!{\src2tex{textfont=tt}}
!!****f* ABINIT/bloch_interp
!! NAME
!! bloch_interp
!!
!! FUNCTION
!!
!! COPYRIGHT
!! Copyright (C) 2010 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!! jobz
!! accesswff
!! ofname
!! codvsn
!! Dtset
!! Wfd
!! Cryst<Crystal_structure>= data type gathering info on symmetries and unit cell
!! Psps <type(pseudopotential_type)>=variables related to pseudopotentials
!! Pawtab(Psps%ntypat) <type(pawtab_type)>=paw tabulated starting data
!! Pawang<pawang_type> angular mesh discretization and related data:
!! Pawrhoij
!! Paw_ij(natom)<type(paw_ij_type)>=data structure containing PAW arrays given on (i,j) channels.
!! spline_opt
!! ovlp_toleig
!! ngfft(18)=Information about 3D FFT.
!! nfft=Number of points in the FFT grid in vtrial. Might different from the FFT mesh used for the wavefunctions.
!! vtrial(nfft,nspden)= trial potential (Hartree)
!!
!! SIDE EFFECTS
!!  ioBSt
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

!#define DEV_BLC_SYMS

subroutine bloch_interp(jobz,accesswff,ofname,codvsn,Dtset,Wfd,Cryst,Psps,Pawtab,Pawang,Pawrhoij,Paw_ij,&
&  spline_opt,ovlp_toleig,ngfft,nfft,vtrial,ioBst)

 use defs_basis
 use defs_datatypes
 use defs_abitypes
 use m_xmpi
 use m_errors
 use m_cprj_bspline
 use m_header

 use m_fstrings,       only : starts_with
 use m_io_tools,       only : get_unit, flush_unit
 use m_numeric_tools,  only : print_arr
 use m_geometry,       only : vdotw !,normv
 use m_blas,           only : xdotc
 use m_abilasi,        only : xheev, xhegv, xheevx, xhegvx
 use m_crystal,        only : crystal_structure
 use m_gsphere,        only : get_kg
 use m_ebands,         only : update_occ, bst_write_bands, get_eneocc_vect
 use m_fft_mesh,       only : rotate_fft_mesh
 use m_bz_mesh,        only : bz_mesh_type, init_kmesh, get_BZ_item, destroy_bz_mesh_type
 use m_wfs,            only : wfd_get_ur, wfs_descriptor, fft_onewfn, wfd_distribute_kb_kpbp, wfd_reset_ur, wfd_get_cprj, &
&                             kdata_init, kdata_free, kdata_t, wfd_iam_master, wfd_barrier, wfd_change_ngfft, wfd_sym_ur
 use m_wffile,         only : wffile_type

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_32_util
 use interfaces_42_geometry
 use interfaces_51_manage_mpi
 use interfaces_53_abiutil
 use interfaces_53_ffts
 use interfaces_56_recipspace
 use interfaces_59_io_mpi
 use interfaces_61_ionetcdf
 use interfaces_62_iowfdenpot
 use interfaces_65_nonlocal
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nfft,spline_opt,accesswff
 real(dp),intent(in) :: ovlp_toleig
 character(len=*),intent(in) :: jobz,ofname,codvsn
 type(Crystal_structure),intent(in) :: Cryst
 type(Pawang_type),intent(in) :: Pawang
 type(Pseudopotential_type),intent(in) :: Psps
 type(wfs_descriptor),intent(inout) :: Wfd
 type(bandstructure_type),intent(inout) :: ioBst
 type(dataset_type),intent(in) :: Dtset
!arrays
 integer,intent(in) :: ngfft(18)
 real(dp),intent(in) :: vtrial(nfft,Wfd%nspden)
 type(Pawtab_type),intent(in) :: Pawtab(Psps%ntypat*Psps%usepaw)
 type(paw_ij_type),intent(in) :: Paw_ij(Wfd%natom*Psps%usepaw)

!Local variables ------------------------------
!scalars
 integer,parameter :: EVAL_VNLK=0,USE_BSPLINE=1
 integer,parameter :: idir0=0,ider0=0,nnlout0=0,icg0=0,tim_rwwf0=0,tim_nonlop0=0
 integer :: ik1_bz,ik2_bz,ik1_ibz,ik2_ibz,ik_ibz,istat,ii,ib,jj,base,ierr
 integer :: band,band1,band2,nband_k,nband_k2,nband_k1,ikpt,ig,ik_bz,ll,rr
 integer :: optder,mkmem_,nkpg,dimffnl,umt_unt,blc_unt !sym3
 integer :: daf_reclen,daf_npw,daf_nspinor,daf_nband,daf_spin,daf_nkpt,daf_blc_size
 integer :: choice,cpopt,matblk,paw_opt,signs,step,nefound
 integer :: cplex_dij,ilmn,itypat
 integer :: ispden,isp,dimdij
 integer :: pertcase,fform,reclen,headform,optkg,rwwf_option,mcg,nband_disk !unwff,
 integer :: blc_size,blc,blc1,blc2,istwf_k,midx,formeig,rec_st,ldz
 !integer :: n1,n2,n3,n4,n5,n6
 integer :: spin,ovlp_size,row,col,useylm_ !,comm_self
 integer :: k1_sym,k2_sym,k1_tim,k2_tim,fft_idx !,inv_k2_sym,inv_k1_sym
 integer :: npw_k,onpw_k,iat,iatom,dimenl1,dimenl2,nnlout,band1_stop,kptopt,nspinor !,lk,rk
 integer :: indx !,with_sym,without_sym
 real(dp),parameter :: tnons_tol=tol8,lambda0=zero
 real(dp) :: sum_eige,ovlp_trace,shift,arg,fft_fact,ecut_eff
 real(dp) :: ene_fact,min_ovlp,max_ovlp,min_norm,max_norm
 complex(dpc) :: blk_ovlp,kin_ij,vnl_ij,cnorm,covlp
 complex(dpc) :: k1_eimkt,k2_eimkt !,tnons_fact
 logical :: k1_isirred,k2_isirred,want_eigenvectors,ltest,do_full_diago !,can_use_sym, take_cnj,
 character(len=500) :: msg,frmt1,frmt2
 character(len=fnlen) :: umt_fname,blc_fname
 type(MPI_type) :: MPI_enreg_seq
 type(BZ_mesh_type) :: Kmesh
 type(Kdata_t) :: Gdata
 type(Hdr_type) :: Hdr
 type(Wffile_type) :: Wff
!arrays
 !integer :: got(Wfd%nproc)
 integer :: nloalg(5),k1_umklp(3),k2_umklp(3),cp_idx(2,4),bspl_kdiv(3),bspl_kord(3)
 integer :: umt_unt_spin(Wfd%nsppol),blc_unt_spin(Wfd%nsppol)
 !integer :: l_gamma(3)=(/0,0,0/)
 !integer,allocatable :: bbp_ks_distrb(:,:,:,:)
 integer,allocatable :: bk2idx(:,:),idx2bk(:,:),typat_sort(:)
 integer,allocatable :: toinv(:,:),multable(:,:,:)
 integer,pointer :: kg_k(:,:),nlmn_sort(:)
 real(dp) :: k4intp(3),k_gamma(3)=(/zero,zero,zero/) !,spinrot_k(4),spinrot_kmq(4)
 real(dp) :: kptns_(3,1),kpoint(3),ylmgr_dum(1),enlout(1),dummy_ekb(0,0),dum_enlout(0)
 real(dp) :: kk1(3),kk2(3),estep(4),ovlp_paw(2) !r1_tau3(3),tau3(3)
 real(dp),pointer :: eig_k(:),occ_k(:)
 real(dp),allocatable :: ovlp_eigene(:),kdotg(:),half_gsq(:),intp_ene(:,:,:),enek_ij(:) !ks_ene(:)
 real(dp),allocatable :: ylm_k(:,:),dum_ylm_gr_k(:,:,:) 
 real(dp),allocatable :: ffnl(:,:,:,:),kpg_k(:,:)
 real(dp),allocatable :: ph3d(:,:,:),occfact(:)
 real(dp),allocatable :: enl(:,:,:),sij(:,:),phkxred(:,:),vectin(:,:),vnl_psi(:,:)
 real(dp),allocatable :: s_psi(:,:)
 real(dp),allocatable :: cg_k(:,:)
 complex(dpc),allocatable :: ur1_dpc(:)
 complex(gwpc),allocatable :: ur1(:),ur2(:)
 complex(gwpc),pointer :: ug1(:),ug2(:)
 complex(gwpc),target,allocatable :: blc_ug(:,:)
 complex(gwpc),allocatable :: blc_ur(:,:)
 complex(dpc),allocatable :: hk_ij(:,:),sk_ij(:,:),sum_ur(:),eig_vec(:,:),umat_k(:,:),vloc_ij(:,:)
 complex(gwpc),allocatable :: cvnl_psi(:),cs_psi(:),vloc_psi(:),wsg(:)
 complex(dpc),allocatable :: ovlp_mat(:,:),sym_ovlp_mat(:,:),ovlp_ikfk(:,:,:,:)
 !logical,allocatable :: bbp_mask(:,:)
 !logical :: k_needs_tr(2)
 character(len=10) :: spin_name(2)=(/'          ','          '/)
 character(len=fnlen) :: umt_fname_spin(Wfd%nsppol),blc_fname_spin(Wfd%nsppol)
 type(cprj_type),pointer :: Cp_k1(:,:),Cp_k2(:,:)
 type(cprj_type),allocatable :: Cp_blc1(:,:),Cp_blc2(:,:)
 type(cprj_type),allocatable,target :: Cp_left(:,:),Cp_right(:,:),Pku_bz(:,:,:,:) 
 type(pawrhoij_type),intent(in) :: Pawrhoij(Cryst%natom*Wfd%usepaw)
 type(cprj_bspl_t) :: Cp_bspl

 type :: blc_t

   integer :: blc_size
   ! size of the Bloch linear combination subspace.

   integer :: u_size

   ! size of the matrix.
   integer :: nspinor

   integer :: npw
   ! Nuber of planewaves in the blc_ug basis set.

   integer :: nkpt
   ! Number of k-points used for u_nk.

   integer :: fform
   ! The fileformat of blc_fname.

   character(len=fnlen) :: blc_fname
   ! The name of the file storing the transformation.

   integer,pointer :: idx2bk(:,:)
    ! idx2bk(2,u_size)
    ! band1  = idx2bk(1,ii) band index of u_nk for ii=1,u_size.
    ! ik1_bz = idx2bk(2,ii) k-point index in k_list of u_nk for ii=1,u_size.

   integer,pointer :: kg(:,:)
   ! kg(3,npw).

   real(dp),pointer :: klist(:,:)
   ! klist(3,nkpt)
   ! The set of k-points used for the u_nk.

   complex(dpc),pointer :: u_dot_blc(:,:)
   ! u_dot_blc(u_size,blc_size)
   ! Array storing the transformation <u_i|B_j>.
   
   complex(dpc),pointer :: blc_ug(:,:) 
   ! blc_ug(npw*nspinor,blc_size).

 end type blc_t

!************************************************************************

 DBG_ENTER("COLL")

 ABI_CHECK(nfft==PRODUCT(ngfft(1:3)),"FFT parallelism not coded")
 !
 ! vtrial might be given on a FFT mesh that is denser than the FFT used for wfs.
 if (ANY(ngfft(1:3)/=Wfd%ngfft(1:3))) call wfd_change_ngfft(Wfd,Cryst,Psps,ngfft) 

 if (Wfd%nsppol==2) spin_name=(/'SPIN_UP  ','SPIN_DOWN'/)

 want_eigenvectors = starts_with(jobz,(/"V"/),csens=.FALSE.)        
 if (want_eigenvectors) then
   call wrtout(std_out," Eigenvectors will be calculated and saved on file "//TRIM(ofname),"COLL")
   ABI_CHECK(accesswff==IO_MODE_FORTRAN,"Unsupported accesswff!")
   do spin=1,Wfd%nsppol
     umt_unt_spin(spin) = get_unit()
     umt_fname_spin(spin) = "UMAT_"//spin_name(spin)  
   end do
 else 
   call wrtout(std_out," Interpolating energies only. Eigenvectors won't be calculated.","COLL")
 end if

 if (spline_opt==EVAL_VNLK) then 
   call wrtout(std_out," Vnlk_ij will be evaluated on the interpolating k-mesh.","COLL")
 else if (spline_opt==USE_BSPLINE) then
   call wrtout(std_out," Vnlk_ij will be obtained on the interpolating k-mesh via B-spline functions.","COLL")
   ABI_CHECK(Psps%useylm==1,"B-spline technique requires useylm==1") ! Not completely true but it makes life easier.
 else 
   write(msg,'(a,i0)')" Wrong value for spline_opt:",spline_opt
   MSG_ERROR(msg)
 end if

 write(msg,'(a,f8.5)')" ovlp_toleig is ",ovlp_toleig
 call wrtout(std_out,msg,"COLL")

 do spin=1,Wfd%nsppol
   blc_unt_spin(spin) = get_unit()
   blc_fname_spin(spin) = "BLC_"//spin_name(spin)  
 end do

 call initmpi_seq(MPI_enreg_seq) ! Fake MPI_type for sequential part.

 nspinor = Wfd%nspinor
 ABI_CHECK(nspinor==1,"nspinor==2 not coded")

 kptopt=1 ! FIXME Time-reversal is always assumed.
 !call init_kmesh(Kmesh,Cryst,Wfd%nkibz,Wfd%kibz,kptopt,wrap_1zone=.TRUE.)
 call init_kmesh(Kmesh,Cryst,Wfd%nkibz,Wfd%kibz,kptopt,wrap_1zone=.FALSE.)
 !
 allocate(nlmn_sort(Cryst%natom)); iat=0 ! nlmn dims sorted by atom type.
 if (Wfd%usepaw==1) then
   nlmn_sort = Wfd%nlmn_sort
 else  ! FIXME here lmnmax == lnmax if useylm_==0
   nlmn_sort = 9
   if (Psps%useylm==1) then 
     nlmn_sort=Psps%lmnmax
   else
     MSG_ERROR("useylm==0 not coded")
   end if
   !do itypat=1,Cryst%ntypat
   !  nlmn_sort(iat+1:iat+Cryst%nattyp(itypat))=Pawtab(itypat)%lmn_size
   !  iat=iat+Cryst%nattyp(itypat)
   !end do
   !write(*,*)" hacking nlmn_sort",nlmn_sort
   write(*,*)" Psps%lmnmax is ",Psps%lmnmax
 end if

 allocate(typat_sort(Cryst%natom))
 do indx=1,Cryst%natom
   iat=Cryst%atindx1(indx)
   typat_sort(indx) = Cryst%typat(iat)
 end do

 fft_fact = one/Wfd%nfft
 allocate(ur1(Wfd%nfft*nspinor),ur2(Wfd%nfft*nspinor),ur1_dpc(Wfd%nfft*nspinor))

 nloalg = Wfd%nloalg

 allocate(multable(4,Cryst%nsym,Cryst%nsym),toinv(4,Cryst%nsym))
 call sg_multable(Cryst%nsym,Cryst%symafm,Cryst%symrel,Cryst%tnons,tnons_tol,ierr,multable=multable,toinv=toinv)
 ABI_CHECK(ierr==0,"Group error, cannot continue")

 allocate(intp_ene(ioBst%mband,ioBst%nkpt,Wfd%nsppol)); intp_ene=zero
 !
 ! ==== Begin big loop over spins ====
 do spin=1,Wfd%nsppol
   !
   ! ======================================================
   ! ====  <u_{ik b1}| u_{fk b2}>, ik in IBZ, fk in BZ ====
   ! ======================================================
   !
   ! For PAW we have to recalculate set projections in the IBZ setting k=0 in the exponential.
   ! TODO: For the time being, these terms are calculated in the BZ, symmetrization will be aded afterwards.
   if (Wfd%usepaw==1) then

     cpopt   = 0 ! Nothing is already calculated.
     choice  = 1
     matblk  = Cryst%natom
     allocate(Pku_bz(Wfd%natom,nspinor,Wfd%mband,Kmesh%nbz))

     do ik1_bz=1,Kmesh%nbz 

       call get_BZ_item(Kmesh,ik1_bz,kk1,ik1_ibz,k1_sym,k1_tim,k1_eimkt,k1_umklp,k1_isirred)

       kpoint  =  k_gamma
       npw_k   =  Wfd%npwwfn !Wfd%npwarr(ik1_ibz) ! TODO these quantities should be k-dependent.
       istwf_k =  Wfd%istwfk(ik1_ibz)
       kg_k    => Wfd%Kdata(ik1_ibz)%kg_k

       ! TODO use optional argument ecut
       call kdata_init(Gdata,Cryst,Psps,k_gamma,istwf_k,Wfd%ngfft,Wfd%MPI_enreg,kg_k=kg_k)
       !
       ! Compute (k+G) vectors
       nkpg=0
       allocate(kpg_k(npw_k,nkpg)); if (nkpg>0) call mkkpg(kg_k,kpg_k,k_gamma,nkpg,npw_k)

       allocate(vectin(2,npw_k*nspinor))
       ABI_CHECK(npw_k==Wfd%npwwfn,"Wrong npw")

       do band1=1,Wfd%nband(ik1_ibz,spin)

         call cprj_alloc(Pku_bz(:,:,band1,ik1_bz),0,nlmn_sort)

         if (.not.k1_isirred) then ! Perform Symmetrization in real space. 

           call wfd_sym_ur(Wfd,Cryst,Kmesh,band1,ik1_bz,spin,ur1)
           !
           ! FFT R -> G TODO Fix issue with double precision complex.
           ur1_dpc = ur1
           call fourdp_c2c_ip(ur1_dpc,-1,Wfd%MPI_enreg,Wfd%nfft,Wfd%ngfft,Wfd%paral_kgb,0)

           do ig=1,npw_k ! FFT box to G-sphere.
             fft_idx = Wfd%igfft0(ig)
             if (fft_idx/=0) then ! G-G0 belong to the FFT mesh.
               vectin(1,ig) = DBLE (ur1_dpc(fft_idx))
               vectin(2,ig) = AIMAG(ur1_dpc(fft_idx))
             else               
               vectin(:,ig) = zero ! Set this component to zero.
             end if
           end do
         else  ! Copy wavefunction in reciprocal space.
           vectin(1,:) = DBLE (Wfd%Wave(band1,ik1_ibz,spin)%ug)
           vectin(2,:) = AIMAG(Wfd%Wave(band1,ik1_ibz,spin)%ug)
         end if
         !
         ! Calculate sorted cprj.
         call getcprj(choice,cpopt,vectin,Pku_bz(:,:,band1,ik1_bz),0,0,Gdata%dim_fnlylm,dummy_ekb,Gdata%fnlylm,&
&          0,Wfd%indlmn,istwf_k,kg_k,kpg_k,kpoint,Wfd%lmnmax,matblk,Wfd%mgfft,Wfd%MPI_enreg,&
&          Cryst%natom,Cryst%nattyp,Wfd%ngfft,nkpg,nloalg,npw_k,nspinor,Cryst%ntypat,&
&          Gdata%phkxred,Wfd%ph1d,Gdata%ph3d,Cryst%ucvol,Wfd%usepaw,1)
       end do

       deallocate(vectin)
       deallocate(kpg_k,STAT=istat)
     end do

     call kdata_free(Gdata)
   end if
   !
   ! ==============================================================
   ! ==== Reconstruct full <u_{kb}| u_{k'b'}> matrix in the BZ ====
   ! ==============================================================
   ! 1) Symmetrization is done in real space. Easier, especially when k-centered G-sphere are used.
   !     u(r,b,kbz)=e^{-2i\pi kibz.(R^{-1}t} u (R{^-1}(r-t),b,kibz) 
   !               =e^{+2i\pi kibz.(R^{-1}t} u*({R^-1}(r-t),b,kibz) for time-reversal
   ! 2) Matrix is Hermitian.
   ! 3) <u_{Sk b}| u_{Sk b'}> are obtained from the previously calculated <u_{kb}| u_{kb'}> table.
   !
   ! {A,a} {B,b} = {AB, a+Ab}
   !
   ! Calculate the size of the overlap matrix and useful tables for MPI looping.
   allocate(bk2idx(Wfd%mband,Kmesh%nbz)); bk2idx=0
   allocate(idx2bk(2,Wfd%mband*Kmesh%nbz)); idx2bk=0

   ovlp_size=0
   do ik2_bz=1,Kmesh%nbz
     ik2_ibz = Kmesh%tab(ik2_bz)
     nband_k2 = Wfd%nband(ik2_ibz,spin)
     do band2=1,nband_k2
       ovlp_size = ovlp_size + 1
       bk2idx(band2,ik2_bz) = ovlp_size
       idx2bk(1,ovlp_size) = band2
       idx2bk(2,ovlp_size) = ik2_bz
     end do
   end do

   write(msg,'(a,f12.1,a,i0)')" Memory required for overlap matrix: ",2*dpc*ovlp_size**2*b2Mb," Mb; ovlp_size= ",ovlp_size
   call wrtout(std_out,msg,"COLL")

   allocate(ovlp_mat(ovlp_size,ovlp_size), STAT=istat) ! Might use packed matrix to save memory, but Lapack call is slower.
   ABI_CHECK(istat==0,"Out of memory in ovlp_mat")
   ovlp_mat=-HUGE(zero)

#ifdef DEV_BLC_SYMS
! FIXME does not work yet. Problems with umklapp symmorphic operations and symmetry tables somewhere.
   allocate(ovlp_ikfk(Wfd%mband,Wfd%nkibz,Wfd%mband,Kmesh%nbz)); ovlp_ikfk=+HUGE(one)
   
   do ik2_bz=1,Kmesh%nbz
     call get_BZ_item(Kmesh,ik2_bz,kk2,ik2_ibz,k2_sym,k2_tim,k2_eimkt,k2_umklp)
     !ik2_ibz = Kmesh%tab(ik2_bz)
     nband_k2 = Wfd%nband(ik2_ibz,spin)

     do band2=1,nband_k2 
       call wfd_sym_ur(Wfd,Cryst,Kmesh,band2,ik2_bz,spin,ur2)

       do ik1_ibz=1,Wfd%nkibz
         nband_k = Wfd%nband(ik1_ibz,spin)
         do band1=1,nband_k

           call wfd_get_ur(Wfd,band1,ik1_ibz,spin,ur1)
           covlp = xdotc(Wfd%nfft*nspinor,ur1,1,ur2,1) * fft_fact

           if (Wfd%usepaw==1) then ! TODO Add onsite term after IBZ-->BZ symmetrization
           end if

           ovlp_ikfk(band1,ik1_ibz,band2,ik2_bz) = covlp 
         end do
       end do
       !
     end do
   end do
   !
   ! FIXME: Equations are not completed. non-symmorphic phases are missing!
   ! Let <r|k> indicate the periodic part of the Bloch wavefunction that transforms according to:
   !   u_{Sk} = e^{-iSk.t} u_k(S^{-1}(r-t))
   !   S3 = S1^{-1} S2
   !
   ! 1) <S1 k1 | S2 k2> = <k1| S1^{-1}   S2 k2>  e^{i (S1 k1 - S2 k2).t1}  
   !
   ! 2) <T S1 k1 | T S2 k2> = <S1 k1| S2 k2>*     

   ! <S1 k1   | T S2 k2> = <k1| S1^{-1} T S2 k2>
   ! <T S1 k1 |   S2 k2> = <k2| S2^{-1} T S1 k1>^*
   ! <T S1 k1 | T S2 k2> = <k2| S2^{-1}   S1 k1>     ! Problematic if the BZ mesh is not invariant under inversion.
   !                                                   e.g. randomly shifted k-meshes. In this case one should use kptopt=3
   allocate(sym_ovlp_mat(ovlp_size,ovlp_size), STAT=istat) ! Might use packed Hermitian matrices but Lapack call should be slower.
   ABI_CHECK(istat==0,"Out of memory")
   sym_ovlp_mat=-HUGE(zero) !czero 

   call wrtout(std_out,"Using version with symmetries","COLL")
   with_sym=0; without_sym=0
   do ik2_bz=1,Kmesh%nbz

     call get_BZ_item(Kmesh,ik2_bz,kk2,ik2_ibz,k2_sym,k2_tim,k2_eimkt,k2_umklp)

     !$ik2_ibz = Kmesh%tab(ik2_bz)
     nband_k2 = Wfd%nband(ik2_ibz,spin)

     inv_k2_sym = toinv(1,k2_sym)
     k_needs_tr(2) = (k2_tim==2)

     do ik1_bz=1,ik2_bz !Kmesh%nbz
       call get_BZ_item(Kmesh,ik1_bz,kk1,ik1_ibz,k1_sym,k1_tim,k1_eimkt,k1_umklp)

       nband_k1 = Wfd%nband(ik1_ibz,spin)

       inv_k1_sym = toinv(1,k1_sym)

       k_needs_tr(1) = (k1_tim==2)

       !sym3 = multable(1,inv_k1_sym,k2_sym)

       rk= Kmesh%rottb(ik2_bz,1,inv_k1_sym)

       can_use_sym = .TRUE.
       !can_use_sym = ( ALL(ABS(Cryst%tnons(:,k2_sym))<tol6) .and. ALL(ABS(Cryst%tnons(:,k1_sym))<tol6) )
       !can_use_sym = can_use_sym .and. ( ALL(k2_umklp == l_gamma) .and. ALL(k1_umklp == l_gamma))

       can_use_sym = can_use_sym .and. ( ALL(k2_umklp == l_gamma) .and. ALL(k1_umklp == l_gamma) &
       & .and. ALL(multable(2:4,inv_k1_sym,k2_sym)==l_gamma) )

       !can_use_sym = ( can_use_sym .and. &
       !& ALL( ABS ( -Kmesh%bz(:,rk) + MATMUL(Cryst%symrec(:,:,inv_k1_sym),Kmesh%bz(:,ik2_bz)) ) < tol6 ) )

       !can_use_sym = ( can_use_sym .and. ALL(ABS(Cryst%tnons(:,k1_sym)) <tol6)  .and. ALL(ABS(Cryst%tnons(:,k2_sym)) <tol6) ) 

       kpoint = kk1 - kk2
       !do ii=1,3 ! Wrap in the first BZ thus enforcing traslational invariance.
       !  call wrap2_pmhalf(kk1(ii)-kk2(ii),kpoint(ii),shift)  ! TODO overloaded interface.
       !end do

       if (ANY (ABS(Cryst%tnons(:,k1_sym)) > tol6) ) then
         !tnons_fact = EXP(j_dpc*two_pi*DOT_PRODUCT(kk1-kk2,Cryst%tnons(:,k1_sym)))
         tnons_fact = EXP(j_dpc*two_pi*DOT_PRODUCT(kpoint,Cryst%tnons(:,k1_sym)))
       else 
         tnons_fact = cone
       end if

       take_cnj=ALL(k_needs_tr) 

       if (ALL(k_needs_tr) .or. ALL(.not.k_needs_tr) ) then
         lk = ik1_ibz
         rk= Kmesh%rottb(ik2_bz,1,inv_k1_sym)
         !rk= Kmesh%rottbm1(ik2_bz,1,k1_sym)
       else 
         MSG_ERROR("Need TR")
         take_cnj=.FALSE.
         !if (k_needs_tr(2)) then
         !lk = ik1_ibz
         !rk=Kmesh%rottb(ik2_bz,1,inv_k1_sym)
         !tnons_fact = CONJG(k1_eimkt) * CONJG(k2_eimkt) * EXP(-j_dpc*two_pi*DOT_PRODUCT(kk2,r1_tau3))
         !end if
       end if

       if (can_use_sym) then

         with_sym=with_sym+1
         do band2=1,nband_k2 
           col=bk2idx(band2,ik2_bz)

           band1_stop = nband_k1; if (ik1_bz==ik2_bz) band1_stop = band2 
           do band1=1,band1_stop
             blk_ovlp = tnons_fact * ovlp_ikfk(band1,lk,band2,rk)
             if (take_cnj) blk_ovlp = DCONJG(blk_ovlp)
             row=bk2idx(band1,ik1_bz)
             if (col>=row) sym_ovlp_mat(row,col) = blk_ovlp
           end do
         end do

       else 
         without_sym=without_sym+1
         do band2=1,nband_k2 
           col=bk2idx(band2,ik2_bz)

           call wfd_sym_ur(Wfd,Cryst,Kmesh,band2,ik2_bz,spin,ur2)

           band1_stop = nband_k1; if (ik1_bz==ik2_bz) band1_stop = band2 

           do band1=1,band1_stop

             call wfd_sym_ur(Wfd,Cryst,Kmesh,band1,ik1_bz,spin,ur1)

             blk_ovlp = xdotc(Wfd%nfft,ur1,1,ur2,1) * fft_fact
             row=bk2idx(band1,ik1_bz)
             if (col>=row) sym_ovlp_mat(row,col) = blk_ovlp

           end do
         end do
       end if

     end do
   end do

   write(*,*)"with_sym",with_sym," without_sym",without_sym
#endif

   !$if (Wfd%usepaw==1) then
   !$  allocate(Cp_k1 (Wfd%natom,nspinor))
   !$  call cprj_alloc(Cp_k1,0,Wfd%nlmn_atm)

   !$  allocate(Cp_k2 (Wfd%natom,nspinor))
   !$  call cprj_alloc(Cp_k2,0,Wfd%nlmn_atm)
   !$end if

   call wrtout(std_out,"Using version without symmetries","COLL")
   do ik2_bz=1,Kmesh%nbz
     call get_BZ_item(Kmesh,ik2_bz,kk2,ik2_ibz,k2_sym,k2_tim,k2_eimkt,k2_umklp,k2_isirred)

     !$ik2_ibz = Kmesh%tab(ik2_bz)

     nband_k2 = Wfd%nband(ik2_ibz,spin)

     do band2=1,nband_k2 

       col=bk2idx(band2,ik2_bz)

       ug2 => Wfd%Wave(band2,ik2_ibz,spin)%ug
       call wfd_sym_ur(Wfd,Cryst,Kmesh,band2,ik2_bz,spin,ur2)

       if (Wfd%usepaw==1) then 
         Cp_k2 => Pku_bz(:,:,band2,ik2_bz)
         !call paw_symcprj(ik2_bz,nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,Cp_k2) 
       end if 

       do ik1_bz=1,ik2_bz 

         !$ik1_ibz = Kmesh%tab(ik1_bz)
         call get_BZ_item(Kmesh,ik1_bz,kk1,ik1_ibz,k1_sym,k1_tim,k1_eimkt,k1_umklp,k1_isirred)

         nband_k1 = Wfd%nband(ik1_ibz,spin)
         band1_stop = Wfd%nband(ik1_ibz,spin); if (ik1_bz==ik2_bz) band1_stop = band2 

         do band1=1,band1_stop
       
           row=bk2idx(band1,ik1_bz)

           if (k2_isirred.and.k1_isirred) then
             ug1 => Wfd%Wave(band1,ik1_ibz,spin)%ug
             blk_ovlp = xdotc(Wfd%npwwfn*nspinor,ug1,1,ug2,1)
           else 
             call wfd_sym_ur(Wfd,Cryst,Kmesh,band1,ik1_bz,spin,ur1)
             blk_ovlp = xdotc(Wfd%nfft*nspinor,ur1,1,ur2,1)/Wfd%nfft
           end if

           if (Wfd%usepaw==1) then
             !$call paw_symcprj(ik1_bz,nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,Cp_k1) 
             Cp_k1 => Pku_bz(:,:,band1,ik1_bz)
             ovlp_paw = paw_overlap(Cp_k1,Cp_k2,typat_sort,Pawtab) ! Be careful as Cp are always sorted.
             blk_ovlp = blk_ovlp + DCMPLX(ovlp_paw(1),ovlp_paw(2))
           end if

           ovlp_mat(row,col) = blk_ovlp
           !if (row==col) ovlp_mat(row,col) = cone ! This  might be needed if elements are not accurate enough.
         end do
       end do
     end do
   end do

!DEBUG
#ifdef  DEV_BLC_SYMS
   write(66,*)"Error in matrix elements"
   do ik2_bz=1,Kmesh%nbz
      ik2_ibz = Kmesh%tab(ik2_bz)
      nband_k2 = Wfd%nband(ik2_ibz,spin)
      do band2=1,nband_k2 
        col=bk2idx(band2,ik2_bz)
        do ik1_bz=1,ik2_bz 
          ik1_ibz = Kmesh%tab(ik1_bz)
          nband_k1 = Wfd%nband(ik1_ibz,spin)
          band1_stop = Wfd%nband(ik1_ibz,spin); if (ik1_bz==ik2_bz) band1_stop = band2 
          do band1=1,band1_stop
            row=bk2idx(band1,ik1_bz)
            if (ABS(sym_ovlp_mat(row,col)-ovlp_mat(row,col))>tol6) then
              call get_BZ_item(Kmesh,ik2_bz,kk2,ik2_ibz,k2_sym,k2_tim,k2_eimkt,k2_umklp)
              call get_BZ_item(Kmesh,ik1_bz,kk1,ik1_ibz,k1_sym,k1_tim,k1_eimkt,k1_umklp)
              write(66,'(2i3,2(2x,a,3f7.3),4f8.4)')&
&               row,col," bz1 ",Kmesh%bz(:,ik1_bz)," bz2 ",Kmesh%bz(:,ik2_bz),ovlp_mat(row,col),sym_ovlp_mat(row,col)
              write(66,'(a,i3,3i3,2f4.1,3i3)')"k2 ",ik2_bz,ik2_ibz,k2_sym,k2_tim,k2_eimkt,k2_umklp
              write(66,'(a,i3,3i3,2f4.1,3i3)')"k1 ",ik1_bz,ik1_ibz,k1_sym,k1_tim,k1_eimkt,k1_umklp
            end if
          end do
        end do
     end do
   end do
#endif
!ENDDEBUG

   call print_arr(ovlp_mat,max_r=40,max_c=12,unit=std_out,mode_paral="COLL")
   if (allocated(sym_ovlp_mat)) deallocate(sym_ovlp_mat)
   if (allocated(ovlp_ikfk)) deallocate(ovlp_ikfk)
   deallocate(bk2idx)

   if (Wfd%usepaw==1) then
     do ik_bz=1,Kmesh%nbz
       ik_ibz=Kmesh%tab(ik_bz)
       do band1=1,Wfd%nband(ik1_ibz,spin)
         call cprj_free(Pku_bz(:,:,band1,ik_bz))
       end do
     end do
     deallocate(Pku_bz)
     !$call cprj_free(Cp_k1); deallocate(Cp_k1)
     !$call cprj_free(Cp_k2); deallocate(Cp_k2)
   end if
   !
   ! ========================================
   ! ==== Diagonalize the overlap matrix ====
   ! ========================================
   allocate(ovlp_eigene(ovlp_size))
   call xheev("Vectors","Upper",ovlp_size,ovlp_mat,ovlp_eigene)

   ovlp_trace = SUM(ovlp_eigene)
   write(msg,'(3(a,f8.2))')" Trace: ",ovlp_trace," Min eig: ",MINVAL(ovlp_eigene)," Max eig: ",MAXVAL(ovlp_eigene)
   call wrtout(std_out,msg,"COLL")
   !
   ! Select the optimal subspace.
   sum_eige = zero; ii=1 !ovlp_size+1
   do while (ovlp_eigene(ii) < ovlp_toleig .and. ii/=ovlp_size)
     sum_eige = sum_eige + ovlp_eigene(ii)
     ii = ii+1
   end do
   blc_size = ovlp_size-ii+1; base = ii-1 

   write(msg,'(a,i0,a)')" Bloch states linear combination sub-space contains ",blc_size," elements."
   call wrtout(std_out,msg,"COLL")

   if (blc_size<ioBst%mband) then
     if (ovlp_size<ioBst%mband) then
       write(msg,'(2(a,i0),2a)')&
&        " Bloch state manifold has only ",ovlp_size," states while ioBst%mband= ",ioBst%mband,ch10,&
&        " Decrease mband or increase the number of input ab-initio Bloch states."
       MSG_ERROR(msg)
     end if
     blc_size = ioBst%mband
     write(msg,'(a,i0)')" Had to enlarge Bloch-state manifold due to input mband; new blc_size: ",blc_size
     MSG_COMMENT(msg)
   end if
   !
   ! =====================================================================
   ! ==== Rotate the input wavefunctions to get the optimal basis set ====
   ! =====================================================================
   ! *) Keep the optimal wavefunctions on each node (if possible) to facilitate
   !    the interpolation over the fine k-mesh. 
   ! *) Use Gamma-centered basis set to facilitate the operations in reciprocal space.
   ! *) The new basis is orthogonal, but not normalized since <U_i|U_j> = delta_ij e_i.
   !

   !istwfk=1; nsppol_=1; nkibz_=1: nband_(:,:)=blc_size
   !$call wfd_init(Wlcd,Cryst,Pawtab,Psps,keep_ur,Wfd%paral_kgb,Wfd%npwwfn,blc_size,nband_,nkibz_,nsppol_,bks_mask,&
   !$&  Wfd%nspden,Wfd%nspinor,istwfk,kibz,ngfft,mg0,gvec,nloalg,comm)

   allocate(blc_ug(Wfd%npwwfn*nspinor,blc_size))
   allocate(sum_ur(Wfd%nfft*nspinor))

   ! TODO invert the loop below (more memory but faster).

   do blc=1,blc_size ! Construct new optimal basis set.

     sum_ur = czero
     do midx=1,ovlp_size ! Loop over the Bloch set in the BZ.
       band1  = idx2bk(1,midx)
       ik1_bz = idx2bk(2,midx)

       call wfd_sym_ur(Wfd,Cryst,Kmesh,band1,ik1_bz,spin,ur1)

       sum_ur = sum_ur + ovlp_mat(midx,base+blc) * ur1
     end do

     cnorm = xdotc(Wfd%nfft*nspinor,sum_ur,1,sum_ur,1) * fft_fact
     !write(std_out,*)" sum_ur norm = ",cnorm," eigene ",ovlp_eigene(base+blc)

     ! FFT R -> G
     kg_k => Wfd%Kdata(1)%kg_k

     ! FIXME Buggy call if SG routines are used due to sg_fftrisc with option=3.
     !call padded_fourwf_cplx(sum_ur,Wfd%ngfft,n1,n2,n3,n1,n2,n3,Wfd%mgfft,-1,Wfd%gbound)

     call fourdp_c2c_ip(sum_ur,-1,Wfd%MPI_enreg,Wfd%nfft*nspinor,Wfd%ngfft,Wfd%paral_kgb,0)

     do ig=1,Wfd%npwwfn       ! Have to map FFT to G-sphere.
       fft_idx = Wfd%igfft0(ig)
       if (fft_idx/=0) then ! G-G0 belong to the FFT mesh.
         blc_ug(ig,blc)=sum_ur(fft_idx) 
       else               ! Set this component to zero.
         blc_ug(ig,blc)=czero
       end if
     end do

     ! Normalize the basis set
     blc_ug(:,blc) = blc_ug(:,blc)/SQRT(ovlp_eigene(base+blc)) ! Normalize the basis set using the eigenvalues of the overlap matrix.

     !cnorm = xdotc(Wfd%npwwfn*nspinor,blc_ug(:,blc),1,blc_ug(:,blc),1)
     !cnorm = DSQRT(DBLE(cnorm)) 
     !blc_ug(:,blc) = blc_ug(:,blc)/cnorm 
   end do ! blc

   deallocate(idx2bk)
   deallocate(sum_ur)
   deallocate(ovlp_mat,ovlp_eigene)
   !
   !  Test on the orthogonalization of wavefunctions. meaningless for PAW as Cprj have are not consistent.
   min_ovlp=HUGE(one); max_ovlp=-HUGE(0)
   min_norm=HUGE(one); max_norm=-HUGE(0)
   do blc2=1,blc_size
     do blc1=1,blc2-1
       covlp = xdotc(Wfd%npwwfn,blc_ug(:,blc1),1,blc_ug(:,blc2),1)
       if (ABS(covlp)<min_ovlp) min_ovlp=ABS(covlp)
       if (ABS(covlp)>max_ovlp) max_ovlp=ABS(covlp)
     end do
     covlp = xdotc(Wfd%npwwfn,blc_ug(:,blc2),1,blc_ug(:,blc2),1)
     if (ABS(covlp)<min_norm) min_norm=ABS(covlp)
     if (ABS(covlp)>max_norm) max_norm=ABS(covlp)
   end do

   write(std_out,*)" Min |<U_i|U_j>|= ",min_ovlp,", Max |<U_i|U_j>|= ",max_ovlp
   write(std_out,*)" Min SQRT(|<U_i|U_i>|)= ",min_norm,", Max SQRT(|<U_i|U_i>|)= ",max_norm
   call flush_unit()
   !
   ! =====================================
   ! ==== Write optimal basis on file ====
   ! =====================================
   ! TODO Now I really need to introduce communicators for the two spins.
   ! as only the master for this spin has to write data!
   if ( wfd_iam_master(Wfd,spin) ) then 
     blc_fname = blc_fname_spin(spin); blc_unt = blc_unt_spin(spin); reclen = Wfd%npwwfn*get_reclen("dpc") 
     open(unit=blc_unt,file=blc_fname,access='direct',recl=reclen)

     write(blc_unt,rec=1)reclen,Wfd%npwwfn,nspinor,blc_size,spin
     do blc=1,blc_size 
       !$write(blc_unt,rec=1+blc)blc_ug(:,blc)
     end do

     close(unit=blc_unt)
   end if
   !
   ! ============================================================================
   ! ==== Evaluate <p_lmn|e^(ikr)U_i> for each k on the k-grid  and each U_i ====
   ! ============================================================================
   !
   ! Here I assume that the G-sphere is gamma-centered.
   ! Real Spherical Harmonics are always used to apply the non-local part even for NC pseudos.
   ! I did non find any easy way to extract only <p_nl|psi> from nonlop_pl.
   !useylm_=1
   useylm_=Psps%useylm
   npw_k=Wfd%npwwfn
   kg_k => Wfd%Kdata(1)%kg_k

   allocate(Cp_blc2 (Wfd%natom,nspinor))
   call cprj_alloc(Cp_blc2, 0,nlmn_sort)

   if (spline_opt==USE_BSPLINE) then
     allocate(Cp_blc1 (Wfd%natom,nspinor))
     call cprj_alloc(Cp_blc1, 0,nlmn_sort)
     allocate(Cp_left (Wfd%natom,nspinor))
     allocate(Cp_right(Wfd%natom,nspinor))
     call cprj_alloc(Cp_left, 0,nlmn_sort)
     call cprj_alloc(Cp_right,0,nlmn_sort)
   end if
   !
   ! ==============================
   ! ==== Prepare NL strengths ====
   ! ==============================
   !
   if (Wfd%usepaw==0) then ! Norm-conserving => constant Kleimann-Bylander energies function of (n,l).
     dimenl1=Psps%dimekb
     dimenl2=Cryst%ntypat
     allocate(enl(Psps%dimekb,dimenl2,nspinor**2))
     allocate(sij(0,0))
     enl(:,:,1)=Psps%ekb(:,:)
     if (nspinor==2) then
       enl(:,:,2)=Psps%ekb(:,:)
       enl(:,:,3:4)=zero
     end if
                                                                                                     
   else ! PAW: store overlap coefficients and allocate memory for Dij coefficients (spin dependent) 
     cplex_dij=Paw_ij(1)%cplex_dij
     dimenl1=Psps%dimekb*cplex_dij
     dimenl2=Cryst%natom
     allocate(enl(dimenl1,dimenl2,nspinor**2))
     allocate(sij(dimenl1,Cryst%ntypat))
     do itypat=1,Cryst%ntypat
       if (cplex_dij==1) then
         sij(1:pawtab(itypat)%lmn2_size,itypat)=pawtab(itypat)%sij(:)
       else
         do ilmn=1,pawtab(itypat)%lmn2_size
           sij(2*ilmn-1,itypat)=pawtab(itypat)%sij(ilmn)
           sij(2*ilmn  ,itypat)=zero
         end do
       end if
     end do
     !
     ! Retrieve PAW Dij coefficients for this spin component
     do ispden=1,nspinor**2
       isp=spin; if (nspinor==2) isp=ispden
       do iatom=1,Wfd%natom
         dimdij=Paw_ij(iatom)%cplex_dij*Paw_ij(iatom)%lmn2_size
         do ilmn=1,dimdij
           enl(ilmn,iatom,ispden)=paw_ij(iatom)%dij(ilmn,isp)
         end do
         if (dimdij+1<=dimenl1) enl(dimdij+1:dimenl1,iatom,ispden)=zero
       end do
     end do
   end if
   !
   ! ============================
   ! ==== B-spline for Vnl_k ====
   ! ============================
   ! * Tabulate Vnlk_ij on the homogeneous mesh of k-points. 
   ! * useylm is used.
   if (spline_opt==USE_BSPLINE) then 

     ABI_CHECK(Wfd%usepaw==0,"PAW with B-splines not coded")
     ABI_CHECK(Psps%useylm==1,"useylm must be 1")

     bspl_kdiv = (/4,4,4/) ! TODO should be input.
     !bspl_kdiv = (/8,8,8/) ! TODO should be input.
     !bspl_kord = (/1,1,1/) ! TODO should be input.
     bspl_kord = bspl_kdiv ! TODO should be input.

     kg_k    => Wfd%Kdata(1)%kg_k

     call cprj_bspline_init(Cp_bspl,Cryst,Psps,Pawtab,Pawang,nspinor,Wfd%ngfft,bspl_kdiv,bspl_kord,&
&      Wfd%npwwfn,blc_size,kg_k,blc_ug,nlmn_sort) 
   end if
   !
   ! ========================================
   ! ==== Precompute <blc2| vloc |blc1> ====
   ! ========================================
   !
   allocate(vloc_psi(Wfd%nfft))  ! <r|Vloc|Cnk>

   allocate(vloc_ij(blc_size,blc_size), STAT=istat)
   ABI_CHECK(istat==0,"Out of memory in vloc_ij")

   allocate(blc_ur(Wfd%nfft*nspinor,blc_size))

   do blc2=1,blc_size  
     ug2 => blc_ug(:,blc2) 

     istwf_k = 1
     kg_k => Wfd%Kdata(1)%kg_k

     call fft_onewfn(Wfd%paral_kgb,istwf_k,nspinor,Wfd%npwwfn,Wfd%nfft,ug2,ur2,&
&      Wfd%igfft0,Wfd%ngfft,kg_k,Wfd%gbound,0,Wfd%MPI_enreg)

      blc_ur(:,blc2) = ur2
   end do

   do blc2=1,blc_size  
     ug2 => blc_ug(:,blc2) 

     istwf_k = 1
     kg_k => Wfd%Kdata(1)%kg_k

     ur2 = blc_ur(:,blc2)

     if (Wfd%nspinor==1) then
       vloc_psi = ur2*vtrial(:,spin)
     else 
       MSG_ERROR("vloc_psi doesn't support nspinor==2")
     end if

     vloc_ij(blc2,blc2) = xdotc(Wfd%nfft,ur2,1,vloc_psi,1)*fft_fact

     do blc1=1,blc2-1 ! Upper triangle of the vloc_ij matrix
       ur1 = blc_ur(:,blc1)
       vloc_ij(blc1,blc2) = xdotc(Wfd%nfft,ur1,1,vloc_psi,1)*fft_fact
     end do
   end do

   deallocate(vloc_psi)
   deallocate(blc_ur)
   !
   ! =======================
   ! ==== Interpolation ====
   ! =======================
   !
   allocate(hk_ij(blc_size,blc_size)); hk_ij=HUGE(one)
   if (Wfd%usepaw==1) then 
     allocate(sk_ij(blc_size,blc_size)); sk_ij=HUGE(one)
   end if

   if (want_eigenvectors) then ! Open tmp file for unitary transformation.
     umt_fname = umt_fname_spin(spin); umt_unt = umt_unt_spin(spin); reclen = blc_size*get_reclen("dpc") 
     open(unit=umt_unt,file=umt_fname,access='direct',recl=reclen)
     write(umt_unt,rec=1)reclen,blc_size,ioBst%nkpt,spin
   end if

   matblk=nloalg(4); if (nloalg(1)>0) matblk=Cryst%natom

   do ikpt=1,ioBst%nkpt ! K-points for interpolation.
     !
     do ii=1,3 ! Wrap in the first BZ thus enforcing traslational invariance.
       call wrap2_pmhalf(ioBst%kptns(ii,ikpt),k4intp(ii),shift)  ! TODO overloaded interface.
     end do
     !
     ! Gamma-centered basis set.
     istwf_k = 1
     kg_k => Wfd%Kdata(1)%kg_k
     npw_k = Wfd%Kdata(1)%npw
     !
     ! Prepare the kinetic term.
     allocate(kdotg(npw_k),half_gsq(npw_k),wsg(npw_k))
     allocate(vectin(2, npw_k*nspinor))  
     allocate(vnl_psi(2,npw_k*nspinor)) ! <G|Vnl|Cnk>
     allocate(cvnl_psi(npw_k*nspinor))  ! <G|Vnl|Cnk>, complex version!

     if (Wfd%usepaw==1) then 
       allocate(s_psi(2,npw_k*nspinor)) ! <G|1+S|Cnk>
       allocate(cs_psi(npw_k*nspinor))
     end if

     do ig=1,npw_k 
       kdotg(ig)    = two_pi**2 * DOT_PRODUCT(k4intp,MATMUL(Cryst%gmet,kg_k(:,ig)))
       half_gsq(ig) = half * vdotw(one*kg_k(:,ig),one*kg_k(:,ig),Cryst%gmet,"G")  ! TODO Add new overloaded interface.
     end do

! THIS PART IS NEEDED FOR THE CALL TO opernl although some quantities won't be used.
! Now I do things cleanly then we try to pass zero-sized arrays!
     allocate(ylm_k(npw_k,Psps%mpsang**2*useylm_))

     if (useylm_==1) then
       kptns_(:,1)=k4intp; optder=0; mkmem_=1
       allocate(dum_ylm_gr_k(npw_k,3+6*(optder/2),Psps%mpsang**2))

       !  Here mband is not used if paral_compil_kpt=0
       call initylmg(Cryst%gprimd,kg_k,kptns_,mkmem_,MPI_enreg_seq,Psps%mpsang,npw_k,(/1/),1,&
&        (/npw_k/),1,optder,Cryst%rprimd,0,0,ylm_k,dum_ylm_gr_k)

       deallocate(dum_ylm_gr_k, STAT=istat)
     end if
     !
     ! Compute (k+G) vectors (only if useylm_=1)
     nkpg=3*nloalg(5)  
     allocate(kpg_k(npw_k,nkpg))
     if (nkpg>0) call mkkpg(kg_k,kpg_k,k4intp,nkpg,npw_k)
     !
     ! ========================================================
     ! ==== Compute nonlocal form factors ffnl at all (k+G) ====
     ! ========================================================
     !
     dimffnl=1+ider0 ! Derivatives are not needed. 
     allocate(ffnl(npw_k,dimffnl,Psps%lmnmax,Psps%ntypat))

     call mkffnl(Psps%dimekb,dimffnl,Psps%ekb,ffnl,Psps%ffspl,Cryst%gmet,Cryst%gprimd,ider0,idir0,Psps%indlmn,&
&      kg_k,kpg_k,k4intp,Psps%lmnmax,Psps%lnmax,Psps%mpsang,Psps%mqgrid_ff,nkpg,npw_k,& 
&      Psps%ntypat,Psps%pspso,Psps%qgrid_ff,Cryst%rmet,Psps%usepaw,useylm_,ylm_k,ylmgr_dum)

     deallocate(ylm_k)
     !
     ! Allocate the arrays phkxred and ph3d, compute phkxred and eventually ph3d.
     allocate(phkxred(2,Cryst%natom))
     do iat=1,Cryst%natom
       iatom=Cryst%atindx(iat)
       arg=two_pi*DOT_PRODUCT(k4intp,Cryst%xred(:,iat))
       phkxred(1,iatom)=DCOS(arg)
       phkxred(2,iatom)=DSIN(arg)
     end do

     allocate(ph3d(2,npw_k,matblk), STAT=istat)
     if (nloalg(1)>0) then ! Allocation as well as precomputation
       if (MPI_enreg_seq%mode_para/='b') then
         call ph1d3d(1,Cryst%natom,kg_k,matblk,Cryst%natom,npw_k,Wfd%ngfft(1),Wfd%ngfft(2),Wfd%ngfft(3),phkxred,Wfd%ph1d,ph3d)
       else 
         MSG_ERROR("Stop not coded")
       end if
     end if
!END BORING CODE NEEDED TO CALL opernl
     !
     ! Calculate the upper triangle of <blc2| H_k |blc1>.
     do blc2=1,blc_size  
       ug2 => blc_ug(:,blc2) 

       if (spline_opt==EVAL_VNLK) then
         vectin(1,:) = DBLE (blc_ug(:,blc2))  ! Input wavefunction coefficients <G|Cnk>
         vectin(2,:) = AIMAG(blc_ug(:,blc2))

         signs  =2     ! => apply the non-local operator to a function in G-space.
         choice =1     ! => <G|V_nonlocal|vectin>.
         cpopt  =-1
         if (Wfd%usepaw==1) cpopt=0     ! => <p_lmn|in> are computed and saved.
         !cpopt  =3    ! => <p_lmn|in> are already in memory.
         paw_opt=0; if (Wfd%usepaw==1) paw_opt=4 ! both PAW nonlocal part of H (Dij) and overlap matrix (Sij)
         !paw_opt=Wfd%usepaw  !Norm-conserving Vnl (use of Kleinman-Bylander ener.) PAW nonlocal part of H (use of Dij coeffs)

         call nonlop(Cryst%atindx1,choice,cpopt,Cp_blc2,dimenl1,dimenl2,dimffnl,dimffnl,&
&          enl,dum_enlout,ffnl,ffnl,Cryst%gmet,Cryst%gprimd,idir0,Psps%indlmn,istwf_k,&
&          kg_k,kg_k,kpg_k,kpg_k,k4intp,k4intp,lambda0,Psps%lmnmax,matblk,Wfd%mgfft,&
&          MPI_enreg_seq,Psps%mpsang,Psps%mpssoang,Cryst%natom,Cryst%nattyp,Wfd%ngfft,nkpg,nkpg,nloalg,&
&          nnlout0,npw_k,npw_k,nspinor,Cryst%ntypat,0,paw_opt,phkxred,phkxred,Wfd%ph1d,ph3d,ph3d,signs,sij,&
&          s_psi,tim_nonlop0,Cryst%ucvol,Psps%useylm,vectin,vnl_psi)

         cvnl_psi = DCMPLX(vnl_psi(1,:),vnl_psi(2,:))
         if (Wfd%usepaw==1) cs_psi = DCMPLX(s_psi(1,:),s_psi(2,:))

       else if (spline_opt==USE_BSPLINE) then
         call cprj_bspline_eval(Cp_bspl,blc2,k4intp,Cp_blc2)
       end if

       do blc1=1,blc2 ! Upper triangle of the hk_ij matrix
         ug1 => blc_ug(:,blc1) 

         if (spline_opt==USE_BSPLINE) then

           call cprj_bspline_eval(Cp_bspl,blc1,k4intp,Cp_blc1)

           ! nonlop_ylm calculates diagonal matrix elements of Vnl while we need 
           ! complex off-diagonal terms as well. Assuming that the pseudotential strengths are
           ! real we can rewrite A_i^* D_{ij} B_j as.
           !
           ! A_i^* D_ij B_j = [rea_i rb_j + ima_i imb_j] D_ij +i [-ima_i reb_j + rea_i imb_j ] D_ij

           cp_idx = RESHAPE( (/1,1,2,2,2,1,1,2/),(/2,4/) )

           do iat=1,Cryst%natom ! already done in cprj_alloc but oh well.
             Cp_left (iat,1)%cp(2,:)=zero
             Cp_right(iat,1)%cp(2,:)=zero
           end do
           !
           ! Non-local part. This coding assumes that Dij are real Spinor not coded! 
           ! TODO Sk_ij for PAW
           do step=1,4    
             ll = cp_idx(1,step)
             rr = cp_idx(2,step)

             do iat=1,Cryst%natom 
               if (ll==2.and.rr==1) then
                 Cp_left (iat,1)%cp(1,:) = -Cp_blc1(iat,1)%cp(ll,:) 
               else
                 Cp_left (iat,1)%cp(1,:) = Cp_blc1(iat,1)%cp(ll,:) 
               end if
               Cp_right(iat,1)%cp(1,:) = Cp_blc2(iat,1)%cp(rr,:) 
             end do

             signs  =1     ! get contracted elements (energy, forces, stress, ...)
             choice =1     ! => a non-local energy contribution
             cpopt  =2     ! right <p_lmn|in> are already in memory;
             nnlout =1
             paw_opt=0; if (Wfd%usepaw==1) paw_opt=4 ! both PAW nonlocal part of H (Dij) and overlap matrix (Sij)
             istwf_k=2
             !paw_opt=Wfd%usepaw  !Norm-conserving Vnl (use of Kleinman-Bylander ener.) PAW nonlocal part of H (use of Dij coeffs)
             !paw_opt=1

             ! Calling hacked version of nonlop_ylm TODO implement other cases and sij as well.
             call nonlop_ylm(Cryst%atindx1,choice,cpopt,Cp_right,dimenl1,dimenl2,dimffnl,dimffnl,&
&              enl,enlout,ffnl,ffnl,Cryst%gprimd,idir0,Psps%indlmn,istwf_k,&
&              kg_k,kg_k,kpg_k,kpg_k,k4intp,k4intp,lambda0,Psps%lmnmax,matblk,Wfd%mgfft,&
&              MPI_enreg_seq,Cryst%natom,Cryst%nattyp,Wfd%ngfft,nkpg,nkpg,nloalg,nnlout,&
&              npw_k,npw_k,nspinor,Cryst%ntypat,paw_opt,phkxred,phkxred,Wfd%ph1d,&
&              ph3d,ph3d,signs,sij,vectin,Cryst%ucvol,vectin,vectin,cprjin_left=Cp_left)

             estep(step) = enlout(1)
           end do ! step
           vnl_ij  = DCMPLX( SUM(estep(1:2)), SUM(estep(3:4)) )

          if (blc1==blc2) then ! Calling hacked version of nonlop_ylm TODO implement other cases and sij as well.
             call nonlop_ylm(Cryst%atindx1,choice,cpopt,Cp_blc1,dimenl1,dimenl2,dimffnl,dimffnl,&
&              enl,enlout,ffnl,ffnl,Cryst%gprimd,idir0,Psps%indlmn,istwf_k,&
&              kg_k,kg_k,kpg_k,kpg_k,k4intp,k4intp,lambda0,Psps%lmnmax,matblk,Wfd%mgfft,&
&              MPI_enreg_seq,Cryst%natom,Cryst%nattyp,Wfd%ngfft,nkpg,nkpg,nloalg,nnlout,&
&              npw_k,npw_k,nspinor,Cryst%ntypat,paw_opt,phkxred,phkxred,Wfd%ph1d,&
&              ph3d,ph3d,signs,sij,vectin,Cryst%ucvol,vectin,vectin)

             vnl_ij= DCMPLX( enlout(1), zero)
           end if

           if(blc1==blc2) write(777,*)blc1,blc2,vnl_ij
         end if
         !
         ! ====================================
         ! ==== Assemble final Hamiltonian ====
         ! ====================================
         !
         ! Kinetic energy.
         wsg = kdotg*ug2
         kin_ij = xdotc(npw_k,ug1,1,wsg,1) 

         wsg = half_gsq*ug2
         kin_ij = kin_ij + xdotc(npw_k,ug1,1,wsg,1) 
         if (blc1==blc2) kin_ij = kin_ij + half * vdotw(k4intp,k4intp,Cryst%gmet,"G")
         !
         ! Vnl term.
         if (spline_opt==EVAL_VNLK) then
           vnl_ij  = xdotc(npw_k*nspinor,ug1,1,cvnl_psi,1)
           !if (blc1==blc2) write(778,*)blc1,blc2,vnl_ij
         end if
         !
         ! Total Hamiltonian.
         hk_ij(blc1,blc2) = kin_ij + vloc_ij(blc1,blc2) + vnl_ij

         if (Wfd%usepaw==1) then  ! Overlap operator.
           sk_ij(blc1,blc2) = xdotc(npw_k*nspinor,ug1,1,cs_psi,1)
           !if (blc1==blc2) sk_ij(blc1,blc2) = sk_ij(blc1,blc2) + xdotc(npw_k*nspinor,ug1,1,ug2,1)
         end if
       end do
     end do
     !
     ! =========================================================
     ! ==== Diagonalize Hk_ij in the optimal Bloch supspace ====
     ! =========================================================
     nband_k = ioBst%nband(ikpt+(spin-1)*ioBst%nkpt)
     allocate(enek_ij(blc_size))

     do_full_diago = (nband_k == blc_size)
     if (.not.do_full_diago) then
       ldz=1; if (want_eigenvectors) ldz=blc_size
       allocate(eig_vec(ldz,nband_k), STAT=istat)
       if (istat/=0) then ! Try to continue.
         MSG_WARNING("Allocation of eig_vec failed. Full diago will be performed")
         do_full_diago = .TRUE.
       end if
     end if

     if (Wfd%usepaw==0) then  ! Solve H*v = e*v
       if (do_full_diago) then
         call xheev(jobz,"Upper",blc_size,hk_ij,enek_ij) 
       else
         call xheevx(jobz,"Irange","Upper",blc_size,hk_ij,zero,zero,1,nband_k,-tol8,nefound,enek_ij,eig_vec,ldz)
         if (want_eigenvectors) hk_ij(:,1:ldz) = eig_vec
       end if
     else  ! Solve H*v = e*S*v
       if (do_full_diago) then
         call xhegv(1,jobz,"Upper",blc_size,hk_ij,sk_ij,enek_ij) 
       else
         call xhegvx(1,jobz,"Irange","Upper",blc_size,hk_ij,sk_ij,zero,zero,1,nband_k,-tol8,nefound,enek_ij,eig_vec,ldz)
         if (want_eigenvectors) hk_ij(:,1:ldz) = eig_vec
       end if
     end if

     if (allocated(eig_vec)) deallocate(eig_vec)
     !
     ! Write interpolated energies.
     ene_fact=one; frmt1='(i4,4x,9(1x,f7.4))'; frmt2='(8x,9(1x,f7.4))'
     write(msg,'(a,i3,3x,a)')' Eigenvalues in Hartree for ikpt= ',ikpt,spin_name(spin)
     call wrtout(std_out,msg,'COLL')

     write(msg,frmt1)ikpt,(enek_ij(ib)*ene_fact,ib=1,MIN(9,nband_k))
     call wrtout(std_out,msg,'COLL') 

     if (nband_k>9) then
       do jj=10,nband_k,9
         write(msg,frmt2) (enek_ij(ib)*ene_fact,ib=jj,MIN(jj+8,nband_k))
         call wrtout(std_out,msg,'COLL') 
       end do
     end if
     call flush_unit()
     !
     ! Save energies.
     intp_ene(:,ikpt,spin) = enek_ij(1:nband_k)
     deallocate(enek_ij)

     if (want_eigenvectors) then ! Write unitary transformation for this spin on file.        
       rec_st=1+(ikpt-1)*ioBst%mband 
       do band=1,nband_k
         write(umt_unt,rec=rec_st+band)hk_ij(:,band)
       end do
     end if

!DEALLOCATE BORING STUFF
     deallocate(ffnl)
     deallocate(kpg_k, STAT=istat)
     deallocate(phkxred,ph3d)
!END BORING STUFF

      deallocate(kdotg,half_gsq,wsg)
      deallocate(cvnl_psi,vnl_psi,vectin)
      if (Wfd%usepaw==1) deallocate(s_psi,cs_psi)
   end do ! intp_kpt

   deallocate(enl,sij) 
   deallocate(hk_ij,vloc_ij)

   if (Wfd%usepaw==1) deallocate(sk_ij)

   call cprj_free(Cp_blc2 ); deallocate(Cp_blc2 )

   if (spline_opt==USE_BSPLINE) then
     call cprj_bspline_free(Cp_bspl)
     call cprj_free(Cp_blc1 ); deallocate(Cp_blc1 )
     call cprj_free(Cp_left ); deallocate(Cp_left )
     call cprj_free(Cp_right); deallocate(Cp_right)
   end if

   deallocate(blc_ug)
   close(umt_unt_spin(spin))

 end do ! spin

 deallocate(ur1,ur2,ur1_dpc)
 deallocate(multable,toinv)
 deallocate(nlmn_sort,typat_sort)

 call destroy_bz_mesh_type(Kmesh)

 call wfd_barrier(Wfd)
 !
 ! Update the energies in ioBst.
 call xsum_mpi(intp_ene,Wfd%comm,ierr)

 ioBst%eig = intp_ene
 deallocate(intp_ene)

 ! Recalculate new occupation numbers and new Fermi level.
 ! FIXME Have to calculate new occ for semiconductors
 !$call update_occ(ioBSt,Dtset%fixmom,Dtset%stmbias,Dtset%prtvol)

 call bst_write_bands(ioBSt,Cryst%gmet,"interpolated",ierr)

 !$call bst_print_fs(ioBst,Cryst,Dtset%kptrlatt,Dtset%nshiftk,Dtset%shiftk,"Fermi_surface",ierr)            
 !
 ! ===================================
 ! ==== Optionally Write WFK file ====
 ! ===================================
 !
 if (.FALSE..and.want_eigenvectors.and. wfd_iam_master(Wfd)) then ! presently only master works!

   call wrtout(std_out," bloch_interp: about to write "//TRIM(ofname),"COLL")
   !
   ! Init new header.
   pertcase=0
   call hdr_init(ioBSt,codvsn,Dtset,Hdr,Pawtab,pertcase,Psps)
   !
   ! Update Fermi level. Write input Pawrhoij on file for PAW.
   allocate(occfact(ioBSt%bantot))
   call get_eneocc_vect(ioBSt,"occ",occfact)

   call hdr_update(ioBSt%bantot,1.0d20,ioBSt%fermie,Hdr,Cryst%natom,1.0d20,&
&    Cryst%rprimd,occfact,Wfd%MPI_enreg,Pawrhoij,Psps%usepaw,Cryst%xred)

   deallocate(occfact)
   !
   ! * Init Wff structure.
   call WffOpen(accesswff,Wfd%comm,ofname,ierr,Wff,Wfd%master,Wfd%my_rank,get_unit())

   ! * Write Header to unformatted file 
   fform=2 ! TODO Recheck this

   if (ANY(accesswff == (/IO_MODE_FORTRAN, IO_MODE_FORTRAN_MASTER, IO_MODE_MPI/) )) then
     call hdr_io_wfftype(fform,Hdr,2,Wff)
     call WffKg(Wff,1)  ! will write kg.
   else if (Wff%accesswff==IO_MODE_ETSF .and. wfd_iam_master(Wfd)) then
     call hdr_io_etsf(fform,Hdr,2,Wff%unwff)
   end if
   call hdr_clean(Hdr)
   !
   ! For each spin and k-point, do:
   !  1) Convert wfd from gamma-centered to k-centered basis set
   !  2) Write G vectors, energies, occ and u(G) on file.
   !  
   ! Options for rwwf.
   rwwf_option=2  ! for writing cg, eigen and occ,
   formeig=0      ! vector of eigenvalues
   headform=0     ! use the default (current) format and headform
   optkg=1        ! kg_k will be written.
   !
   nullify(kg_k)
   do spin=1,Wfd%nsppol

     blc_fname = blc_fname_spin(spin); blc_unt = blc_unt_spin(spin); reclen = Wfd%npwwfn*get_reclen("dpc") 
     open(unit=blc_unt,file=blc_fname,access='direct',recl=reclen)

     read(blc_unt,rec=1)daf_reclen,daf_npw,daf_nspinor,daf_nband,daf_spin
     ! TODO Likely here I need the table mat idx -> (kbz,band)

     ltest = ALL( (/daf_reclen,daf_npw,daf_nspinor,daf_spin/) == (/reclen,Wfd%npwwfn,Wfd%nspinor,spin/) )
     ABI_CHECK(ltest,"Mismatch in dimensions read from BLC file")

     allocate(blc_ug(Wfd%npwwfn*nspinor,daf_nband))
     do blc=1,blc_size 
       read(blc_unt,rec=1+blc)blc_ug(:,blc)
     end do

     umt_fname = umt_fname_spin(spin); umt_unt = umt_unt_spin(spin); reclen = Wfd%npwwfn*get_reclen("dpc") 
     open(unit=umt_unt,file=umt_fname,access='direct',recl=reclen)
                                                                                                              
     read(umt_unt,rec=1)daf_reclen,daf_blc_size,daf_nkpt,daf_spin
     ! TODO Likely here I need the table mat idx -> (kbz,band)
                                                                                                              
     ltest = ALL( (/daf_reclen,daf_nkpt,daf_spin/) == (/reclen,ioBSt%nkpt,spin/) )
     ABI_CHECK(ltest,"Mismatch in dimensions read from UMT file")

     do ikpt=1,ioBSt%nkpt

       kpoint     = ioBSt%kptns(:,ikpt)
       npw_k      = ioBSt%npwarr(ikpt)
       istwf_k    = ioBSt%istwfk(ikpt)
       nband_k    = ioBSt%nband(ikpt+(spin-1)*ioBSt%nkpt)     ! TODO use 2D array.
       nband_disk = nband_k
       mcg        = npw_k*nspinor*nband_k

       occ_k => ioBSt%occ(:,ikpt,spin)
       eig_k => ioBSt%eig(:,ikpt,spin)

       ecut_eff = Dtset%ecut * Dtset%dilatmx**2  
       call get_kg(kpoint,istwf_k,ecut_eff,Cryst%gmet,onpw_k,kg_k) 
       if (onpw_k /= npw_k) then
         write(msg,'(a,2(1x,i0),a)')" Mismatch between input npw_k and value calculated from ecut. ",npw_k,onpw_k," Will use ...."
         MSG_WARNING(msg)
       end if

       ! This part will not work if we switch to k-centered basis sets.
       allocate(cg_k(2,npw_k*nspinor*nband_k)); cg_k=zero

       allocate(umat_k(daf_blc_size,nband_k))  ! <BLC_i| interpu_j>
       rec_st=1+(ikpt-1)*ioBst%mband 
       do band=1,nband_k
         read(umt_unt,rec=rec_st+band)umat_k(:,band)
       end do
       deallocate(umat_k)
       
       !$call XGEMM('N','N',nsize,sizegw,rangeb,cone_gw,wf_ks(:,lowerb:upperb),nsize,&
       !$&          umat_k,rangeb,czero_gw,wf_qp(spad+1:spad+nsize,b1gw:b2gw),nsize)

#if 0
       ! * Perform conversion of the basis set.
       ikg=0
       call wfd_gamma2k(Wfd,ik_ibz,spin,ikg,kg_k,cg_k,Cryst%gmet,nmiss)
       if (nmiss>0) then 
         write(msg,'(2(a,i4))')' Missing ',nmiss,' components for ik_ibz= ',ik_ibz
         MSG_WARNING(msg)
       end if
#endif
       !
       ! Write set of bands for this (k,s).
       call rwwf(cg_k,eig_k,formeig,headform,icg0,ikpt,spin,kg_k,ioBSt%mband,mcg,MPI_enreg_seq,nband_k,&
&        nband_disk,npw_k,nspinor,occ_k,rwwf_option,optkg,tim_rwwf0,Wff)

       deallocate(cg_k,kg_k)

     end do !ikpt

     deallocate(blc_ug)
     close(unit=blc_unt)
   end do !spin
   !
   ! * Close the wavefunction file (and do NOT delete it !)
   call WffClose(Wff,ierr)

   ! TODO Close tmp files and delete them.
 end if

 call wfd_barrier(Wfd)

 DBG_EXIT("COLL")

end subroutine bloch_interp       
!!***

!----------------------------------------------------------------------

!!****f* ABINIT/interpolate_sigmak
!! NAME
!! interpolate_sigmak
!!
!! FUNCTION
!!
!! COPYRIGHT
!! Copyright (C) 2010 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!
!! OUTPUT
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine interpolate_sigmak(Cryst,Kmesh,kptrlatt,nshiftk,shiftk,lbd,ubd,isigk_ij,onkpt,okpt,osigk_ij,ierr)

 use defs_basis
 use m_bspline
 use m_errors

 use m_numeric_tools,  only : set2unit
 use m_crystal,        only : crystal_structure
 use m_bz_mesh,        only : bz_mesh_type, isequalk

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_28_numeric_noabirule
 use interfaces_56_recipspace
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: lbd,ubd,onkpt,nshiftk
 integer,intent(out) :: ierr
 type(Crystal_structure),intent(in) :: Cryst
 type(bz_mesh_type),intent(in) :: Kmesh
!arrays
 integer,intent(in) :: kptrlatt(3,3)
 real(dp),intent(in) :: okpt(3,onkpt) 
 real(dp),intent(in) :: shiftk(3,nshiftk)
 complex(dpc),intent(in) :: isigk_ij(lbd:ubd,lbd:ubd,Kmesh%nibz)
 complex(dpc),intent(out) :: osigk_ij(lbd:ubd,lbd:ubd,onkpt)

!Local variables ------------------------------
!scalars
 integer :: kxord,kyord,kzord
 integer :: nxknot,nyknot,nzknot
 integer :: b1,b2,reim,ik_bz,ik_ibz,nkz,nky,nkx,ii,nkptlatt
 integer :: dir,mkpt,brav,nseen,nvec,nmiss,smp_nk
 integer :: ladd,radd,ikpt,ik1,ik2,ik3,bs_idx
 real(dp),parameter :: KPT_TOL=tol12
 logical :: found
 character(len=500) :: msg
!arrays
 integer,allocatable :: bsp2bz(:),iperm(:)
 real(dp) :: bs_kpt(3)
 real(dp),allocatable :: spkpt(:,:),seen(:),xvec(:),yvec(:),zvec(:),xknot(:),yknot(:),zknot(:)
 real(dp),allocatable :: xyzdata(:,:,:),bs_coef(:,:,:),work_bz(:,:,:,:),rout(:,:)
 complex(dpc),allocatable :: cwork_bz(:,:,:),umat_k(:,:)

!************************************************************************

 DBG_ENTER("COLL")

 ABI_UNUSED(Cryst%nsym)  

 ierr=0
 if (Kmesh%nbz==1) then
   ierr=1
   MSG_WARNING("Cannot interpolate with a single k-point") 
   RETURN
 end if

 allocate(cwork_bz(Kmesh%nbz,lbd:ubd,lbd:ubd))
 allocate(umat_k(lbd:ubd,lbd:ubd))

 call set2unit(umat_k)
 !
 ! Reconstruct matrix elements in the full BZ, then rotate to the new basis.
 do ik_bz=1,Kmesh%nbz
   ik_ibz = Kmesh%tab(ik_bz)
   cwork_bz(ik_bz,:,:) =  isigk_ij(:,:,ik_ibz)
   cwork_bz(ik_bz,:,:) =  MATMUL( TRANSPOSE(CONJG(umat_k)), MATMUL(cwork_bz(ik_bz,:,:),umat_k) ) 
 end do

 deallocate(umat_k)

 allocate(work_bz(2,Kmesh%nbz,lbd:ubd,lbd:ubd))
 work_bz(1,:,:,:) = DBLE(cwork_bz)
 work_bz(2,:,:,:) = AIMAG(cwork_bz)
 deallocate(cwork_bz)
 !
 ! Prepare B-spline interpolation.
 ! FIXME: Here I am assuming that kptrlatt is diagonal, only one shift is expected!
 !

 !kptrlatt = Kmesh%kptrlatt  ! FIXME these quantities have to be intitialized!
 !nshiftk  = Kmesh%nshift
 !shiftk   => Kmesh%shift

 ! Compute the number of k points in the G-space unit cell (will be multiplied by nshiftk later).
 nkptlatt=kptrlatt(1,1)*kptrlatt(2,2)*kptrlatt(3,3) &
&   +kptrlatt(1,2)*kptrlatt(2,3)*kptrlatt(3,1) &
&   +kptrlatt(1,3)*kptrlatt(2,1)*kptrlatt(3,2) &
&   -kptrlatt(1,2)*kptrlatt(2,1)*kptrlatt(3,3) &
&   -kptrlatt(1,3)*kptrlatt(2,2)*kptrlatt(3,1) &
&   -kptrlatt(1,1)*kptrlatt(2,3)*kptrlatt(3,2)

 mkpt=nkptlatt*nshiftk; brav=1 !brav=1 is able to treat all bravais lattices.

 allocate(spkpt(3,mkpt))
 call smpbz(brav,std_out,kptrlatt,mkpt,smp_nk,nshiftk,0,shiftk,spkpt)

 allocate(seen(smp_nk),iperm(smp_nk))

 do dir=1,3
   nseen=1; seen(nseen) = spkpt(dir,1)
   do ik_bz=2,smp_nk
      if (ALL( ABS(spkpt(dir,ik_bz)-seen(1:nseen)) > KPT_TOL ) ) then
        nseen = nseen + 1
        seen(nseen) = spkpt(dir,ik_bz) 
      end if
   end do

   call sort_dp(nseen,seen,iperm, KPT_TOL)
   nvec = nseen
   radd = 0; ladd = 0
   if (ABS(seen(1)+one - seen(nseen)) > KPT_TOL) then 
     radd=1
     nvec = nvec + 1
   end if
   if (ABS(seen(nseen)-one - seen(1)) > KPT_TOL) then
     ladd=1
     nvec = nvec + 1
   end if

   if (dir==1) then
     allocate(xvec(nvec))
     xvec(1+ladd:nvec-radd) = seen(1:nseen)
     if (ladd==1) xvec(ladd) = seen(nseen)-one
     if (radd==1) xvec(nvec) = seen(1)+one

   else if (dir==2) then
     allocate(yvec(nvec))
     yvec(1+ladd:nvec-radd) = seen(1:nseen)
     if (ladd==1) yvec(ladd) = seen(nseen)-one
     if (radd==1) yvec(nvec) = seen(1)+one

   else if (dir==3) then
     allocate(zvec(nvec))
     zvec(1+ladd:nvec-radd) = seen(1:nseen)
     if (ladd==1) zvec(ladd) = seen(nseen)-one
     if (radd==1) zvec(nvec) = seen(1)+one
   end if
 end do

 deallocate(seen,iperm)

 ! TODO Add check on final partion. 
 deallocate(spkpt)
 ! 
 !
 ! Map B-spline mesh onto BZ.
 !
 nkx = SIZE(xvec)
 nky = SIZE(yvec)
 nkz = SIZE(zvec)

 write(*,*)" B-spline mesh: ",nkx,nky,nkz

 allocate(bsp2bz(nkx*nky*nkz))

 bs_idx=0; nmiss=0
 do ik3=1,nkz
   do ik2=1,nky
     do ik1=1,nkx
       bs_idx = bs_idx+1
       bs_kpt = (/xvec(ik1), yvec(ik2), zvec(ik3)/)
       
       ik_bz=0; found=.FALSE.
       do while (ik_bz<Kmesh%nbz .and. .not.found)
         ik_bz = ik_bz + 1
         found = isequalk(bs_kpt,Kmesh%bz(:,ik_bz))
       end do

       if (found) then
         bsp2bz(bs_idx) = ik_bz
       else 
         nmiss = nmiss+1
       end if

     end do
   end do
 end do

 if (nmiss>0) then
   ierr=ierr+1
   write(msg,'(a,i0,a)')"Found ",nmiss," k-points in B-spline mesh that does not belong to the BZ"
   MSG_ERROR(msg)
 end if
 !
 ! Generate knots (Order should be selected in a more careful way)
 kxord = nkx
 kyord = nky
 kzord = nkx

 nxknot = nkx + kxord
 nyknot = nky + kyord
 nzknot = nkz + kzord

 allocate(xknot(nxknot))
 allocate(yknot(nyknot))
 allocate(zknot(nzknot))

 call dbsnak (nkx, xvec, kxord, xknot)
 call dbsnak (nky, yvec, kyord, yknot)
 call dbsnak (nkz, zvec, kzord, zknot)
 !
 allocate(xyzdata(nkx,nky,nkz))
 allocate(bs_coef(nkx,nky,nkz))
 allocate(rout(2,onkpt))

 do b2=lbd,ubd
   do b1=lbd,ubd
     do reim=1,2
       !
       ! Load data.
       bs_idx = 0  
       do ik3=1,nkz
         do ik2=1,nky
           do ik1=1,nkx
             bs_idx = bs_idx+1
             ik_bz = bsp2bz(bs_idx)
             xyzdata(ik1,ik2,ik3) = work_bz(reim,ik_bz,b1,b2)
           end do
         end do
       end do
       !
       ! Construct 3D tensor for B-spline.
       call dbs3in(nkx,xvec,nky,yvec,nkz,zvec,xyzdata,nkx,nky,kxord,kyord,kzord,xknot,yknot,zknot,bs_coef)
       !
       do ikpt=1,onkpt ! B-spline interpolation.
         rout(reim,ikpt) = dbs3vl(okpt(1,ikpt),okpt(2,ikpt),okpt(3,ikpt),kxord,kyord,kzord,xknot,yknot,zknot,nkx,nky,nkz,bs_coef)
       end do
     end do ! reim

     osigk_ij(b1,b2,:) = DCMPLX(rout(1,:),rout(2,:))

   end do
 end do

 do ikpt=1,onkpt
   write(77,'(a,3es16.8,a)')"# kpt= (",okpt(:,ikpt),") "
   write(77,'(1x,(10f9.5))')(REAL(osigk_ij(ii,ii,ikpt))*Ha_eV,ii=lbd,ubd)
 end do

 deallocate(rout)
 deallocate(xyzdata)
 deallocate(bs_coef)

 deallocate(xknot,yknot,zknot)
 deallocate(xvec,yvec,zvec)

 deallocate(bsp2bz)
 deallocate(work_bz)

 DBG_EXIT("COLL")

end subroutine interpolate_sigmak
!!***

