!{\src2tex{textfont=tt}}
!!****f* ABINIT/ctocprj
!! NAME
!! ctocprj

!! FUNCTION
!!  Compute all <Proj_i|Cnk> for every wave function |Cnk> expressed in reciprocal space.
!!  |Proj_i> are non-local projectors (for each atom and each l,m,n)
!!  Can also compute derivatives of <Proj_i|Cnk> wrt to several parameters
!!
!! COPYRIGHT
!! Copyright (C) 1998-2010 ABINIT group (MT)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~ABINIT/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~ABINIT/Infos/contributors .
!!
!! INPUTS
!!  atindx(natom)=index table for atoms
!!  cg(2,mpw*nspinor*mband*mkmem*nsppol)=planewave coefficients of wavefunctions
!!  choice: chooses derivatives to compute:
!!          =1 => no derivatives
!!          =2 => 1st derivatives with respect to atomic position(s)
!!          =3 => 1st derivatives with respect to strain(s)
!!          =23=> 1st derivatives with respect to atm. pos. and strain(s)
!!          =4 => 2nd derivatives with respect to atomic pos.
!!          =24=> 1st and 2nd derivatives with respect to atomic pos.
!!          =5 => derivatives with respect to k wavevector
!!          =6 => 2nd derivatives with respect to strain and atm. pos.
!!  gmet(3,3)=reciprocal space metric tensor in bohr**-2
!!  gprimd(3,3)=dimensional reciprocal space primitive translations
!!  iatom= if <=0, cprj=<p_i|Cnk> are computed for all atoms 1...natom
!!         if  >0  cprj=<p_i|Cnk> are computed only for atom with index iatom
!!  idir=direction of the derivative, i.e. dir. of - atom to be moved  in the case choice=2
!!                                                 - strain component  in the case choice=3
!!                                                 - k point direction in the case choice=5
!!       Compatible only with choice=2,3,5; if idir=0, all derivatives are computed
!!  iorder_cprj=0 if output cprj=<p_i|Cnk> are sorted by atom type
!!                (first all elements of atom type 1, followed by those of atom type 2 and so on).
!!              1 if output cprj=<p_i|Cnk> are sorted according to
!!                the variable typat in the main input file
!!  istwfk(nkpt)=option parameter that describes the storage of wfs
!!  kg(3,mpw*mkmem)=reduced planewave coordinates
!!  kpt(3,nkpt)=reduced coordinates of k points.
!!  mband=maximum number of bands
!!  mgfft=maximum size of 1D FFTs
!!  mkmem=number of k points which can fit in memory; set to 0 if use disk
!!  mpi_enreg=informations about MPI parallelization
!!  mpsang=1+maximum angular momentum for nonlocal pseudopotentials
!!  mpw=maximum dimensioned size of npw
!!  natom=number of atoms in cell
!!  nattyp(ntypat)= # atoms of each type
!!  nband(nkpt*nsppol)=number of bands at this k point for that spin polarization
!!  ncprj=1st dim. of cprj array (natom if iatom<=0, 1 if iatom>0)
!!  ngfft(18)=contain all needed information about 3D FFT, see ~ABINIT/Infos/vargs.htm#ngfft
!!  nkpt=number of k points
!!  nloalg(5)=governs the choice of the algorithm for nonlocal operator
!!  npwarr(nkpt)=number of planewaves in basis at this k point
!!  nspinor=number of spinorial components of the wavefunctions
!!  nsppol=1 for unpolarized, 2 for spin-polarized
!!  ntypat=number of types of atoms in unit cell
!!  paral_kgb= 1 if kpt-band-FFT is activated
!!  ph1d(2,3*(2*mgfft+1)*natom)=1-dim structure factor phase information
!!  psps <type(pseudopotential_type)>=variables related to pseudopotentials
!!  rmet(3,3)=real space metric (bohr**2)
!!  tim_ctocprj=timing code of the calling routine
!!  typat(natom)= types of atoms
!!  uncp=unit number for <P_lmn|Cnk> data (if used)
!!  unkg=unit number for (k+G) data (if used)
!!  unylm=unit number for storage of Ylm on disk
!!  useylmgr=1 if gradients of spherical harmonics are used (see ylmgr)
!!  wffnow=struct infos for wf disk file
!!  xred(3,natom)=reduced dimensionless atomic coordinates
!!  ylm(mpw*mkmem,mpsang*mpsang)=real spherical harmonics for each G and k point
!!  ylmgr(npw*mkmem,3,mpsang*mpsang*useylmgr)=gradients of real spherical harmonics wrt (k+G)
!!!
!! OUTPUT
!!  cprj(ncprj,nspinor*mband*mkmem*nsppol) <type(cprj_type)>= projected input wave functions <Proj_i|Cnk> with NL projectors
!!                                                             Usually ncprj=natom
!!
!! PARENTS
!!      extrapwf,loper3,scfcv,vtorho
!!
!! CHILDREN
!!      cprj_alloc,cprj_diskinit_w,cprj_free,cprj_put,getcprj,hdr_skip,mkffnl
!!      mkkpg,ph1d3d,rdnpw,rwwf,xallgather_mpi,xallgatherv_mpi,xalltoallv_mpi
!!      xcomm_init,xdefineoff
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

 subroutine ctocprj(atindx,cg,choice,cprj,gmet,gprimd,iatom,idir,&
& iorder_cprj,istwfk,kg,kpt,mband,mgfft,mkmem,mpi_enreg,mpsang,&
& mpw,natom,nattyp,nband,ncprj,ngfft,nkpt,nloalg,npwarr,nspinor,&
& nsppol,ntypat,paral_kgb,ph1d,psps,rmet,typat,ucvol,uncp,unkg,&
& unylm,useylmgr,wffnow,xred,ylm,ylmgr)

 use defs_basis
 use defs_datatypes
 use defs_abitypes
 use m_xmpi
 use m_errors
 use m_wffile
#if defined HAVE_NETCDF
 use netcdf
#endif

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_51_manage_mpi
 use interfaces_53_abiutil
 use interfaces_59_io_mpi
 use interfaces_62_iowfdenpot
 use interfaces_65_nonlocal, except_this_one => ctocprj
!End of the abilint section

 implicit none

!Arguments -------------------------------
!scalars
 integer,intent(in) :: choice,iatom,idir,iorder_cprj,mband,mgfft,mkmem,mpsang,mpw
 integer,intent(in) :: natom,ncprj,nkpt,nsppol,ntypat,paral_kgb,uncp,unkg,unylm,useylmgr
 integer,intent(in) :: nspinor
 real(dp),intent(in) :: ucvol
 type(MPI_type),intent(inout) :: mpi_enreg
 type(pseudopotential_type),intent(in) :: psps
 type(wffile_type),intent(inout) :: wffnow
!arrays
 integer,intent(in) :: istwfk(nkpt),nband(nkpt*nsppol)
 integer,intent(in) :: ngfft(18),nloalg(5),npwarr(nkpt),kg(3,mpw*mkmem),typat(natom)
 integer,intent(in),target :: atindx(natom),nattyp(ntypat)
 real(dp),intent(in) :: cg(2,mpw*nspinor*mband*mkmem*nsppol)
 real(dp),intent(in) :: gmet(3,3),gprimd(3,3),kpt(3,nkpt),rmet(3,3)
 real(dp),intent(in) :: xred(3,natom),ylm(mpw*mkmem,mpsang*mpsang)
 real(dp),intent(in) :: ylmgr(mpw*mkmem,3,mpsang*mpsang*useylmgr)
 real(dp),intent(in),target :: ph1d(2,3*(2*mgfft+1)*natom)
 type(cprj_type),intent(out) :: cprj(ncprj,nspinor*mband*mkmem*nsppol)

!Local variables-------------------------------
!scalars
 integer :: blocksz,cg_bandpp,counter,cpopt,cprj_bandpp,dimffnl,formeig,ia,iatm,iatom1,iatom2
 integer :: iband_max,iband_min,ibg,iblockbd,ibp,icg,icgb,icp1,icp2,ider,idir0,ierr
 integer :: ig,ii,ikg,ikpt,ilm,ipw,isize,isppol,istwf_k,itypat,iwf1,iwf2
 integer :: matblk,mcg_disk,me_distrb,muig,n1,n1_2p1,n2,n2_2p1,n3,n3_2p1
 integer :: nband_k,nblockbd,ncpgr,nkpg,nprocband,npw_k,npw_nk,nsp,ntypat0
 integer :: shift1,shift1b,shift2,shift2b,shift3,shift3b
 integer :: spaceComm,spaceComm_band,spaceComm_fft,tim_rwwf
 logical :: one_atom
 real(dp) :: arg
 character(len=500) :: msg
!arrays
 integer,allocatable :: bufsize(:),bufsize_wf(:),bufdisp(:),bufdisp_wf(:)
 integer,allocatable :: dimlmn(:),kg_dum(:,:),kg_k(:,:),kg_k_loc(:,:)
 integer,allocatable :: npw_block(:),npw_disp(:)
 integer,pointer :: atindx_atm(:),indlmn_atm(:,:,:),nattyp_atm(:),pspso_atm(:)
 real(dp) :: kpoint(3)
 real(dp),allocatable :: cg_disk(:,:),cwavef(:,:),cwavef_tmp(:,:),eig_dum(:)
 real(dp),allocatable :: ffnl(:,:,:,:),ffnl_npw(:,:,:,:),ffnl_tmp(:,:,:,:),ffnl_tmp_npw(:,:,:,:)
 real(dp),allocatable :: kpg_k(:,:),occ_dum(:)
 real(dp),allocatable :: ph3d(:,:,:),ph3d_npw(:,:,:),ph3d_tmp(:,:,:),ph3d_tmp_npw(:,:,:)
 real(dp),allocatable :: phkxred(:,:),ylm_k(:,:),ylmgr_k(:,:,:)
 real(dp),pointer :: ekb_atm(:,:),ffspl_atm(:,:,:,:),ph1d_atm(:,:)
 type(cprj_type),allocatable :: cwaveprj(:,:)

! *********************************************************************

 DBG_ENTER('COLL')

!Preliminary tests
 if (psps%useylm==0) then
   MSG_ERROR(' Not available for useylm=0 !')
 end if
 if ((choice<1.or.choice>6).and.choice/=23.and.choice/=24) then
   MSG_BUG(' Bad choice !')
 end if
 if (idir>0.and.choice/=2.and.choice/=3.and.choice/=5) then
   msg=' Does not support idir>0 for that choice.'
   MSG_BUG(msg)
 end if
 if (mkmem==0.and.paral_kgb/=0) then
   msg=' Not available for mkmem=0 and band-FFT parallelism !'
   MSG_ERROR(msg)
 end if
 if (useylmgr==0.and.(choice==3.or.choice==5)) then
   msg=' Ylm gradients has to be in memory for choice=3 or 5 !'
   MSG_BUG(msg)
 end if

!Init parallelism
 call xcomm_init(mpi_enreg,spaceComm,spaceComm_bandfft=mpi_enreg%comm_kpt)
 if (paral_kgb/=0) then
   me_distrb = mpi_enreg%me_kpt
   spaceComm_band=mpi_enreg%comm_band
   spaceComm_fft=mpi_enreg%comm_fft
   nprocband=mpi_enreg%nproc_band
   cg_bandpp  =mpi_enreg%bandpp
   cprj_bandpp=mpi_enreg%bandpp
 else
   me_distrb = mpi_enreg%me
   spaceComm_band=0;spaceComm_fft=0
   nprocband=1;cg_bandpp=1;cprj_bandpp=1
 end if
 if (cg_bandpp/=cprj_bandpp) then
   MSG_BUG('  cg_bandpp must be equal to cprj_bandpp !')
 end if

!Check sizes for cprj (distribution is tricky)
 one_atom=(iatom>0)
 if (one_atom.and.ncprj/=1) then
   MSG_BUG('  Bad value for ncprj dimension (should be 1) !')
 end if
 if (.not.one_atom.and.ncprj/=natom) then
   MSG_BUG('  Bad value for ncprj dimension (should be natom) !')
 end if

!Prepare temporary files if mkmem==0
 if (mkmem==0) then
   formeig=0;mcg_disk=mpw*nspinor*mband
   call hdr_skip(wffnow,ierr)
   call xdefineOff(formeig,wffnow,mpi_enreg,nband,npwarr,nspinor,nsppol,nkpt)
   allocate(cg_disk(2,mcg_disk))
 end if

!Initialize some variables
 n1=ngfft(1);n2=ngfft(2);n3=ngfft(3)
 n1_2p1=2*n1+1;n2_2p1=2*n2+1;n3_2p1=2*n3+1
 ibg=0;icg=0;cpopt=0
 ider=0;idir0=0
 if (choice==3.or.choice==5) ider=1
 if (idir>0) then
   if (choice==3) idir0=-idir
   if (choice==5) idir0=idir
 else
   if (choice==3) idir0=-7
   if (choice==5) idir0=4
 end if
 if (idir0==0.or.idir0==4) then
   dimffnl=1+3*ider
 else if (idir0/=-7) then
   dimffnl=1+ider
 else
   dimffnl=1+6*ider
 end if
 nkpg=0
 if (choice==3.or.choice==2.or.choice==23) nkpg=3*nloalg(5)
 if (choice==4.or.choice==24) nkpg=9*nloalg(5)

!Set number of gradients for <p_i|Cnk>
 ncpgr=0
 if (idir==0) then
   if (choice==2) ncpgr=3
   if (choice==3) ncpgr=6
   if (choice==23)ncpgr=9
   if (choice==4) ncpgr=6
   if (choice==24)ncpgr=9
   if (choice==5) ncpgr=3
   if (choice==6) ncpgr=63
 else
   ncpgr=1
 end if

!Extract data for treated atom(s)
 if (one_atom) then
   iatom1=iatom;iatom2=iatom
   ntypat0=1;itypat=typat(iatom)
   allocate(nattyp_atm(ntypat0));nattyp_atm(1)=1
   allocate(atindx_atm(ntypat0));atindx_atm(1)=atindx(iatom)
   allocate(ph1d_atm(2,(n1_2p1+n2_2p1+n3_2p1)*ntypat0))
   shift1=(atindx(iatom)-1)*n1_2p1
   shift2=(atindx(iatom)-1)*n2_2p1+natom*n1_2p1
   shift3=(atindx(iatom)-1)*n3_2p1+natom*(n1_2p1+n2_2p1)
   shift1b=0;shift2b=n1_2p1;shift3b=n1_2p1+n2_2p1
   ph1d_atm(:,shift1b+1:shift1b+n1_2p1)=ph1d(:,shift1+1:shift1+n1_2p1)
   ph1d_atm(:,shift2b+1:shift2b+n2_2p1)=ph1d(:,shift2+1:shift2+n2_2p1)
   ph1d_atm(:,shift3b+1:shift3b+n3_2p1)=ph1d(:,shift3+1:shift3+n3_2p1)
   allocate(ekb_atm(psps%dimekb,ntypat0))
   allocate(indlmn_atm(6,psps%lmnmax,ntypat0))
   allocate(ffspl_atm(psps%mqgrid_ff,2,psps%lnmax,ntypat0))
   allocate(pspso_atm(ntypat0))
   ekb_atm(:,1)=psps%ekb(:,itypat)
   indlmn_atm(:,:,1)=psps%indlmn(:,:,itypat)
   ffspl_atm(:,:,:,1)=psps%ffspl(:,:,:,itypat)
   pspso_atm(1)=psps%pspso(itypat)
 else
   iatom1=1;iatom2=natom
   ntypat0=ntypat
   atindx_atm => atindx
   nattyp_atm => nattyp
   ph1d_atm => ph1d
   ekb_atm => psps%ekb
   indlmn_atm => psps%indlmn
   ffspl_atm => psps%ffspl
   pspso_atm => psps%pspso
 end if

!Dimensioning and allocation of <p_i|Cnk>
 allocate(dimlmn(ncprj));dimlmn=0  ! Type-sorted cprj
 if (one_atom) then
   itypat=typat(iatom)
   dimlmn(ia+1:ia+nattyp(itypat))=count(indlmn_atm(3,:,itypat)>0)
 else
   ia=0
   do itypat=1,ntypat0
     dimlmn(ia+1:ia+nattyp(itypat))=count(indlmn_atm(3,:,itypat)>0)
     ia=ia+nattyp(itypat)
   end do
 end if
 if (mkmem==0) then
   call cprj_diskinit_w(atindx_atm,ncprj,iorder_cprj,mkmem,natom,ncpgr,dimlmn,nspinor,uncp)
 end if
 allocate(cwaveprj(ncprj,nspinor*cprj_bandpp))
 call cprj_alloc(cwaveprj,ncpgr,dimlmn)

!Additional statements if band-fft parallelism
 if (nprocband>1) then
   allocate(npw_block(nprocband),npw_disp(nprocband))
   allocate(bufsize(nprocband*cg_bandpp),bufdisp(nprocband*cg_bandpp))
   allocate(bufsize_wf(nprocband*cg_bandpp),bufdisp_wf(nprocband*cg_bandpp))
 end if

!LOOP OVER SPINS
 do isppol=1,nsppol
   ikg=0

!  Rewind temporary files if needed
   if (mkmem==0) rewind(unkg)
   if (mkmem==0) rewind(unylm)

!  BIG FAT k POINT LOOP
   do ikpt=1,nkpt
     counter=100*ikpt+isppol

!    Select k point to be treated by this proc
     nband_k=nband(ikpt+(isppol-1)*nkpt)
     if(mpi_enreg%paral_compil_kpt==1)then
       if(minval(abs(mpi_enreg%proc_distrb(ikpt,1:nband_k,isppol)-me_distrb))/=0) cycle
     end if

!    Old FFT parallelism: define FFT communicator for this k-point
     if (mpi_enreg%paral_compil_fft==1.and.paral_kgb/=0) then
       mpi_enreg%num_group_fft=ikpt+(isppol-1)*nkpt
     end if

!    Retrieve k-point
     kpoint(:)=kpt(:,ikpt)
     istwf_k=istwfk(ikpt)

!    Retrieve number of plane waves
     npw_k=npwarr(ikpt)
     if (nprocband>1) then
!      Special treatment for band-fft //
       call xallgather_mpi(npw_k,npw_block,spaceComm_band,ierr)
       npw_nk=sum(npw_block);npw_disp(1)=0
       do ii=2,nprocband
         npw_disp(ii)=npw_disp(ii-1)+npw_block(ii-1)
       end do
     else
       npw_nk=npw_k
     end if

!    Test cprj gradients dimension (just to be sure)
     if (cprj(1,ibg+1)%ncpgr/=ncpgr) then
       MSG_BUG('  cprj are badly allocated !')
     end if

!    Retrieve (k+G) points and spherical harmonics
     allocate(ylm_k(npw_k,mpsang*mpsang),ylmgr_k(npw_k,3,mpsang*mpsang*useylmgr),kg_k(3,npw_nk))
     if (mkmem==0) then
       nsp=nspinor
       call rdnpw(ikpt,isppol,nband_k,npw_k,nsp,0,unkg)
       read (unkg) kg_k(1:3,1:npw_k)
       read(unylm)
       if (useylmgr==0) then
         read(unylm) ((ylm_k(muig,ilm),muig=1,npw_k),ilm=1,mpsang*mpsang)
       else
         read(unylm) ((ylm_k(muig,ilm),muig=1,npw_k),ilm=1,mpsang*mpsang),&
&         (((ylmgr_k(muig,ii,ilm),muig=1,npw_k),ii=1,3),ilm=1,mpsang*mpsang)
       end if
       tim_rwwf=1;allocate(eig_dum(mband),kg_dum(3,0),occ_dum(mband))
       call rwwf(cg_disk,eig_dum,0,0,0,ikpt,isppol,kg_dum,mband,mcg_disk,mpi_enreg,nband_k,&
&       nband_k,npw_k,nspinor,occ_dum,-2,0,tim_rwwf,wffnow)
       deallocate(eig_dum,kg_dum,occ_dum)
     else
       if (nprocband>1) then
!        Special treatment for band-fft //
         allocate(kg_k_loc(3,npw_k))
         kg_k_loc(:,1:npw_k)=kg(:,1+ikg:npw_k+ikg)
         bufsize(:)=3*npw_block(:);bufdisp(:)=3*npw_disp(:)
         call xallgatherv_mpi(kg_k_loc,3*npw_k,kg_k,bufsize,bufdisp,spaceComm_band,ierr)
       else
         kg_k(:,1:npw_k)=kg(:,1+ikg:npw_k+ikg)
       end if
       do ilm=1,mpsang*mpsang
         ylm_k(1:npw_k,ilm)=ylm(1+ikg:npw_k+ikg,ilm)
         if (useylmgr>0) ylmgr_k(1:npw_k,1:3,ilm)=ylmgr(1+ikg:npw_k+ikg,1:3,ilm)
       end do
     end if

!    Compute (k+G) vectors
     allocate(kpg_k(npw_nk,nkpg))
     if (nkpg>0) call mkkpg(kg_k,kpg_k,kpoint,nkpg,npw_nk)
!    Allocate and compute the arrays phkxred and ph3d
     allocate(phkxred(2,ncprj))
     do ia=iatom1,iatom2
       iatm=min(atindx_atm(ia),ncprj)
       arg=two_pi*(kpoint(1)*xred(1,ia)+kpoint(2)*xred(2,ia)+kpoint(3)*xred(3,ia))
       phkxred(1,iatm)=cos(arg);phkxred(2,iatm)=sin(arg)
     end do
     if(nloalg(1)<=0)then
!      Here, only the allocation of ph3d , not the precomputation
       matblk=min(nloalg(4),ncprj);allocate(ph3d(2,npw_k,matblk))
       if (nprocband>1) then
         MSG_ERROR('  Band-fft parallelism +nloag(1)<0 forbidden !')
       end if
     else
!      Here, allocation as well as precomputation
       matblk=ncprj;allocate(ph3d(2,npw_nk,matblk))
       if (nprocband>1) then
!        Special treatment for band-fft //
         allocate(ph3d_tmp(2,npw_k,matblk))
         call ph1d3d(1,ncprj,kg_k_loc,matblk,ncprj,npw_k,n1,n2,n3,phkxred,ph1d_atm,ph3d_tmp)
         allocate(ph3d_tmp_npw(2,matblk,npw_k),ph3d_npw(2,matblk,npw_nk))
         isize=2*matblk;bufsize(:)=isize*npw_block(:);bufdisp(:)=isize*npw_disp(:)
         do ipw=1,npw_k
           ph3d_tmp_npw(:,:,ipw)=ph3d_tmp(:,ipw,:)
         end do
         call xallgatherv_mpi(ph3d_tmp_npw,isize*npw_k,ph3d_npw,bufsize,bufdisp,spaceComm_band,ierr)
         do ipw=1,npw_nk
           ph3d(:,ipw,:)=ph3d_npw(:,:,ipw)
         end do
         deallocate(ph3d_npw,ph3d_tmp_npw,ph3d_tmp)
       else
         call ph1d3d(1,ncprj,kg_k,matblk,ncprj,npw_k,n1,n2,n3,phkxred,ph1d_atm,ph3d)
       end if
     end if

!    Compute nonlocal form factors ffnl at all (k+G)
     allocate(ffnl(npw_nk,dimffnl,psps%lmnmax,ntypat0))
     if (nprocband>1) then
!      Special treatment for band-fft //
       allocate(ffnl_tmp(npw_k,dimffnl,psps%lmnmax,ntypat0))
       call mkffnl(psps%dimekb,dimffnl,ekb_atm,ffnl_tmp,ffspl_atm,&
&       gmet,gprimd,ider,idir0,indlmn_atm,kg_k_loc,kpg_k,kpoint,psps%lmnmax,&
&       psps%lnmax,psps%mpsang,psps%mqgrid_ff,nkpg,npw_k,ntypat0,&
&       pspso_atm,psps%qgrid_ff,rmet,psps%usepaw,psps%useylm,ylm_k,ylmgr_k)
       allocate(ffnl_tmp_npw(dimffnl,psps%lmnmax,ntypat0,npw_k))
       allocate(ffnl_npw(dimffnl,psps%lmnmax,ntypat0,npw_nk))
       isize=dimffnl*psps%lmnmax*ntypat0
       bufsize(:)=isize*npw_block(:);bufdisp(:)=isize*npw_disp(:)
       do ipw=1,npw_k
         ffnl_tmp_npw(:,:,:,ipw)=ffnl_tmp(ipw,:,:,:)
       end do
       call xallgatherv_mpi(ffnl_tmp_npw,isize*npw_k,ffnl_npw,bufsize,bufdisp,spaceComm_band,ierr)
       do ipw=1,npw_nk
         ffnl(ipw,:,:,:)=ffnl_npw(:,:,:,ipw)
       end do
       deallocate(ffnl_npw,ffnl_tmp_npw,ffnl_tmp)
     else
       call mkffnl(psps%dimekb,dimffnl,ekb_atm,ffnl,ffspl_atm,&
&       gmet,gprimd,ider,idir0,indlmn_atm,kg_k,kpg_k,kpoint,psps%lmnmax,&
&       psps%lnmax,psps%mpsang,psps%mqgrid_ff,nkpg,npw_k,ntypat0,&
&       pspso_atm,psps%qgrid_ff,rmet,psps%usepaw,psps%useylm,ylm_k,ylmgr_k)
     end if

!    No more need of kg_g_tmp
     if (nprocband>1) deallocate(kg_k_loc)

!    Allocate arrays for a wave-function (or a block of WFs)
     allocate(cwavef(2,npw_nk*nspinor*cg_bandpp))
     if (nprocband>1) then
       isize=2*nspinor*cg_bandpp;bufsize(:)=isize*npw_block(:);bufdisp(:)=isize*npw_disp(:)
       isize=2*nspinor*npw_k*cg_bandpp;bufsize_wf(:)=isize
       do ii=1,nprocband*cg_bandpp
         bufdisp_wf(ii)=(ii-1)*isize
       end do
     end if

!    Loop over bands or blocks of bands
     icgb=icg
     blocksz=nprocband*cg_bandpp
     nblockbd=nband_k/blocksz
     do iblockbd=1,nblockbd
       iband_min=1+(iblockbd-1)*blocksz
       iband_max=iblockbd*blocksz

       if(mpi_enreg%paral_compil_kpt==1.and.paral_kgb/=0) then
         if (minval(abs(mpi_enreg%proc_distrb(ikpt,iband_min:iband_max,isppol)-me_distrb))/=0) cycle
       end if

!      Extract wavefunction information
       if (mkmem==0) then
         do ig=1,npw_k*nspinor
           cwavef(1,ig)=cg_disk(1,ig+icgb)
           cwavef(2,ig)=cg_disk(2,ig+icgb)
         end do
       else
         if (nprocband>1) then
!          Special treatment for band-fft //
           allocate(cwavef_tmp(2,npw_k*nspinor*blocksz))
           do ig=1,npw_k*nspinor*blocksz
             cwavef_tmp(1,ig)=cg(1,ig+icgb)
             cwavef_tmp(2,ig)=cg(2,ig+icgb)
           end do
           call xalltoallv_mpi(cwavef_tmp,bufsize_wf,bufdisp_wf,cwavef,bufsize,bufdisp,spaceComm_band,ierr)
           deallocate(cwavef_tmp)
         else
           do ig=1,npw_k*nspinor
             cwavef(1,ig)=cg(1,ig+icgb)
             cwavef(2,ig)=cg(2,ig+icgb)
           end do
         end if
       end if

!      Compute scalar product of wavefunction with all NL projectors
       do ibp=1,cg_bandpp   ! Note: we suppose cp_bandpp=cprj_bandpp
         iwf1=1+(ibp-1)*npw_nk*nspinor;iwf2=ibp*npw_nk*nspinor
         icp1=1+(ibp-1)*nspinor;icp2=ibp*nspinor
         call getcprj(choice,cpopt,cwavef(:,iwf1:iwf2),cwaveprj(:,icp1:icp2),psps%dimekb,ntypat0,dimffnl,&
&         ekb_atm,ffnl,idir,indlmn_atm,istwf_k,kg_k,kpg_k,kpoint,psps%lmnmax,&
&         matblk,mgfft,mpi_enreg,ncprj,nattyp_atm,ngfft,nkpg,nloalg,&
&         npw_nk,nspinor,ntypat0,phkxred,ph1d_atm,ph3d,ucvol,psps%usepaw,psps%useylm)
       end do
!      Export cwaveprj to big array cprj
       call cprj_put(atindx_atm,cwaveprj,cprj,ncprj,iband_min,ibg,ikpt,iorder_cprj,isppol,&
&       mband,mkmem,mpi_enreg,natom,cprj_bandpp,nband_k,dimlmn,nspinor,nsppol,spaceComm_band,uncp,&
&       to_be_gathered=.true.)

!      End loop over bands
       icgb=icgb+npw_k*nspinor*blocksz
     end do

!    Shift array memory (if mkmem/=0)
     if (mkmem/=0) then
       ibg=ibg+nspinor*nband_k
       icg=icg+nspinor*nband_k*npw_k
       ikg=ikg+npw_k
     end if

!    End big k point loop
     deallocate(ffnl,ph3d,phkxred,kg_k,kpg_k,ylm_k,ylmgr_k,cwavef)
   end do
!  End loop over spins
 end do

!Deallocate temporary storage
 if (one_atom) deallocate(atindx_atm,nattyp_atm,ph1d_atm,ekb_atm,indlmn_atm,ffspl_atm,pspso_atm)
 nullify(atindx_atm,nattyp_atm,ph1d_atm,ekb_atm,indlmn_atm,ffspl_atm,pspso_atm)
 call cprj_free(cwaveprj)
 deallocate(cwaveprj,dimlmn)
 if (nprocband>1) then
   deallocate(npw_block,npw_disp)
   deallocate(bufsize,bufdisp,bufsize_wf,bufdisp_wf)
 end if
 if(mkmem==0) deallocate(cg_disk)

 DBG_EXIT('COLL')

 end subroutine ctocprj
!!***
