!{\src2tex{textfont=tt}}
!!****f* ABINIT/nstwf3
!! NAME
!! nstwf3
!!
!! FUNCTION
!! This routine computes the non-local contribution to the
!! 2DTE matrix elements, in the non-stationary formulation
!!
!! COPYRIGHT
!! Copyright (C) 1999-2009 ABINIT group (XG,AR,MB,MVer)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  cg(2,mpw*nspinor*mband*mkmem*nsppol)=planewave coefficients of wavefunctions at k
!!  cgq(2,mpw*nspinor*mband*mkqmem*nsppol)=planewave coefficients of wavefunctions at k+q
!!  cg1(2,mpw1*nspinor*mband*mk1mem*nsppol)=pw coefficients of RF wavefunctions at k,q.
!!  cprj(natom,nspinor*mband*mkmem*nsppol*usecprj)=
!!          wave functions at k projected with non-local projectors: cprj=<p_i|Cnk>
!!  cprjq(natom,nspinor*mband*mkqmem*nsppol*usecprj)=
!!          wave functions at k+q projected with non-local projectors: cprjq=<p_i|Cnk+q>
!!  ddkfil(3)=unit numbers for the three possible ddk files for ipert1
!!       equal to 0 if no dot file is available for this direction
!!  dimcprj(natom*usepaw)=array of dimensions of arrays cprj, cprjq (ordered by atom-type)
!!  dtfil <type(datafiles_type)>=variables related to files
!!  ecut=cut-off energy for plane wave basis sphere (Ha)
!!  ecutsm=smearing energy for plane wave kinetic energy (Ha)  (NOT NEEDED !)
!!  eig_k(mband*nsppol)=GS eigenvalues at k (hartree)
!!  eig1_k(2*nsppol*mband**2)=matrix of first-order eigenvalues (hartree)
!!  gs_hamkq <type(gs_hamiltonian_type)>=all data for the Hamiltonian at k+q
!!  ibg=shift to be applied on the location of data in the array cprj
!!  ibgq=shift to be applied on the location of data in the array cprjq
!!  icg=shift to be applied on the location of data in the array cg
!!  icgq=shift to be applied on the location of data in the array cgq
!!  icg1=shift to be applied on the location of data in the array cg1
!!  idir=direction of the current perturbation
!!  ikpt=number of the k-point
!!  ipert=type of the perturbation
!!  isppol=1 for unpolarized, 2 for spin-polarized
!!  istwf_k=option parameter that describes the storage of wfs
!!  kg_k(3,npw_k)=reduced planewave coordinates.
!!  kg1_k(3,npw1_k)=reduced planewave coordinates at k+q, with RF k points
!!  kpt(3)=reduced coordinates of k points.
!!  mband=maximum number of bands
!!  mgfft=maximum size of 1D FFTs
!!  mkmem =number of k points which can fit in memory; set to 0 if use disk
!!  mkqmem =number of k+q points which can fit in memory; set to 0 if use disk
!!  mk1mem =number of k points which can fit in memory (RF data); 0 if use disk
!!  mpert =maximum number of ipert
!!  mpi_enreg=informations about MPI parallelization
!!  mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
!!  mpw=maximum dimensioned size of npw or wfs at k
!!  mpw1=maximum dimensioned size of npw for wfs at k+q (also for 1-order wfs).
!!  natom=number of atoms in cell.
!!  nband_k=number of bands at this k point for that spin polarization
!!  nband_rbz=(nkpt_rbz*nsppol)=number of bands at each RF k point for each spin
!!  ncpgr=number of gradients stored in cprj array (cprj=<p_i|Cnk>)
!!  nfft=(effective) number of FFT grid points (for this processor)
!!  nkpt_rbz=number of k points in the reduced BZ for this perturbation
!!  npw_k=number of plane waves at this k point
!!  npw1_k=number of plane waves at this k+q point
!!  nspinor=number of spinorial components of the wavefunctions
!!  nsppol=1 for unpolarized, 2 for spin-polarized
!!  occopt=option for occupation numbers (not used !)
!!  occ_k(nband_k)=occupation number for each band (usually 2) for each k.
!!  paral_kgb=flag for (kpt,bandes,FFT) parallelism
!!  paw_ij(natom*usepaw) <type(paw_ij_type)>=paw arrays given on (i,j) channels for the GS
!!  paw_ij1(natom) <type(paw_ij_type)>=1st-order paw arrays given on (i,j) channels
!!  prtbbb=if 1, will compute the band-by-band decomposition
!!  prtvol=control print volume and debugging output
!!  psps <type(pseudopotential_type)>=variables related to pseudopotentials
!!  rmet(3,3)=real space metric (bohr**2)
!!  usecprj= 1 if cprj, cprjq, cprj1 arrays are stored in memory
!!  wffddk(3)=struct info for for the three possible dot files for ipert1
!!       equal to 0 if no dot file is available for this direction
!!  wffnow=struct info for INPUT 1st-order wf file
!!  wfftgs,wfftkq=struct info for ground-state wf disk files
!!  wtk_k=weight assigned to the k point.
!!  ylm(npw_k,mpsang*mpsang*useylm)= real spherical harmonics for each
!!    G and k point
!!  ylm1(npw1_k,mpsang*mpsang*useylm)= real spherical harmonics for each
!!    G and k+q point
!!
!! OUTPUT
!!  d2bbb_k(2,3,mband,mband*prtbbb)=band by band decomposition of the second
!!   order derivatives, for the present k point, and perturbation idir, ipert
!!  d2nl_k(2,3,mpert)=non-local contributions to
!!   non-stationary 2DTE, for the present k point, and perturbation idir, ipert
!!
!! PARENTS
!!      nstdy3
!!
!! CHILDREN
!!      cprj_alloc,cprj_free,cprj_get,dotprod_g,gaugetransfo,getgh1c,getgsc
!!      leave_new,mkffnl,mkkpg,ph1d3d,timab,wffreaddatarec,wffreadnpwrec
!!      wffreadskiprec,wrtout,xme_init
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine nstwf3(cg,cgq,cg1,cprj,cprjq,ddkfil,dimcprj,dtfil,d2bbb_k,d2nl_k,ecut,ecutsm,&
&          eig_k,eig1_k,gs_hamkq,ibg,ibgq,icg,icgq,icg1,idir,ikpt,ipert,&
&          isppol,istwf_k,kg_k,kg1_k,kpt,mband,mgfft,mkmem,mkqmem,mk1mem,mpert,&
&          mpi_enreg,mpsang,mpw,mpw1,natom,nband_k,nband_rbz,ncpgr,nfft,nkpt_rbz,&
&          npw_k,npw1_k,nspinor,nsppol,occopt,occ_k,paral_kgb,paw_ij,paw_ij1,prtbbb,&
&          prtvol,psps,rmet,usecprj,wffddk,wffnow,wfftgs,wfftkq,wtk_k,ylm,ylm1)

 use defs_basis
 use defs_datatypes
 use defs_abitypes

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_16_hideleave
 use interfaces_18_timing
 use interfaces_51_manage_mpi
 use interfaces_53_spacepar
 use interfaces_59_io_mpi
 use interfaces_65_nonlocal
 use interfaces_66_wfs
 use interfaces_72_response, except_this_one => nstwf3
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: ibg,ibgq,icg,icg1,icgq,idir,ikpt,ipert,isppol,istwf_k
 integer,intent(in) :: mband,mgfft,mk1mem,mkmem,mkqmem,mpert,mpsang,mpw,mpw1
 integer,intent(in) :: natom,ncpgr,nfft,nkpt_rbz,nsppol,occopt,paral_kgb,prtbbb
 integer,intent(in) :: prtvol,usecprj
 integer,intent(inout) :: nband_k,npw1_k,npw_k,nspinor
 real(dp),intent(in) :: ecut,ecutsm,wtk_k
 type(MPI_type),intent(inout) :: mpi_enreg
 type(datafiles_type),intent(in) :: dtfil
 type(gs_hamiltonian_type),intent(in) :: gs_hamkq
 type(pseudopotential_type),intent(in) :: psps
 type(wffile_type),intent(inout) :: wffnow,wfftgs,wfftkq
!arrays
 integer,intent(in) :: ddkfil(3),dimcprj(natom*psps%usepaw),kg1_k(3,npw1_k)
 integer,intent(in) :: kg_k(3,npw_k),nband_rbz(nkpt_rbz*nsppol)
 real(dp),intent(in) :: cg(2,mpw*nspinor*mband*mkmem*nsppol)
 real(dp),intent(in) :: cg1(2,mpw1*nspinor*mband*mk1mem*nsppol)
 real(dp),intent(in) :: cgq(2,mpw1*nspinor*mband*mkqmem*nsppol)
 real(dp),intent(in) :: eig_k(mband*nsppol),kpt(3),occ_k(nband_k),rmet(3,3)
 real(dp),intent(in) :: ylm(npw_k,mpsang*mpsang*psps%useylm)
 real(dp),intent(in) :: ylm1(npw1_k,mpsang*mpsang*psps%useylm)
 real(dp),intent(inout) :: eig1_k(2*nsppol*mband**2)
 real(dp),intent(out) :: d2bbb_k(2,3,mband,mband*prtbbb),d2nl_k(2,3,mpert)
 type(cprj_type),intent(in) :: cprj(natom,nspinor*mband*mkmem*nsppol*usecprj)
 type(cprj_type),intent(in) :: cprjq(natom,nspinor*mband*mkqmem*nsppol*usecprj)
 type(paw_ij_type),intent(in) :: paw_ij(natom*psps%usepaw)
 type(paw_ij_type),intent(in) :: paw_ij1(natom*psps%usepaw)
 type(wffile_type),intent(inout) :: wffddk(3)

!Local variables-------------------------------
!scalars
 integer :: berryopt,cplex,dimdij,dimdij1,dime1kb,dimffnl,dimphkxred,ia,iatom
 integer :: iband,ider,idir1,ierr,igscq,ii,ilmn,iorder_cprj,ipert1,ipw,isp
 integer :: ispden,itypat,jband,matblk,mcgq,mcprjq,me,mgscq,n1,n2,n3,nband_kocc
 integer :: nkpg,nkpg1,npw_disk,optlocal,optnl,sij_opt,tim_getgh1c,usee1kb
 integer :: usevnl
 real(dp) :: aa,arg,dot1i,dot1r,dot2i,dot2r,dot_ndiagi,dot_ndiagr,doti,dotr
 real(dp) :: lambda
 character(len=500) :: message
!arrays
 integer :: pspso_typ(1)
 integer,allocatable :: indlmn_typ(:,:,:)
 real(dp) :: dummy(1,1),tsec(2),ylmgr_dum(1)
 real(dp),allocatable :: cg_k(:,:),cgdot(:,:,:),cgnow(:,:),cgq_disk(:,:)
 real(dp),allocatable :: cgtgs(:,:),cwave0(:,:),cwavef(:,:),cwavef_da(:,:)
 real(dp),allocatable :: cwavef_db(:,:),dkinpw(:),e1kbfr(:,:,:),eig2_k(:)
 real(dp),allocatable :: eig_dot(:,:),ekb_typ(:,:,:),ffnl(:,:,:,:)
 real(dp),allocatable :: ffnl1(:,:,:,:),ffnlk(:,:,:,:),ffnlkq(:,:,:,:)
 real(dp),allocatable :: gscq(:,:),gvnl1(:,:),kinpw1(:),kpg1_k(:,:),kpg_k(:,:)
 real(dp),allocatable :: ph3d(:,:,:),phkxred(:,:),sij_typ(:,:)
 type(cprj_type),allocatable :: cprjq_disk(:,:),cwaveprj0(:,:)

! *********************************************************************

!Keep track of total time spent in nstwf3
 call timab(102,1,tsec)

!DEBUG
!write(6,*)' nstwf3 : enter '
!write(6,*)'ikpt,isppol,dimekb = ',ikpt,isppol
!stop
!ENDDEBUG

!Define me
 call xme_init(mpi_enreg,me)

 tim_getgh1c=2

!Nonlocal form factors
 dimffnl=1
 allocate(ffnl(npw_k,dimffnl,psps%lmnmax,psps%ntypat))
 allocate(ffnlk(npw_k,dimffnl,psps%lmnmax,1))
 allocate(ffnl1(npw1_k,dimffnl,psps%lmnmax,psps%ntypat))
 allocate(ffnlkq(npw1_k,dimffnl,psps%lmnmax,1))

!Nonlocal psps strengths (or KB energies)
 if (ipert/=natom+1) then
  if (psps%usepaw==0) then
   usee1kb=0;dime1kb=0
   allocate(ekb_typ(gs_hamkq%dimekb1,1,nspinor**2))
   allocate(sij_typ(gs_hamkq%dimekb1,usee1kb))
   allocate(e1kbfr(dime1kb,gs_hamkq%dimekb2,usee1kb))
  else
   usee1kb=1;dime1kb=psps%dimekb*paw_ij1(1)%cplex_dij
   allocate(ekb_typ(gs_hamkq%dimekb1,1,nspinor**2))
   allocate(sij_typ(gs_hamkq%dimekb1,1))
   allocate(e1kbfr(dime1kb,gs_hamkq%dimekb2,nspinor**2))
  end if
 end if

!Additional allocations
 if (ipert/=natom+1) then
  allocate(indlmn_typ(6,psps%lmnmax,1))
  dimphkxred=0
  if (psps%usepaw==0) dimphkxred=1
  if (psps%usepaw==1) dimphkxred=natom
  allocate(phkxred(2,dimphkxred))
  allocate(dkinpw(npw_k),kinpw1(npw1_k))
  kinpw1=zero
 end if

!Additional allocations
 allocate(gvnl1(2,npw1_k*nspinor))
 allocate(eig2_k(2*nsppol*mband**2))

!Compute (k+G) vectors
 nkpg=0;if (ipert/=natom+1) nkpg=3*gs_hamkq%nloalg(5)
 allocate(kpg_k(npw_k,nkpg))
 if (nkpg>0) call mkkpg(kg_k,kpg_k,kpt,nkpg,npw_k)

!Compute nonlocal form factors ffnl at (k+G), for all atoms
 ider=0
 call mkffnl(psps%dimekb,dimffnl,psps%ekb,ffnl,psps%ffspl,gs_hamkq%gmet,gs_hamkq%gprimd,ider,ider,&
& psps%indlmn,kg_k,kpg_k,kpt,psps%lmnmax,&
& psps%lnmax,psps%mpsang,psps%mqgrid_ff,nkpg,&
& npw_k,psps%ntypat,psps%pspso,psps%qgrid_ff,rmet,psps%usepaw,psps%useylm,ylm,ylmgr_dum)

!Compute (k+q+G) vectors
 nkpg1=0;if (ipert/=natom+1) nkpg1=3*gs_hamkq%nloalg(5)
 allocate(kpg1_k(npw1_k,nkpg1))
 if (nkpg1>0) call mkkpg(kg1_k,kpg1_k,gs_hamkq%kpoint,nkpg1,npw1_k)

!Compute nonlocal form factors ffnl1 at (k+q+G), for all atoms
 ider=0
 call mkffnl(psps%dimekb,dimffnl,psps%ekb,ffnl1,psps%ffspl,gs_hamkq%gmet,gs_hamkq%gprimd,ider,ider,&
& psps%indlmn,kg1_k,kpg1_k,gs_hamkq%kpoint,psps%lmnmax,&
& psps%lnmax,psps%mpsang,psps%mqgrid_ff,nkpg1,&
& npw1_k,psps%ntypat,psps%pspso,psps%qgrid_ff,rmet,psps%usepaw,psps%useylm,ylm1,ylmgr_dum)

!Compute ph3d (note: use npw1_k)
 if(gs_hamkq%nloalg(1)<=0)then
! Here, only the allocation, not the precomputation.
  matblk=gs_hamkq%nloalg(4)
  allocate(ph3d(2,npw1_k,matblk))
 else
! Here, allocation as well as precomputation
  matblk=natom
  allocate(ph3d(2,npw1_k,matblk))
  n1=gs_hamkq%ngfft(1);n2=gs_hamkq%ngfft(2);n3=gs_hamkq%ngfft(3)
  call ph1d3d(1,natom,kg1_k,gs_hamkq%kpoint,matblk,natom,npw1_k,n1,n2,n3,gs_hamkq%phkxred,gs_hamkq%ph1d,ph3d)
 end if

 allocate(cwave0(2,npw_k*nspinor),cwavef(2,npw1_k*nspinor))

!DEBUG
!write(6,*)' nstwf3 : before loop , call rdnpw'
!stop
!ENDDEBUG

!Take care of the npw and kg records
!NOTE : one should be able to modify the rwwf routine to take care
!of the band parallelism, which is not the case yet ...
 do idir1=1,3
  if (ddkfil(idir1)/=0)then
!  Read npw record
   call WffReadNpwRec(ierr,ikpt,isppol,nband_k,npw_disk,nspinor,wffddk(idir1))
   if (npw_k /= npw_disk) then
    write(message,'(a,a,a,a,i3,a,i5,a,i3,a,a,i5,a,a,i5)')ch10,&
&    ' nstwf3: BUG - ',ch10,&
&    ' For isppol = ',isppol,', ikpt = ',ikpt,' and idir = ',idir,ch10,&
&    ' the number of plane waves in the ddk file is equal to', npw_disk,ch10,&
&    ' while it should be ',npw_k
    call wrtout(6,message,'PERS')
    call leave_new('PERS')
   end if
!  Skip k+G record
   call WffReadSkipRec(ierr,1,wffddk(idir1))
  end if
 end do
 if (mkmem==0) then
  call WffReadNpwRec(ierr,ikpt,isppol,nband_k,npw_k,nspinor,wfftgs)
! Skip k+G and eigenvalue records in wfftgs
  call WffReadSkipRec(ierr,2,wfftgs)
 end if
!DEBUG
!write(6,*)' nstwf3 : nband_k, prtbbb=',nband_k,prtbbb
!stop
!ENDDEBUG
 if (mk1mem==0) then
  call WffReadNpwRec(ierr,ikpt,isppol,nband_k,npw1_k,nspinor,wffnow)
! Skip k+G record
  call WffReadSkipRec(ierr,1,wffnow)
 end if

 if (ipert==natom+1) then
  nband_kocc = 0
  do iband = 1,nband_k
   if (abs(occ_k(iband)) > tol8) nband_kocc = nband_kocc + 1
  end do
 end if

 if(prtbbb==1)then
  allocate(cwavef_da(2,npw1_k*nspinor),cwavef_db(2,npw1_k*nspinor))
  allocate(cg_k(2,npw_k*nspinor*nband_k))
  if ((ipert == natom + 1).or.(ipert <= natom).or.(ipert == natom + 2)) then
   if (mkmem /= 0) then
    cg_k(:,:) = cg(:,1+icg:icg+nband_k*npw_k*nspinor)
   else
    do iband=1,nband_k
     call WffReadDataRec(cwave0,ierr,2*npw_k*nspinor,wfftgs)
     cg_k(:,(iband-1)*npw_k*nspinor+1:iband*npw_k*nspinor)=cwave0(:,:)
    end do
   end if
  end if
  d2bbb_k(:,:,:,:) = zero
 end if

!Additional stuff for PAW
 if (psps%usepaw==1) then
! 1-Read |Cnk+q> for this k+q
  if (mkqmem==0) then
   mcgq=npw1_k*nspinor*nband_k;allocate(cgq_disk(2,mcgq))
   call WffReadNpwRec(ierr,ikpt,isppol,nband_k,npw1_k,nspinor,wfftkq)
   call WffReadSkipRec(ierr,2,wfftkq)
   do iband=1,nband_k
    call WffReadDataRec(cwave0,ierr,2*npw1_k*nspinor,wfftkq)
    cgq_disk(:,(iband-1)*npw1_k*nspinor+1:iband*npw1_k*nspinor)=cwave0(:,:)
   end do
   iorder_cprj=0
   mcprjq=nspinor*nband_k*usecprj;allocate(cprjq_disk(natom,mcprjq))
   if (mcprjq>0) then
    call cprj_alloc(cprjq_disk,0,dimcprj)
    call cprj_get(gs_hamkq%atindx1,cprjq_disk,cprjq,natom,1,ibgq,ikpt,iorder_cprj,isppol,&
&    mband,mkqmem,mpi_enreg,natom,nband_k,nband_k,nspinor,nsppol,dtfil%unpawq)
   end if
  else
   mcgq=mpw1*nspinor*mband*mkqmem*nsppol
   mcprjq=nspinor*mband*mkqmem*nsppol*usecprj
  end if ! End if for choice governed by mkqmem
! 2-Compute all <g|S|Cnk+q>
  igscq=0;mgscq=npw1_k*nspinor*nband_k;allocate(gscq(2,mgscq))
  if (mkqmem/=0) then
   call getgsc(cgq,cprjq,dimcprj,dimffnl,ffnl1,gs_hamkq,gscq,ibgq,icgq,igscq,ikpt,isppol,&
&   kg1_k,psps%lmnmax,matblk,mcgq,mcprjq,mgfft,mgscq,mpi_enreg,psps%mpsang,&
&   psps%mpssoang,natom,nband_k,nkpt_rbz,npw1_k,nspinor,gs_hamkq%ntypat,ph3d)
  else
   call getgsc(cgq_disk,cprjq_disk,dimcprj,dimffnl,ffnl1,gs_hamkq,gscq,ibgq,icgq,igscq,ikpt,isppol,&
&   kg1_k,psps%lmnmax,matblk,mcgq,mcprjq,mgfft,mgscq,mpi_enreg,psps%mpsang,&
&   psps%mpssoang,natom,nband_k,nkpt_rbz,npw1_k,nspinor,gs_hamkq%ntypat,ph3d)
   deallocate(cgq_disk)
   call cprj_free(cprjq_disk)
   deallocate(cprjq_disk)
  end if
! 3-Initialize additional scalars/arrays
  iorder_cprj=0
  if (usecprj==1) then
   allocate(cwaveprj0(natom,nspinor))
   call cprj_alloc(cwaveprj0,ncpgr,dimcprj)
  end if
 end if

!Loop over bands
 do iband=1,nband_k

  if(mpi_enreg%paral_compil_kpt==1)then
   if(mpi_enreg%proc_distrb(ikpt,iband,isppol) /= me) then
    do idir1=1,3
!    Skip the eigenvalue and the wf records of this band
     if (ddkfil(idir1) /= 0) then
      call WffReadSkipRec(ierr,2,wffddk(idir1))
     end if
    end do
    if(mkmem==0)then
     if(prtbbb==0 .or. ipert==natom+2)then
      call WffReadSkipRec(ierr,1,wfftgs)
     end if
    end if
    if(mk1mem==0)then
     call WffReadSkipRec(ierr,2,wffnow)
    end if
    cycle
   end if
  end if ! mpi_enreg%paral_compil_kpt==1

! Read ground-state wavefunctions
  if (prtbbb==0 .or. ipert==natom+2) then
   if(mkmem/=0) then
    cwave0(:,:)=cg(:,1+(iband-1)*npw_k*nspinor+icg:iband*npw_k*nspinor+icg)
   else
    call timab(286,1,tsec)
    call WffReadDataRec(cwave0,ierr,2*npw_k*nspinor,wfftgs)
    call timab(286,2,tsec)
   end if
  else    ! prtbbb==1 and ipert<=natom , already in cg_k
   cwave0(:,:)=cg_k(:,1+(iband-1)*npw_k*nspinor:iband*npw_k*nspinor)
  end if
! Read PAW ground state projected WF (cprj)
  if (gs_hamkq%usepaw==1.and.usecprj==1) then
   call cprj_get(gs_hamkq%atindx1,cwaveprj0,cprj,natom,iband,ibg,ikpt,iorder_cprj,&
&   isppol,mband,mkmem,mpi_enreg,natom,1,nband_k,nspinor,nsppol,dtfil%unpaw)
  end if

! Read first-order wavefunctions
  if(mk1mem/=0)then
   cwavef(:,:)=cg1(:,1+(iband-1)*npw1_k*nspinor+icg1:iband*npw1_k*nspinor+icg1)
  else
   call timab(286,1,tsec)
   call WffReadDataRec(eig1_k(1+(iband-1)*2*nband_k:2*iband*nband_k),ierr,2*nband_k,wffnow)
   call WffReadDataRec(cwavef,ierr,2*npw1_k*nspinor,wffnow)
   call timab(286,2,tsec)
  end if

! In case non ddk perturbation
  if (ipert /= natom + 1) then

   do ipert1=1,mpert

    if( ipert1<=natom .or. ipert1==natom+2 )then

     if( ipert1<=natom )then

!     Transfer the infos relative to the displaced atom: ekb, indlmn, pspso, ffspl
      itypat=gs_hamkq%typat(ipert1)
      if (psps%usepaw==1) then
       dimdij=paw_ij(ipert1)%cplex_dij*paw_ij(ipert1)%lmn2_size
       do ispden=1,nspinor**2
        isp=isppol;if (nspinor==2) isp=ispden
        ekb_typ(1:dimdij,1,ispden)=paw_ij(ipert1)%dij(1:dimdij,isp)
        if (paw_ij(ipert1)%cplex_dij==1) then
         sij_typ(1:dimdij,1)=gs_hamkq%sij(1:dimdij,itypat)
        else
         do ilmn=1,dimdij/2
          sij_typ(2*ilmn-1,1)=gs_hamkq%sij(ilmn,itypat)
          sij_typ(2*ilmn  ,1)=zero
         end do
        end if
        if(dimdij<gs_hamkq%dimekb1) then
         ekb_typ(dimdij+1:gs_hamkq%dimekb1,1,ispden)=zero
         sij_typ(dimdij+1:gs_hamkq%dimekb1,1)=zero
        end if
        do iatom=1,natom
         dimdij1=paw_ij1(iatom)%cplex_dij*paw_ij1(iatom)%lmn2_size
         e1kbfr(1:dimdij1,iatom,ispden)=paw_ij1(iatom)%dijfr(1:dimdij1,isp)
         if(dimdij1<dime1kb) e1kbfr(dimdij1+1:dime1kb,iatom,ispden)=zero
        end do
       end do
      else
       ekb_typ(:,1,1)=psps%ekb(:,itypat)
       if (nspinor==2) then
        ekb_typ(:,1,2)=psps%ekb(:,itypat)
        ekb_typ(:,1,3:4)=zero
       end if
      end if
      indlmn_typ(:,:,1)=psps%indlmn(:,:,itypat)
      pspso_typ(1)=psps%pspso(itypat)

!     Copy the part needed for the displaced atom, in ffnlkq.
      do ilmn=1,psps%lmnmax
       ffnlkq(:,:,ilmn,1)=ffnl1(:,:,ilmn,itypat)
       ffnlk (:,:,ilmn,1)=ffnl (:,:,ilmn,itypat)
      end do
     end if ! ipert 1<=natom

     if (((ipert <= natom).or.(ipert == natom + 2)) &
&     .and.(ipert1 == natom+2).and. prtbbb==1) then
      call gaugetransfo(cg_k,cwavef,cwavef_db,eig_k,eig1_k,iband,nband_k, &
&      mband,npw_k,npw1_k,nspinor,nsppol,occ_k)
      cwavef(:,:) = cwavef_db(:,:)
     end if

!    Define the direction along which to move the atom :
!    the polarisation (ipert1,idir1) is refered as j1.
     do idir1=1,3
      if ( ipert1<=natom .or. &
&      (ipert1==natom+2.and.ddkfil(idir1)/=0) ) then

!      Get |Vnon-locj^(1)|u0> :
!      first-order non-local, applied to zero-order wavefunction
!      (??) this routine gives MINUS the non-local contribution
!      PAW: VHxc-dependant part not taken into account and -Eps.S^(1) added
       if( ipert1<=natom )then

!       Compute here phkxred for kpt
        if (psps%usepaw==0) then
         arg=two_pi*(kpt(1)*gs_hamkq%xred(1,ipert1)+kpt(2)*gs_hamkq%xred(2,ipert1)+kpt(3)*gs_hamkq%xred(3,ipert1))
         phkxred(1,1)=cos(arg) ; phkxred(2,1)=sin(arg)
        else
!        If PAW, need all phkxred
         do ia=1,natom
          iatom=gs_hamkq%atindx(ia)
          arg=two_pi*(kpt(1)*gs_hamkq%xred(1,ia)+kpt(2)*gs_hamkq%xred(2,ia)+kpt(3)*gs_hamkq%xred(3,ia))
          phkxred(1,iatom)=cos(arg) ; phkxred(2,iatom)=sin(arg)
         end do
        end if

        lambda=eig_k((isppol-1)*nband_k+iband)
        berryopt=1;cplex=1;optlocal=0;optnl=1;sij_opt=-1;usevnl=0
        call getgh1c(berryopt,cplex,cwave0,cwaveprj0,dimcprj,gs_hamkq%dimekb1,dime1kb,dimffnl,dimffnl,dimphkxred,&
&        dkinpw,ekb_typ,e1kbfr,e1kbfr,ffnlk,ffnlkq,ffnl1,dtfil%filstat,gs_hamkq%gbound,gvnl1,dummy,dummy,&
&        gs_hamkq,dummy,idir1,indlmn_typ,ipert1,kg_k,kg1_k,kinpw1,kpg_k,kpg1_k,kpt,lambda,psps%lmnmax,&
&        matblk,mgfft,mpi_enreg,psps%mpsang,psps%mpssoang,natom,nkpg,nkpg1,npw_k,npw1_k,nspinor,gs_hamkq%ntypat,&
&        gs_hamkq%n4,gs_hamkq%n5,gs_hamkq%n6,optlocal,optnl,paral_kgb,ph3d,phkxred,prtvol,pspso_typ,&
&        sij_opt,sij_typ,tim_getgh1c,usecprj,usee1kb,usevnl,dummy,dummy)

       else if( ipert1==natom+2 )then

        call WffReadDataRec(eig2_k(1+(iband-1)*2*nband_k:2*iband*nband_k),ierr,2*nband_k,wffddk(idir1))
        call WffReadDataRec(gvnl1,ierr,2*npw1_k*nspinor,wffddk(idir1))

!       In case of band-by-band,
!       construct the first-order wavefunctions in the diagonal gauge
        if (((ipert <= natom).or.(ipert == natom + 2)).and.(prtbbb==1)) then
         call gaugetransfo(cg_k,gvnl1,cwavef_da,eig_k,eig2_k,iband,nband_k, &
&         mband,npw_k,npw1_k,nspinor,nsppol,occ_k)
         gvnl1(:,:) = cwavef_da(:,:)
        end if

!       Multiplication by -i
        do ipw=1,npw1_k*nspinor
         aa=gvnl1(1,ipw)
         gvnl1(1,ipw)=gvnl1(2,ipw)
         gvnl1(2,ipw)=-aa
        end do
       end if

!      MVeithen 021212 :
!      1) Case ipert1 = natom + 2 and ipert = natom + 2:
!      the second derivative of the energy with respect to an electric
!      field is computed from Eq. (38) of X. Gonze, PRB 55 ,10355 (1997).
!      The evaluation of this formula needs the operator $i \frac{d}{dk}.
!      2) Case ipert1 = natom + 2 and ipert < natom:
!      the computation of the Born effective charge tensor uses
!      the operator $-i \frac{d}{dk}.
       if (ipert==natom+2) gvnl1(:,:) = -gvnl1(:,:)

!      <G|Vnl1|Cnk> is contained in gvnl1
!      construct the matrix element (<uj2|vj1|u0>)complex conjug.
!      and add it to the 2nd-order matrix
!      XG030513 : use dotprod_g, for future parallelisation
       call dotprod_g(dotr,doti,istwf_k,mpi_enreg,npw1_k*nspinor,2,cwavef,gvnl1)

       d2nl_k(1,idir1,ipert1)= &
&       d2nl_k(1,idir1,ipert1)+wtk_k*occ_k(iband)*two*dotr
       d2nl_k(2,idir1,ipert1)=&
&       d2nl_k(2,idir1,ipert1)-wtk_k*occ_k(iband)*two*doti

!      Band by band decomposition of the Born effective charges
!      calculated from a phonon perturbation
       if(prtbbb==1)then
        d2bbb_k(1,idir1,iband,iband) = wtk_k*occ_k(iband)*two*dotr
        d2bbb_k(2,idir1,iband,iband) = -one*wtk_k*occ_k(iband)*two*doti
       end if

!      DEBUG  Do not forget to restore the idir loop
!      write(6,*)' nstwf3 : cwave0 '
!      write(6,*)cwave0(:,1:2)
!      write(6,*)' nstwf3 : gvnl1 '
!      do ii=1,npw1_k*nspinor
!      write(6,*)ii,gvnl1(:,ii)
!      end do
!      write(6,*)' nstwf3 : cwavef '
!      do ii=1,npw1_k*nspinor
!      write(6,*)ii,cwavef(:,ii)
!      end do
!      if(idir1==3 .and. ipert1==4)then
!      write(6,*)' nstwf3 : ikpt,iband,ipert1,idir1,dotr,doti,d2nl_k'
!      write(6,*)ikpt, iband, ipert1, idir1
!      write(6,*)wtk_k,occ_k(iband),dotr, doti
!      write(6,*)d2nl_k(:,idir1,ipert1)
!      end if
!      stop
!      ENDDEBUG

      end if
     end do

    end if
   end do

  end if     ! ipert /= natom +1

! Compute the localization tensor

  if (ipert==natom+1) then

   ipert1=natom+1

   if(prtbbb==1)then
    call gaugetransfo(cg_k,cwavef,cwavef_db,eig_k,eig1_k,iband,nband_k, &
&    mband,npw_k,npw1_k,nspinor,nsppol,occ_k)
    cwavef(:,:) = cwavef_db(:,:)
   end if

   do idir1 = 1,3
    eig2_k(:) = zero
    gvnl1(:,:) = zero
    if (idir == idir1) then
     if (ddkfil(idir1) /= 0) then
      call WffReadSkipRec(ierr,2,wffddk(idir1))
     end if
     gvnl1(:,:) = cwavef(:,:)
     eig2_k(:) = eig1_k(:)
    else
     if (ddkfil(idir1) /= 0) then
      call WffReadDataRec(eig2_k(1+(iband-1)*2*nband_k:2*iband*nband_k),ierr,2*nband_k,wffddk(idir1))
      call WffReadDataRec(gvnl1,ierr,2*npw1_k*nspinor,wffddk(idir1))

      if(prtbbb==1)then
       call gaugetransfo(cg_k,gvnl1,cwavef_da,eig_k,eig2_k,iband,nband_k, &
&       mband,npw_k,npw1_k,nspinor,nsppol,occ_k)
       gvnl1(:,:) = cwavef_da(:,:)
      end if

     end if    !ddkfil(idir1)
    end if    !idir == idir1

!   <G|du/dqa> is contained in gvnl1 and <G|du/dqb> in cwavef
!   construct the matrix elements <du/dqa|du/dqb> -> dot
!   <u|du/dqa> -> dot1
!   <du/dqb|u> -> dot2
!   and add them to the 2nd-order matrix

!   XG030513 : use dotprod_g, for future parallelisation
    call dotprod_g(dotr,doti,istwf_k,mpi_enreg,npw1_k*nspinor,2,gvnl1,cwavef)
    d2nl_k(1,idir1,ipert1)=d2nl_k(1,idir1,ipert1)+wtk_k*occ_k(iband)*dotr/(nband_kocc*two)
    d2nl_k(2,idir1,ipert1)=d2nl_k(2,idir1,ipert1)+wtk_k*occ_k(iband)*doti/(nband_kocc*two)


!   XG 020216 : Marek, could you check the next forty lines
!   In the parallel gauge, dot1 and dot2 vanishes
    if(prtbbb==1)then
     d2bbb_k(1,idir1,iband,iband)=d2bbb_k(1,idir1,iband,iband)+dotr
     d2bbb_k(2,idir1,iband,iband)=d2bbb_k(2,idir1,iband,iband)+doti
     dot_ndiagr=zero ; dot_ndiagi=zero
     do jband = 1,nband_k              !compute dot1 and dot2
      if (abs(occ_k(jband)) > tol8) then

       dot1r=zero ; dot1i=zero
       dot2r=zero ; dot2i=zero

       cwave0(:,:)=cg_k(:,1+(jband-1)*npw_k*nspinor:jband*npw_k*nspinor)
!      XG030513 : use dotprod_g, for future parallelisation
       call dotprod_g(dot1r,dot1i,istwf_k,mpi_enreg,npw1_k*nspinor,2,cwave0,gvnl1)
       call dotprod_g(dot2r,dot2i,istwf_k,mpi_enreg,npw1_k*nspinor,2,cwavef,cwave0)

       dot_ndiagr = dot_ndiagr + dot1r*dot2r - dot1i*dot2i
       dot_ndiagi = dot_ndiagi + dot1r*dot2i + dot1i*dot2r

       d2bbb_k(1,idir1,iband,jband) = d2bbb_k(1,idir1,iband,jband) - &
&       (dot1r*dot2r - dot1i*dot2i)
       d2bbb_k(2,idir1,iband,jband) = d2bbb_k(2,idir1,iband,jband) - &
&       (dot1r*dot2i + dot1i*dot2r)

      end if  ! occ_k
     end do !jband

     d2bbb_k(:,idir1,iband,:)= &
&     d2bbb_k(:,idir1,iband,:)*wtk_k*occ_k(iband)*half

     d2nl_k(1,idir1,ipert1)= &
&     d2nl_k(1,idir1,ipert1)-wtk_k*occ_k(iband)*dot_ndiagr/(nband_kocc*two)
     d2nl_k(2,idir1,ipert1)=&
&     d2nl_k(2,idir1,ipert1)-wtk_k*occ_k(iband)*dot_ndiagi/(nband_kocc*two)

    end if ! prtbbb==1

   end do  ! idir1
  end if   ! Compute localization tensor, ipert=natom+1

! End loop over bands
 end do

!DEBUG
!write(6,*)' nstwf3 : after loop over bands '
!stop
!ENDDEBUG

!Final deallocations
 deallocate(cwave0,cwavef)
 deallocate(eig2_k,gvnl1)
 if (ipert/=natom+1) then
  deallocate(ffnl,ffnl1,ffnlkq,ffnlk)
  deallocate(kpg_k,kpg1_k,phkxred,ph3d)
  deallocate(ekb_typ,indlmn_typ)
  deallocate(dkinpw,kinpw1)
 end if
 if (psps%usepaw==1) then
  deallocate(gscq)
  if (usecprj==1) then
   call cprj_free(cwaveprj0)
   deallocate(cwaveprj0)
  end if
 end if
 if(prtbbb==1) deallocate(cg_k,cwavef_da,cwavef_db)

 call timab(102,2,tsec)

!DEBUG
!write(6,*)' nstwf3 : exit '
!write(6,*)' nstwf3 : d2nl_k(2,3,2:4:2)',d2nl_k(2,3,2:4:2)
!stop
!ENDDEBUG

end subroutine nstwf3
!!***
