!{\src2tex{textfont=tt}}
!!****f* ABINIT/cexch_haydock
!! NAME
!! cexch_haydock
!!
!! FUNCTION
!!  Calculate and write the excitonic Hamiltonian on an external binary file (Fortran file open
!!  in random mode) for subsequent treatment in the Bethe-Salpeter code.
!!
!! COPYRIGHT
!! Copyright (C) 1992-2009 EXC group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida)
!! Copyright (C) 2009-2010 ABINIT group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida, M.Giantomassi)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  BSp<excparam>=The parameters for the Bethe-Salpeter calculation. 
!!  Dtfil<datafiles_type>=File names and unit numbers.
!!  usepaw=1 if PAW is used, 0 otherwise.
!!  nspinor=Number of spinorial components.
!!  nsppol=Number of independent spin polarizations.
!!  Cryst<crystal_structure>=Info on the crystalline structure.
!!  Kmesh<BZ_mesh_type>=The list of k-points in the BZ, IBZ and symmetry tables.
!!  Qmesh<BZ_mesh_type>=The list of q-points for epsilon^{-1} and related symmetry tables. 
!!  ktabr(nfftot_osc,BSp%nkbz)=The FFT index of $(R^{-1}(r-\tau))$ where R is symmetry needed to obtains 
!!    the k-points from the irreducible image.  Used to symmetrize u_Sk where S = \transpose R^{-1}
!!  Gsph_Max<gvectors_type>=Info on the G-sphere used to describe wavefunctions and W (the largest one is actually stored).  
!!  igfft(Bsp%npwvec,2*BSp%mG0(1)+1,2*BSp%mG0(2)+1,BSp%mG0(3)+1)=Index of G-G0 in the FFT grid. The first 
!!    dimension refers to G, the remainig three dimensions define G0.
!!  Vcp<coulombian_type>=The Coulomb interaction in reciprocal space. A cutoff can be used
!!  Er<epsilonm1_results>=Data type gathering info and data for the symmetrized inverse dielectric matrix.
!!  trans<transition>=For each transition, it gives the index of the k-point, valence and conduction index
!!    as well as the transition energy.
!!  transtab(BSp%nkbz,BSp%nbndv,BSp%nbndc)=Correspondence between the tupla (k,v,c) and the transition index in trans.
!!  nfftot_osc=Total Number of FFT points used for the oscillator matrix elements.
!!  ngfft_osc(18)=Info on the FFT algorithm used to calculate the oscillator matrix elements.
!!  MPI_enreg<MPI_type>=Info on the parallelism.
!!  Psps<Pseudopotential_type>=Variables related to pseudopotentials
!!  Pawtab(Psps%ntypat)<pawtab_type>=PAW tabulated starting data.
!!  Pawang<pawang_type>=PAW angular mesh and related data.
!!  Paw_pwff(Cryst%ntypat*usepaw)<Paw_pwff_type>=Form factor used to calculate the onsite mat. elements of a plane wave.
!!  Wfd<wfs_descriptor>=Handler for the wavefunctions.
!!
!! OUTPUT
!!  The excitonic Hamiltonian is saved on an external binary file.
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine cexch_haydock(BSp,Dtfil,usepaw,nspinor,nsppol,Cryst,Kmesh,Qmesh,ktabr,Gsph_Max,igfft,Vcp,&
&  Wfd,Er,trans,transtab,nfftot_osc,ngfft_osc,MPI_enreg,Psps,Pawtab,Pawang,Paw_pwff)

 use defs_basis
 use defs_datatypes
 use defs_abitypes
 use defs_gw
 use m_bs_defs
 use m_errors

 use m_io_tools,     only : get_unit
 use m_blas,         only : xdotc
 use m_bz_mesh,      only : bz_mesh_type, get_BZ_item, get_BZ_diff, has_BZ_item, isamek, findqg0
 use m_crystal,      only : crystal_structure
 use m_gsphere,      only : gvectors_type
 use m_coulombian,   only : coulombian_type
 use m_screening,    only : make_W, Epsm1_symmetrizer
 use m_paw_pwij,     only : paw_pwff_type, paw_pwij_type, init_paw_pwij, destroy_paw_pwij, paw_rho_tw_g
 use m_wfs,          only : wfs_descriptor, wfd_get_ur, wfd_get_cprj, wfd_change_ngfft
 use m_oscillators,  only : rho_tw_g, sym_rhotwgq0

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_32_util
 use interfaces_53_abiutil
 use interfaces_66_paw
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: usepaw
 integer,intent(in) :: nspinor,nsppol
 integer,intent(in) :: nfftot_osc
 type(excparam),intent(inout) :: BSp
 type(MPI_type),intent(inout) :: MPI_enreg
 type(Epsilonm1_results),intent(inout) :: Er
 type(BZ_mesh_type),intent(in) :: Kmesh,Qmesh
 type(Crystal_structure),intent(in) :: Cryst
 type(Coulombian_type),intent(in) :: Vcp
 type(Gvectors_type),intent(in) :: Gsph_Max
 type(datafiles_type),intent(in) :: Dtfil
 type(Pseudopotential_type),intent(in) :: Psps
 type(pawang_type),intent(in) :: Pawang
 type(wfs_descriptor),intent(inout) :: Wfd
!arrays  
 integer,intent(in) :: ngfft_osc(18)
 integer,intent(in) :: transtab(BSp%nkbz,BSp%nbndv,BSp%nbndc)
 integer,intent(in) :: ktabr(nfftot_osc,BSp%nkbz)
 integer,intent(in) :: igfft(Bsp%npwvec,2*BSp%mG0(1)+1,2*BSp%mG0(2)+1,2*BSp%mG0(3)+1)
 type(transition),intent(in) :: trans(BSp%nh)
 type(Pawtab_type),intent(in) :: Pawtab(Psps%ntypat)
 type(Paw_pwff_type),intent(in) :: Paw_pwff(Psps%ntypat*Psps%usepaw)

!Local variables ------------------------------
!scalars
 integer,parameter :: tim_fourdp=0,map2sphere=1
 integer,parameter :: spin=1
! integer :: ik_bz,ik_ibz,ikp_bz,ikp_ibz 
 integer :: indx_k_bz,indx_kp_bz,nomega
 integer :: shift,indx_kibz,ibsp,spad,i1,i2
 integer :: paral_kgb,dim_rtwg,haydo_unt
 integer :: nh 
 integer :: ig
 integer :: ik,ik_ibz,itim_k,ikp,ikp_ibz,itim_kp,isym_k,isym_kp
 integer :: iq,iqbz,iq_ibz,isym_q,itim_q,iqbz_0
 integer :: ig01,ig02,ig03,mgfft_osc,fftalga_osc
 integer :: iv,ivp,ic,icp,it,itp
 integer :: irv,irc,irvp,ircp,ib
 integer :: istat,lastirec,use_padfft
 integer :: kstart=1
 integer :: sizeofcomplex
 real(dp) :: q0vol,faq,rtw,fcc_const
 real(dp) :: tenergy,tpenergy
 complex :: http,ctemp
 complex(dpc) :: ph_mkpt,ph_mkt
 logical,parameter :: timereversal = .FALSE.
 logical :: h_in_memory
 logical :: have_file = .FALSE.
 character(len=500) :: msg
!arrays
 integer :: g0(3)
 integer,allocatable :: igfftg0(:),ktabr_k(:),ktabr_kp(:),gbound(:,:),gmg0(:,:)
 integer,pointer :: grottb(:,:,:)
 real(dp) :: kbz(3),kpbz(3),spinrot_k(4),spinrot_kp(4),kmk1(3)
 !complex,allocatable :: bsh(:)
 complex,allocatable :: bsh_k(:,:,:,:,:)
 complex(gwpc),allocatable :: wfnr(:,:,:)
 complex(gwpc),allocatable :: vc_sqrt_qbz(:)
 complex(gwpc) :: Wstar(BSp%npweps,BSp%npweps)
 complex(gwpc),allocatable :: W(:,:,:) 
 complex(gwpc) :: Wstardiag(BSp%npweps)
 complex(gwpc) :: rhotwg1(BSp%npweps),rhotwg2(BSp%npweps)
 complex(gwpc),allocatable :: rhxtwg_vv(:,:,:),rhxtwg_cc(:,:,:),rhxtwg_q0(:,:,:,:) 
! complex(gwpc),allocatable :: wfr1(:),wfr2(:)
 complex(gwpc) :: ctccp(BSp%npweps)
 type(Cprj_type),allocatable :: Cprj_k(:,:),Cprj_kp(:,:)
 type(Cprj_type),allocatable :: Cp1(:,:),Cp2(:,:)
 type(Paw_pwij_type),allocatable :: Pwij_q0(:),Pwij_q(:)

!************************************************************************

 DBG_ENTER("COLL")

 ABI_CHECK(nsppol==1,"nsppol==2 not coded")
 ABI_CHECK(nspinor==1,"nspinor==2 not coded")

 if (nfftot_osc/=PRODUCT(ngfft_osc(1:3))) then
   MSG_ERROR("mismatch in FFT size")
 end if

 if ( ANY(ngfft_osc(1:3) /= Wfd%ngfft(1:3)) ) call wfd_change_ngfft(Wfd,Cryst,Psps,ngfft_osc) 

 grottb => Gsph_Max%rottb(:,:,:)    

 dim_rtwg=1
 paral_kgb=0
 nh = BSp%nh

 allocate(ktabr_k(nfftot_osc),ktabr_kp(nfftot_osc))

! Evaluate the tables needed for the padded FFT performed in rhotwg.
! There is no need to shift the G-sphere as we only have vertical transitions.
 mgfft_osc   = MAXVAL(ngfft_osc(1:3))
 fftalga_osc = ngfft_osc(7)/100 !; fftalgc_osc=MOD(ngfft_osc(7),10)
 use_padfft=0; !$if (fftalga_osc==3) use_padfft=1  ! Padded FFTW3 is safe instead!
 allocate(gbound(2*mgfft_osc+8,2*use_padfft))
 if (use_padfft==1) call sphereboundary(gbound,1,Gsph_Max%gvec,mgfft_osc,BSp%npweps)

 allocate(rhxtwg_vv(BSp%npweps,BSp%nbnds,BSp%nbnds),STAT=istat)
 allocate(rhxtwg_cc(BSp%npweps,BSp%nbnds,BSp%nbnds),STAT=istat)

 allocate(vc_sqrt_qbz(BSp%npwvec))

 allocate(W(BSp%npweps,BSp%npweps,1), STAT=istat) 

 sizeofcomplex = get_reclen("gwpc")

 !     Some constants  

 !faq = four_pi/(Cryst%ucvol*BSp%nkbz)
 faq = one/(Cryst%ucvol*BSp%nkbz)

 rtw = (6.0*pi**2/(Cryst%ucvol*BSp%nkbz))**(1./3.)

! Analytic integration of 4pi/q^2 over the volume element:
! $4pi/V \int_V d^3q 1/q^2 =4pi bz_geometric_factor V^(-2/3)$
! i_sz=4*pi*bz_geometry_factor*q0_vol**(-two_thirds) where q0_vol= V_BZ/N_k
! bz_geometry_factor: sphere=7.79, fcc=7.44, sc=6.188, bcc=6.946, wz=5.255
! (see gwa.pdf, appendix A.4)

 q0vol = (8.0*pi**3) / (Cryst%ucvol*BSp%nkbz)
 !     If q=0 and C=V then set up rho-twiddle(G=0) to reflect an
 !     analytic integration of q**-2 over the volume element:
 !     <q**-2> = 7.44 V**(-2/3)   (for fcc cell)
 fcc_const = SQRT(7.44*q0vol**(-2.0/3.0))

 !     Average of (q+q')**-2 integration for head of Coulomb matrix 
 !     INTRTW(QL) = (2*pi*rtw + pi*(rtw**2/QL-QL)*LOG((QL+rtw)/(QL-rtw)))
 !    &              * (Cryst%ucvol*BSp%nkbz)/(2*pi)**3. * QL*QL

 lastirec = nh*nh

 if (usepaw==1) then
   allocate(Cprj_k (Cryst%natom,nspinor*Bsp%nbnds)); call cprj_alloc(Cprj_k, 0,Wfd%nlmn_atm)
   allocate(Cprj_kp(Cryst%natom,nspinor*Bsp%nbnds)); call cprj_alloc(Cprj_kp,0,Wfd%nlmn_atm)
   allocate(Cp1(Wfd%natom,Wfd%nspinor)); call cprj_alloc(Cp1,0,Wfd%nlmn_atm)
   allocate(Cp2(Wfd%natom,Wfd%nspinor)); call cprj_alloc(Cp2,0,Wfd%nlmn_atm)
 end if

!-----------------------------------------------------------------------

 if (BSp%EXCHANGETERM) print *, 'exchange term'
 if (BSp%COULOMBTERM) print *, 'coulomb term'

 if (BSp%COULOMBTERM) then ! Construct W in the IBZ.
   call make_W(Er,Vcp) !TODO mqmem==0, change info ER%, and update Hscr, clean treatment of epsm1 and different IDs
   call pclock(950)
 end if

!----------------------------------------------------------------------
!     load wavefunctions in r-space
!----------------------------------------------------------------------

 write(msg,'(3a,i0,a)')&
&  ' Allocating real-space wavefunctions. ',ch10,&
&  ' memory size ',nfftot_osc*BSp%nbnds*BSp%nkibz, 'sizeof(complex)'
 call wrtout(std_out,msg,"COLL")

 allocate(wfnr(nfftot_osc*Wfd%nspinor,BSp%nbnds,BSp%nkibz),stat=istat)
 if (istat/=0) then 
  MSG_ERROR('out of memory: wfnr')
 end if

 ! SPIN support is missing here

 do ik=1,BSp%nkibz
   !rec_idx=ik+(spin-1)*BSp%nkibz
   !read(31,rec=rec_idx) ((wfnr(ir,ib,ik),ir=1,nfftot_osc),ib=1,BSp%nbnds)
   do ib=1,BSp%nbnds
     call wfd_get_ur(Wfd,ib,ik,spin,wfnr(:,ib,ik))
   end do
 end do
      
 ! calculate all matrix elements in IBZ for q=0 to save time
 call wrtout(std_out,'Calculating all matrix elements for q=0',"COLL")
 !print *, 'memory size ', BSp%npweps*BSp%nbnds**2*BSp%nkibz,' sizeof(complex)'
      
! identify q == 0
 iqbz_0 = 0
 do iqbz=1,Qmesh%nbz
   if (ALL(ABS(Qmesh%bz(:,iqbz))<1.0e-3)) iqbz_0 = iqbz
 end do
 if (iqbz_0 == 0) then
   MSG_ERROR("q=0 not found")
 end if

 iqbz  = iqbz_0
 iq    = iqbz_0
 iq_ibz = Qmesh%tab(iq) ; isym_q = Qmesh%tabo(iq) ; itim_q = (3-Qmesh%tabi(iq))/2
 ig01=0; ig02=0; ig03=0

 allocate(rhxtwg_q0(BSp%npweps,BSp%nbnds,BSp%nbnds,BSp%nkibz))
 rhxtwg_q0(:,:,:,:) = 0.0

 if (usepaw==1) then ! * Prepare onsite contributions at q==0
   allocate(Pwij_q0(Cryst%ntypat))
   call init_paw_pwij(Pwij_q0,BSp%npweps,Qmesh%bz(:,iqbz_0),Gsph_Max%gvec,Cryst%rprimd,Dtfil,Psps,Pawtab,Paw_pwff)
 end if

 ig01 = BSp%mG0(1)+1 ! The index of G in the FFT mesh.
 ig02 = BSp%mG0(2)+1
 ig03 = BSp%mG0(3)+1

 allocate(igfftg0(BSp%npweps))
 igfftg0(1:BSp%npweps) = igfft(1:BSp%npweps,ig01,ig02,ig03)
 
 do ik=1,BSp%nkibz ! Loop over all k-points in IBZ
  ik_ibz=ik; itim_k=1; isym_k=1; ph_mkt = cone
  spinrot_k(:) =Cryst%spinrot(:,isym_k)

  if (usepaw==1) then
    indx_kibz=nspinor*BSp%nbnds*(ik_ibz-1)+nspinor*BSp%nbnds*Kmesh%nibz*(spin-1)
    ibsp=0
    do ib=1,BSp%nbnds
      call wfd_get_cprj(Wfd,ib,ik_ibz,spin,Cryst,Cp1,sorted=.FALSE.)
      ibsp = ibsp + nspinor
      call cprj_copy(Cp1,Cprj_k(:,ibsp:ibsp+(nspinor-1)))
    end do
  end if

  do iv=BSp%lomo,BSp%nbnds ! loop over band V
   do ic=BSp%lomo,BSp%nbnds ! loop over band C 
!   Symmetry index for IBZ = 1 (in ktabr(:,1)

    !$call wfd_get_ur(Wfd,iv,ik,spin,wfr1)
    !$call wfd_get_ur(Wfd,ic,ik,spin,wfr2)

    call rho_tw_g(paral_kgb,nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&     wfnr(:,iv,ik),1,ktabr(:,1),ph_mkt,spinrot_k,& 
&     wfnr(:,ic,ik),1,ktabr(:,1),ph_mkt,spinrot_k,&
&     dim_rtwg,rhotwg1,tim_fourdp,MPI_enreg)

    ! === Add PAW onsite contribution === 
    if (usepaw==1) then ! FIXME Find a clever way to deal with spinors
     spad=(nspinor-1)
     i1=iv; if (nspinor==2) i1=(2*iv-1)
     i2=ic; if (nspinor==2) i2=(2*ic-1)
     call paw_rho_tw_g(Bsp%npweps,dim_rtwg,nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,Gsph_Max%gvec,&
&      Cprj_k(:,i1:i1+spad),Cprj_k(:,i2:i2+spad),Pwij_q0,rhotwg1)
    end if

    if (iv <= BSp%homo .and. ic <= BSp%homo .or. & 
&       iv >  BSp%homo .and. ic >  BSp%homo) then
     if (iv/=ic) then ! COULOMB term: C/=V: ignore them
      rhotwg1(1) = 0.0
     else  ! iv=ic
!     If q=0 and C=V then set up rho-twiddle(G=0) to reflect an
!     analytic integration of q**-2 over the volume element:
!     <q**-2> = 7.44 V**(-2/3)   (for fcc cell)
      !rhotwg1(1) = fcc_const * qpg(1,iq)
      rhotwg1(1) = SQRT(Vcp%i_sz) / Vcp%vcqlwl_sqrt(1,iq)
      !if(vcut) rhotwg1(1) = 1.0
     end if  ! iv=ic
    else
     rhotwg1(1) = (0.,0.)
    end if

    rhxtwg_q0(:,iv,ic,ik) = rhotwg1(:)
   end do ! ic
  end do ! iv
 end do ! ik 

 deallocate(gbound, STAT=istat)
 if (usepaw==1) then
  call destroy_paw_pwij(Pwij_q0); deallocate(Pwij_q0)
 end if

 call pclock(900)

! try to allocate excitonic hamiltonian

 msg='trying to allocate excitonic hamiltonian'
 call wrtout(std_out,msg,"COLL")

 allocate(bsh_k(BSp%nbndv,BSp%nbndc,BSp%nkbz,BSp%nbndv,BSp%nbndc),stat=istat)
 if (istat/=0) then 
  MSG_ERROR('buy more memory')
 end if

 write(msg,'(a,i0,a)')'memory size ', lastirec, ' sizeof(complex)'
 call wrtout(std_out,msg,"COLL")

 !allocate(bsh(lastirec),stat=istat)
 !if (istat/=0) then
 ! h_in_memory = .FALSE.
 ! msg='not enough memory, writing Bethe-Salpeter Hamiltonian to disk'
 ! call wrtout(std_out,msg,"COLL")
 !else
 ! h_in_memory = .TRUE.
 ! kstart  = 1
 ! kcstart = 1
 ! msg=' Allocated'
 ! call wrtout(std_out,msg,"COLL")
 !end if

 h_in_memory = .FALSE.

!coulomb term
!-----------------------------------------------------------------------
            
 inquire(file='in_haydock.exh', exist=have_file)
 if (have_file) BSp%COULOMBTERM=.FALSE.

 if (BSp%COULOMBTERM) then
           
  msg=" Calculating Coulomb term using full W_{GG'}"
  if (BSp%WDIAG) msg=" Calculating Coulomb term using diagonal only approximation for W_{GG'}"
  call wrtout(std_out,msg,"COLL")

  haydo_unt = get_unit()
  open(unit=haydo_unt,file='out_haydock.exh',form='unformatted')

  if (BSp%EXCHANGETERM) then  ! setup of the exchange term inside the coulomb term
   ! q = 0 -> iqbz_0
   iqbz = iqbz_0
   iq = iqbz
   iq_ibz = Qmesh%tab(iq) ; isym_q = Qmesh%tabo(iq) ; itim_q = (3-Qmesh%tabi(iq))/2
   !print *,'iq,iq_ibz,isym_q,itim_q ',iq,iq_ibz,isym_q,itim_q

   ! Set up table of |q(BZ)+G|
    !qbzpg(grottb(ig,itim_q,isym_q)) = qpg(ig,iq_ibz)
   if (iq_ibz==1) then 
    do ig=1,BSp%npwvec
     vc_sqrt_qbz(grottb(ig,itim_q,isym_q))=Vcp%vcqlwl_sqrt(ig,1)
    end do
   else 
    do ig=1,BSp%npwvec
     vc_sqrt_qbz(grottb(ig,itim_q,isym_q))=Vcp%vc_sqrt(ig,iq_ibz)
    end do
   end if
  end if     

  do ik=kstart,BSp%nkbz ! loop over k
   write(msg,'(a,i5,a,i5,a)')' status: ',ik,'/',BSp%nkbz,' completed'
   call wrtout(std_out,msg,"COLL")

   ! * Get ik_ibz, non-symmorphic phase, ph_mkt, and symmetries from ik.
   call get_BZ_item(Kmesh,ik,kbz,ik_ibz,isym_k,itim_k,ph_mkt)

   ktabr_k(:) = ktabr(:,ik)
   spinrot_k(:)=Cryst%spinrot(:,isym_k)
   !if (itim_k==2) CYCLE ! time-reversal or not       

   if (usepaw==1) then ! Load cprj for this k in the BZ.
    ! * Do not care of umklapp G0 in k-q as the phase is already included.
    shift    =nspinor*BSp%nbnds*Kmesh%nbz*(spin-1)
    indx_k_bz=nspinor*BSp%nbnds*(ik-1)+shift
    ibsp=0
    do ib=1,Bsp%nbnds
      call wfd_get_cprj(Wfd,ib,ik_ibz,spin,Cryst,Cp1,sorted=.FALSE.)
      call paw_symcprj(ik,Wfd%nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,Cp1) 
      ibsp = ibsp + nspinor
      call cprj_copy(Cp1,Cprj_k(:,ibsp:ibsp+(nspinor-1)))
    end do
   end if

   ! do ikp = ik, BSp%nkbz     ! loop over kp da far partire da 1
   do ikp=1,BSp%nkbz           ! loop over kp da far partire da 1
    
    ! * Get ikp_ibz, non-symmorphic phase, ph_mkpt, and symmetries from ikp.
    call get_BZ_item(Kmesh,ikp,kpbz,ikp_ibz,isym_kp,itim_kp,ph_mkpt)

    spinrot_kp(:)=Cryst%spinrot(:,isym_kp)
    ktabr_kp(:) = ktabr(:,ikp)

    if (usepaw==1) then ! Load cprj for this kp in the BZ.
      ! * Do not care of umklapp G0 in k-q as the phase is already included.
      shift     =nspinor*BSp%nbnds*Kmesh%nbz*(spin-1)
      indx_kp_bz=nspinor*BSp%nbnds*(ikp-1)+shift
      ibsp=0
      do ib=1,Bsp%nbnds
        call wfd_get_cprj(Wfd,ib,ikp_ibz,spin,Cryst,Cp2,sorted=.FALSE.)
        call paw_symcprj(ikp,Wfd%nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,Cp2) 
        ibsp = ibsp + nspinor
        call cprj_copy(Cp2,Cprj_kp(:,ibsp:ibsp+(nspinor-1)))
      end do
    end if
    !
    ! * Find q = K - KP - G0.
    kmk1(:) = Kmesh%bz(:,ik) - Kmesh%bz(:,ikp)
    call findqg0(iq,g0,kmk1,Qmesh%nbz,Qmesh%bz,BSp%mG0)
    !
    ! * Get the G-G0 shift for the FFT of the oscillators.
    ig01 = g0(1) + BSp%mG0(1)+1
    ig02 = g0(2) + BSp%mG0(2)+1
    ig03 = g0(3) + BSp%mG0(3)+1
    igfftg0(1:BSp%npweps) = igfft(1:BSp%npweps,ig01,ig02,ig03)
    !
    ! Evaluate the tables needed for the padded FFT performed in rhotwg. Note that we have 
    ! to pass G-G0 to sphereboundary instead of G as we need FFT results on the shifted G-sphere, 
    ! If Gamma is not inside G-G0 one has to disable FFT padding as sphereboundary will give wrong tables.
    !
    use_padfft=0      
    allocate(gmg0(3,BSp%npweps))
    do ig=1,BSp%npweps
      gmg0(:,ig) = Gsph_Max%gvec(:,ig)-g0
      if (ALL(gmg0(:,ig) == 0)) use_padfft=1
    end do
                                                                                                           
    if (fftalga_osc/=3) use_padfft=0  ! Padded FFTW3 is safe instead!
    allocate(gbound(2*mgfft_osc+8,2*use_padfft))
    if (use_padfft==1) call sphereboundary(gbound,1,gmg0,mgfft_osc,BSp%npweps)
    deallocate(gmg0)

    iq_ibz = Qmesh%tab(iq) ; isym_q = Qmesh%tabo(iq) ; itim_q = (3-Qmesh%tabi(iq))/2

    ! Symmetrize W (omega=0) 
    nomega=1
    call Epsm1_symmetrizer(iq,nomega,BSp%npweps,Er,Gsph_Max,Qmesh,.FALSE.,W) 

    Wstar(:,:) = CONJG(W(:,:,1))
    do ig = 1, BSp%npweps
      Wstardiag(ig) = Wstar(ig,ig)
    end do

!   calculate matrix-elements rhxtwg_vv and rhxtwg_cc
    if (ik==ikp) then

      do iv=BSp%lomo,BSp%homo   ! loop over band V
        do ivp=BSp%lomo,BSp%homo ! loop over band VP
          rhxtwg_vv(:,ivp,iv) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,ivp,iv,ik_ibz),Gsph_Max)
        end do
      end do

      do icp=BSp%lumo,BSp%nbnds ! loop over band C_prime
       do ic=BSp%lumo,BSp%nbnds ! loop over band C
        rhxtwg_cc(:,icp,ic) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,icp,ic,ik_ibz),Gsph_Max)
       end do
      end do

    else ! ik /= ikp

      ! === Evaluate oscillator matrix elements ===
      ! * $ <phj/r|e^{-i(q+G)}|phi/r> - <tphj/r|e^{-i(q+G)}|tphi/r> $ in packed form.
      if (usepaw==1) then
        allocate(Pwij_q(Cryst%ntypat))
        call init_paw_pwij(Pwij_q,BSp%npweps,Qmesh%bz(:,iq),Gsph_Max%gvec,Cryst%rprimd,Dtfil,Psps,Pawtab,Paw_pwff)
      end if
        
      do iv=BSp%lomo,BSp%homo    ! loop over band V
        do ivp=BSp%lomo,BSp%homo  ! loop over band VP

          !$call wfd_get_ur(Wfd,ivp,ikp_ibz,spin,wfr1)
          !$call wfd_get_ur(Wfd,iv ,ik_ibz ,spin,wfr2)

          call rho_tw_g(paral_kgb,nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&           wfnr(:,ivp,ikp_ibz),itim_kp,ktabr_kp,ph_mkpt,spinrot_kp,&
&           wfnr(:,iv ,ik_ibz ),itim_k ,ktabr_k ,ph_mkt ,spinrot_k ,&
&           dim_rtwg,rhxtwg_vv(:,ivp,iv),tim_fourdp,MPI_enreg)

          ! === Add PAW onsite contribution === 
          if (usepaw==1) then ! FIXME Find a clever way to deal with spinors
            spad=(nspinor-1)
            i1=ivp; if (nspinor==2) i1=(2*ivp-1)
            i2=iv ; if (nspinor==2) i2=(2*iv-1)
            call paw_rho_tw_g(Bsp%npweps,dim_rtwg,nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,Gsph_Max%gvec,&
              Cprj_kp(:,i1:i1+spad),Cprj_k(:,i2:i2+spad),Pwij_q,rhxtwg_vv(:,ivp,iv))
          end if

        end do ! ivp
      end do ! iv

      do icp=BSp%lumo,BSp%nbnds  ! loop over band C_prime
        do ic=BSp%lumo,BSp%nbnds  ! loop over band C

          !$call wfd_get_ur(Wfd,icp,ikp_ibz,spin,wfr1)
          !$call wfd_get_ur(Wfd,ic ,ik_ibz ,spin,wfr2)

          call rho_tw_g(paral_kgb,nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&           wfnr(:,icp,ikp_ibz),itim_kp,ktabr_kp,ph_mkpt,spinrot_kp,&
&           wfnr(:,ic , ik_ibz),itim_k ,ktabr_k ,ph_mkt ,spinrot_k ,&
&           dim_rtwg,rhxtwg_cc(:,icp,ic),tim_fourdp,MPI_enreg)

          ! === Add PAW onsite contribution === 
          if (usepaw==1) then ! FIXME Find a clever way to deal with spinors                                                   
           spad=(nspinor-1)
           i1=icp; if (nspinor==2) i1=(2*icp-1)
           i2=ic ; if (nspinor==2) i2=(2*ic-1)
           call paw_rho_tw_g(Bsp%npweps,dim_rtwg,nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,Gsph_Max%gvec,&
             Cprj_kp(:,i1:i1+spad),Cprj_k(:,i2:i2+spad),Pwij_q,rhxtwg_cc(:,icp,ic))
          end if

        end do ! ic
      end do ! icp

      if (usepaw==1) then   
        call destroy_paw_pwij(Pwij_q); deallocate(Pwij_q)
      end if
    end if ! ik == ikp
        
!   now calculate: sum_G,G' rhotw(G)* W(G,G') rhotw(G')
    do ic=BSp%lumo, BSp%nbnds
      irc = ic-BSp%lumo + 1
      do icp=BSp%lumo,BSp%nbnds
        ircp = icp - BSp%lumo + 1

!       sum_GG' rho_c'c*(G) W(G,G') rho_v'v(G')
!       first sum on G: sum_G rho_c'c(G) W*(G,G')    (W conjguated)
        if (BSp%WDIAG) then
          ctccp(:) = rhxtwg_cc(:,icp,ic) * Wstardiag(:)
        else
          ctccp(:) = MATMUL(rhxtwg_cc(:,icp,ic),Wstar(:,:))
        end if
            
        do iv=BSp%lomo,BSp%homo
          irv = iv - BSp%lomo + 1
              
          it = transtab(ik,irv,irc)
          if (it==0) CYCLE                    ! ir-uv-cutoff
          tenergy = trans(it)%en
              
          do ivp=BSp%lomo,BSp%homo
            irvp = ivp - BSp%lomo + 1

            itp = transtab(ikp,irvp,ircp)
            if (itp==0) CYCLE                 ! ir-uv-cutoff
            tpenergy = trans(itp)%en

            if (ABS(tenergy - tpenergy) > BSp%stripecut) CYCLE    ! stripe

!           then sum on G': sum_G' (rho_c'c(G) W*(G,G'))* rho_v'v(G')
            http = -faq * DOT_PRODUCT(ctccp(:),rhxtwg_vv(:,ivp,iv))
            !$http = -faq * xdotc(BSp%npweps,ctccp,1,rhxtwg_vv(:,ivp,iv),1)

            bsh_k(irv,irc,ikp,irvp,ircp) = http
!           write(*,'("ik,ikp,iv,ivp,ic,icp, it, itp",8I3)')ik,ikp,iv,ivp,ic,icp, it, itp
          end do ! ivp
        end do ! iv
      end do ! icp
    end do ! ic

    deallocate(gbound, STAT=istat)
   end do ! ikp

   if (BSp%EXCHANGETERM) then
    do iv=BSp%lomo,BSp%homo
     irv = iv - BSp%lomo + 1
     do ic=BSp%lumo,BSp%nbnds
      irc = ic - BSp%lumo + 1
      it = transtab(ik,irv,irc); if (it==0) CYCLE ! ir-uv-cutoff
      tenergy = trans(it)%en
!   qui mettiamo il termine rhotwg_vc che serve per il termine exchange

      !do ig=2,BSp%npweps
      ! rhotwg1(grottb(ig,itim_k,isym_k)) = rhxtwg_q0(ig,iv,ic,ik_ibz) / (qbzpg(ig)*qbzpg(ig))
      !end do
      do ig=2,BSp%npweps
       rhotwg1(grottb(ig,itim_k,isym_k)) = rhxtwg_q0(ig,iv,ic,ik_ibz) * vc_sqrt_qbz(ig) * vc_sqrt_qbz(ig) !/ (qbzpg(ig)*qbzpg(ig))
      end do

      if (itim_k==2) rhotwg1(2:) = CONJG(rhotwg1(2:)) 

      do itp=1,nh
       tpenergy = trans(itp)%en
       if (ABS(tenergy - tpenergy) > BSp%stripecut) CYCLE !stripe cutoff

       ikp = trans(itp)%k
       ivp = trans(itp)%v
       irvp=ivp-BSp%lomo+1 
       icp = trans(itp)%c
       ircp=icp-BSp%lumo+1
       ikp_ibz = Kmesh%tab(ikp)
       itim_kp = (3-Kmesh%tabi(ikp))/2
       isym_kp = Kmesh%tabo(ikp)

       do ig=2,BSp%npweps
        rhotwg2(grottb(ig,itim_kp,isym_kp)) = rhxtwg_q0(ig,ivp,icp,ikp_ibz)
       end do
       if (itim_kp==2) rhotwg2(2:) = CONJG(rhotwg2(2:)) 

       ! sum over G
       ctemp = DOT_PRODUCT(rhotwg1(2:),rhotwg2(2:))
       !$ctemp = XDOTC(BSp%npweps-1,rhotwg1(2:),1,rhotwg2(2:),1)
       ctemp = two * ctemp * faq
               
       bsh_k(irv,irc,ikp,irvp,ircp)=bsh_k(irv,irc,ikp,irvp,ircp)+ctemp
      end do !itp
     end do !ic
    end do !iv
   end if ! exchange term

   do iv=1,BSp%nbndv
    do ic=1,BSp%nbndc
     it = transtab(ik,iv,ic)
     tenergy=trans(it)%en
     bsh_k(iv,ic,ik,iv,ic)=REAL(bsh_k(iv,ic,ik,iv,ic))+tenergy
     write(haydo_unt) (( (CMPLX(bsh_k(iv,ic,ikp,ivp,icp),kind=dpc),icp=1,BSp%nbndc),ivp=1,BSp%nbndv),ikp=1,BSp%nkbz)  
!    do ikp=1,BSp%nkbz
!     do ivp=1,BSp%nbndv
!      do icp=1,BSp%nbndc
!       write(66,*) it,transtab(ikp,ivp,icp),REAL(bsh_k(iv,ic,ikp,ivp,icp)),aimag(bsh_k(iv,ic,ikp,ivp,icp))
!      end do
!     end do
!    end do
    end do
   end do 

  end do ! ik

  close(haydo_unt)

  call wrtout(std_out,' Coulomb term completed',"COLL")

  !if (h_in_memory) write(*,'(" -W(1,1) = [eV] ",f7.2)') REAL(bsh(1))*Ha_eV
  call pclock(3000)
 end if ! BSp%COULOMBTERM

 ! * Free memory.
 deallocate(W)
 deallocate(igfftg0)

 deallocate(ktabr_k,ktabr_kp)
 deallocate(rhxtwg_q0)
 deallocate(rhxtwg_vv)
 deallocate(rhxtwg_cc)
 deallocate(wfnr)
 deallocate(vc_sqrt_qbz)

 deallocate(bsh_k)

 !if (allocated(bsh)) deallocate(bsh) 
 
 if (usepaw==1) then ! Optional deallocation for PAW.
   call cprj_free(Cprj_k ); deallocate(Cprj_k )
   call cprj_free(Cprj_kp); deallocate(Cprj_kp)
   call cprj_free(Cp1); deallocate(Cp1)
   call cprj_free(Cp2); deallocate(Cp2)
 end if

 DBG_EXIT("COLL")

end subroutine cexch_haydock
!!***

