!{\src2tex{textfont=tt}}
!!****f* ABINIT/calc_vHxc_braket
!! NAME
!!  calc_vHxc_braket
!!
!! FUNCTION
!!  Evaluate the matrix elements of $v_H$ and $v_{xc}$ and $v_U$
!!  both in case of NC pseudopotentials and PAW (LDA+U, presently, is only available in PAW)
!!  The matrix elements of $v_{xc}$ are calculated with and without the core contribution.
!!  The later quantity is required in case of GW calculations.
!!
!! COPYRIGHT
!!  Copyright (C) 2008-2010 ABINIT group (MG)
!!  This file is distributed under the terms of the
!!  GNU General Public License, see ~abinit/COPYING
!!  or http://www.gnu.org/copyleft/gpl.txt .
!!
!! INPUTS
!!  Mflags
!!  Sigp<Sigma_parameters>=All parameters for the sigma calculation.
!!    %nkptgw=number of points to be calculated
!!    %npwvec=Max number of planewaves
!!  b1,b2=min and max band index to be considered.
!!  gsqcutf_eff=Fourier cutoff on G^2 for "large sphere" of radius double
!!   that of the basis sphere--appropriate for charge density rho(G),Hartree potential, and pseudopotentials
!!  Dtset <type(dataset_type)>=all input variables in this dataset
!!     %nspden= number of spin-density components
!!     %nspinor=number of spinorial components
!!     %nsppol=number of independent spin polarizations
!!  igfftf(npwwfn)=indeg of each G vector in the fine FFT mesh
!!  MPI_enreg=informations about MPI parallelization
!!  ngfftf(18)contain all needed information about 3D fine FFT, see ~abinit/doc/input_variables/vargs.htm#ngfft
!!  nfftf=number of points in the fine FFT mesh (for this processor)
!!  Kmesh<BZ_mesh_type>
!!    %nibz=number of irreducible k-points
!!    %nbnds=number of bands treated (note: does not depend on k)
!!  Pawtab(Dtset%ntypat*Dtset%usepaw) <type(pawtab_type)>=paw tabulated starting data
!!  Paw_an(natom) <type(paw_an_type)>=paw arrays given on angular mesh
!!  Pawang <type(pawang_type)>=paw angular mesh and related data
!!  Paw_ij(natom) <type(paw_ij_type)>=paw arrays given on (i,j) channels
!!  Pawfgrtab(natom) <type(pawfgrtab_type)>=atomic data given on fine rectangular grid
!!  Cprj(Cryst%natom,Dtset%nspinor*nbnds*nibz*Dtset%nsppol*Dtset%usepaw) <type(Cprj_type)
!!   projected input wave functions <Proj_i|Cnk> with all NL projectors for each k-point in the IBZ.
!!  Cryst<Crystal_structure>=unit cell and symmetries
!!     %natom=number of atoms in the unit cell
!!     %rprimd(3,3)=direct lattice vectors
!!     %ucvol=unit cell volume
!!     %ntypat= number of type of atoms
!!     %typat(natom)=type of each atom
!!  vhartr(nfftf)= Hartree potential in real space on the fine FFT mesh
!!  vxc(nfftf,Dtset%nspden)= xc potential in real space on the fine FFT grid
!!  Wfd <type (wfs_descriptor)>=Structure gathering information on the wavefunctions.
!!  rhor(nfftf,nspden)=density in real space (smooth part if PAW).
!!  rhog(2,nfftf)=density in reciprocal space (smooth part if PAW).
!!  nhatgrdim= -PAW only- 0 if nhatgr array is not used ; 1 otherwise
!!  usexcnhat= -PAW only- 1 if nhat density has to be taken into account in Vxc
!!
!! OUTPUT
!!  Mels
!!   %vxc   (b1gw:b2gw,b1gw:b2gw,nibz,Dtset%nsppol)=matrix elements of $v_{xc}[nv+nc]$.
!!   %vxcval(b1gw:b2gw,b1gw:b2gw,nibz,Dtset%nsppol)=matrix elements of $v_{xc}[nv]$.
!!   %vhartr(b1gw:b2gw,b1gw:b2gw,nibz,Dtset%nsppol)=matrix elements of $v_H$.
!!   %vu    (b1gw:b2gw,b1gw:b2gw,nibz,Dtset%nsppol)=matrix elements of $v_U$.
!!
!! SIDE EFFECTS
!!  Paw_ij= In case of self-Consistency it is changed. It will contain the new H0
!!  Hamiltonian calculated using the QP densities. The valence contribution to XC
!!  is removed.
!!
!! NOTES
!!  All the quantities ($v_H$, $v_{xc}$ and $\psi$ are evaluated on the "fine" FFT mesh.
!!  In case of calculations with pseudopotials the usual mesh is defined by ecut.
!!  For PAW calculations the dense FFT grid defined bt ecutdg is used
!!  Besides, in case of PAW, the matrix elements of V_hartree do not contain the onsite
!!  contributions due to the coulombian potentials generate by ncore and tncore.
!!  These quantities, as well as the onsite kinetic terms, are stored in Paw_ij%dij0.
!!
!! PARENTS
!!      sigma
!!
!! CHILDREN
!!      cprj_alloc,cprj_free,fft_onewfn,wfd_get_ur,herm_melements,init_melements
!!      initmpi_seq,mkkin,paw_mknewh0,rhohxc,split_work2,wrtout,xcomm_init
!!      xmaster_init,xme_init,xmpi_nproc,xsum_melements
!!
!! SOURCE


#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine calc_vHxc_braket(Mflags,Mels,Dtset,Sigp,Kmesh,b1,b2,gsqcutf_eff,nfftf,ngfftf,igfftf,&
&  Wfd,vtrial,vhartr,vxc,Psps,Cprj,Pawtab,Paw_an,Pawang,Pawfgrtab,Paw_ij,dijexc_core,&
&  MPI_enreg,Cryst,rhor,rhog,usexcnhat,nhat,nhatgr,nhatgrdim)

 use defs_basis
 use defs_datatypes
 use defs_abitypes
 use m_errors

 use m_gwdefs,      only : sigma_parameters
 use m_blas,        only : xdotc
 use m_wfs,         only : wfd_get_ur, fft_onewfn, wfs_descriptor
 use m_crystal,     only : crystal_structure
 use m_bz_mesh,     only : bz_mesh_type
 use m_melemts,     only : init_melements, herm_melements, xsum_melements, melements_flags_type, melements_type

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_51_manage_mpi
 use interfaces_53_abiutil
 use interfaces_53_ffts
 use interfaces_56_recipspace
 use interfaces_56_xc
 use interfaces_66_paw
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nhatgrdim,usexcnhat
 integer,intent(in) :: b1,b2,nfftf
 real(dp),intent(in) :: gsqcutf_eff
 type(Dataset_type),intent(in) :: Dtset
 type(wfs_descriptor),intent(inout) :: Wfd
 type(MPI_type),intent(inout) :: MPI_enreg
 type(Pawang_type),intent(in) :: Pawang
 type(Crystal_structure),intent(in) :: Cryst
 type(BZ_mesh_type),intent(in) :: Kmesh
 type(Sigma_parameters),intent(in) :: Sigp
 type(melements_flags_type),intent(in) :: Mflags
 type(melements_type),intent(out) :: Mels
!arrays
 integer,intent(in) :: ngfftf(18),igfftf(Wfd%npwwfn)
 real(dp),intent(in) :: vhartr(nfftf),vxc(nfftf,Dtset%nspden),vtrial(nfftf,Dtset%nspden)
 real(dp),intent(in) :: rhor(nfftf,Dtset%nspden),rhog(2,nfftf)
 real(dp),intent(in) :: nhat(nfftf,Dtset%nspden*Dtset%usepaw)
 real(dp),intent(in) :: nhatgr(nfftf,Dtset%nspden,3*nhatgrdim)
 !real(dp),intent(in) :: dijexc_core(cplex_dij*lmn2_size_max,ndij,Cryst%ntypat)
 real(dp),intent(in) :: dijexc_core(:,:,:)
 type(Pawtab_type),intent(in) :: Pawtab(Cryst%ntypat*Dtset%usepaw)
 type(Cprj_type),intent(in) ::  Cprj(Cryst%natom,Dtset%nspinor*Sigp%nbnds*Kmesh%nibz*Dtset%nsppol*Dtset%usepaw)
 type(Paw_an_type),intent(in) :: Paw_an(Cryst%natom)
 type(Pseudopotential_type),intent(in) :: Psps
 type(Paw_ij_type),intent(inout) :: Paw_ij(Cryst%natom)
 type(Pawfgrtab_type),intent(inout) :: Pawfgrtab(Cryst%natom)

!Local variables-------------------------------
!scalars
 integer :: iat,ii,ikc,ik_ibz,ib,jb,is,shift,ispinor
 integer :: itypat,lmn_size,j0lmn,jlmn,ilmn,klmn,klmn1,lmn2_size_max
 integer :: isppol,izero,cplex_dij
 integer :: nspinor,nsppol,nspden
 integer :: nbcalc,ntasks,rank,spaceComm,master,nprocs,irank
 integer :: isp1,isp2,iab,nsploop,nkxc,option,n3xccc_,ibsp1,ibsp2,nk3xc,mgfftf
 real(dp) :: nfftfm1,fact,DijH,enxc_val,vxcval_avg,h0dij,vxc1,vxc1_val,re_p,im_p,dijsigcx
 logical :: use_fineFFT,ltest
 character(len=500) :: msg
 type(MPI_type) :: MPI_enreg_seq
!arrays
 integer,parameter :: spinor_idxs(2,4)=RESHAPE((/1,1,2,2,1,2,2,1/),(/2,4/))
 integer :: kcalc2ibz(Sigp%nkptgw)
 integer,allocatable :: dimlmn(:),task_distrb(:,:,:,:),tmp_distrb(:)
 integer,allocatable :: istart(:),istop(:),bands_idx(:,:,:),gboundf(:,:)
 real(dp) :: tmp_xc(2,Sigp%nsig_ab),tmp_xcval(2,Sigp%nsig_ab),tmp_H(2,Sigp%nsig_ab),tmp_U(2,Sigp%nsig_ab)
 real(dp) :: tmp_h0ij(2,Sigp%nsig_ab),tmp_sigcx(2,Sigp%nsig_ab)
 real(dp) :: dijU(2)
 real(dp) :: strsxc(6),kpt(3)
 real(dp) :: vxc1ab(2),vxc1ab_val(2)
 real(dp),allocatable :: kxc_(:,:),vh_(:),xccc3d_(:),vxc_val(:,:)
 real(dp),allocatable :: kinpw(:),veffh0(:,:)
 complex(dpc) :: tmp(3)
 complex(gwpc),pointer :: wfr1up(:),wfr1dwn(:)
 complex(gwpc),pointer :: wfr2up(:),wfr2dwn(:)
 complex(gwpc),pointer :: cg1(:),cg2(:)
 complex(gwpc),target,allocatable :: wfr1(:),wfr2(:)
 complex(dpc),allocatable :: vxcab(:),vxcab_val(:),u1cjg_u2dpc(:),kinwf2(:),veffh0_ab(:)
 type(Cprj_type),allocatable ::  Cprj_b1ks(:,:),Cprj_b2ks(:,:)

! *************************************************************************

 DBG_ENTER("COLL")

 spaceComm = Wfd%comm
 rank      = Wfd%my_rank
 master    = Wfd%master
 nprocs    = Wfd%nproc

 ! * Fake MPI_type for sequential part
 call initmpi_seq(MPI_enreg_seq)

 ! === Index in the IBZ of the GW k-points ===
 ! * Only these points will be considered.
 kcalc2ibz(:)=Kmesh%tab(Sigp%kptgw2bz)

 nspinor=Dtset%nspinor
 nsppol =Dtset%nsppol
 nspden =Dtset%nspden
 if (nspinor==2) then
   MSG_ERROR(" Remember to ADD SO")
 end if

 ! === Initialize structure to store matrix elements ===
 ! * Note that minbnd and maxbnd are not spin dependent!
 allocate(bands_idx(2,Kmesh%nibz,nsppol))
 bands_idx=0
 do ikc=1,Sigp%nkptgw
   ik_ibz = kcalc2ibz(ikc)
   bands_idx(1,ik_ibz,:)=Sigp%minbnd(ikc,1)
   bands_idx(2,ik_ibz,:)=Sigp%maxbnd(ikc,1)
 end do

 call init_melements(Mels,Mflags,Dtset%nsppol,Dtset%nspden,Dtset%nspinor,Kmesh%nibz,Kmesh%ibz,bands_idx)
 deallocate(bands_idx)

 if (Mels%bmin/= b1 .or. Mels%bmax/= b2) then
   MSG_BUG("Check init_melements")
 end if
 if (Mflags%has_lexexch==1) then
   MSG_ERROR("Local EXX not coded!")
 end if
 !
 ! === Evaluate $v_\xc$ using only the valence charge ====
 msg = ' calc_vHxc_braket : calculating v_xc[n_val] (excluding non-linear core corrections) '
 call wrtout(std_out,msg,'COLL')

 do isppol=1,nsppol
   write(msg,'(a,i2,a,e16.6)')&
&    ' For spin ',isppol,' Min density rhor = ',MINVAL(rhor(:,isppol))
   call wrtout(std_out,msg,'COLL')
   if (Dtset%usepaw==1) then
     write(msg,'(a,i2,a,e16.6)')&
&      ' For spin ',isppol,' Min density nhat = ',MINVAL(nhat(:,isppol))
     call wrtout(std_out,msg,'COLL')
     write(msg,'(a,i2,a,e16.6)')&
&      ' For spin ',isppol,' Min density trho-nhat = ',MINVAL(rhor(:,isppol)-nhat(:,isppol))
     call wrtout(std_out,msg,'COLL')
     write(msg,'(a,i2)')' using usexcnhat = ',usexcnhat
     call wrtout(std_out,msg,'COLL')
   end if
 end do

 option = 0 ! Only exc, vxc, strsxc
 nkxc   = 0 ! No computation of XC kernel
 n3xccc_= 0 ! No core
 nk3xc  = 0 ! k3xc not needed
 izero  = Dtset%usepaw

 allocate(xccc3d_(n3xccc_),vh_(nfftf),kxc_(nfftf,nkxc),vxc_val(nfftf,nspden))

 call rhohxc(Dtset,enxc_val,gsqcutf_eff,izero,kxc_,MPI_enreg_seq,nfftf,ngfftf,&
& nhat,Dtset%usepaw,nhatgr,nhatgrdim,nkxc,nk3xc,nspden,n3xccc_,option,rhog,rhor,Cryst%rprimd,&
& strsxc,usexcnhat,vh_,vxc_val,vxcval_avg,xccc3d_)

 deallocate(xccc3d_,vh_,kxc_)

 write(msg,'(a,f8.4,2a,f8.4,a)')&
&  ' E_xc[n_val]  = ',enxc_val,  ' [Ha]. ',&
&  '<V_xc[n_val]> = ',vxcval_avg,' [Ha]. '
 call wrtout(std_out,msg,'COLL')

 ! === If PAW and qp-SCGW then update Paw_ij and calculate the matrix elements ===
 ! * We cannot simply rely on gwcalctyp because I need KS vxc in sigma.
 if (Dtset%usepaw==1.and.Mflags%has_hbare==1) then
   ABI_CHECK(Mflags%only_diago==0,"Wrong only_diago")

   call paw_mknewh0(nsppol,Dtset%nspden,nfftf,Dtset%pawspnorb,Dtset%pawprtvol,Cryst,Psps,&
&    Pawtab,Paw_an,Paw_ij,Pawang,Pawfgrtab,vxc,vxc_val,vtrial)

   ! * Effective potential of the bare Hamiltonian: valence term is subtracted.
   allocate(veffh0(nfftf,Dtset%nspden))
   veffh0=vtrial-vxc_val
   !veffh0=vtrial !this is to retrieve the KS Hamiltonian
 end if

 ! === Setup of the hermitian operator vxcab ===
 ! * if nspden==4 vxc contains (v^11, v^22, Re[V^12], Im[V^12].
 ! * Cannot use directly Re and Im since we also need off-diagonal elements.
 if (nspinor==2) then
   allocate(vxcab(nfftf),vxcab_val(nfftf))
   vxcab    (:)=DCMPLX(vxc    (:,3),vxc    (:,4))
   vxcab_val(:)=DCMPLX(vxc_val(:,3),vxc_val(:,4))
   if (Mflags%has_hbare==1) then
     allocate(veffh0_ab(nfftf))
     veffh0_ab(:)=DCMPLX(veffh0(:,3),veffh0(:,4))
   end if
 end if

 ! Usually FFT meshes for wavefunctions and potentials are not equal. Two approaches are possible:
 ! Either we Fourier interpolate potentials on the coarse WF mesh or we FFT the wfs on the dense mesh.
 ! At the moment the later approach is used, more CPU demanding but more accurate.
 use_fineFFT=.FALSE.
 if (ANY(Wfd%ngfft(1:3)/=ngfftf(1:3))) then
   use_fineFFT=.TRUE.
   write(msg,'(a,3i4)')' calc_vHxc_braket : using fine FFT grid ',ngfftf(1:3)
   call wrtout(std_out,msg,'COLL')
   mgfftf = MAXVAL(ngfftf(1:3))
   allocate(gboundf(2*mgfftf+8,2)) ! Have to re-calculate gboundf for the dense FFT mesh.
   call sphereboundary(gboundf,Wfd%istwfk(1),Wfd%gvec,mgfftf,Wfd%npwwfn)
 end if
 !
 ! === Allocate matrix elements of vxc[n], vxc_val[n_v], vH and vU ===
 nfftfm1=one/nfftf

 ! * hbareme contains the matrix element of the new bare Hamiltonian h0.
 if (Mflags%has_hbare==1) then
   allocate(kinpw(Sigp%npwwfn),kinwf2(nspinor*Sigp%npwwfn))
 end if

 allocate(wfr1(nfftf*nspinor),wfr2(nfftf*nspinor))
 allocate(u1cjg_u2dpc(nfftf))

 ! === Create distribution table for tasks ===
 ! * This section is parallelized inside MPI_COMM_WORLD
 !   as all processors are calling the routine with all GW wavefunctions
 nbcalc=(b2-b1+1)
 allocate(task_distrb(Sigp%nkptgw,b1:b1+nbcalc-1,b1:b1+nbcalc-1,nsppol))
 task_distrb=rank

 if (nprocs/=1) then
   if (Mflags%only_diago==1) then 
     ntasks=Sigp%nkptgw*nbcalc*nsppol
   else 
     ntasks=Sigp%nkptgw*(nbcalc*(nbcalc+1)/2)*nsppol
   end if

   allocate(tmp_distrb(ntasks))
   allocate(istart(nprocs),istop(nprocs))
   call split_work2(ntasks,nprocs,istart,istop)
   do irank=0,nprocs-1
    if (istart(irank+1)==ntasks+1) CYCLE
    tmp_distrb(istart(irank+1):istop(irank+1))=irank
   end do
   deallocate(istart,istop)
   ! * Reshape tmp_distrb, just to keep the code readable.
   task_distrb=-999; ii=0
   do ikc=1,Sigp%nkptgw
     do is=1,nsppol
       do jb=b1,b2
         do ib=b1,jb ! Upper triangle
          if (Mflags%only_diago==1.and.ib/=jb) CYCLE
          ii=ii+1
          task_distrb(ikc,ib,jb,is)=tmp_distrb(ii)
          task_distrb(ikc,jb,ib,is)=tmp_distrb(ii) !just to be consistent
         end do
       end do
     end do
   end do
   if (.not.Mflags%only_diago==1) then
     ltest=ALL(task_distrb>=0.and.task_distrb<=nprocs-1)
     ABI_CHECK(ltest,"BUG in task_distrb")
   end if
   deallocate(tmp_distrb)
 end if
 !
 ! =====================================
 ! ==== Loop over required k-points ====
 ! =====================================
 do ikc=1,Sigp%nkptgw

   if (ALL(task_distrb(ikc,:,:,:)/=rank)) CYCLE
   ik_ibz=kcalc2ibz(ikc)

   ! Calculate |k+G|^2 needed by hbareme
   !FIXME Here I have a problem if I use ecutwfn there is a bug somewhere in setshell or invars2m!
   ! ecutwfn is slightly smaller than the max kinetic energy in gvec. The 0.1 pad should partially solve the problem
   if (Mflags%has_hbare==1) then
     kpt=Kmesh%ibz(:,ik_ibz)
     call mkkin(Dtset%ecutwfn+0.1_dp,Dtset%ecutsm,Dtset%effmass,Cryst%gmet,Wfd%gvec,kinpw,kpt,Sigp%npwwfn)
     where (kinpw>HUGE(zero)*1.d-11); kinpw=zero; end where
   end if

   do is=1,nsppol
     if (ALL(task_distrb(ikc,:,:,is)/=rank)) CYCLE

     do jb=b1,b2
       if (ALL(task_distrb(ikc,:,jb,is)/=rank)) CYCLE

       if (Mflags%has_hbare==1) then
         cg2 => Wfd%Wave(jb,ikc,is)%ug  ! Wfd contains 1:nkptgw wave functions
         kinwf2(1:Wfd%npwwfn)=cg2(1:Wfd%npwwfn)*kinpw(:)
         if (nspinor==2) kinwf2(Wfd%npwwfn+1:)=cg2(Wfd%npwwfn+1:)*kinpw(:)
       end if

       if (use_fineFFT) then
         call fft_onewfn(Wfd%paral_kgb,Wfd%istwfk(ikc),nspinor,Wfd%npwwfn,nfftf,Wfd%Wave(jb,ikc,is)%ug,wfr2,&
&           igfftf,ngfftf,Wfd%gvec,gboundf,0,MPI_enreg)
       else
         call wfd_get_ur(Wfd,jb,ikc,is,wfr2)
       end if

       do ib=b1,jb ! Upper triangle

         if (task_distrb(ikc,ib,jb,is)/=rank) CYCLE

         ! * Off-diagonal elements only for QPSCGW.
         if (Mflags%only_diago==1.and.ib/=jb) CYCLE

         if (use_fineFFT) then
           call fft_onewfn(Wfd%paral_kgb,Wfd%istwfk(ikc),nspinor,Wfd%npwwfn,nfftf,Wfd%Wave(ib,ikc,is)%ug,wfr1,&
&            igfftf,ngfftf,Wfd%gvec,gboundf,0,MPI_enreg)
         else
           call wfd_get_ur(Wfd,ib,ikc,is,wfr1)
         end if

         u1cjg_u2dpc(1:nfftf)=CONJG(wfr1(1:nfftf))*wfr2(1:nfftf)

         if (Mflags%has_vxc==1)      &
&          Mels%vxc     (ib,jb,ik_ibz,is)=SUM(u1cjg_u2dpc(1:nfftf)*vxc    (1:nfftf,is))*nfftfm1

         if (Mflags%has_vxcval==1)   &
&          Mels%vxcval  (ib,jb,ik_ibz,is)=SUM(u1cjg_u2dpc(1:nfftf)*vxc_val(1:nfftf,is))*nfftfm1

         if (Mflags%has_vhartree==1) &
&          Mels%vhartree(ib,jb,ik_ibz,is)=SUM(u1cjg_u2dpc(1:nfftf)*vhartr (1:nfftf))   *nfftfm1

         if (Mflags%has_hbare==1) then
           cg1 => Wfd%Wave(ib,ikc,is)%ug(1:Wfd%npwwfn)
           Mels%hbare(ib,jb,ik_ibz,is)=  &
&            DOT_PRODUCT(cg1,kinwf2(1:Wfd%npwwfn)) + SUM(u1cjg_u2dpc(1:nfftf)*veffh0(1:nfftf,is))*nfftfm1
!&            xdotc(Wfd%npwwfn,cg1(1:),1,kinwf2(1:),1) + SUM(u1cjg_u2dpc(1:nfftf)*veffh0(1:nfftf,is))*nfftfm1
         end if

         if (nspinor==2) then !Here I can skip 21 if ib==jb
           wfr1up  => wfr1(1:nfftf)
           wfr1dwn => wfr1(nfftf+1:2*nfftf)
           wfr2up  => wfr2(1:nfftf)
           wfr2dwn => wfr2(nfftf+1:2*nfftf)

           if (Mflags%has_hbare==1) then
             cg1 => Wfd%Wave(ib,ikc,is)%ug(Wfd%npwwfn+1:)
             tmp(1)=SUM(CONJG(wfr1dwn)*veffh0(:,2)*wfr2dwn)*nfftfm1 + DOT_PRODUCT(cg1,kinwf2(Wfd%npwwfn+1:))
             tmp(2)=SUM(CONJG(wfr1dwn)*      veffh0_ab(:) *wfr2dwn)*nfftfm1
             tmp(3)=SUM(CONJG(wfr1dwn)*CONJG(veffh0_ab(:))*wfr2dwn)*nfftfm1
             Mels%hbare(ib,jb,ik_ibz,2:4)=tmp(:)
           end if

           if (Mflags%has_vxc==1) then
             tmp(1) = SUM(CONJG(wfr1dwn)*      vxc(:,2) *wfr2dwn)*nfftfm1
             tmp(2) = SUM(CONJG(wfr1up )*      vxcab(:) *wfr2dwn)*nfftfm1
             tmp(3) = SUM(CONJG(wfr1dwn)*CONJG(vxcab(:))*wfr2up )*nfftfm1
             Mels%vxc(ib,jb,ik_ibz,2:4)=tmp(:)
           end if

           if (Mflags%has_vxcval==1) then
             tmp(1) = SUM(CONJG(wfr1dwn)*      vxc_val(:,2) *wfr2dwn)*nfftfm1
             tmp(2) = SUM(CONJG(wfr1up )*      vxcab_val(:) *wfr2dwn)*nfftfm1
             tmp(3) = SUM(CONJG(wfr1dwn)*CONJG(vxcab_val(:))*wfr2up )*nfftfm1
             Mels%vxcval(ib,jb,ik_ibz,2:4)=tmp(:)
           end if

           if (Mflags%has_vhartree==1) then
             tmp(1) = SUM(CONJG(wfr1dwn)*vhartr(:)*wfr2dwn)*nfftfm1
             Mels%vhartree(ib,jb,ik_ibz,2  )=tmp(1)
             Mels%vhartree(ib,jb,ik_ibz,3:4)=czero
           end if

         end if !nspinor==2

       end do !ib
     end do !jb

   end do !is
 end do !ikc

 deallocate(wfr1,wfr2,vxc_val)
 deallocate(u1cjg_u2dpc)
 if (nspinor==2) deallocate(vxcab,vxcab_val)
 if (use_fineFFT) deallocate(gboundf)

 if (Mflags%has_hbare==1) then
   deallocate(kinpw,kinwf2)
   deallocate(veffh0)
   if (nspinor==2) deallocate(veffh0_ab)
 end if
 !
 ! ====================================
 ! ===== Additional terms for PAW =====
 ! ====================================
 if (Dtset%usepaw==1) then

   ! * Tests if needed pointers in Paw_ij are associated.
   ltest=(associated(Paw_ij(1)%dijxc).and.associated(Paw_ij(1)%dijxc_val))
   ABI_CHECK(ltest,"dijxc or dijxc_val not associated")

   !* For LDA+U
   do iat=1,Cryst%natom
     itypat=Cryst%typat(iat)
     if (Pawtab(itypat)%usepawu>0) then
       ltest=(associated(Paw_ij(iat)%dijU))
       ABI_CHECK(ltest,"LDA+U but dijU not associated")
     end if
   end do

   if (Dtset%pawspnorb>0) then
     ltest=(associated(Paw_ij(1)%dijso))
     ABI_CHECK(ltest,"dijso not associated")
   end if

   lmn2_size_max=MAXVAL(Pawtab(:)%lmn2_size)

   if (Mflags%has_sxcore==1) then
     if (     SIZE(dijexc_core,DIM=1) /= lmn2_size_max  &
&        .or. SIZE(dijexc_core,DIM=2) /= 1              &
&        .or. SIZE(dijexc_core,DIM=3) /= Cryst%ntypat ) then
       MSG_BUG("Wrong sizes in dijexc_core")
     end if
   end if

   nsploop=nspinor**2

   ! ====================================
   ! === Assemble PAW matrix elements ===
   ! ====================================
   allocate(dimlmn(Cryst%natom))
   do iat=1,Cryst%natom
     dimlmn(iat)=Pawtab(Cryst%typat(iat))%lmn_size
   end do

   allocate(Cprj_b1ks(Cryst%natom,nspinor))
   allocate(Cprj_b2ks(Cryst%natom,nspinor))
   call cprj_alloc(Cprj_b1ks,0,dimlmn)
   call cprj_alloc(Cprj_b2ks,0,dimlmn)

   ! === Loop over required k-points ===
   do ikc=1,Sigp%nkptgw
     if (ALL(task_distrb(ikc,:,:,:)/=rank)) CYCLE
     ik_ibz=kcalc2ibz(ikc)

     do is=1,nsppol
       if (ALL(task_distrb(ikc,:,:,is)/=rank)) CYCLE
       shift=nspinor*Sigp%nbnds*Kmesh%nibz*(is-1)

       do jb=b1,b2
         if (ALL(task_distrb(ikc,:,jb,is)/=rank)) CYCLE

         ! === Load projected wavefunctions for this k-point, spin and band ===
         ! * Cprj are unsorted, full correspondence with xred. See ctocprj.F90!!
         ibsp2=nspinor*Sigp%nbnds*(ik_ibz-1)+jb+shift-1
         do ispinor=1,nspinor
           ibsp2=ibsp2+1
           do iat=1,Cryst%natom
            Cprj_b2ks(iat,ispinor)%cp(:,:)=Cprj(iat,ibsp2)%cp(:,:)
           end do
         end do

         do ib=b1,jb ! Upper triangle
           if (task_distrb(ikc,ib,jb,is)/=rank) CYCLE

           ! * Off-diagonal elements only for QPSCGW.
           if (Mflags%only_diago==1.and.ib/=jb) CYCLE

           ibsp1=nspinor*Sigp%nbnds*(ik_ibz-1)+ib+shift-1
           do ispinor=1,nspinor
             ibsp1=ibsp1+1
             do iat=1,Cryst%natom
               Cprj_b1ks(iat,ispinor)%cp(:,:)=Cprj(iat,ibsp1)%cp(:,:)
             end do
           end do
           !
           ! === Get onsite matrix elements summing over atoms and channels ===
           ! * Spin is external and fixed (1,2) if collinear.
           ! * if noncollinear loop internally over the four components ab.
           tmp_xc   =zero
           tmp_xcval=zero
           tmp_H    =zero
           tmp_U    =zero
           tmp_h0ij =zero
           tmp_sigcx=zero

           do iat=1,Cryst%natom
             itypat   =Cryst%typat(iat)
             lmn_size =Pawtab(itypat)%lmn_size
             cplex_dij=Paw_ij(iat)%cplex_dij
             klmn1=1

             do jlmn=1,lmn_size
               j0lmn=jlmn*(jlmn-1)/2
               do ilmn=1,jlmn
                 klmn=j0lmn+ilmn
                 ! TODO Be careful, here I assume that the onsite terms ij are symmetric
                 ! should check the spin-orbit case!
                 fact=one ; if (ilmn==jlmn) fact=half

                 ! === Loop over four components if nspinor==2 ===
                 ! * If collinear nsploop==1
                 do iab=1,nsploop
                   isp1=spinor_idxs(1,iab)
                   isp2=spinor_idxs(2,iab)

                   re_p=  Cprj_b1ks(iat,isp1)%cp(1,ilmn) * Cprj_b2ks(iat,isp2)%cp(1,jlmn) &
&                        +Cprj_b1ks(iat,isp1)%cp(2,ilmn) * Cprj_b2ks(iat,isp2)%cp(2,jlmn) &
&                        +Cprj_b1ks(iat,isp1)%cp(1,jlmn) * Cprj_b2ks(iat,isp2)%cp(1,ilmn) &
&                        +Cprj_b1ks(iat,isp1)%cp(2,jlmn) * Cprj_b2ks(iat,isp2)%cp(2,ilmn)

                   im_p=  Cprj_b1ks(iat,isp1)%cp(1,ilmn) * Cprj_b2ks(iat,isp2)%cp(2,jlmn) &
&                        -Cprj_b1ks(iat,isp1)%cp(2,ilmn) * Cprj_b2ks(iat,isp2)%cp(1,jlmn) &
&                        +Cprj_b1ks(iat,isp1)%cp(1,jlmn) * Cprj_b2ks(iat,isp2)%cp(2,ilmn) &
&                        -Cprj_b1ks(iat,isp1)%cp(2,jlmn) * Cprj_b2ks(iat,isp2)%cp(1,ilmn)

                   ! ==================================================
                   ! === Load onsite matrix elements and accumulate ===
                   ! ==================================================
                   if (nspinor==1) then

                     if (Mflags%has_hbare==1) then ! * Get new dij of h0 and accumulate.
                       h0dij=Paw_ij(iat)%dij(klmn,is)
                       tmp_h0ij(1,iab)=tmp_h0ij(1,iab) + h0dij*re_p*fact
                       tmp_h0ij(2,iab)=tmp_h0ij(2,iab) + h0dij*im_p*fact
                     end if

                     if (Mflags%has_sxcore==1) then ! * Fock operator generated by core electrons.
                       dijsigcx = dijexc_core(klmn,1,itypat)
                       tmp_sigcx(1,iab)=tmp_sigcx(1,iab) + dijsigcx*re_p*fact
                       tmp_sigcx(2,iab)=tmp_sigcx(2,iab) + dijsigcx*im_p*fact
                     end if

                     if (Mflags%has_vxc==1) then ! * Accumulate vxc[n1+nc] + vxc[n1+tn+nc].
                       vxc1 = Paw_ij(iat)%dijxc(klmn,is)
                       tmp_xc(1,iab)=tmp_xc(1,iab) + vxc1*re_p*fact
                       tmp_xc(2,iab)=tmp_xc(2,iab) + vxc1*im_p*fact
                     end if

                     if (Mflags%has_vxcval==1) then ! * Accumulate valence-only XC.
                       vxc1_val=Paw_ij(iat)%dijxc_val(klmn,is)
                       tmp_xcval(1,1)=tmp_xcval(1,1) + vxc1_val*re_p*fact
                       tmp_xcval(2,1)=tmp_xcval(2,1) + vxc1_val*im_p*fact
                     end if

                     if (Mflags%has_vhartree==1) then ! * Accumulate Hartree term of the PAW Hamiltonian.
                       DijH=Paw_ij(iat)%dijhartree(klmn)
                       tmp_H(1,1)=tmp_H(1,1) + DijH*re_p*fact
                       tmp_H(2,1)=tmp_H(2,1) + DijH*im_p*fact
                     end if

                     ! * Accumulate U term of the PAW Hamiltonian (only onsite AE contribution)
                     if (Mflags%has_vu==1) then
                       if (Pawtab(itypat)%usepawu>0) then
                         dijU(1)=Paw_ij(iat)%dijU(klmn,is)
                         tmp_U(1,1)=tmp_U(1,1) + dijU(1)*re_p*fact
                         tmp_U(2,1)=tmp_U(2,1) + dijU(1)*im_p*fact
                       end if
                     end if

                   else ! Spinorial case ===

                     ! FIXME H0 + spinor not implemented
                     if (Mflags%has_hbare==1.or.Mflags%has_sxcore==1) then
                       MSG_ERROR("not implemented")
                     end if

                     if (Mflags%has_vxc==1) then ! * Accumulate vxc[n1+nc] + vxc[n1+tn+nc].
                       vxc1ab(1) = Paw_ij(iat)%dijxc(klmn1,  iab)
                       vxc1ab(2) = Paw_ij(iat)%dijxc(klmn1+1,iab)
                       tmp_xc(1,iab) = tmp_xc(1,iab) + (vxc1ab(1)*re_p - vxc1ab(2)*im_p)*fact
                       tmp_xc(2,iab) = tmp_xc(2,iab) + (vxc1ab(2)*re_p + vxc1ab(1)*im_p)*fact
                     end if

                     if (Mflags%has_vxcval==1) then ! * Accumulate valence-only XC.
                       vxc1ab_val(1) = Paw_ij(iat)%dijxc_val(klmn1,  iab)
                       vxc1ab_val(2) = Paw_ij(iat)%dijxc_val(klmn1+1,iab)
                       tmp_xcval(1,iab) = tmp_xcval(1,iab) + (vxc1ab_val(1)*re_p - vxc1ab_val(2)*im_p)*fact
                       tmp_xcval(2,iab) = tmp_xcval(2,iab) + (vxc1ab_val(2)*re_p + vxc1ab_val(1)*im_p)*fact
                     end if

                     ! * In GW, dijhartree is always real.
                     if (Mflags%has_vhartree==1) then ! * Accumulate Hartree term of the PAW Hamiltonian.
                       if (iab==1.or.iab==2) then
                         DijH = Paw_ij(iat)%dijhartree(klmn)
                         tmp_H(1,iab) = tmp_H(1,iab) + DijH*re_p*fact
                         tmp_H(2,iab) = tmp_H(2,iab) + DijH*im_p*fact
                       end if
                     end if

                     ! TODO "ADD LDA+U and SO"
                     ! check this part

                     if (Mflags%has_vu==1) then
                       if (Pawtab(itypat)%usepawu>0) then ! * Accumulate the U term of the PAW Hamiltonian (only onsite AE contribution)
                         dijU(1)=Paw_ij(iat)%dijU(klmn1  ,iab)
                         dijU(2)=Paw_ij(iat)%dijU(klmn1+1,iab)
                         tmp_U(1,iab) = tmp_U(1,iab) + (dijU(1)*re_p - dijU(2)*im_p)*fact
                         tmp_U(2,iab) = tmp_U(2,iab) + (dijU(2)*re_p + dijU(1)*im_p)*fact
                       end if
                     end if

                   end if
                 end do !iab

                 klmn1=klmn1+cplex_dij

               end do !ilmn
             end do !jlmn
           end do !iat
           !
           ! ========================================
           ! ==== Add to plane wave contribution ====
           ! ========================================
           if (nspinor==1) then

             if (Mflags%has_hbare==1)    &
&              Mels%hbare   (ib,jb,ik_ibz,is) = Mels%hbare   (ib,jb,ik_ibz,is) + DCMPLX(tmp_h0ij(1,1),tmp_h0ij(2,1))

             if (Mflags%has_vxc==1)      &
&              Mels%vxc     (ib,jb,ik_ibz,is) = Mels%vxc     (ib,jb,ik_ibz,is) + DCMPLX(tmp_xc(1,1),tmp_xc(2,1))

             if (Mflags%has_vxcval==1)   &
&              Mels%vxcval  (ib,jb,ik_ibz,is) = Mels%vxcval  (ib,jb,ik_ibz,is) + DCMPLX(tmp_xcval(1,1),tmp_xcval(2,1))

             if (Mflags%has_vhartree==1) &
&              Mels%vhartree(ib,jb,ik_ibz,is) = Mels%vhartree(ib,jb,ik_ibz,is) + DCMPLX(tmp_H (1,1),tmp_H (2,1))

             if (Mflags%has_vu==1)       &
&              Mels%vu      (ib,jb,ik_ibz,is) = DCMPLX(tmp_U(1,1),tmp_U(2,1))

             if (Mflags%has_sxcore==1)   &
&              Mels%sxcore  (ib,jb,ik_ibz,is) = DCMPLX(tmp_sigcx(1,1),tmp_sigcx(2,1))

           else

             if (Mflags%has_hbare==1)    &
&              Mels%hbare   (ib,jb,ik_ibz,:) = Mels%hbare(ib,jb,ik_ibz,:) + DCMPLX(tmp_h0ij(1,:),tmp_h0ij(2,:))

             if (Mflags%has_vxc==1)      &
&              Mels%vxc     (ib,jb,ik_ibz,:) = Mels%vxc   (ib,jb,ik_ibz,:) + DCMPLX(tmp_xc(1,:),tmp_xc(2,:))

             if (Mflags%has_vxcval==1)   &
&              Mels%vxcval  (ib,jb,ik_ibz,:) = Mels%vxcval(ib,jb,ik_ibz,:) + DCMPLX(tmp_xcval(1,:),tmp_xcval(2,:))

             if (Mflags%has_vhartree==1) &
&              Mels%vhartree(ib,jb,ik_ibz,:) = Mels%vhartree(ib,jb,ik_ibz,:) + DCMPLX(tmp_H (1,:),tmp_H (2,:))

             if (Mflags%has_vu==1)       &
&              Mels%vu      (ib,jb,ik_ibz,:) = DCMPLX(tmp_U(1,:),tmp_U(2,:))
           end if

         end do !ib
       end do !jb

     end do !is
   end do !ikc

   deallocate(dimlmn)
   call cprj_free(Cprj_b1ks); deallocate(Cprj_b1ks)
   call cprj_free(Cprj_b2ks); deallocate(Cprj_b2ks)

 end if !PAW

 deallocate(task_distrb)

 ! === Sum up contributions on each node ===
 ! * Set the corresponding has_* flags to 2.
 call xsum_melements(Mels,spaceComm)

 ! * Reconstruct lower triangle.
 call herm_melements(Mels)

 DBG_EXIT("COLL")

end subroutine calc_vHxc_braket
!!***

!----------------------------------------------------------------------

!!****f* ABINIT/calc_wf_qp
!! NAME
!! calc_wf_qp
!!
!! FUNCTION
!!  Calculate QP amplitudes in real or reciprocal space starting from the 
!!  KS wavefunctions and the corresponding expansion coefficients.
!!  Take into account possible spreading of bands.
!!
!! INPUTS
!!  b1gw, b2gw = Min and max band index over k-point and spin for GW corrections.
!!  nkibz=number of k-points.
!!  nsize= number of points in real space or number of G vectors.
!!  nsppol=number of spin.
!!  nbnds=number of bands in the present GW calculation.
!!  my_minb, my_maxb = Indeces of the bands treated by this processor.
!!  m_lda_to_qp(nbnds,nbnds,nkibz,nsppol)= expansion of the QP amplitudes in terms of KS wavefunctions.
!!
!! OUTPUT
!!  wf(nsize,my_minb:my_maxb,nkibz,nsppol)= Updated QP amplitudes for this processor.
!!
!! TODO 
!!  Pass the object: avoid possible copy and moreover one can call wfd_reset_ur
!!
!! PARENTS
!!      screening,sigma
!!
!! CHILDREN
!!
!! SOURCE
!!

subroutine calc_wf_qp(MPI_enreg,gwpara,nkibz,nbnds,nsize,nsppol,nspinor,&
& m_lda_to_qp,my_minb,my_maxb,b1gw,b2gw,wf)

 use defs_basis
 use m_xmpi
 use m_gwdefs,       only : czero_gw, cone_gw
 use defs_abitypes
 use m_errors

 use m_blas,         only : xgemm

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_51_manage_mpi
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nbnds,nkibz,nsize,nsppol,nspinor,my_minb,my_maxb,b1gw,b2gw,gwpara
 type(MPI_type),intent(inout) :: MPI_enreg
!arrays
 complex(dpc),intent(in) :: m_lda_to_qp(nbnds,nbnds,nkibz,nsppol)
 complex(gwpc),intent(inout) :: wf(nsize*nspinor,my_minb:my_maxb,nkibz,nsppol)

!Local variables-------------------------------
!scalars
 integer :: band,ik_ibz,spin,ierr,lowerb,upperb,rangeb,ispinor,spad
 integer :: spaceComm,sizegw,my_rank
 character(len=500) :: msg
!arrays
 complex(gwpc),allocatable :: umat_k(:,:)
 complex(gwpc),allocatable :: wf_ks(:,:),wf_qp(:,:)
!************************************************************************

 DBG_ENTER("COLL")

 call xcomm_init(MPI_enreg,spaceComm)
 call xme_init  (MPI_enreg,my_rank)
 !
 ! === Determine the range of bands this processor has to treat ===
 lowerb=0  ! NO overlap between [b1gw,b2gw] and [my_minb,my_maxb]
 if (b1gw<=my_maxb) lowerb=MAX(b1gw,my_minb)
 upperb=0  ! NO overlap between [b1gw,b2gw] and [my_minb,my_maxb]
 if (b2gw>=my_minb) upperb=MIN(b2gw,my_maxb)
 rangeb=0
 if (lowerb/=0.and.upperb/=0) rangeb=upperb-lowerb+1
 sizegw=b2gw-b1gw+1

 if (rangeb/=0) then
   write(msg,'(2a,i3,3a,i3,1x,i3,a,3i3,2a,3i3)')ch10,&
&    ' proc. ',my_rank,' will update its wavefunctions ',ch10,&
&    ' my_bands indeces: ',my_minb,my_maxb,' gwrange: ',b1gw,b2gw,sizegw,ch10,&
&    ' lowerb, upperb, rangeb: ',lowerb,upperb,rangeb
   call wrtout(std_out,msg,'PERS') 
 end if

 allocate(umat_k(lowerb:upperb,b1gw:b2gw))
 allocate(wf_qp(nsize*nspinor,b1gw:b2gw))  
 allocate(wf_ks(nsize,lowerb:upperb))
 wf_qp(:,:)=czero_gw ; wf_ks(:,:)=czero_gw 
 !
 ! === Calculate : $\Psi^{QP}_{r,b} = \sum_n \Psi^{KS}_{r,n} M_{n,b}$ ===
 do spin=1,nsppol
   do ik_ibz=1,nkibz

     umat_k(:,:)=m_lda_to_qp(lowerb:upperb,b1gw:b2gw,ik_ibz,spin)
     wf_qp(:,:)=czero_gw

     if (rangeb/=0) then
       do ispinor=1,nspinor
         spad=nsize*(ispinor-1)
         wf_ks(:,lowerb:upperb)=wf(spad+1:spad+nsize,lowerb:upperb,ik_ibz,spin)

         call XGEMM('N','N',nsize,sizegw,rangeb,cone_gw,wf_ks(:,lowerb:upperb),nsize,&
&          umat_k,rangeb,czero_gw,wf_qp(spad+1:spad+nsize,b1gw:b2gw),nsize)

       end do
     end if
     !
     ! =======================================
     ! === Update the input wave functions ===
     ! =======================================
     !
     SELECT CASE (gwpara)

     CASE (0,1)
       ! == Each node has the full set ==
       wf(:,b1gw:b2gw,ik_ibz,spin)=wf_qp(:,b1gw:b2gw)

     CASE (2)
       ! == Bands are spreaded across the nodes ==
       ! * Sum up all the partial QP amplitudes.
       ! * Keep the band in memory only if you are the right processor.
       call xsum_mpi(wf_qp(:,b1gw:b2gw),spaceComm,ierr)
       do band=b1gw,b2gw
         if (my_rank==MPI_enreg%proc_distrb(ik_ibz,band,spin)) wf(:,band,ik_ibz,spin)=wf_qp(:,band)
       end do

     CASE DEFAULT
       write(msg,'(a,i3,a)')' gwpara = ',gwpara,' not allowed '
       MSG_BUG(msg)
     END SELECT

   end do !ik_ibz
 end do !spin

 deallocate(umat_k)
 deallocate(wf_ks,wf_qp)

 !$call wfd_reset_ur(Wf_info)

 DBG_EXIT("COLL")

end subroutine calc_wf_qp
!!***

!----------------------------------------------------------------------

!!****f* ABINIt/calc_wf_qp_Wfval
!! NAME
!! calc_wf_qp_Wfval
!!
!! FUNCTION
!!  Calculate QP amplitudes in real or reciprocal space starting from the 
!!  KS wavefunctions and the corresponding expansion coefficients,
!!  Consider the case of two separated sets of wavefunctions: Wf and Wfval
!!
!! INPUTS
!!  b1gw, b2gw = Min and max band index over k-point and spin for GW corrections.
!!  nkibz=number of k-points.
!!  nsize= number of points in real space or number of G vectors.
!!  nsppol=number of spin.
!!  nbnds=number of bands in the present GW calculation.
!!  my_minb, my_maxb = Indeces of the bands treated by this processor.
!!  m_lda_to_qp(nbnds,nbnds,nkibz,nsppol)= expansion of the QP amplitudes in terms of KS wavefunctions.
!!
!! OUTPUT
!!  wf(nsize,my_minb:my_maxb,nkibz,nsppol)= Updated QP amplitudes for this processor.
!!
!! PARENTS
!!      screening
!!
!! CHILDREN
!!
!! SOURCE
!!

subroutine calc_wf_qp_Wfval(MPI_enreg,gwpara,nkibz,nbnds,nsize,nsppol,&
& m_lda_to_qp,my_minb,my_maxb,b1gw,b2gw,wf,nbvw,wfval)

 use defs_basis
 use m_gwdefs,       only : czero_gw, cone_gw
 use defs_abitypes
 use m_errors
 use m_xmpi

 use m_blas,         only : xgemm

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_51_manage_mpi
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nbnds,nkibz,nsize,nsppol,my_minb,my_maxb,b1gw,b2gw,nbvw,gwpara
 type(MPI_type),intent(inout) :: MPI_enreg
!arrays
 complex(dpc),intent(in) :: m_lda_to_qp(nbnds,nbnds,nkibz,nsppol)
 complex(gwpc),intent(inout) :: wf(nsize,my_minb:my_maxb,nkibz,nsppol)
 complex(gwpc),intent(inout) :: wfval(nsize,nbvw,nkibz,nsppol)

!Local variables-------------------------------
!scalars
 integer :: band,ik_ibz,spin,ierr,lowerb,upperb,rangeb,b1gw_c,spaceComm,sizegw,my_rank
 character(len=500) :: msg
!arrays
 complex(gwpc),allocatable :: umat_k(:,:)
 complex(gwpc),allocatable :: wf_qp(:,:),wf_qp_valval(:,:),wf_qp_val(:,:),wf_qp_valcon(:,:)
!************************************************************************

 DBG_ENTER("COLL")

 call xcomm_init(MPI_enreg,spaceComm)
 call xme_init  (MPI_enreg,my_rank)
 !
 ! === Determine the range of bands this processor has to treat ===
 b1gw_c=MAX(b1gw,nbvw+1) ! Avoid double counting of the valence bands
 lowerb=0  ! NO overlap between [b1gw,b2gw] and [my_minb,my_maxb]
 if (b1gw_c<=my_maxb) lowerb=MAX(b1gw_c,my_minb)
 upperb=0  ! NO overlap between [b1gw,b2gw] and [my_minb,my_maxb]
 if (b2gw>=my_minb) upperb=MIN(b2gw,my_maxb)
 rangeb=0
 if (lowerb/=0.and.upperb/=0) rangeb=upperb-lowerb+1
 sizegw=b2gw-b1gw_c+1
 
 if (rangeb>0) then
   write(msg,'(2a,i3,3a,i3,1x,i3,a,3i3,2a,3(i3,1x))')ch10,&
&    ' proc ',my_rank,' will update its wavefunctions ',ch10,&
&    ' my_bands indeces: ',my_minb,my_maxb,' gwrange: ',b1gw_c,b2gw,sizegw,ch10,&
&    ' lowerb, upperb, rangeb: ',lowerb,upperb,rangeb
   call wrtout(std_out,msg,'PERS') 
 end if
 
 allocate(wf_qp_valval(nsize,nbvw))
 allocate(wf_qp_val   (nsize,nbvw))  

 if (sizegw>0) then
   allocate(wf_qp_valcon(nsize,b1gw_c:b2gw))  
   allocate(wf_qp(nsize,b1gw_c:b2gw))  
 end if
 !
 ! === Calculate : $\Psi^{QP}_{r,b} = \sum_n \Psi^{KS}_{r,n} M_{n,b}$ ===
 do spin=1,nsppol
   do ik_ibz=1,nkibz
    !
    ! I) Treat the valence bands
    !
    wf_qp_valval(:,:)=czero_gw ; wf_qp_val(:,:)=czero_gw 
    allocate(umat_k(nbvw,nbvw))
    umat_k(:,:)=m_lda_to_qp(1:nbvw,1:nbvw,ik_ibz,spin)
 
    call XGEMM('N','N',nsize,nbvw,nbvw,cone_gw,wfval(:,1:nbvw,ik_ibz,spin),nsize,&
&     umat_k,nbvw,czero_gw,wf_qp_valval(:,1:nbvw),nsize)

    deallocate(umat_k)
   
    if (rangeb>0) then
      allocate(umat_k(lowerb:upperb,1:nbvw))
      umat_k(:,:)=m_lda_to_qp(lowerb:upperb,1:nbvw,ik_ibz,spin)

      call XGEMM('N','N',nsize,nbvw,rangeb,cone_gw,wf(:,lowerb:upperb,ik_ibz,spin),nsize,&
&       umat_k,rangeb,czero_gw,wf_qp_val(:,1:nbvw),nsize)

      deallocate(umat_k)
    end if

    if (gwpara==2) then 
      ! Bands are spreaded among processors:
      ! * Sum up all the partial QP amplitudes.
      ! * Keep the band in memory only if you are the right processor.
      call xsum_mpi(wf_qp_val(:,1:nbvw),spaceComm,ierr)
      wf_qp_valval(:,1:nbvw)=wf_qp_valval(:,1:nbvw) + wf_qp_val(:,1:nbvw)
    else
      ! Each node has the full set
      wf_qp_valval(:,1:nbvw)=wf_qp_valval(:,1:nbvw) + wf_qp_val(:,1:nbvw)
    end if
    !
    ! II) Treat the NON-valence bands
    !
    if (sizegw>0) then
      wf_qp_valcon=czero_gw
      wf_qp       =czero_gw
      allocate(umat_k(1:nbvw,b1gw_c:b2gw))
      umat_k(:,:)=m_lda_to_qp(1:nbvw,b1gw_c:b2gw,ik_ibz,spin)
 
      call XGEMM('N','N',nsize,sizegw,nbvw,cone_gw,wfval(:,1:nbvw,ik_ibz,spin),nsize,&
&       umat_k,nbvw,czero_gw,wf_qp_valcon(:,b1gw_c:b2gw),nsize)

      deallocate(umat_k)
 
      if (rangeb>0) then
        allocate(umat_k(lowerb:upperb,b1gw_c:b2gw))
        umat_k(:,:)=m_lda_to_qp(lowerb:upperb,b1gw_c:b2gw,ik_ibz,spin)

        call XGEMM('N','N',nsize,sizegw,rangeb,cone_gw,wf(:,lowerb:upperb,ik_ibz,spin),nsize,&
&         umat_k,rangeb,czero_gw,wf_qp(:,b1gw_c:b2gw),nsize)

        deallocate(umat_k)
      end if
      !
      ! === Update the input wave functions ===
      if (gwpara==2) then 
        ! Bands are spreaded among processors:
        ! * Sum up all the partial QP amplitudes.
        ! * Keep the band in memory only if you are the right processor.
        call xsum_mpi(wf_qp(:,b1gw_c:b2gw),spaceComm,ierr)
        do band=b1gw_c,b2gw
         if (my_rank==MPI_enreg%proc_distrb(ik_ibz,band,spin)) wf(:,band,ik_ibz,spin)=wf_qp(:,band)+wf_qp_valcon(:,band)
        end do
      else
        ! Each node has the full set
        wf(:,b1gw_c:b2gw,ik_ibz,spin)=wf_qp_valcon(:,b1gw_c:b2gw)+wf_qp(:,b1gw_c:b2gw)
      end if
 
    end if !sizegw>0
 
    wfval(:,:,ik_ibz,spin)=wf_qp_valval(:,:)
    wf   (:,my_minb:b1gw_c-1,ik_ibz,spin)=wf_qp_valval(:,my_minb:b1gw_c-1)
 
   end do !ik_ibz
 end do !spin

 if (allocated(wf_qp       )) deallocate(wf_qp       )
 if (allocated(wf_qp_valval)) deallocate(wf_qp_valval)
 if (allocated(wf_qp_val   )) deallocate(wf_qp_val   )
 if (allocated(wf_qp_valcon)) deallocate(wf_qp_valcon)

 call xbarrier_mpi(spaceComm)

 DBG_EXIT("COLL")

end subroutine calc_wf_qp_Wfval
!!***
!{\src2tex{textfont=tt}}
!!****f* ABINIT/assemblychi0q0_sym
!! NAME
!! assemblychi0q0_sym
!!
!! FUNCTION
!! Update the independent particle susceptibility at q==0 for the contribution
!! of one pair of occupied-unoccupied band, for each frequency.
!! This routine takes advantage of the symmetries of the little group of the external q-point
!! to symmetrize the contribution arising from the input k-point located in the IBZ_q.
!! It computes:
!!
!!   $ \chi_0(G1,G2,io) = \chi_0(G1,G2,io)+\sum_S (rhotwg(G1)*rhotwg^\dagger(G2))*green_w(io) $
!!
!! where S is a symmetry in reciprocal space.
!! The matrix elements of the gradient operator and [V_{nl},r] are symmetrized as well.
!!
!! COPYRIGHT
!! Copyright (C) 1999-2010 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  ik_bz=Index of the k-point whose contribution has to be added to chi0.
!!  isym_kbz=Index of the symmetry such that k_bz = IS k_ibz
!!  itim_kbz=2 if time-reversal has to be used to obtain k_bz, 1 otherwise.
!!  npwepG0=Maximum number of G vectors
!!  nqlwl=Number of small q-points used to deal with the optical limit.
!!  qlwl(3,nqlwl)=reciprocal space coordinates of the q-point for long-wavelength limit treatment.
!!  rhotwg(npwepG0*nspinor**2)=Oscillator matrix elements corresponding to an occupied-unoccupied pair of states.
!!  rhotwx(3,nspinor**2)=Matrix element of the operator -i[H,r]/(e1-e2) in reciprocal lattice units.
!!  green_w(nomega)=Frequency dependent part of the Green function.
!!  Ltg_q<little_group_type>=Info on the little group associated to the external q-point.
!!    %timrev=2 it time-reversal is used, 1 otherwise.
!!    %nsym_sg=Number of space group symmetries.
!!    %wtksym(2,nsym,nkbz)=1 if the symmetry (with or without time-reversal) must be considered for this k-point.
!!  Gsph_epsG0<Gvectors_type> Information on the "enlarged" G-sphere used for chi0, it contains umklapp G0 vectors
!!    %ng=number of G vectors in the enlarged sphere, actually MUST be equal to the size of rhotwg.
!!    %rottbm1(ng,2,nsym)=index of (IR)^{-1} G where I is the identity or the inversion.
!!    %phmGt(ng,nsym)=phase factors associated to non-simmorphic operations.
!!  Cryst<Crystal_structure>=Structure defining the unit cell and its symmetries
!!    %nsym=Number of symmetries.
!!    %symrec(3,3,nsym)=Symmetry operation in reciprocal space (reduced coordinates)
!!  Ep<Epsilonm1_parameters>=Parameters of the chi0 calculation.
!!     %npwe=number of plane waves in chi0.
!!     %symchi=1 if symmetrization has to be performed.
!!     %nomega=number of frequencies in chi0.
!!
!! OUTPUT
!!  (see side effects)
!!
!! SIDE EFFECTS
!!  chi0(npwe,npwe,nomega)= Updated independent-particle susceptibility matrix in reciprocal space at q==0.
!!  lwing(Ep%npwe*Ep%nI,Ep%nomega,3)=Lower wing (calculated only if nqlwl > 1 )
!!  uwing(Ep%npwe*Ep%nJ,Ep%nomega,3)=Upper wing (calculated only if nqlwl > 1 )
!!
!! NOTES
!!
!!  1) Symmetrization of the oscilator matrix elements.
!!    If  Sq = q then  M_G( Sk,q)= e^{-i(q+G).\tau} M_{ S^-1G}  (k,q)
!!    If -Sq = q then  M_G(-Sk,q)= e^{-i(q+G).\tau} M_{-S^-1G}^*(k,q)
!!
!!    In the case of umklapps:
!!    If  Sq = q+G0 then  M_G( Sk,q)= e^{-i(q+G).\tau} M_{ S^-1(G-G0}   (k,q)
!!    If -Sq = q+G0 then  M_G(-Sk,q)= e^{-i(q+G).\tau} M_{-S^-1(G-G0)}^*(k,q)
!!
!!  In the equation below there is no need to take into account the phases due to q.t
!!  as they cancel each other in the scalar product ==> only phmGt(G,isym)=e^{-iG.\tau} is needed.
!!
!!  2) Symmetrization of the matrix elements of the position operator.
!!
!!    <Sk,b|\vec r| Sk,b'> = <k b| R\vec r + \tau|k b'> 
!! 
!!     where S is one of the symrec operation, R and \tau is the corresponding
!!     operation in real space. The term involving the fractional translation is zero provided that b /= b'.
!!
!! PARENTS
!!      cchi0q0
!!
!! CHILDREN
!!      cgerc,dosym,matrginv,mkrhotwg_sigma,zgerc
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine assemblychi0q0_sym(nqlwl,qlwl,ik_bz,isym_kbz,itim_kbz,gwcomp,nspinor,npwepG0,Ep,Cryst,Ltg_q,Gsph_epsG0,&
& chi0,rhotwx,rhotwg,green_w,green_enhigh_w,deltaf_b1b2,lwing,uwing)

 use defs_basis
 use defs_datatypes
 use m_errors

 use m_gwdefs,   only : GW_TOL_DOCC, czero_gw, epsilonm1_parameters
 use m_blas,     only : xgerc
 use m_geometry, only : vdotw
 use m_crystal,  only : crystal_structure
 use m_gsphere,  only : gvectors_type
 use m_bz_mesh,  only : little_group

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_32_util
 use interfaces_68_gw, except_this_one => assemblychi0q0_sym
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: ik_bz,isym_kbz,itim_kbz
 integer,intent(in) :: npwepG0,nqlwl,nspinor,gwcomp
 real(dp),intent(in) :: deltaf_b1b2
 type(Little_group),intent(in) :: Ltg_q
 type(Gvectors_type),intent(in) :: Gsph_epsG0 
 type(Crystal_structure),intent(in) :: Cryst
 type(Epsilonm1_parameters),intent(in) :: Ep
!arrays
 real(dp),intent(in) :: qlwl(3,nqlwl)
 complex(gwpc),intent(inout) :: rhotwg(npwepG0*nspinor**2)
 complex(gwpc),intent(in) :: rhotwx(3,nspinor**2)
 complex(gwpc),intent(inout) :: chi0(Ep%npwe*Ep%nI,Ep%npwe*Ep%nJ,Ep%nomega)
 complex(dpc),intent(in) :: green_w(Ep%nomega),green_enhigh_w(Ep%nomega)
 complex(dpc),intent(inout) :: lwing(Ep%npwe*Ep%nI,Ep%nomega,3)
 complex(dpc),intent(inout) :: uwing(Ep%npwe*Ep%nJ,Ep%nomega,3)

!Local variables-------------------------------
!scalars
 integer :: itim,io,isym,igp,ig,iqlwl
 integer :: jj,ii,s_jj,pad_jj,pad_ii
 complex(gwpc) :: dd,mqg0,mqg0_sym,rhotwg0_bkp
 character(len=500) :: msg
!arrays
 integer,pointer :: Sm1G(:) 
 real(dp) :: opinv(3,3),qrot(3)
 real(dp) :: b1(3),b2(3),b3(3)
 complex(gwpc),allocatable :: rhotwg_sym(:),rhotwg_I(:),rhotwg_J(:) 
 complex(gwpc),allocatable :: rhotwg_sym_star(:),rhotwg_star(:)
 complex(gwpc),pointer :: phmGt(:)

!************************************************************************

 b1=two_pi*Gsph_epsG0%gprimd(:,1)
 b2=two_pi*Gsph_epsG0%gprimd(:,2)
 b3=two_pi*Gsph_epsG0%gprimd(:,3)

 SELECT CASE (Ep%symchi)
 
 CASE (0) ! Do not use symmetries.

   if (nspinor==1) then
    ! * Accumulate over the full BZ i.e 
    !    chi0(G1,G2,io) = chi0(G1,G2,io) + (rhotwg(G1)*CONJG(rhotwg(G2)))*green_w(io)
    ! * The non-analytic term is symmetrized for this k-point in the BZ according to:
    !    rhotwg(1)= S^-1q * rhotwx_ibz
    !    rhotwg(1)=-S^-1q * CONJG(rhotwx_ibz) if time-reversal is used.
    opinv(:,:)=REAL(Cryst%symrec(:,:,isym_kbz),dp)
    call matrginv(opinv,3,3) 
    qrot = (3-2*itim_kbz) * MATMUL(opinv,qlwl(:,1))

    rhotwg(1)=dotproductqrc(qrot,rhotwx(:,1),b1,b2,b3) !TODO get rid of this
    if (itim_kbz==2) rhotwg(1)=CONJG(rhotwg(1))

    if (gwcomp==1) then ! Leave the head and wings uncorrected (does not matter much)
      if (ABS(deltaf_b1b2) < GW_TOL_DOCC) rhotwg(1)=czero_gw
      do igp=1,Ep%npwe
        chi0(1,igp,:) = chi0(1,igp,:) + rhotwg(1) *CONJG(rhotwg(igp))*green_enhigh_w(:)
      end do
      do ig=2,Ep%npwe
        chi0(ig,1,:)  = chi0(ig,1,:)  + rhotwg(ig)*CONJG(rhotwg(1))  *green_enhigh_w(:)
      end do
    end if

    ! Multiply elements G1,G2 of rhotwg_sym by green_w(io) and accumulate in chi0(G1,G2,io)
    do io=1,Ep%nomega
      dd=green_w(io) 
      call XGERC(Ep%npwe,Ep%npwe,dd,rhotwg,1,rhotwg,1,chi0(:,:,io),Ep%npwe)
    end do

    ! === Accumulate heads and wings for each small q ===
    ! * For better performance, this part is not done if nqlwl==1
    !   lwing and uwing will be filled in cchi0q0 after the MPI collective sum
    ! FIXME extrapolar method should be checked!!
    !do io=1,Ep%nomega                                                       
    ! lwing(:,io,1) =  chi0(:,1,io)
    ! uwing(:,io,1) =  chi0(1,:,io)
    !end do

    if (nqlwl>1.and..FALSE.) then 
      rhotwg0_bkp = rhotwg(1) ! Save G=0 value of the first q
      allocate(rhotwg_star(Ep%npwe))
      rhotwg_star = CONJG(rhotwg(1:Ep%npwe)) 

      do iqlwl=2,nqlwl
        qrot =  (3-2*itim_kbz) * MATMUL(opinv,qlwl(:,iqlwl))
        mqg0 = dotproductqrc(qrot,rhotwx(:,1),b1,b2,b3) !TODO get rid of this
        if (itim_kbz==2) mqg0=CONJG(mqg0)
        rhotwg     (1) =mqg0
        rhotwg_star(1) =CONJG(mqg0)
        !
        ! here we might take advantage of Hermiticity along Im axis in RPA (see mkG0w)
        do io=1,Ep%nomega                                                       
          lwing(:,io,iqlwl) = lwing(:,io,iqlwl) + rhotwg     (1:Ep%npwe) * CONJG(mqg0) * green_w(io)
          uwing(:,io,iqlwl) = uwing(:,io,iqlwl) + rhotwg_star(1:Ep%npwe) *       mqg0  * green_w(io)
        end do
      end do ! iqlwl

      deallocate(rhotwg_star)
      rhotwg(1) = rhotwg0_bkp ! Reinstate previous value of rhotwg(1).
    end if ! nqlwl > 1

  else ! spinorial case 
    allocate(rhotwg_I(Ep%npwe))
    allocate(rhotwg_J(Ep%npwe))

    ABI_CHECK(nqlwl==1,"nqlwl/=1 Not implemented")

    ! I can use symmetries to loop over the upper triangle but 
    ! this makes using BLAS more difficult
    ! Important NOTE: treatment of q-->0 limit is correct only
    ! for i=j=0. Other components require additional terms.

    do jj=1,Ep%nJ
      s_jj=1 ; if (jj==4) s_jj=-1
      pad_jj=(jj-1)*Ep%npwe
      call mkrhotwg_sigma(jj,nspinor,Ep%npwe,rhotwg,rhotwg_J)

      rhotwg_J(1) = q0limit(jj,qlwl(:,1),nspinor,rhotwx,b1,b2,b3) 
      !TODO RECHECK this
      if (itim_kbz==2) rhotwg_J(1)=-CONJG(rhotwg_J(1))

      do ii=1,Ep%nI
        pad_ii=(ii-1)*Ep%npwe

        if (ii/=jj) then
          call mkrhotwg_sigma(ii,nspinor,Ep%npwe,rhotwg,rhotwg_I)
          rhotwg_I(1) = q0limit(ii,qlwl(:,1),nspinor,rhotwx,b1,b2,b3) 
          if (itim_kbz==2) rhotwg_I(1)=-CONJG(rhotwg_I(1))
        else 
          rhotwg_I(:)=rhotwg_J(:)
        end if

        do io=1,Ep%nomega
          dd = s_jj*green_w(io) 
          call XGERC(Ep%npwe,Ep%npwe,dd,rhotwg_I,1,rhotwg_J,1,&
&           chi0(pad_ii+1:pad_ii+Ep%npwe,pad_jj+1:pad_jj+Ep%npwe,io),Ep%npwe)
        end do

      end do !ii
    end do !jj

    deallocate(rhotwg_I,rhotwg_J)
  end if

 CASE (1) ! Use symmetries to reconstruct the integrand.
   if (nspinor==1) then
     allocate(rhotwg_sym(Ep%npwe))

     ! === Loop over symmetries of the space group and time-reversal ===
     do isym=1,Ltg_q%nsym_sg
       do itim=1,Ltg_q%timrev

         if (Ltg_q%wtksym(itim,isym,ik_bz)==1) then 
           ! === This operation belongs to the little group and has to be considered to reconstruct the BZ ===
           ! TODO this is a hot-spot, should add a test on the umklapp
           phmGt => Gsph_epsG0%phmGt  (1:Ep%npwe,isym) ! In the 2 lines below note the slicing (1:npwe)
           Sm1G  => Gsph_epsG0%rottbm1(1:Ep%npwe,itim,isym)

           opinv(:,:)=REAL(Cryst%symrec(:,:,isym),dp)
           call matrginv(opinv,3,3) 
           qrot = (3-2*itim) * MATMUL(opinv,qlwl(:,1))

           SELECT CASE (itim)

           CASE (1)
             rhotwg_sym(1:Ep%npwe)=rhotwg(Sm1G(1:Ep%npwe))*phmGt(1:Ep%npwe) 
             rhotwg_sym(1)=dotproductqrc(qrot,rhotwx(:,1),b1,b2,b3)

           CASE (2) 
             rhotwg_sym(1:Ep%npwe)=CONJG(rhotwg(Sm1G(1:Ep%npwe)))*phmGt(1:Ep%npwe) 
             rhotwg_sym(1)=CONJG(dotproductqrc(qrot,rhotwx(:,1),b1,b2,b3))

           CASE DEFAULT 
             write(msg,'(a,i3)')'Wrong value of itim= ',itim
             MSG_BUG(msg)
           END SELECT 

           if (gwcomp==1) then ! Leave the head and wings uncorrected (does not matter much)
             if (ABS(deltaf_b1b2) < GW_TOL_DOCC) rhotwg_sym(1)=czero_gw
             do igp=1,Ep%npwe
               chi0(1,igp,:) = chi0(1,igp,:) + rhotwg_sym(1) *CONJG(rhotwg_sym(igp))*green_enhigh_w(:)
             end do
             do ig=2,Ep%npwe
               chi0(ig,1,:)  = chi0(ig,1,:)  + rhotwg_sym(ig)*CONJG(rhotwg_sym(1))  *green_enhigh_w(:)
             end do
           end if

           ! Multiply elements G1,G2 of rhotwg_sym by green_w(io) and accumulate in chi0(G,Gp,io)
           do io=1,Ep%nomega
             dd=green_w(io) 
             call XGERC(Ep%npwe,Ep%npwe,dd,rhotwg_sym,1,rhotwg_sym,1,chi0(:,:,io),Ep%npwe)
           end do

           ! === Accumulate heads and wings for each small q ===
           ! * For better performance, this part is not done if nqlwl==1
           !   lwing and uwing will be filled in cchi0q0 after the MPI collective sum
           ! FIXME extrapolar method should be checked!!
           if (nqlwl>1.and..FALSE.) then
             allocate(rhotwg_sym_star(Ep%npwe))
             rhotwg_sym_star = CONJG(rhotwg_sym)

             do iqlwl=2,nqlwl
               qrot = (3-2*itim) * MATMUL(opinv,qlwl(:,iqlwl))
               mqg0_sym = dotproductqrc(qrot,rhotwx(:,1),b1,b2,b3)
               if (itim==2) mqg0_sym = CONJG(mqg0_sym)

               rhotwg_sym     (1) =       mqg0_sym
               rhotwg_sym_star(1) = CONJG(mqg0_sym)

               ! here we might take advantage of Hermiticity along Im axis in RPA (see mkG0w)
               do io=1,Ep%nomega                                                       
                 lwing(:,io,iqlwl) = lwing(:,io,iqlwl) + rhotwg_sym     (1:Ep%npwe) * CONJG(mqg0_sym) * green_w(io)
                 uwing(:,io,iqlwl) = uwing(:,io,iqlwl) + rhotwg_sym_star(1:Ep%npwe) *       mqg0_sym  * green_w(io)
               end do
             end do !iqlwl

             deallocate(rhotwg_sym_star)
           end if !nqlwl>1

         end if !wtksym
       end do !itim
     end do !isym
   
     deallocate(rhotwg_sym)

   else  !spinorial case
     write(msg,'(a,i3)')' symchi=1 with spinor not implemented '
     MSG_BUG(msg)
     ABI_CHECK(nqlwl==1,"nqlwl/=1 Not implemented")
   end if

 CASE DEFAULT
   write(msg,'(a,i3)')'Wrong value of symchi= ',Ep%symchi
   MSG_BUG(msg)
 END SELECT

end subroutine assemblychi0q0_sym
!!***
!{\src2tex{textfont=tt}}
!!****f* ABINIT/assemblychi0_sym
!! NAME
!! assemblychi0_sym
!!
!! FUNCTION
!! Update the independent particle susceptibility for the contribution
!! of one pair of occupied-unoccupied band, for each frequency.
!! If symchi=1 the expression is symmetrized taking into account the symmetries 
!! of the little group associated to the external q-point.
!! Compute chi0(G1,G2,io)=chi0(G1,G2,io)+\sum_S \hat S (rhotwg(G1)*rhotwg*(G2))*green_w(io)
!! where S are the symmetries of the little group associated to the external q-point.
!!
!! COPYRIGHT
!! Copyright (C) 1999-2010 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  nspinor=Number of spinorial components.
!!  ik_bz=Index of the k-point in the BZ array whose contribution has to be symmetrized and added to cchi0 
!!  npwepG0=Maximum number of G vectors taking into account possible umklapp G0, ie enlarged sphere G-G0
!!  rhotwg(npwe*nspinor**2)=Oscillator matrix elements for this k-point and the transition that has to be summed
!!  green_w(nomega)=frequency dependent part coming from the green function
!!  Gsph_epsG0<Gvectors_type> Information on the "enlarged" G-sphere used for chi0, it contains umklapp G0 vectors
!!    %ng=number of G vectors in the enlarged sphere, actually MUST be equal to the size of rhotwg
!!    %rottbm1(ng,2,nsym)=index of (IR)^{-1} G where I is the identity or the inversion 
!!    %phmGt(ng,nsym)=phase factors associated to non-simmorphic operations
!!  Ltg_q<little_group_type>=Info on the little group associated to the external q-point.
!!    %timrev=2 it time-reversal is used, 1 otherwise
!!    %nsym_sg=Number of space group symmetries
!!    %wtksym(2,nsym,nkbz)=1 if the symmetry (with or without time-reversal) must be considered for this k-point
!!    %flag_umklp(timrev,nsym)= flag for umklapp processes 
!!      if 1 that the particular operation (IS) requires a G_o to preserve Q, 0 otherwise 
!!    %igmG0(npwepG0,timrev,nsym) index of G-G0 in the array gvec
!!  Ep<Epsilonm1_parameters>=Parameters related to the calculation of chi0/epsilon^-1
!!    %symchi
!!    %nomega=number of frequencies
!!    %npwe=number of plane waves for epsilon (input variable)
!!    
!! OUTPUT
!!  (see side effects)
!!
!! SIDE EFFECTS
!!  chi0(npwe,npwe,nomega)=independent-particle susceptibility matrix in reciprocal space
!!
!! PARENTS
!!      cchi0
!!
!! CHILDREN
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine assemblychi0_sym(ik_bz,nspinor,Ep,Ltg_q,green_w,npwepG0,rhotwg,Gsph_epsG0,chi0)

 use defs_basis
 use defs_datatypes
 use m_errors

 use m_gwdefs,   only : epsilonm1_parameters
 use m_blas,     only : xgerc
 use m_gsphere,  only : gvectors_type
 use m_bz_mesh,  only : little_group

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_68_gw, except_this_one => assemblychi0_sym
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: ik_bz,npwepG0,nspinor
 type(Gvectors_type),intent(in) :: Gsph_epsG0 
 type(Little_group),intent(in) :: Ltg_q
 type(Epsilonm1_parameters),intent(in) :: Ep
!arrays
 complex(gwpc),intent(in) :: rhotwg(npwepG0*nspinor**2)
 complex(dpc),intent(in) :: green_w(Ep%nomega)
 complex(gwpc),intent(inout) :: chi0(Ep%npwe*Ep%nI,Ep%npwe*Ep%nJ,Ep%nomega)

!Local variables-------------------------------
!scalars
 integer :: itim,io,isym
 integer :: jj,ii,s_jj,pad_jj,pad_ii
 complex(gwpc) :: dd 
 character(len=500) :: msg
!arrays
 integer,pointer :: gmG0(:) 
 integer,allocatable :: Sm1_gmG0(:)
 complex(gwpc),allocatable :: rhotwg_sym(:),rhotwg_I(:),rhotwg_J(:)
 complex(gwpc),pointer :: phmGt(:)

! *************************************************************************
 
 SELECT CASE (Ep%symchi)

 CASE (0) ! Do not use symmetries
  if (nspinor==1) then
   do io=1,Ep%nomega
    dd=green_w(io) 
!!#if defined HAVE_GW_DPC
!!    call ZGERC(Ep%npwe,Ep%npwe,dd,rhotwg,1,rhotwg,1,chi0(:,:,io),Ep%npwe)
!!#else
!!    call CGERC(Ep%npwe,Ep%npwe,dd,rhotwg,1,rhotwg,1,chi0(:,:,io),Ep%npwe)
!!#endif
    call XGERC(Ep%npwe,Ep%npwe,dd,rhotwg,1,rhotwg,1,chi0(:,:,io),Ep%npwe)
   end do

  else ! spinorial case
   allocate(rhotwg_I(Ep%npwe))
   allocate(rhotwg_J(Ep%npwe))

   ! I can use symmetries to loop over the upper triangle but 
   ! this makes using BLAS more difficult

   do jj=1,Ep%nJ
    s_jj=1 ; if (jj==4) s_jj=-1
    pad_jj=(jj-1)*Ep%npwe
    call mkrhotwg_sigma(jj,nspinor,Ep%npwe,rhotwg,rhotwg_J)

    do ii=1,Ep%nI
     pad_ii=(ii-1)*Ep%npwe

     if (ii/=jj) then
      call mkrhotwg_sigma(ii,nspinor,Ep%npwe,rhotwg,rhotwg_I)
     else 
      rhotwg_I(:)=rhotwg_J(:)
     end if

     do io=1,Ep%nomega
      dd = s_jj*green_w(io) 
!!#if defined HAVE_GW_DPC
!!      call ZGERC(Ep%npwe,Ep%npwe,dd,rhotwg_I,1,rhotwg_J,1,chi0(pad_ii+1:pad_ii+Ep%npwe,pad_jj+1:pad_jj+Ep%npwe,io),Ep%npwe)
!!#else
!!      call CGERC(Ep%npwe,Ep%npwe,dd,rhotwg_I,1,rhotwg_J,1,chi0(pad_ii+1:pad_ii+Ep%npwe,pad_jj+1:pad_jj+Ep%npwe,io),Ep%npwe)
!!#endif
      call XGERC(Ep%npwe,Ep%npwe,dd,rhotwg_I,1,rhotwg_J,1,chi0(pad_ii+1:pad_ii+Ep%npwe,pad_jj+1:pad_jj+Ep%npwe,io),Ep%npwe)
     end do

    end do !ii
   end do !jj

   deallocate(rhotwg_I,rhotwg_J)
  end if

 CASE (1) ! Use symmetries to reconstruct the integrand in the BZ.
  !
  ! Notes on the symmetrization of the oscillator matrix elements
  !  If  Sq = q then  M_G^( Sk,q)= e^{-i(q+G).t} M_{ S^-1G}  (k,q)
  !  If -Sq = q then  M_G^(-Sk,q)= e^{-i(q+G).t} M_{-S^-1G}^*(k,q)
  !
  ! In case of an umklapp process 
  !  If  Sq = q+G0 then  M_G( Sk,q)= e^{-i(q+G).t} M_{ S^-1(G-G0}   (k,q)
  !  If -Sq = q+G0 then  M_G(-Sk,q)= e^{-i(q+G).t} M_{-S^-1(G-G0)}^*(k,q)
  !
  ! Ltg_q%igmG0(ig,itim,isym) contains the index of G-G0 where ISq=q+G0
  ! Note that there is no need to take into account the phases due to q, 
  ! They cancel in the scalar product ==> phmGt(G,isym)=e^{-iG\cdot t}
  !
  ! Mind the slicing of %rottbm1(npwepG0,timrev,nsym) and %phmGt(npwepG0,nsym) as 
  ! these arrays, usually, do not conform to rho_twg_sym(npw) !
  !
  allocate(rhotwg_sym(Ep%npwe))
  allocate(Sm1_gmG0  (Ep%npwe))
  !
  ! === Loop over symmetries of the space group and time-reversal ===
  do isym=1,Ltg_q%nsym_sg
   do itim=1,Ltg_q%timrev

    if (Ltg_q%wtksym(itim,isym,ik_bz)==1) then 
     ! === This operation belongs to the little group and has to be used to reconstruct the BZ ===
     ! * In the following 3 lines mind the slicing (1:npwe)
     ! TODO this is a hot-spot, should add a test on the umklapp
     !
     phmGt => Gsph_epsG0%phmGt(1:Ep%npwe,isym) 
     gmG0  => Ltg_q%igmG0     (1:Ep%npwe,itim,isym)  
     Sm1_gmG0(1:Ep%npwe)=Gsph_epsG0%rottbm1(gmG0(1:Ep%npwe),itim,isym)

     SELECT CASE (itim)
     CASE (1) 
      rhotwg_sym(1:Ep%npwe)=rhotwg(Sm1_gmG0)*phmGt(1:Ep%npwe) 
     CASE (2) 
      rhotwg_sym(1:Ep%npwe)=CONJG(rhotwg(Sm1_gmG0))*phmGt(1:Ep%npwe) 
     CASE DEFAULT 
      write(msg,'(a,i3)')'Wrong itim= ',itim
      MSG_BUG(msg)
     END SELECT 

     ! Multiply rhotwg_sym by green_w(io) and accumulate in chi0(G,Gp,io)
     do io=1,Ep%nomega
      dd=green_w(io)  
!!#if defined HAVE_GW_DPC
!!      call ZGERC(Ep%npwe,Ep%npwe,dd,rhotwg_sym,1,rhotwg_sym,1,chi0(:,:,io),Ep%npwe)
!!#else
!!      call CGERC(Ep%npwe,Ep%npwe,dd,rhotwg_sym,1,rhotwg_sym,1,chi0(:,:,io),Ep%npwe)
!!#endif
      call XGERC(Ep%npwe,Ep%npwe,dd,rhotwg_sym,1,rhotwg_sym,1,chi0(:,:,io),Ep%npwe)
     end do

    end if
   end do 
  end do 

  deallocate(rhotwg_sym)
  deallocate(Sm1_gmG0)

 CASE DEFAULT
  write(msg,'(a,i3)')'Wrong symchi= ',Ep%symchi
  MSG_BUG(msg)
 END SELECT

end subroutine assemblychi0_sym
!!***

!!****f* ABINIT/mkrhotwg_sigma
!! NAME
!! mkrhotwg_sigma
!!
!! FUNCTION
!!  Helper function used to calculate selected linear combination
!!  of the oscillator matrix elements in the case of noncollinear magnetism.
!!
!! COPYRIGHT
!! Copyright (C) 1999-2010 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  ii=Index selecting the particolar combination of spin components.
!!  npw=Number of plane-waves in the oscillators.
!!  nspinor=Number of spinorial components.
!!  rhotwg(npw*nspinor**2)=OScillator matrix elements.
!!    
!! OUTPUT
!!  rhotwg_I(npw)=Required linear combination of the oscillator matrix elements.
!!
!! PARENTS
!!      assemblychi0_sym,assemblychi0q0_sym
!!
!! CHILDREN
!!
!! SOURCE

subroutine mkrhotwg_sigma(ii,nspinor,npw,rhotwg,rhotwg_I)

 use defs_basis
 use m_errors

 use m_gwdefs, only : j_gw

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: ii,npw,nspinor
!arrays
 complex(gwpc),intent(in) :: rhotwg(npw*nspinor**2)
 complex(gwpc),intent(out) :: rhotwg_I(npw)

!Local variables-------------------------------
!scalars
 character(len=500) :: msg

! *************************************************************************

 SELECT CASE (ii)
 CASE (1) ! $ M_0 = M_{\up,\up} + M_{\down,\down} $
  rhotwg_I(:) = rhotwg(1:npw) + rhotwg(npw+1:2*npw)
 CASE (2) ! $ M_z = M_{\up,\up} - M_{\down,\down} $
  rhotwg_I(:) = rhotwg(1:npw) - rhotwg(npw+1:2*npw)
 CASE (3) ! $ M_x = M_{\up,\down} + M_{\down,\up} $
  rhotwg_I(:) = ( rhotwg(2*npw+1:3*npw) + rhotwg(3*npw+1:4*npw) )
 CASE (4) ! $ M_y = i * (M_{\up,\down} -M_{\down,\up}) $
  rhotwg_I(:) = (rhotwg(2*npw+1:3*npw) - rhotwg(3*npw+1:4*npw) )*j_gw
 CASE DEFAULT 
  write(msg,'(a,i3)')'Wrong ii value= ',ii
  MSG_BUG(msg)
 END SELECT

end subroutine mkrhotwg_sigma
!!***
!{\src2tex{textfont=tt}}
!!****f* ABINIT/assemblychi0sfq0
!! NAME
!! assemblychi0sfq0
!!
!! FUNCTION
!! Update the spectral function of the independent particle susceptibility at q==0 for the contribution
!! of one pair of occupied-unoccupied band, for each frequency.
!! If symchi==1, the symmetries belonging to the little group of the external point q are used 
!! to reconstrunct the contributions in the full Brillouin zone. In this case, the equation implented is: 
!!
!!  $ chi0(G1,G2,io)=chi0(G1,G2,io)+\sum_S (rhotwg(G1)*rhotwg^\dagger(G2))* \delta(\omega -trans) $
!!
!! where S is a symmetry belonging to the little group of q.
!! The subroutine also performs the symmetrization of the matrix elements of the 
!! gradient operator and of the commutator [V_{nl},r] with the position operator.
!!
!! COPYRIGHT
!! Copyright (C) 1999-2010 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  ikbz=Index in the BZ of the k-point whose contribution to chi0 has to be added, 
!!   if we use symmetries, the contribution to chi0 by this k-point has to be symmetrized.
!!  isym_kbz=Index of the symmetry such as k_bz = IS k_ibz
!!  itim_kbz=2 if time-reversal has to be used to obtain k_bz, 1 otherwise. 
!!  my_wl,my_wr=min and Max frequency index treated by this processor.
!!  npwe=Number of plane waves used to describe chi0.
!!  npwepG0=Maximum number of G vectors to account for umklapps.
!!  nomega=Number of frequencies in the imaginary part.
!!  nqlwl=Number of q-points used for the optical limit.
!!  qlwl(3,nqlwl)=Reciprocal space coordinates of the q-points for the long-wavelength limit treatment.
!!  rhotwg(npwepG0*nspinor**2)=Oscillator matrix elements corresponding to an occupied-unoccupied pair of states.
!!  rhotwx(3,nspinor**2)=Matrix elements of the gradient and of the commutator of the non-local operator with 
!!    the position operator. The second term is present only if inclvkb=1,2.
!!  Gsph_epsG0<Gvectors_type> Information on the "enlarged" G-sphere used for chi0, it contains umklapp G0 vectors
!!    %ng=number of G vectors in the enlarged sphere. It MUST be equal to the size of rhotwg
!!    %rottbm1(ng,2,nsym)=index of (IR)^{-1} G where I is the identity or the inversion 
!!    %phmGt(ng,nsym)=phase factors associated to non-symmorphic operations
!!  Ltg_q<little_group_type>=Info on the little group associated to the external q-point.
!!    %timrev=2 it time-reversal is used, 1 otherwise
!!    %nsym_sg=Number of space group symmetries
!!    %wtksym(2,nsym,nkbz)=1 if the symmetry (with or without time-reversal) must be considered for this k-point
!!    %flag_umklp(timrev,nsym)= flag for umklapp processes 
!!     if 1 that the particular operation (IS) requires a G_o to preserve Q, 0 otherwise 
!! Cryst<Crystal_structure>=Info on unit cell and it symmetries
!!    %nsym=Number of symmetry operations.
!!    %symrec(3,3,nsym)=Symmetry operations in reciprocal space (reduced coordinates).
!!    
!! OUTPUT
!!  (see side effects)
!!
!! SIDE EFFECTS
!!  chi0sf(npwe,npwe,my_wl:my_wr)=Updated spectral function at q==0.
!!  lwing_sf(npwe,nomega,nqlwl)=Updated lower wing of the spectral function.
!!  uwing_sf(npwe,nomega,nqlwl)=Updated Upper wing of the spectral function.
!!
!! PARENTS
!!      cchi0q0
!!
!! CHILDREN
!!      cgerc,zgerc
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine assemblychi0sfq0(nqlwl,qlwl,ikbz,isym_kbz,itim_kbz,nspinor,symchi,npwepG0,npwe,Cryst,Ltg_q,Gsph_epsG0,&
& factocc,my_wl,iomegal,wl,my_wr,iomegar,wr,rhotwx,rhotwg,nomegasf,chi0sf,lwing_sf,uwing_sf)

 use defs_basis
 use m_errors

 use m_gwdefs,   only : epsilonm1_parameters
 use m_blas,     only : xgerc
 use m_crystal,  only : crystal_structure
 use m_gsphere,  only : gvectors_type
 use m_bz_mesh,  only : little_group

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_32_util
 use interfaces_68_gw, except_this_one => assemblychi0sfq0
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: ikbz,my_wl,my_wr,nomegasf,npwe,npwepG0,nqlwl,nspinor
 integer,intent(in) :: isym_kbz,itim_kbz,symchi,iomegal,iomegar
 real(dp),intent(in) :: factocc,wl,wr 
 type(Little_group),intent(in) :: Ltg_q
 type(Gvectors_type),intent(in) :: Gsph_epsG0 
 type(Crystal_structure),intent(in) :: Cryst
!arrays
 real(dp),intent(in) :: qlwl(3,nqlwl)
 complex(gwpc),intent(inout) :: rhotwg(npwepG0*nspinor**2)
 complex(gwpc),intent(in) :: rhotwx(3)
 complex(gwpc),intent(inout) :: chi0sf(npwe,npwe,my_wl:my_wr)
 complex(dpc),intent(inout) :: lwing_sf(npwe,my_wl:my_wr,3)
 complex(dpc),intent(inout) :: uwing_sf(npwe,my_wl:my_wr,3)

!Local variables-------------------------------
!scalars
 integer :: itim,isym,iqlwl
 complex(gwpc) :: num 
 complex(gwpc) :: mqg0,mqg0_sym,rhotwg0_bkp
 character(len=500) :: msg
!arrays
 integer,pointer :: Sm1G(:) 
 real(dp) :: opinv(3,3),qrot(3),b1(3),b2(3),b3(3)
 complex(gwpc),allocatable :: rhotwg_sym(:)
 complex(gwpc),allocatable :: rhotwg_sym_star(:),rhotwg_star(:)
 complex(gwpc),pointer :: phmGt(:)
!************************************************************************

 if (iomegal<my_wl .or. iomegar>my_wr) then 
   write(msg,'(3a,2(a,i5,a,i5))')ch10,&
&    ' assemblychi0sfq0 : Indeces out of boundary ',ch10,&
&    '  my_wl = ',my_wl,' iomegal = ',iomegal,ch10,&
&    '  my_wr = ',my_wr,' iomegar = ',iomegar,ch10
   MSG_PERS_BUG(msg)
 end if 

 b1(:)=two_pi*Gsph_epsG0%gprimd(:,1)
 b2(:)=two_pi*Gsph_epsG0%gprimd(:,2)
 b3(:)=two_pi*Gsph_epsG0%gprimd(:,3)

 SELECT CASE (symchi)

 CASE (0)
   ! 
   ! === Calculation without symmetries ===
   ! * rhotwg(1)= R^-1q*rhotwx_ibz
   ! * rhotwg(1)=-R^-1q*conjg(rhotwx_ibz) for inversion
   ! FIXME My equation reads  -iq* <cSk|\nabla|vSk> = -i \transpose S <ck_i|\nabla\|vk_i>  
   if (nspinor==1) then
     opinv(:,:)=REAL(Cryst%symrec(:,:,isym_kbz),dp)
     call matrginv(opinv,3,3)
     qrot =  (3-2*itim_kbz) * MATMUL(opinv,qlwl(:,1))
     rhotwg(1)=dotproductqrc(qrot,rhotwx,b1,b2,b3)
     if (itim_kbz==2) rhotwg(1)=CONJG(rhotwg(1))

     if (wl<huge(0.0_dp)*1.d-11) then !this is awful but it is still a first coding
       num=-wl*factocc ! Num is single precision needed for cgerc check factocc
       call XGERC(npwe,npwe,num,rhotwg,1,rhotwg,1,chi0sf(:,:,iomegal),npwe)
     end if 
     ! Last point, must accumulate left point but not the right one
     if (iomegar/=nomegasf+1 .and. wr<huge(0.0_dp)*1.d-11) then 
       num=-wr*factocc
       call XGERC(npwe,npwe,num,rhotwg,1,rhotwg,1,chi0sf(:,:,iomegar),npwe)
     end if 
     !
     ! === Accumulate heads and wings for each small q ===
     ! * For better performance, this part is not done if nqlwl==1
     !   lwing and uwing will be filled in cchi0q0 after the MPI collective sum
     !
     if (nqlwl>1.and..FALSE.) then
       rhotwg0_bkp = rhotwg(1) ! Save G=0 value of the first q
       allocate(rhotwg_star(npwepG0))
       rhotwg_star = CONJG(rhotwg(1:npwepG0)) 

       do iqlwl=2,nqlwl
         qrot =  (3-2*itim_kbz) * MATMUL(opinv,qlwl(:,iqlwl))
         mqg0 = dotproductqrc(qrot,rhotwx,b1,b2,b3) !TODO get rid of this
         if (itim_kbz==2) mqg0=CONJG(mqg0)
         rhotwg     (1) =mqg0
         rhotwg_star(1) =CONJG(mqg0)
         !
         if (wl<huge(0.0_dp)*1.d-11) then !this is awful but it is still a first coding    
           num=-wl*factocc ! Num is single precision needed for cgerc check factocc
           lwing_sf(:,iomegal,iqlwl) = lwing_sf(:,iomegal,iqlwl) + rhotwg     (1:npwepG0) * CONJG(mqg0) * num 
           uwing_sf(:,iomegal,iqlwl) = uwing_sf(:,iomegal,iqlwl) + rhotwg_star(1:npwepG0) *       mqg0  * num 
         end if 
         !
         ! Last point, must accumulate left point but not the right one
         if (iomegar/=nomegasf+1 .and. wr<huge(0.0_dp)*1.d-11) then 
           num=-wr*factocc
           lwing_sf(:,iomegar,iqlwl) = lwing_sf(:,iomegar,iqlwl) + rhotwg     (1:npwepG0) * CONJG(mqg0) * num 
           uwing_sf(:,iomegar,iqlwl) = uwing_sf(:,iomegar,iqlwl) + rhotwg_star(1:npwepG0) *       mqg0  * num 
         end if 
       end do ! iqlwl

       deallocate(rhotwg_star)
       rhotwg(1) = rhotwg0_bkp ! Reinstate previous value of rhotwg(1).
     end if !nqlwl

   else ! spinorial case
     msg="Spectral method + nspinor==2 not implemented"
     MSG_BUG(msg)
   end if


 CASE (1)
   ! === Notes on the symmetrization of oscillator matrix elements ===
   ! If  Sq = q then  M_G( Sk,q)= e^{-i(q+G)\cdot t} M_{ S^-1G}  (k,q)
   ! If -Sq = q then  M_G(-Sk,q)= e^{-i(q+G)\cdot t} M_{-S^-1G}^*(k,q)
   ! 
   ! In case of an umklapp process 
   ! If  Sq = q+G_o then  M_G( Sk,q)= e^{-i(q+G)\cdot t} M_{ S^-1(G-G_o}   (k,q)
   ! If -Sq = q+G_o then  M_G(-Sk,q)= e^{-i(q+G)\cdot t} M_{-S^-1(G-G-o)}^*(k,q)
   ! 
   ! rhotwg(1)= R^-1q*rhotwx_ibz
   ! rhotwg(1)=-R^-1q*conjg(rhotwx_ibz) for inversion
   !
   if (nspinor==1) then
     allocate(rhotwg_sym(npwe))
     !
     ! === Loop over symmetries of the space group and time-reversal ===
     do isym=1,Ltg_q%nsym_sg
       do itim=1,Ltg_q%timrev

         if (Ltg_q%wtksym(itim,isym,ikbz)==1) then 
           ! === This operation belongs to the little group and has to be considered to reconstruct the BZ ===
           ! TODO this is a hot-spot, should add a test on the umklapp
           !
           phmGt => Gsph_epsG0%phmGt(1:npwe,isym) ! In these 2 lines mind the slicing (1:npwe)
           Sm1G  => Gsph_epsG0%rottbm1(1:npwe,itim,isym)

           opinv(:,:)=REAL(Cryst%symrec(:,:,isym),dp)
           call matrginv(opinv,3,3)
           qrot = (3-2*itim) * MATMUL(opinv,qlwl(:,1))

           SELECT CASE (itim)

           CASE (1)
             rhotwg_sym(1:npwe)=rhotwg(Sm1G(1:npwe))*phmGt(1:npwe)
             rhotwg_sym(1)=dotproductqrc(qrot,rhotwx,b1,b2,b3)

           CASE (2) 
             rhotwg_sym(1:npwe)=CONJG(rhotwg(Sm1G(1:npwe)))*phmGt(1:npwe)
             rhotwg_sym(1)=CONJG(dotproductqrc(qrot,rhotwx,b1,b2,b3))

           CASE DEFAULT
             write(msg,'(a,i4)')'Wrong value of itim= ',itim
             MSG_BUG(msg)
           END SELECT
           !
           ! === Multiply elements G,Gp of rhotwg_sym*num and accumulate in chi0sf(G,Gp,io) ===
           if (wl<huge(0.0_dp)*1.d-11) then
             num=-wl*factocc
             call XGERC(npwe,npwe,num,rhotwg_sym,1,rhotwg_sym,1,chi0sf(:,:,iomegal),npwe)
           end if
           !
           ! Last point, must accumulate left point but not the right one
           if (iomegar/=nomegasf+1 .and. wr<huge(0.0_dp)*1.d-11) then 
             num=-wr*factocc
             call XGERC(npwe,npwe,num,rhotwg_sym,1,rhotwg_sym,1,chi0sf(:,:,iomegar),npwe)
           end if 

           ! === Accumulate heads and wings for each small q ===
           ! * For better performance, this part is not done if nqlwl==1
           !   lwing and uwing will be filled in cchi0q0 after the MPI collective sum
           if (nqlwl>1.and..FALSE.) then
             allocate(rhotwg_sym_star(npwe))
             rhotwg_sym_star = CONJG(rhotwg_sym(1:npwe)) 
 
             do iqlwl=2,nqlwl
               qrot =  (3-2*itim_kbz) * MATMUL(opinv,qlwl(:,iqlwl))
               mqg0_sym = dotproductqrc(qrot,rhotwx,b1,b2,b3) !TODO get rid of this
               if (itim_kbz==2) mqg0_sym=CONJG(mqg0_sym)
               rhotwg_sym     (1) =mqg0_sym
               rhotwg_sym_star(1) =CONJG(mqg0_sym)
               !
               if (wl<huge(0.0_dp)*1.d-11) then !this is awful but it is still a first coding    
                 num=-wl*factocc ! Num is single precision needed for cgerc check factocc
                 lwing_sf(:,iomegal,iqlwl) = lwing_sf(:,iomegal,iqlwl) + rhotwg_sym_star(1:npwe) * CONJG(mqg0_sym) * num 
                 uwing_sf(:,iomegal,iqlwl) = uwing_sf(:,iomegal,iqlwl) + rhotwg_sym_star(1:npwe) *       mqg0_sym  * num 
               end if 
               ! Last point, must accumulate left point but not the right one
               if (iomegar/=nomegasf+1 .and. wr<huge(0.0_dp)*1.d-11) then 
                 num=-wr*factocc
                 lwing_sf(:,iomegar,iqlwl) = lwing_sf(:,iomegar,iqlwl) + rhotwg_sym_star(1:npwe) * CONJG(mqg0_sym) * num 
                 uwing_sf(:,iomegar,iqlwl) = uwing_sf(:,iomegar,iqlwl) + rhotwg_sym_star(1:npwe) *       mqg0_sym  * num 
               end if 
             end do ! iqlwl
 
             deallocate(rhotwg_sym_star)
           end if !nqlwl

         end if !wtksym
       end do !inv
     end do !isym
     deallocate(rhotwg_sym)

   else ! spinorial case
     msg="Spectral method + nspinor==2 not implemented"
     MSG_BUG(msg)
   end if

 CASE DEFAULT
   write(msg,'(a,i4)')'Wrong value of symchi= ',symchi
   MSG_BUG(msg)
 END SELECT

end subroutine assemblychi0sfq0
!!***
