!{\src2tex{textfont=tt}}
!!****f* ABINIT/cohsex_me
!! NAME
!! cohsex_me
!!
!! FUNCTION
!! Calculate diagonal and off-diagonal matrix elements of the SEX or COHSEX self-energy operator.
!!
!! COPYRIGHT
!! Copyright (C) 1999-2010 ABINIT group (FB, GMR, VO, LR, RWG, MG, RShaltaf)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!! minbnd, maxbnd= min and Max band index for GW correction (for this k-point)
!! Dtfil=filenames (only Dtfil%filsrc is used if the screening must be read from file)
!! accesswff=Option defining the file format of the SCR file (Fortran, NETCDF)
!! Er <Epsilonm1_results> (see the definition of this structured datatype)
!!    %mqmem=if 0 use out-of-core method in which a single q-slice of espilon is read inside the loop over k
!!    %nomega_i=Number of imaginary frequencies.
!!    %nomega_r=Number of real frequencies.
!!    %nomega=Total number of frequencies.
!! Gsph_Max<Gvectors_type>= info on biggest G-sphere
!!    %nsym=number of symmetry operations
!!    %rottb(Sigp%npwvec,timrev,nsym)=index of (IS) G where I is the identity or the inversion
!!      operation and G is one of the npwvec vectors in reciprocal space
!!    %timrev=2 if time-reversal symmetry is used, 1 otherwise
!! gvec(3,Sigp%npwvec)=integer coordinates of each plane wave in reciprocal space
!! ikcalc=index in the array Sigp%kptgw2bz of the k-point where GW corrections are calculated
!! Ltg_k datatype containing information on the little group
!! Kmesh <BZ_mesh_type>
!!    %nbz=Number of points in the BZ
!!    %nibz=Number of points in IBZ
!!    %kibz(3,nibz)=k-point coordinates, irreducible Brillouin zone
!!    %kbz(3,nbz)=k-point coordinates, full Brillouin zone
!!    %ktab(nbz)= table giving for each k-point in the BZ (kBZ), the corresponding
!!    %ktabi(nbz)= for each k-point in the BZ defines whether inversion has to be considered
!!    %ktabp(nbz)= phase factor associated to tnons
!! gwc_ngfft(18)=Information about 3D FFT for the oscillator strengths used for the correlation part,
!! Vcp <Coulombian_type datatype> containing information on the cutoff technique
!!    %vc_sqrt(npwc,nqibz)= square-root of the coulombian potential for q-points in the IBZ
!! Pawtab(Psps%ntypat) <type(pawtab_type)>=paw tabulated starting data
!! Pawang <type(pawang_type)>=paw angular mesh and related data
!! Psps <type(pseudopotential_type)>=variables related to pseudopotentials
!!    %usepaw=1 for PAW, 0 for NC pseudopotentials.
!! Qmesh <bz_mesh_type> : datatype gathering information of the q-mesh used
!!    %ibz=q points where $\tilde\epsilon^{-1}$ has been computed
!!    %bz(3,nqbz)=coordinates of all q-points in BZ
!! Sigp <sigma_parameters> (see the definition of this structured datatype)
!!    %npwvec= Max betwee npweps and npwwfn used to dimension arrays
!! Cryst<Crystal_structure>=Info on unit cell and symmetries
!!    %natom=number of atoms in unit cell
!!    %ucvol=unit cell volume
!!    %nsym=number of symmetry operations
!!    %typat(natom)=type of each atom
!!  much slower but it requires less memory
!! QP_BSt<Bandstructure_type>=Datatype gathering info on the QP energies (KS if one shot)
!!  eig(Sigp%nbnds,Kmesh%nibz,Sigp%nsppol)=KS or QP energies for k-points, bands and spin
!!  occ(Sigp%nbnds,Kmesh%nibz,Sigp%nsppol)=occupation numbers, for each k point in IBZ, each band and spin
!!  Paw_pwff<Paw_pwff_type>=Form factor used to calculate the onsite mat. elements of a plane wave.
!! QP_sym(Sigp%nsppol)<bands_symmetries>=Datatype collecting data on the irreducible representaions of the
!!  little group of kcalc in the KS representation as well as the symmetry of the bdgw_k states.
!!  Sr=sigma_results (see the definition of this structured datatype)
!!
!! OUTPUT
!!
!! NOTES
!!  1) The treatment of the divergence of Gygi+Baldereschi (PRB 1986) is included.
!!  2) The calculation of energy derivative is based on finite elements.
!!  3) On the symmetrization of Sigma matrix elements ***/
!!        If  Sk = k+G0 then  M_G(k, Sq)= e^{-i( Sq+G).t} M_{ S^-1(G}   (k,q)
!!        If -Sk = k+G0 then  M_G(k,-Sq)= e^{-i(-Sq+G).t} M_{-S^-1(G)}^*(k,q)
!!
!!     Notice the absence of G0 in the expression. Moreover, when we sum over the little group, it turns out
!!     that there is a cancellation of the phase factor associated to the non-symmorphic operations due to a
!!     similar term coming from the symmetrization of \epsilon^{-1}. Mind however that the nonsymmorphic phase
!!     has to be considered when epsilon^-1 is reconstructed starting from the q-points in the IBZ.
!!
!!  4) The unitary transformation relating wavefunctions
!!     at symmetric k-points should be taken into account during the symmetrization
!!     of the oscillator matrix elements. In case of G_oW_o and GW_o calculations, however,
!!     it is possible to make an invariant by just including all the degenerate states and
!!     averaging the final results over the degenerate subset. 
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine cohsex_me(ikcalc,nomega_sigc,minbnd,maxbnd,Dtfil,Cryst,QP_BSt,Sigp,Sr,Er,Gsph_Max,Vcp,Kmesh,Qmesh,&
& Ltg_k,Pawtab,Pawang,Paw_pwff,Psps,Wfd,QP_sym,gvec,gwc_ngfft,accesswff,prtvol,sigcme_tmp)

 use defs_basis
 use m_gwdefs !,        only : czero_gw, cone_gw, j_gw, sigma_parameters, sigma_type_from_key, sigma_is_herm
 use defs_datatypes
 use defs_abitypes
 use defs_gw
 use m_xmpi
 use m_defs_ptgroups
 use m_jolly_pointers
#ifdef HAVE_CLIB
 use m_clib
#endif
 use m_errors

 use m_blas,          only : xdotc, xgemv
 use m_numeric_tools, only : hermitianize, imin_loc
 use m_geometry,      only : normv
 use m_crystal,       only : crystal_structure
 use m_bz_mesh,       only : bz_mesh_type, get_BZ_item, findqg0, little_group, print_little_group
 use m_gsphere,       only : gvectors_type
 use m_fft_mesh,      only : get_gftt, rotate_fft_mesh, cigfft
 use m_coulombian,    only : coulombian_type
 use m_paw_pwij,      only : paw_pwff_type, paw_pwij_type, init_paw_pwij, destroy_paw_pwij, paw_rho_tw_g
 use m_wfs,           only : wfd_get_ur, wfs_descriptor, wfd_get_cprj, wfd_change_ngfft
 use m_oscillators,   only : rho_tw_g, calc_wfwfg
 use m_screening,     only : epsm1_symmetrizer, get_epsm1
 use m_bands_sym,     only : bands_symmetries, symmetrize_me, bsym_failed
 use m_ptgroups,      only : sum_irreps

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_18_timing
 use interfaces_53_abiutil
 use interfaces_66_paw
 use interfaces_68_gw, except_this_one => cohsex_me
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: ikcalc,prtvol,accesswff,nomega_sigc,minbnd,maxbnd
 type(Crystal_structure),intent(in) :: Cryst
 type(Bandstructure_type),intent(in) :: QP_BSt
 type(BZ_mesh_type),intent(in) :: Kmesh,Qmesh
 type(Coulombian_type),intent(in) :: Vcp
 type(Datafiles_type),intent(in) :: Dtfil
 type(Epsilonm1_results),intent(inout) :: Er
 type(Gvectors_type),intent(in) :: Gsph_Max
 type(Little_group),intent(in) :: Ltg_k
 type(Pseudopotential_type),intent(in) :: Psps
 type(pawang_type),intent(in) :: pawang
 type(Sigma_parameters),intent(in) :: Sigp
 type(Sigma_results),intent(in) :: Sr
 type(wfs_descriptor),intent(inout) :: Wfd
!arrays
 integer,intent(in) :: gvec(3,Sigp%npwvec)
 integer,intent(in) :: gwc_ngfft(18)
 !complex(dpc),intent(out) :: sigcme_tmp(:,:,:,:)
 !complex(dpc),pointer :: sigcme_tmp(:,:,:,:)
 complex(dpc),intent(out) :: sigcme_tmp(nomega_sigc,minbnd:maxbnd,minbnd:maxbnd,Sigp%nsppol*Sigp%nsig_ab)
 type(Pawtab_type),intent(in) :: Pawtab(Psps%ntypat)
 type(Paw_pwff_type),intent(in) :: Paw_pwff(Psps%ntypat*Psps%usepaw)
 type(bands_symmetries),intent(in) :: QP_sym(Sigp%nsppol)

!Local variables ------------------------------
!scalars
 integer,parameter :: level=41,tim_fourdp=2,localrdwf_=1
 integer :: iab,ib,ib1,ib2,ierr,ig,ig01,ig02,ig03,ii,iik,itim_q,i1,i2
 integer :: ik_bz,ik_ibz,io,isym_q,iq_bz,iq_ibz,is,isppol,istat,isym,jb,is_idx 
 integer :: band,band1,band2,idle,rank
 integer :: jik,jk_bz,jk_ibz,kb,nspinor
 integer :: nomega_tot,nq_summed,ispinor,ibsp,dimcprj_gw
 integer :: spad,spadc,spadc1,spadc2,irow,my_nbks
 integer :: comm,ndegs,wtqm,wtqp,mod10
 integer :: shift,isym_kgw,isym_ki,gwc_mgfft,use_padfft,gwc_fftalga,gwc_nfftot
 integer :: nG01d,nG02d,nG03d,ifft,spin
 real(dp) :: fact_sp,theta_mu_minus_e0i,tol_empty,norm,gw_gsq
 complex(dpc) :: ctmp,scprod,ph_mkgwt,ph_mkt
 logical :: iscompatibleFFT,q_is_gamma !,ltest
 character(len=500) :: msg
!arrays
 integer :: g0(3),spinor_padc(2,4),nbv_ks(Kmesh%nibz,Wfd%nsppol)
 integer,allocatable :: proc_distrb(:,:,:),coh_distrb(:,:,:,:)
 integer,allocatable :: degtab(:,:,:),grottb(:,:,:),gmg0(:,:)
 integer,allocatable :: igfftcg0(:),gw_gfft(:,:),gw_gbound(:,:),irottb(:,:),ktabr(:,:),igfft(:,:,:,:)
 integer :: got(Wfd%nproc)
 real(dp) :: ksum(3),kgw(3),kgw_m_ksum(3),q0(3),tsec(2),qbz(3)
 real(dp) :: spinrot_kbz(4),spinrot_kgw(4)
 real(dp),pointer :: qp_ene(:,:,:),qp_occ(:,:,:)
 !real(dp),allocatable :: ks_rhor_paw(:,:)
 complex(gwpc) :: sigcohme(Sigp%nsig_ab)
 complex(gwpc),allocatable :: vc_sqrt_qbz(:),rhotwg(:),rhotwgp(:),sigsex(:)
 complex(gwpc),allocatable :: epsm1_qbz(:,:,:)
 complex(gwpc),allocatable :: sigc_ket(:,:)
 complex(gwpc),allocatable :: rhotwg_ki(:,:)
 complex(gwpc),allocatable :: sigctmp(:,:)
 complex(gwpc),allocatable :: wfr_bdgw(:,:),wfr_sum(:),wf1swf2_g(:)
 complex(gwpc),pointer :: cg_jb(:),cg_sum(:)
 complex(dpc) :: ovlp(2)
 complex(dpc),allocatable :: sym_cme(:,:,:),sigc(:,:,:,:,:)
 logical :: rank_mask(Wfd%nproc),can_symmetrize(Wfd%nsppol)
 logical,allocatable :: bks_mask(:,:,:)
 type(sigijtab_t),pointer :: Sigcij_tab(:)
 type(Cprj_type),allocatable :: Cprj_kgw(:,:),Cprj_ksum(:,:)
 type(Paw_pwij_type),allocatable :: Pwij_qg(:),Pwij_fft(:)

!************************************************************************

 DBG_ENTER("COLL")
 !
 ! === Initial check ===
 ABI_CHECK(Sr%nomega_r==Sigp%nomegasr,"")
 ABI_CHECK(Sr%nomega4sd==Sigp%nomegasrd,"")
 ABI_CHECK(Sigp%npwvec==Gsph_Max%ng,"")

 ! === Initialize MPI variables === 
 comm  =Wfd%comm
                                                                                      
 ! === Initialize some values ===
 nspinor = Wfd%nspinor
 spinor_padc(:,:)=RESHAPE((/0,0,Sigp%npwc,Sigp%npwc,0,Sigp%npwc,Sigp%npwc,0/),(/2,4/))

 qp_ene => QP_BSt%eig(:,:,:)
 qp_occ => QP_BSt%occ(:,:,:)
 !
 ! Index of the GW point in the BZ array, its image in IBZ and time-reversal ===
 jk_bz=Sigp%kptgw2bz(ikcalc)
 call get_BZ_item(Kmesh,jk_bz,kgw,jk_ibz,isym_kgw,jik,ph_mkgwt)
 !$call get_IBZ_item(Kmesh,jk_ibz,kibz,wtk)
 spinrot_kgw=Cryst%spinrot(:,isym_kgw)
 !
 ib1=minbnd
 ib2=maxbnd

 write(msg,'(2a,3f8.3,2a,2(i3,a))')ch10,&
&  ' Calculating <nk|Sigma_c(omega)|nk> at k = ',kgw(:),ch10,&
&  ' bands n = from ',ib1,' to ',ib2,ch10
 call wrtout(std_out,msg,'COLL')

 !ltest = ALL(gwc_ngfft(1:3) == Wfd%ngfft(1:3))
 !ABI_CHECK(ltest," cannot change FFT on-the-fly yet")

 if ( ANY(gwc_ngfft(1:3) /= Wfd%ngfft(1:3)) ) call wfd_change_ngfft(Wfd,Cryst,Psps,gwc_ngfft)
 gwc_mgfft   = MAXVAL(gwc_ngfft(1:3))

 can_symmetrize = .FALSE.
 if (Sigp%symsigma>0) then
   can_symmetrize = .TRUE.
   if (Sigp%gwcalctyp >= 20) then
    do spin=1,Wfd%nsppol
      can_symmetrize(spin) = .not.bsym_failed(QP_sym(spin))
      if (.not.can_symmetrize(spin)) then
        write(msg,'(a,i0,4a)')" Symmetrization cannot be performed for spin: ",spin,ch10,&
&         " band classification encountered the following problem: ",ch10,TRIM(QP_sym(spin)%err_msg)
        MSG_WARNING(msg)
      end if
    end do
   end if
   ABI_CHECK(Sigp%nspinor==1,'Symmetrization with nspinor=2 not implemented')
 end if

 ABI_UNUSED(Pawang%l_max)

 mod10=MOD(Sigp%gwcalctyp,10)

 call timab(421,1,tsec) ! csigme(tot) Overall clock. TODO check this
 call timab(425,1,tsec) ! csigme (SigC)
 !
 !
 ! === Normalization of theta_mu_minus_e0i ===
 ! * If nsppol==2, qp_occ $\in [0,1]$
 SELECT CASE (Sigp%nsppol)
 CASE (1)
   fact_sp=half; tol_empty=0.01   ! below this value the state is assumed empty
   if (Sigp%nspinor==2) then
    fact_sp=one; tol_empty=0.005  ! below this value the state is assumed empty
   end if
 CASE (2)
   fact_sp=one; tol_empty=0.005   ! to be consistent and obtain similar results if a metallic
 CASE DEFAULT                     ! spin unpolarized system is treated using nsppol==2
   MSG_BUG('Wrong nsppol')
 END SELECT
 !
 ! Allocate arrays used to accumulate the matrix elements of \Sigma_c over
 ! k-points and bands. Note that for AC requires only the imaginary frequencies
 !nomega_sigc=Sr%nomega_r+Sr%nomega4sd
 !
 call timab(422,1,tsec) ! csigme(init0)
 !
 ! === Define the G-G0 shifts for the FFT of the oscillators ===
 ! * Sigp%mG0 gives the MAX G0 component to account for umklapp.
 ! * Note the size MAX(Sigp%npwx,Sigp%npwc).
 !
 nG01d = 2*Sigp%mG0(1)+1
 nG02d = 2*Sigp%mG0(2)+1
 nG03d = 2*Sigp%mG0(3)+1

 allocate(igfft(Sigp%npwc,nG01d,nG02d,nG03d))
 call cigfft(Sigp%mG0,Sigp%npwc,gwc_ngfft,Gsph_Max%gvec,igfft,ierr)

 if (ierr/=0) then
   write(msg,'(a,i4,3a)')&
&    ' Found ',ierr,' G-G0 vectors falling outside the FFT box. ',ch10,&
&    ' This is not allowed. Check mG0 and the setup of the GW FFT mesh. '
   MSG_ERROR(msg)
 end if
 !
 ! === Precalculate the FFT index of $(R^{-1}(r-\tau))$ ===
 ! * S=\transpose R^{-1} and k_BZ = S k_IBZ
 ! * irottb is the FFT index of $R^{-1} (r-\tau)$ used to symmetrize u_Sk.
 gwc_nfftot = PRODUCT(gwc_ngfft(1:3))
 allocate(irottb(gwc_nfftot,Cryst%nsym))
 call rotate_FFT_mesh(Cryst%nsym,Cryst%symrel,Cryst%tnons,gwc_ngfft,irottb,iscompatibleFFT)
 if (.not.iscompatibleFFT) then
   msg = "FFT mesh is not compatible with symmetries. Results might be affected by large errors!"
   MSG_WARNING(msg)
 end if

 allocate(ktabr(gwc_nfftot,Kmesh%nbz))
 do ik_bz=1,Kmesh%nbz
   isym=Kmesh%tabo(ik_bz)
   do ifft=1,gwc_nfftot
     ktabr(ifft,ik_bz)=irottb(ifft,isym)
   end do
 end do
 deallocate(irottb)
 !
 ! The number of occupied states for each point in the IBZ and spin.
 !nbv_ks(:,:) = COUNT(qp_occ>=tol_empty,DIM=1)  MG: g95 returns random numbers, likely a bug in the compiler
 do spin=1,Wfd%nsppol
   do ik_ibz=1,Kmesh%nibz
     nbv_ks(ik_ibz,spin) = COUNT(qp_occ(:,ik_ibz,spin)>=tol_empty) 
   end do
 end do
 !
 ! (b,k,s) mask for MPI distribution of the sum over occupied states in the BZ.
 allocate(bks_mask(Wfd%mband,Kmesh%nbz,Wfd%nsppol)); bks_mask=.FALSE. 
 do spin=1,Wfd%nsppol
   do ik_bz=1,Kmesh%nbz
      ik_ibz = Kmesh%tab(ik_bz)
      bks_mask(1:nbv_ks(ik_ibz,spin),ik_bz,spin) = .TRUE.
   end do
 end do
 ! 
 ! Distribute the individual terms of the sum over the BZ taking into account symmetries and MPI memory distribution.
 ! got is used to optimize the distribution if more than one node can calculate the same (b,k,s) element.
 got=0
 allocate(proc_distrb(Wfd%mband,Kmesh%nbz,Wfd%nsppol))
 call sigma_distribution(Wfd,Kmesh,Ltg_k,Qmesh,Sigp%nsppol,can_symmetrize,kgw,Sigp%mg0,my_nbks,&
&  proc_distrb,got,bks_mask,global=.TRUE.)

 deallocate(bks_mask)

 write(msg,'(a,i0,a)')" Will sum ",my_nbks," (b,k,s) occupied states in (COHSEX|SEX)."
 call wrtout(std_out,msg,'PERS')

 Sigcij_tab => Sigp%Sigcij_tab(ikcalc,1:Sigp%nsppol)

 if (mod10==SIG_COHSEX) then  ! Distribute the COHSEX terms, taking into account the symmetries of the Sigma_ij matrix. 
   allocate(coh_distrb(ib1:ib2,ib1:ib2,Kmesh%nbz,Wfd%nsppol))

   coh_distrb = xmpi_undefined_rank
   do spin=1,Wfd%nsppol
     do ik_bz=1,Kmesh%nbz
        if (ANY(proc_distrb(:,ik_bz,spin) /= xmpi_undefined_rank) ) then ! This BZ point will be calculated.
           rank_mask = .FALSE. ! To select only those nodes that will treat (k,s).
           do band=1,Wfd%mband
             rank = proc_distrb(band,ik_bz,spin) 
             if (rank /= xmpi_undefined_rank) rank_mask(rank+1)=.TRUE.
           end do
           do band2=ib1,ib2
             do irow=1,Sigcij_tab(spin)%col(band2)%size1   ! Looping over the upper triangle of sigma_ij with non-zero elements.
               band1 = Sigcij_tab(spin)%col(band2)%bidx(irow)
               idle = imin_loc(got,mask=rank_mask)
               got(idle) = got(idle)+1
               coh_distrb(band1,band2,ik_bz,spin) = idle-1
             end do
           end do
        end if
     end do
   end do

   write(msg,'(a,i0,a)')" will treat ",COUNT(coh_distrb==Wfd%my_rank)," COH terms."
   call wrtout(std_out,msg,'PERS')
 end if

 allocate(rhotwg_ki(Sigp%npwc*nspinor,minbnd:maxbnd)); rhotwg_ki=czero_gw
 allocate(rhotwg   (Sigp%npwc*nspinor))
 allocate(rhotwgp  (Sigp%npwc*nspinor))
 allocate(vc_sqrt_qbz(Sigp%npwc))

 ! Here I need two G spheres one for epsilon, the other one for vc, this part has to be cleaned
 allocate(grottb(Sigp%npwvec,Gsph_Max%timrev,Gsph_Max%nsym))
 grottb(:,:,:)=Gsph_Max%rottb(:,:,:)
 !
 ! === Additional allocations for PAW ===
 if (Psps%usepaw==1) then
   allocate(Cprj_ksum(Cryst%natom,nspinor))
   call cprj_alloc(Cprj_ksum,0,Wfd%nlmn_atm)
   !
   ! For COHSEX we need the onsite terms of the PW on the FFT mesh.
   ! * gw_gfft is the set of plane waves in the FFT Box for the oscillators.
   if (mod10==SIG_COHSEX) then
     allocate(gw_gfft(3,gwc_nfftot)); q0=zero
     call get_gftt(gwc_ngfft,q0,Cryst%gmet,gw_gsq,gw_gfft)
     allocate(Pwij_fft(Psps%ntypat))
     call init_paw_pwij(Pwij_fft,gwc_nfftot,(/zero,zero,zero/),gw_gfft,Cryst%rprimd,Dtfil,Psps,Pawtab,Paw_pwff)
   end if
 end if ! usepaw==1
 !
 ! === Calculate total number of frequencies and allocate related arrays ===
 ! * sigcme2 is used to accumulate the diagonal matrix elements over k-points and
 !   GW bands, used only in case of ppmodel 3 and 4 (TODO save memory)
 nomega_tot=Sr%nomega_r+Sr%nomega4sd

 allocate(sigctmp(nomega_sigc,Sigp%nsig_ab)); sigctmp=czero_gw
 allocate(sigc_ket(Sigp%npwc*nspinor,nomega_sigc))

 if (mod10==SIG_COHSEX) allocate(wf1swf2_g(gwc_nfftot*nspinor))

 ! Arrays storing the contribution given by the Hermitian/anti-Hermitian part of \Sigma_c
 !allocate(aherm_sigc_ket(Sigp%npwc*nspinor,nomega_sigc))
 !allocate( herm_sigc_ket(Sigp%npwc*nspinor,nomega_sigc))

 allocate(sigsex(Sigp%npwc))

 !allocate(sigcme_tmp(nomega_sigc,ib1:ib2,ib1:ib2,Sigp%nsppol*Sigp%nsig_ab))
 sigcme_tmp=czero

 allocate(sigc(2,nomega_sigc,ib1:ib2,ib1:ib2,Sigp%nsppol*Sigp%nsig_ab))
 sigc=czero
 !
 ! Here we divide the states where the QP energies are required into complexes. Note however that this approach is not
 ! based on group theory, and it might lead to spurious results in case of accidental degeneracies.
 !
 nq_summed=Kmesh%nbz
 if (Sigp%symsigma>0) then
   call print_little_group(Ltg_k,std_out,prtvol,'COLL')
   nq_summed=SUM(Ltg_k%ibzq(:))
   !
   ! === Find number of complexes and number of bands in each complex ===
   ! The tolerance is a little bit arbitrary (0.001 eV)
   ! It could be reduced, in particular in case of nearly accidental degeneracies
   allocate(degtab(ib1:ib2,ib1:ib2,Sigp%nsppol)); degtab=0
   do isppol=1,Sigp%nsppol
     do ib=ib1,ib2
       do jb=ib1,ib2 
        if (ABS(qp_ene(ib,jk_ibz,isppol)-qp_ene(jb,jk_ibz,isppol))<0.001/Ha_ev) then
          degtab(ib,jb,isppol)=1
        end if
       end do
     end do
   end do
!   if (ANY(degtab/=0)) then ! If two states do not belong to the same complex => matrix elements of v_xc differ
!     write(msg,'(a,3f8.3,a)')' Degenerate states at k-point = ( ',kgw(:),' ).'
!     call wrtout(std_out,msg,'COLL')
!     do isppol=1,Sigp%nsppol
!       do ib=ib1,ib2 
!         do jb=ib+1,ib2 
!           if (degtab(ib,jb,isppol)==1) then
!             write(msg,'(a,i2,a,i4,a,i4)')' (spin ',isppol,')',ib,' <====> ',jb
!             call wrtout(std_out,msg,'COLL')
!             if (ABS(Sr%vxcme(ib,jk_ibz,isppol)-Sr%vxcme(jb,jk_ibz,isppol))>ABS(tol6*Sr%vxcme(jb,jk_ibz,isppol))) then 
!               write(msg,'(7a)')&
!&                ' It seems that an accidental degeneracy is occurring at this k-point ',ch10,&
!&                ' In this case, using symsigma=1 might lead to spurious results as the algorithm ',ch10,&
!&                ' will treat these states as degenerate, and it won''t be able to remove the degeneracy. ',ch10,&
!&                ' In order to avoid this deficiency, run the calculation using symsigma=0'
!               MSG_WARNING(msg)
!             end if
!           end if
!         end do
!       end do
!     end do
!   end if
 end if !symsigma

#ifdef HAVE_CLIB
 call clib_progress_bar(-1,Kmesh%nbz)
#endif

 write(msg,'(2a,i6,a)')ch10,' calculation status ( ',nq_summed,' to be completed):'
 call wrtout(std_out,msg,'COLL')

 ! TODO if single q (ex molecule) dont allocate epsm1q, avoid waste of memory
 allocate(epsm1_qbz(Sigp%npwc,Sigp%npwc,1),STAT=istat)
 ABI_CHECK(istat==0,"out-of-memory in epsm1_qbz")

 allocate(igfftcg0(Sigp%npwc))
 !
 ! === Out-of-core solution for epsilon ===
 if (Er%mqmem==0) then
   MSG_COMMENT('Reading q-slices from file. Slower but less memory.')
 end if

 call timab(422,2,tsec)
 !
 ! ==========================================
 ! ==== Fat loop over k_i in the full BZ ====
 ! ==========================================
 !
 allocate(wfr_sum (gwc_nfftot*nspinor))

 do isppol=1,Sigp%nsppol

   if (ALL(proc_distrb(:,:,isppol)/=Wfd%my_rank)) CYCLE
   
   allocate(wfr_bdgw(gwc_nfftot*nspinor,ib1:ib2)) ! Load wavefunctions for GW corrections.
   do jb=ib1,ib2 
     call wfd_get_ur(Wfd,jb,jk_ibz,isppol,wfr_sum)
     wfr_bdgw(:,jb)=wfr_sum
   end do

   if (Wfd%usepaw==1) then ! * Load cprj for GW states, note the indexing.
     dimcprj_gw=nspinor*(ib2-ib1+1)
     allocate(Cprj_kgw(Cryst%natom,ib1:ib1+dimcprj_gw-1))
     call cprj_alloc(Cprj_kgw,0,Wfd%nlmn_atm)
     ibsp=ib1  
     do jb=ib1,ib2
       call wfd_get_cprj(Wfd,jb,jk_ibz,isppol,Cryst,Cprj_ksum,sorted=.FALSE.)
       call paw_symcprj(jk_bz,nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,Cprj_ksum) 
       call cprj_copy(Cprj_ksum,Cprj_kgw(:,ibsp:ibsp+(nspinor-1)))
       ibsp=ibsp+nspinor
     end do
   end if

   do ik_bz=1,Kmesh%nbz
     !
     ! === Parallelization over k-points and spin ===
     ! * For the spin there is another check in the inner loop
     if (ALL(proc_distrb(:,ik_bz,isppol)/=Wfd%my_rank)) CYCLE

     call timab(423,1,tsec) ! csigme (initq)
     !
     ! * Find the corresponding irreducible k-point
     call get_BZ_item(Kmesh,ik_bz,ksum,ik_ibz,isym_ki,iik,ph_mkt)
     spinrot_kbz(:)=Cryst%spinrot(:,isym_ki)

     ! * Identify q and G0 where q+G0=k_GW-k_i
     kgw_m_ksum=kgw-ksum
     call findqg0(iq_bz,g0,kgw_m_ksum,Qmesh%nbz,Qmesh%bz,Sigp%mG0)

     ! === Symmetrize the matrix elements ===
     ! * Sum only q"s in IBZ_k. In this case elements are weighted
     !   according to wtqp and wtqm. wtqm is for time-reversal.
     wtqp=1; wtqm=0
     !if (Sigp%symsigma>0) then
     if (can_symmetrize(isppol)) then
       if (Ltg_k%ibzq(iq_bz)/=1) CYCLE
       wtqp=0; wtqm=0
       do isym=1,Ltg_k%nsym_sg
         wtqp=wtqp+Ltg_k%wtksym(1,isym,iq_bz)
         wtqm=wtqm+Ltg_k%wtksym(2,isym,iq_bz)
       end do
     end if

#ifdef HAVE_CLIB
     call clib_progress_bar(ik_bz,Kmesh%nbz)
#else
     !$write(msg,'(2(a,i4),a,i3)')' csigme : ik_bz ',ik_bz,'/',Kmesh%nbz,' done by processor ',Wfd%my_rank
     !$call wrtout(std_out,msg,'PERS')
#endif
     !
     ! === Find the corresponding irred q-point ===
     call get_BZ_item(Qmesh,iq_bz,qbz,iq_ibz,isym_q,itim_q)
     q_is_gamma = (normv(qbz,Cryst%gmet,"G") < GW_TOL_W0)

     ! === Get the G-G0 shift for the FFT of the oscillators ===
     ig01 = g0(1) + Sigp%mG0(1)+1
     ig02 = g0(2) + Sigp%mG0(2)+1
     ig03 = g0(3) + Sigp%mG0(3)+1
     igfftcg0(1:Sigp%npwc) = igfft(:,ig01,ig02,ig03)

     ! Evaluate the tables needed for the padded FFT performed in rhotwg. Note that we have 
     ! to pass G-G0 to sphereboundary instead of G. as we need FFT results on the shifted G-sphere, 
     ! If Gamma is not inside G-G0 one has to disable FFT padding as sphereboundary will give wrong tables.
     use_padfft=0
     gwc_fftalga = gwc_ngfft(7)/100 !; gwc_fftalgc=MOD(gwc_ngfft(7),10)

     allocate(gmg0(3,Sigp%npwc))
     do ig=1,Sigp%npwc
       gmg0(:,ig) = Gsph_Max%gvec(:,ig)-g0
       if (ALL(gmg0(:,ig) == 0)) use_padfft=1
     end do

     !use_padfft=0  ! FIXME There is a bug somewhere in the padded-fft wrapper if we call sg_fftpad for G-->R
     if (gwc_fftalga/=3) use_padfft=0  ! Padded FFTW3 is safe instead!
     !use_padfft=1
     allocate(gw_gbound(2*gwc_mgfft+8,2*use_padfft))
     if (use_padfft==1) call sphereboundary(gw_gbound,1,gmg0,gwc_mgfft,Sigp%npwc)
     deallocate(gmg0)
     !
     ! Get PAW oscillator matrix elements $ <phj/r|e^{-i(q+G)}|phi/r> - <tphj/r|e^{-i(q+G)}|tphi/r> $ in packed form.
     if (Psps%usepaw==1) then
       allocate(Pwij_qg(Psps%ntypat))
       q0 = qbz !;if (q_is_gamma) q0 = (/0.00001_dp,0.00001_dp,0.00001_dp/) ! GW_Q0_DEFAULT
       call init_paw_pwij(Pwij_qg,Sigp%npwc,q0,Gsph_Max%gvec,Cryst%rprimd,Dtfil,Psps,Pawtab,Paw_pwff)
     end if

     if (Er%mqmem==0) then ! Read q-slice of epsilon^{-1}|chi0 in Er%epsm1(:,:,:,1) (much slower but less memory).
       call get_epsm1(Er,Vcp,0,0,accesswff,localrdwf_,comm,iqibzA=iq_ibz)
     end if
     !
     ! Only omega==0 for SEX or COHSEX 
     call Epsm1_symmetrizer(iq_bz,1,Sigp%npwc,Er,Gsph_Max,Qmesh,.TRUE.,epsm1_qbz)
     !
     ! === Get Fourier components of the Coulombian interaction in the BZ ===
     ! * In 3D systems, neglecting umklapp,  vc(Sq,sG)=vc(q,G)=4pi/|q+G|
     ! * The same relation holds for 0-D systems, but not in 1-D or 2D systems. It depends on S.
     do ig=1,Sigp%npwc
       vc_sqrt_qbz(grottb(ig,itim_q,isym_q))=Vcp%vc_sqrt(ig,iq_ibz)
     end do

     call timab(423,2,tsec) ! csigme (initq)
     !
     ! === Sum over bands ===
     !do isppol=1,Sigp%nsppol
     do ib=1,Sigp%nbnds
       !
       ! === Parallelism over spin ===
       ! * This processor has this k-point but what about spin?
       if (proc_distrb(ib,ik_bz,isppol)/=Wfd%my_rank) CYCLE
       !
       ! * Skip empty state ib for HF, SEX, and COHSEX.
       if (qp_occ(ib,ik_ibz,isppol)<tol_empty) CYCLE

       theta_mu_minus_e0i=fact_sp*qp_occ(ib,ik_ibz,isppol)

       call wfd_get_ur(Wfd,ib,ik_ibz,isppol,wfr_sum)

       if (Psps%usepaw==1) then ! Load cprj for point ksum, this spin or spinor and *THIS* band.
         ! TODO MG I could avoid doing this but I have to exchange spin and bands ???
         ! For sure there is a better way to do this!
         call wfd_get_cprj(Wfd,ib,ik_ibz,isppol,Cryst,Cprj_ksum,sorted=.FALSE.)
         call paw_symcprj(ik_bz,nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,Cprj_ksum) 
       end if

       do jb=ib1,ib2
         !
         ! === Get all <k-q,ib,s|e^{-i(q+G).r}|s,jb,k>, at once ===
         call rho_tw_g(Wfd%paral_kgb,nspinor,Sigp%npwc,gwc_nfftot,gwc_ngfft,1,use_padfft,igfftcg0,gw_gbound,&
&          wfr_sum       ,iik,ktabr(:,ik_bz),ph_mkt  ,spinrot_kbz,  &
&          wfr_bdgw(:,jb),jik,ktabr(:,jk_bz),ph_mkgwt,spinrot_kgw,&
&          nspinor,rhotwg_ki(:,jb),tim_fourdp,Wfd%MPI_enreg)

         if (Psps%usepaw==1) then ! Add on-site contribution, projectors are already in BZ !TODO Recheck this!
           shift=nspinor*(ib2-ib1+1)*(isppol-1)
           i2=jb+shift; if (nspinor==2) i2=(2*jb-1)
           spad=(nspinor-1)
           call paw_rho_tw_g(Sigp%npwc,nspinor,nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,gvec,&
&            Cprj_ksum(:,:),Cprj_kgw(:,i2:i2+spad),Pwij_qg,rhotwg_ki(:,jb))
         end if
         !
         ! === Multiply by the square root of the Coulomb term ===
         ! * In 3-D systems, the factor sqrt(4pi) is included)
         do ii=1,nspinor
           spad=(ii-1)*Sigp%npwc
           rhotwg_ki(spad+1:spad+Sigp%npwc,jb) = rhotwg_ki(spad+1:spad+Sigp%npwc,jb)*vc_sqrt_qbz(1:Sigp%npwc)
         end do
         !
         ! === Treat analytically the case q --> 0 ===
         ! * The oscillator is evaluated at q=O as it is considered constant in the small cube around Gamma
         !   while the Colulomb term is integrated out out out out.
         ! * In the scalar case we have nonzero contribution only if ib==jb
         ! * For nspinor==2 evalute <ib,up|jb,up> and <ib,dwn|jb,dwn>,
         !   impose orthonormalization since npwwfn might be < npwvec.
         if (ik_bz==jk_bz) then
           if (nspinor==1) then
             rhotwg_ki(1,jb)=czero_gw
             if (ib==jb) rhotwg_ki(1,jb)=CMPLX(SQRT(Vcp%i_sz),0.0_gwp)
           else
             ! TODO Recheck this!
             cg_sum  => Wfd%Wave(ib,ik_ibz,isppol)%ug
             cg_jb   => Wfd%Wave(jb,jk_ibz,isppol)%ug
             ctmp = xdotc(Wfd%npwwfn*Wfd%nspinor,cg_sum,1,cg_jb,1) 
             ovlp(1) = REAL(ctmp) 
             ovlp(2) = AIMAG(ctmp) 
             if (Psps%usepaw==1) then
               i2=(2*jb-1)
               ovlp = ovlp + paw_overlap(Cprj_ksum,Cprj_kgw(:,i2:i2+1),Cryst%typat,Pawtab)
             end if
             !ovlp(2) = -ovlp(1)
             !if (ib==jb) ovlp(2)=cone_gw-ovlp(1)
             if (ib==jb) then
               norm=DBLE(ovlp(1)+ovlp(2))
               ovlp(1)=DBLE(ovlp(1)/norm)
               ovlp(2)=DBLE(ovlp(2)/norm)
             else
               scprod=ovlp(1)+ovlp(2)
               ovlp(1)=ovlp(1)-scprod*half
               ovlp(2)=ovlp(2)-scprod*half
             end if
             rhotwg_ki(1          ,jb) = CMPLX(SQRT(Vcp%i_sz),0.0_gwp)*ovlp(1)
             rhotwg_ki(Sigp%npwc+1,jb) = CMPLX(SQRT(Vcp%i_sz),0.0_gwp)*ovlp(2)
           end if
         end if
       end do !jb  Got all matrix elements from minbnd up to maxbnd.

       do kb=ib1,ib2
         !
         ! === Get the ket \Sigma|\phi_{k,kb}> according to the method ===
         rhotwgp(:)=rhotwg_ki(:,kb)
         sigc_ket  = czero_gw

         ! SEX part. TODO add check on theta_mu_minus_e0i
         do ispinor=1,nspinor
           spadc=(ispinor-1)*Sigp%npwc
           call XGEMV('N',Sigp%npwc,Sigp%npwc,cone_gw,epsm1_qbz(:,:,1),Sigp%npwc,rhotwgp(1+spadc:),1,czero_gw,sigsex,1)

           sigsex(:)= -theta_mu_minus_e0i*sigsex(:)
           
           do io=1,nomega_tot ! nomega==1 as SEX is energy independent.
             sigc_ket(spadc+1:spadc+Sigp%npwc,io)=sigsex(:)
           end do
         end do
         !
         ! Loop over the non-zero row elements of this column.
         ! 1) If gwcalctyp<20 : only diagonal elements since QP==KS.
         ! 2) If gwcalctyp>=20:
         !     * Only off-diagonal elements connecting states with same character.
         !     * Only the upper triangle if HF, SEX, or COHSEX. 
         do irow=1,Sigcij_tab(isppol)%col(kb)%size1

           jb = Sigcij_tab(isppol)%col(kb)%bidx(irow)
           rhotwg=rhotwg_ki(:,jb)
           !
           ! === Calculate <\phi_j|\Sigma_c|\phi_k> ===
           ! * Different freqs according to method (AC or Perturbative), see nomega_sigc.


           do iab=1,Sigp%nsig_ab
             spadc1=spinor_padc(1,iab)
             spadc2=spinor_padc(2,iab)
             do io=1,nomega_sigc
               sigctmp(io,iab)=XDOTC(Sigp%npwc,rhotwg(spadc1+1:),1,sigc_ket(spadc2+1:,io),1)
             end do
           end do
           !
           ! TODO: save  wf1swf2_g to avoid having to recalculate it at each q-point.
           if (mod10==SIG_COHSEX) then  ! Evaluate Static COH. TODO add spinor.
             if ( coh_distrb(jb,kb,ik_bz,isppol) == Wfd%my_rank ) then
               coh_distrb(jb,kb,ik_bz,isppol) = xmpi_undefined_rank ! COH term is done only once for each k-point.
                                                                      ! It does not depend on the index ib summed over.
#if 1
               call calc_wfwfg(Wfd%MPI_enreg,Wfd%paral_kgb,tim_fourdp,ktabr(:,jk_ibz),jik,& ! why jk_ibz?
&                gwc_nfftot,gwc_ngfft,wfr_bdgw(:,jb),wfr_bdgw(:,kb),wf1swf2_g)
#else
               if (jik/=1) stop "jik"

               call calc_wfwfg(Wfd%MPI_enreg,Wfd%paral_kgb,tim_fourdp,ktabr(:,jk_bz),jik,&
&                gwc_nfftot,gwc_ngfft,wfr_bdgw(:,jb),wfr_bdgw(:,kb),wf1swf2_g)
#endif

               if (Psps%usepaw==1) then
                 shift=nspinor*(ib2-ib1+1)*(isppol-1)
                 i1=jb+shift
                 i2=kb+shift
                 if (nspinor==2) then
                   i1=(2*jb-1)
                   i2=(2*kb-1)
                 end if
                 spad=(nspinor-1)
                 call paw_rho_tw_g(gwc_nfftot,Sigp%nsig_ab,Sigp%nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,&
&                  gw_gfft,Cprj_kgw(:,i1:i1+spad),Cprj_kgw(:,i2:i2+spad),Pwij_fft,wf1swf2_g)
               end if

               call calc_coh(Sigp%nspinor,Sigp%nsig_ab,gwc_nfftot,gwc_ngfft,Sigp%npwc,gvec,wf1swf2_g,epsm1_qbz(:,:,1),&
&                vc_sqrt_qbz,Vcp%i_sz,iq_ibz,(jb==kb),sigcohme)

               do io=1,nomega_sigc ! Should be 1
                 sigctmp(io,:) = sigctmp(io,:)+sigcohme(:)
               end do
                 
             end if
           end if ! COHSEX
           !
           ! === Accumulate and, in case, symmetrize matrix elements of Sigma_c ===
           do iab=1,Sigp%nsig_ab
             is_idx=isppol; if (nspinor==2) is_idx=iab

             sigcme_tmp(:,jb,kb,is_idx)=sigcme_tmp(:,jb,kb,is_idx) + &
&              (wtqp+wtqm)*DBLE(sigctmp(:,iab)) + (wtqp-wtqm)*j_gw*AIMAG(sigctmp(:,iab))

             sigc(1,:,jb,kb,is_idx)=sigc(1,:,jb,kb,is_idx) + wtqp*      sigctmp(:,iab)
             sigc(2,:,jb,kb,is_idx)=sigc(2,:,jb,kb,is_idx) + wtqm*CONJG(sigctmp(:,iab))
             ! TODO this should be the contribution coming from the anti-hermitian part.
           end do
         end do !jb used to calculate matrix elements of $\Sigma$

       end do !kb to calculate matrix elements of $\Sigma$
     end do !ib
     !
     ! Deallocate k-dependent quantities.
     deallocate(gw_gbound, STAT=istat)
     if (Psps%usepaw==1) then
       call destroy_paw_pwij(Pwij_qg); deallocate(Pwij_qg)
     end if
   end do !ik_bz

   deallocate(wfr_bdgw)
   if (Wfd%usepaw==1) then
     call cprj_free(Cprj_kgw ); deallocate(Cprj_kgw)
   end if
 end do !isppol

 deallocate(igfftcg0)
 !
 ! === Gather contributions from all the CPUs ===
 call xbarrier_mpi(comm)
 call xsum_mpi(sigcme_tmp,comm,ierr)
 call xsum_mpi(sigc,comm,ierr)
 !
 ! === Multiply by constants ===
 ! * For 3D systems sqrt(4pi) is included in vc_sqrt_qbz ===
 sigcme_tmp = sigcme_tmp /(Cryst%ucvol*Kmesh%nbz)
 sigc       = sigc       /(Cryst%ucvol*Kmesh%nbz)
 !
 ! === If we have summed over the IBZ_q now we have to average over complexes ===
 ! * Presently only diagonal terms are considered
 ! * TODO it does not work if nspinor==2.

 do is=1,Sigp%nsppol
   if (can_symmetrize(is)) then
     allocate(sym_cme(nomega_tot,ib1:ib2,ib1:ib2)); sym_cme=czero
     !
     ! === Average over degenerate diagonal elements ===
     ! NOTE: frequencies for \Sigma_c(\omega) should be equal to avoid spurious results.
     ! another good reason to use a strict criterion for the tollerance on eigenvalues.
     do ib=ib1,ib2 
       ndegs=0
       do jb=ib1,ib2 
         if (degtab(ib,jb,is)==1) then
           sym_cme(:,ib,ib)=sym_cme(:,ib,ib)+SUM(sigc(:,:,jb,jb,is),DIM=1)
         end if
         ndegs=ndegs+degtab(ib,jb,is)
       end do
       sym_cme(:,ib,ib)=sym_cme(:,ib,ib)/ndegs
     end do

     if (Sigp%gwcalctyp >= 20) then 
       call symmetrize_me(QP_sym(is),ib1,ib2,sigc(:,1,:,:,is),sym_cme(1,:,:))
     end if
     !
     ! ==== Copy symmetrized values ====
     do ib=ib1,ib2 
       do jb=ib1,ib2 
         sigcme_tmp(:,ib,jb,is)=sym_cme(:,ib,jb)
       end do
     end do
     deallocate(sym_cme)
   end if
 end do

 ! Reconstruct the full sigma matrix from the upper triangle (only for HF, SEX and COHSEX)
 if (Sigp%gwcalctyp>=20 .and. sigma_is_herm(Sigp) ) then
   ABI_CHECK(nspinor==1,"cannot hermitianize non-collinear sigma!")
   do isppol=1,Sigp%nsppol
     do io=1,nomega_sigc
       call hermitianize(sigcme_tmp(io,:,:,isppol),"Upper")
     end do
   end do
 end if
 !
 ! ===========================
 ! ==== Deallocate memory ====
 ! ===========================
 if (Psps%usepaw==1) then
   if (allocated(gw_gfft))  deallocate(gw_gfft)
   call cprj_free(Cprj_ksum); deallocate(Cprj_ksum)
   if (allocated(Pwij_fft)) then
     call destroy_paw_pwij(Pwij_fft); deallocate(Pwij_fft)
   end if
 end if

 deallocate(igfft,ktabr)
 deallocate(wfr_sum)
 deallocate(rhotwg_ki,rhotwg,rhotwgp,vc_sqrt_qbz,sigc_ket)
 deallocate(epsm1_qbz)
 deallocate(sigctmp)
 deallocate(grottb)
 deallocate(sigc,sigsex)
 deallocate(proc_distrb)
 if (mod10==SIG_COHSEX) then  
   deallocate(wf1swf2_g)
   deallocate(coh_distrb)
 end if

 if (allocated(degtab ))  deallocate(degtab )


 call timab(425,2,tsec) ! csigme(SigC)
 call timab(421,2,tsec)

 DBG_EXIT("COLL")

end subroutine cohsex_me
!!***
