!{\src2tex{textfont=tt}}
!!****f* ABINIT/calc_exch
!! NAME
!!  calc_exch
!!
!! FUNCTION
!!  Calculate and write the excitonic Hamiltonian on an external binary file (Fortran file open
!!  in random mode) for subsequent treatment in the Bethe-Salpeter code.
!!
!! COPYRIGHT
!! Copyright (C) 1992-2009 EXC group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida)
!! Copyright (C) 2009-2010 ABINIT group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida, M.Giantomassi)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  BSp<excparam>=The parameters for the Bethe-Salpeter calculation. 
!!  Dtfil<datafiles_type>=File names and unit numbers.
!!  usepaw=1 if PAW is used, 0 otherwise.
!!  nspinor=Number of spinorial components.
!!  nsppol=Number of independent spin polarizations.
!!  Cryst<crystal_structure>=Info on the crystalline structure.
!!  Kmesh<BZ_mesh_type>=The list of k-points in the BZ, IBZ and symmetry tables.
!!  Qmesh<BZ_mesh_type>=The list of q-points for epsilon^{-1} and related symmetry tables. 
!!  ktabr(nfftot_osc,BSp%nkbz)=The FFT index of $(R^{-1}(r-\tau))$ where R is symmetry needed to obtains 
!!    the k-points from the irreducible image.  Used to symmetrize u_Sk where S = \transpose R^{-1}
!!  Gsph_Max<gvectors_type>=Info on the G-sphere used to describe wavefunctions and W (the largest one is actually stored).  
!!  igfft(Bsp%npwvec,2*BSp%mG0(1)+1,2*BSp%mG0(2)+1,BSp%mG0(3)+1)=Index of G-G0 in the FFT grid. The first 
!!    dimension refers to G, the remainig three dimensions define G0.
!!  Vcp<coulombian_type>=The Coulomb interaction in reciprocal space. A cutoff can be used
!!  Er<epsilonm1_results>=Data type gathering info and data for the symmetrized inverse dielectric matrix.
!!  trans<transition>=For each transition, it gives the index of the k-point, valence and conduction index
!!    as well as the transition energy.
!!  transtab(BSp%nkbz,BSp%nbndv,BSp%nbndc)=Correspondence between the tupla (k,v,c) and the transition index in trans.
!!  nfftot_osc=Total Number of FFT points used for the oscillator matrix elements.
!!  ngfft_osc(18)=Info on the FFT algorithm used to calculate the oscillator matrix elements.
!!  Psps<Pseudopotential_type>=Variables related to pseudopotentials
!!  Pawtab(Psps%ntypat)<pawtab_type>=PAW tabulated starting data.
!!  Pawang<pawang_type>=PAW angular mesh and related data.
!!  Paw_pwff(Cryst%ntypat*usepaw)<Paw_pwff_type>=Form factor used to calculate the onsite mat. elements of a plane wave.
!!  Wfd<wfs_descriptor>=Handler for the wavefunctions.
!!  rhxtwg_q0
!!  is_resonant
!!  fname
!!  spaceComm=MPI communicator.
!!  prtvol=Verbosity level.
!!
!! OUTPUT
!!  The excitonic Hamiltonian is saved on an external binary file (see below).
!!
!! SIDE EFFECTS 
!!  When the routine returns Er%epsm1 contains W (calculation of W is done in-place without having to allocated extra memory) 
!!
!! NOTES
!!  1) Version for K_V = K_C (q=0), thus KP_V = KP_C
!!  2) normal fcc supercell in bulk
!!  3) Omega = 0 only
!!  No exchange limit: use LDA energies in case 
!!  4) Hermiticity is exploited
!!  5) Symmetry of H-k-k' = H*kk' not used
!!  6) Coulomb term can be approssimateed as diagonal in G
!!  7) Valence bands treated from lomo on
!!
!!  The following files are used:
!!
!!  Input:
!!  Output (access direct):
!!    unit bsh_unt : h excitonic resonant Hamiltonian
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

!#define DEV_MG_DEBUG_MODE 1

subroutine calc_exch(BSp,Dtfil,usepaw,nspinor,nsppol,Cryst,Kmesh,Qmesh,ktabr,Gsph_Max,igfft,Vcp,&
& Wfd,Er,trans,transtab,nfftot_osc,ngfft_osc,Psps,Pawtab,Pawang,Paw_pwff,rhxtwg_q0,&
& is_resonant,fname,spaceComm,prtvol)

 use defs_basis
 use defs_datatypes
 use defs_abitypes
 use defs_gw
 use m_bs_defs
 use m_xmpi
 use m_errors
#if defined HAVE_MPI && defined HAVE_MPI2
 use mpi
#endif

 use m_gwdefs,       only : czero_gw
 use m_io_tools,     only : get_unit
 use m_blas,         only : xdotc
 use m_crystal,      only : crystal_structure
 use m_gsphere,      only : gvectors_type
 use m_coulombian,   only : coulombian_type
 use m_bz_mesh,      only : bz_mesh_type, get_BZ_item, get_BZ_diff, has_BZ_item, isamek, findqg0
 use m_screening,    only : make_W, Epsm1_symmetrizer
 use m_paw_pwij,     only : paw_pwff_type, paw_pwij_type, init_paw_pwij, destroy_paw_pwij, paw_rho_tw_g
 use m_wfs,          only : wfs_descriptor, wfd_get_ur, wfd_get_cprj, wfd_change_ngfft
 use m_oscillators,  only : rho_tw_g, sym_rhotwgq0

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_51_manage_mpi
 use interfaces_53_abiutil
 use interfaces_66_paw
!End of the abilint section

 implicit none

#if defined HAVE_MPI && defined HAVE_MPI1
 include 'mpif.h'
#endif

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: usepaw,nspinor,nsppol,nfftot_osc,spaceComm,prtvol
 character(len=fnlen),intent(in) :: fname
 logical,intent(in) :: is_resonant
 type(excparam),intent(in) :: BSp
 type(Epsilonm1_results),intent(inout) :: Er
 type(BZ_mesh_type),intent(in) :: Kmesh,Qmesh
 type(crystal_structure),intent(in) :: Cryst
 type(Coulombian_type),intent(in) :: Vcp
 type(Gvectors_type),intent(in) :: Gsph_Max
 type(datafiles_type),intent(in) :: Dtfil
 type(Pseudopotential_type),intent(in) :: Psps
 type(pawang_type),intent(in) :: Pawang
 type(wfs_descriptor),intent(inout) :: Wfd
!arrays
 integer,intent(in) :: ngfft_osc(18)
 integer,intent(in) :: transtab(BSp%nkbz,BSp%nbndv,BSp%nbndc)
 integer,intent(in) :: igfft(Bsp%npwvec,2*BSp%mG0(1)+1,2*BSp%mG0(2)+1,2*BSp%mG0(3)+1)
 integer,intent(in) :: ktabr(nfftot_osc,BSp%nkbz)
 complex(gwpc),intent(in) :: rhxtwg_q0(BSp%npweps,BSp%nbnds,BSp%nbnds,BSp%nkibz)
 type(transition),intent(in) :: trans(BSp%nh)
 type(Pawtab_type),intent(in) :: Pawtab(Psps%ntypat)
 type(Paw_pwff_type),intent(in) :: Paw_pwff(Psps%ntypat*usepaw)

!Local variables ------------------------------
!scalars
 integer,parameter :: tim_fourdp=0,map2sphere=1,spin=1,paral_kgb=0
 integer :: ik_bz,ikp_bz
 integer :: indx_k_bz,indx_kp_bz,nomega,itpk_min,itpk_max
 integer :: shift,ibsp,spad,i1,i2
 integer :: dim_rtwg,bsh_unt 
#if defined HAVE_MPI_IO
 !integer :: my_spaceComm
 integer(kind=MPI_OFFSET_KIND) :: offset,rec_idx
 integer :: amode,fh,dpc_size
#else
 integer(i8b) :: offset,rec_idx
#endif
 integer :: nh,ig,ir
 integer :: ik_ibz,itim_k,ikp_ibz,itim_kp,isym_k,isym_kp
 integer :: iq_bz,iq_ibz,isym_q,itim_q,iqbz0 
 integer :: ig01,ig02,ig03,irank
 integer :: it,itp,iv,ivp,ic,icp
 integer :: irv,irc,irvp,ircp
 integer :: ib,istat,tot_hsize,my_hsize,sender
 integer :: recl4dpc,use_padfft
 integer :: ierr,nprocs,my_rank,master,mgfft_osc,fftalga_osc
 real(dp) :: faq
 complex(spc) :: http
 complex(spc) :: tenergy,tpenergy
 complex(spc) :: ctemp
 complex(dpc) :: ph_mkpt,ph_mkt
 logical,parameter :: time_reversal = .FALSE.
 logical :: use_mpiio
 logical :: do_coulomb_term,do_exchange_term,w_is_diagonal
 character(len=500) :: msg
 type(MPI_type) :: MPI_enreg_seq
!arrays
 integer :: bidx(2,4),g0(3)
 integer,pointer :: grottb(:,:,:)
 integer,allocatable :: igfftg0(:),ktabr_k(:),ktabr_kp(:),hsize_of(:),istart(:),istop(:)
 integer,allocatable :: gbound(:,:)
 integer,allocatable :: my_cols(:),gmg0(:,:)
 real(dp) :: kbz(3),kpbz(3),qbz(3),spinrot_k(4),spinrot_kp(4),kmkp(3)
 complex(spc),allocatable :: my_bsham(:),buffer(:)
 complex(gwpc),allocatable :: wfnr(:,:,:)
 complex(gwpc),allocatable :: W_qbz(:,:,:),Wstar(:,:),Wstardiag(:)
 complex(gwpc),allocatable :: vc_sqrt_qbz(:)
 complex(gwpc),allocatable :: rhotwg1(:),rhotwg2(:),rhxtwg_vpv(:),rhxtwg_cpc(:),ctccp(:)
 complex(gwpc),allocatable :: wfr_ckp(:),wfr_vkp(:),wfr_vk(:),wfr_ck(:)
 type(Cprj_type),allocatable :: Cprj_k(:,:),Cprj_kp(:,:)
 type(Cprj_type),allocatable :: Cp1(:,:),Cp2(:,:)
 type(Paw_pwij_type),allocatable :: Pwij_q(:)
#if defined HAVE_MPI_IO
 complex(dpc),allocatable :: buffer_dpc(:)
#endif
#if defined DEV_MG_DEBUG_MODE
 integer,allocatable :: ttp_check(:,:)
#endif

!************************************************************************

 DBG_ENTER("COLL")

 !TODO Add check on the content of Er%

 ABI_CHECK(nsppol==1,"nsppol==2 not coded")
 ABI_CHECK(nspinor==1,"nspinor==2 not coded")
 ABI_CHECK(nfftot_osc==PRODUCT(ngfft_osc(1:3)),"mismatch in FFT size")

 nprocs  = xcomm_size(spaceComm)
 my_rank = xcomm_rank(spaceComm)
 master=0

 if ( ANY(ngfft_osc(1:3) /= Wfd%ngfft(1:3)) ) call wfd_change_ngfft(Wfd,Cryst,Psps,ngfft_osc) 

 call initmpi_seq(MPI_enreg_seq) ! * Fake MPI_type for the sequential part.

 do_coulomb_term  = Bsp%COULOMBTERM
 do_exchange_term = Bsp%EXCHANGETERM
 w_is_diagonal    = BSp%WDIAG

 ! bidx gives the staring and final indeces used to loop over bands
 !
 !      (b3,b4)
 !         |... ...|
 ! (b1,b2) |... ...|

 ! Resonant matrix is given by
 !      (v',c')
 !       |... ...|
 ! (v,c) |... ...|


! Coupling matrix is given by
!       (c',v')
!       |... ...|
! (v,c) |... ...|

 if (is_resonant) then
   bidx(:,1) = (/BSp%lomo,BSp%homo/)  ! range for b1
   bidx(:,2) = (/BSp%lumo,BSp%nbnds/) ! range for b2
   bidx(:,3) = (/BSp%lomo,BSp%homo/)  ! range for b3
   bidx(:,4) = (/BSp%lumo,BSp%nbnds/) ! range for b4
 else 
   bidx(:,1) = (/BSp%lomo,BSp%homo/)  ! range for b1
   bidx(:,2) = (/BSp%lumo,BSp%nbnds/) ! range for b2
   bidx(:,3) = (/BSp%lumo,BSp%nbnds/) ! range for b3
   bidx(:,4) = (/BSp%lomo,BSp%homo/)  ! range for b4
 end if

 grottb => Gsph_Max%rottb(:,:,:) 

! Basic constants  
 dim_rtwg=1; faq = one/(Cryst%ucvol*BSp%nkbz)

 mgfft_osc = MAXVAL(ngfft_osc(1:3)) 
 fftalga_osc = ngfft_osc(7)/100 !; fftalgc_osc=MOD(ngfft_osc(7),10)

 ! Distribute the calculation of the matrix elements among the nodes.
 ! * istarst and istop gives the initial and final transition treated by each node.
 ! * my_hsize is the number of transitions treated by this node.
 ! * my_cols(1:2) gives the initial and final column treated by this node.

 nh=BSp%nh; tot_hsize=(nh*nh+nh)/2 ! Only the upper triangle is used.

 allocate(istart(0:nprocs-1),istop(0:nprocs-1))
 call split_work2(tot_hsize,nprocs,istart,istop) !check this but it should be OK.

 allocate(hsize_of(0:nprocs-1))
 hsize_of=0
 do irank=0,nprocs-1
   if (istop(irank)>=istart(irank)) hsize_of(irank) = istop(irank)-istart(irank)+1
 end do

 my_hsize = hsize_of(my_rank)
 ABI_CHECK(my_hsize>0,"One of the processors has zero transitions!")

 allocate(my_cols(2)); my_cols=0
 do itp=1,nh 
   do it=1,itp 
     ir = it + itp*(itp-1)/2
     if (ir==istart(my_rank)) my_cols(1) = itp
     if (ir==istop (my_rank)) my_cols(2) = itp
   end do
 end do

! * Announce the treatment of bands by each node.
 write(msg,'(4(a,i0))')&
&  ' Treating ',my_hsize,' matrix elements, from column ',my_cols(1),' up to column ',my_cols(2),' by node ',my_rank
 call wrtout(std_out,msg,'PERS')

 if (usepaw==1) then
   allocate(Cprj_k (Cryst%natom,nspinor*Bsp%nbnds)); call cprj_alloc(Cprj_k, 0,Wfd%nlmn_atm)
   allocate(Cprj_kp(Cryst%natom,nspinor*Bsp%nbnds)); call cprj_alloc(Cprj_kp,0,Wfd%nlmn_atm)
   allocate(Cp1(Wfd%natom,Wfd%nspinor)); call cprj_alloc(Cp1,0,Wfd%nlmn_atm)
   allocate(Cp2(Wfd%natom,Wfd%nspinor)); call cprj_alloc(Cp2,0,Wfd%nlmn_atm)
 end if
 !
 ! Get wavefunctions in r-space
 write(msg,'(a,f8.2,a)')&
&  ' Allocating real-space wavefunctions, Memory required ',nfftot_osc*BSp%nbnds*BSp%nkibz*gwpc*b2Mb,' Mb.'
 call wrtout(std_out,msg,"COLL")

 ! FIXME this part should be removed, Everything should be obtained from Wfd
 allocate(wfnr(nfftot_osc*Wfd%nspinor,BSp%nbnds,BSp%nkibz),STAT=istat)
 ABI_CHECK(istat==0,'out of memory in wfnr')

 ! SPIN support is missing here
 do ik_ibz=1,BSp%nkibz
   do ib=1,BSp%nbnds
     call wfd_get_ur(Wfd,ib,ik_ibz,spin,wfnr(:,ib,ik_ibz))
   end do
 end do

 allocate(wfr_ckp(Wfd%nspinor*nfftot_osc))
 allocate(wfr_vkp(Wfd%nspinor*nfftot_osc))
 allocate(wfr_ck (Wfd%nspinor*nfftot_osc))
 allocate(wfr_vk (Wfd%nspinor*nfftot_osc))
      
 iqbz0=0 ! Identify q==0
 do iq_bz=1,Qmesh%nbz
   if (ALL(ABS(Qmesh%bz(:,iq_bz))<tol3)) iqbz0 = iq_bz
 end do
 ABI_CHECK(iqbz0/=0,"q=0 not found")

 allocate(igfftg0(BSp%npweps))
 allocate(ktabr_k(nfftot_osc),ktabr_kp(nfftot_osc))
 allocate(rhxtwg_vpv(BSp%npweps),STAT=istat)
 allocate(rhxtwg_cpc(BSp%npweps),STAT=istat)

 if (is_resonant) then
   write(msg,'(a,f8.1,a)')&
&   ' Calculating excitonic Hamiltonian. Allocated buffer ', my_hsize*spc*b2Mb,' Mb. '
 else 
   write(msg,'(a,f8.1,a)')&
&   ' Calculating excitonic Hamiltonian. Allocated buffer ', my_hsize*spc*b2Mb,' Mb. '
 end if
 call wrtout(std_out,msg,"COLL")

 allocate(my_bsham(istart(my_rank):istop(my_rank)), STAT=istat)
 ABI_CHECK(istat==0,'Not enough memory for exc Hamiltonian')
 my_bsham = czero
  
 if (do_coulomb_term) then ! Construct Coulomb term.

   msg=" Calculating direct Coulomb term using full W_{GG'}"
   if (w_is_diagonal) msg=" Calculating direct Coulomb term using diagonal only approximation for W_{GG'}"
   call wrtout(std_out,msg,"COLL")

   allocate(ctccp(BSp%npweps))
   allocate(Wstardiag(BSp%npweps))
   allocate(W_qbz(BSp%npweps,BSp%npweps,1), STAT=istat) 
   allocate(Wstar(BSp%npweps,BSp%npweps),STAT=istat)

#if defined DEV_MG_DEBUG_MODE
   allocate(ttp_check(nh,nh)); ttp_check=0
#endif

   do ikp_bz=1,BSp%nkbz ! Loop over kp  
     ! NOTE: this way of looping is good for bulk but it's not optimal in the
     !       case of systems sampled only at Gamma e.g. isolated systems in which
     !       one should take advantage of Hermiticity by looping over c-v !!!!

     ! Check whether (ikp_bz,vp,cp) belongs to the set of columns treated by me for some vp,cp
     itpk_min = MINVAL(transtab(ikp_bz,:,:))
     itpk_max = MAXVAL(transtab(ikp_bz,:,:))
     if ( my_cols(2)<itpk_min .or. my_cols(1)>itpk_max) CYCLE 

     write(msg,'(3(a,i5))')" status: ",ikp_bz,"/",BSp%nkbz," done by node ",my_rank
     call wrtout(std_out,msg,"PERS")

     ! * Get ikp_ibz, non-symmorphic phase, ph_mkpt, and symmetries from ikp_bz.
     call get_BZ_item(Kmesh,ikp_bz,kpbz,ikp_ibz,isym_kp,itim_kp,ph_mkpt)

     ktabr_kp(:) = ktabr(:,ikp_bz)
     spinrot_kp(:)=Cryst%spinrot(:,isym_kp)

     if (usepaw==1) then ! Load cprj for this kp in the BZ. Here I assume that each node has all the bands.
       shift     =nspinor*BSp%nbnds*Kmesh%nbz*(spin-1)
       indx_kp_bz=nspinor*BSp%nbnds*(ikp_bz-1)+shift
       ibsp=0
       do ib=1,Bsp%nbnds
         call wfd_get_cprj(Wfd,ib,ikp_ibz,spin,Cryst,Cp1,sorted=.FALSE.)
         call paw_symcprj(ikp_bz,Wfd%nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,Cp1) 
         ibsp = ibsp + nspinor
         call cprj_copy(Cp1,Cprj_kp(:,ibsp:ibsp+(nspinor-1)))
       end do
     end if

     !do ik_bz=1,BSp%nkbz ! Loop over k
     do ik_bz=1,ikp_bz ! Loop over k
       !
       ! * Get ik_ibz, non-symmorphic phase, ph_mkt, and symmetries from ik_bz
       call get_BZ_item(Kmesh,ik_bz,kbz,ik_ibz,isym_k,itim_k,ph_mkt)
                                                                               
       ktabr_k(:) = ktabr(:,ik_bz)
       spinrot_k(:)=Cryst%spinrot(:,isym_k)
       !if(itim_k==2) CYCLE ! time-reversal or not       
                                                                               
       if (usepaw==1) then ! Load cprj for this k in the BZ.
         ! * Do not care of umklapp G0 in k-q as the phase is already included.
         shift    =nspinor*BSp%nbnds*Kmesh%nbz*(spin-1)
         indx_k_bz=nspinor*BSp%nbnds*(ik_bz-1)+shift
         ibsp=0
         do ib=1,Bsp%nbnds
           call wfd_get_cprj(Wfd,ib,ik_ibz,spin,Cryst,Cp2,sorted=.FALSE.)
           call paw_symcprj(ik_bz,Wfd%nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,Cp2) 
           ibsp = ibsp + nspinor
           call cprj_copy(Cp2,Cprj_k(:,ibsp:ibsp+(nspinor-1)))
         end do
       end if

       ! * Find q = K-KP-G0 in the full BZ.
       kmkp = Kmesh%bz(:,ik_bz) - Kmesh%bz(:,ikp_bz)
       call findqg0(iq_bz,g0,kmkp,Qmesh%nbz,Qmesh%bz,BSp%mG0)

       ! * Get the G-G0 shift for the FFT of the oscillators.
       ig01 = g0(1) + BSp%mG0(1)+1
       ig02 = g0(2) + BSp%mG0(2)+1
       ig03 = g0(3) + BSp%mG0(3)+1
       igfftg0(1:BSp%npweps) = igfft(1:BSp%npweps,ig01,ig02,ig03)

       ! Evaluate the tables needed for the padded FFT performed in rhotwg. Note that we have 
       ! to pass G-G0 to sphereboundary instead of G as we need FFT results on the shifted G-sphere, 
       ! If Gamma is not inside G-G0 one has to disable FFT padding as sphereboundary will give wrong tables.
       !
       use_padfft=0      
       allocate(gmg0(3,BSp%npweps))
       do ig=1,BSp%npweps
         gmg0(:,ig) = Gsph_Max%gvec(:,ig)-g0
         if (ALL(gmg0(:,ig) == 0)) use_padfft=1
       end do

       if (fftalga_osc/=3) use_padfft=0  ! Padded FFT with SG is not safe.
       allocate(gbound(2*mgfft_osc+8,2*use_padfft))
       if (use_padfft==1) call sphereboundary(gbound,1,gmg0,mgfft_osc,BSp%npweps)
       deallocate(gmg0)

       ! * Get iq_ibz, and symmetries from iq_bz
       call get_BZ_item(Qmesh,iq_bz,qbz,iq_ibz,isym_q,itim_q)

       ! Symmetrize W (omega=0) 
       nomega=1
       call Epsm1_symmetrizer(iq_bz,nomega,BSp%npweps,Er,Gsph_Max,Qmesh,.FALSE.,W_qbz) 

       Wstar(:,:) = CONJG(W_qbz(:,:,1))
       do ig=1,BSp%npweps
         Wstardiag(ig) = Wstar(ig,ig)
       end do

       ! === Evaluate oscillator matrix elements ===
       ! * $ <phj/r|e^{-i(q+G)}|phi/r> - <tphj/r|e^{-i(q+G)}|tphi/r> $ in packed form.
       if (usepaw==1.and.ik_bz/=ikp_bz) then
         allocate(Pwij_q(Cryst%ntypat))
         call init_paw_pwij(Pwij_q,BSp%npweps,Qmesh%bz(:,iq_bz),Gsph_Max%gvec,Cryst%rprimd,Dtfil,Psps,Pawtab,Paw_pwff)
       end if

       ! =======================================
       ! === Loop over the four band indeces ===
       ! =======================================
       !
       do ic=bidx(1,2),bidx(2,2)     !do ic=BSp%lumo,BSp%nbnds
         irc = ic - bidx(1,2) + 1

         do icp=bidx(1,4),bidx(2,4)  !do icp=BSp%lumo,BSp%nbnds
           ircp = icp - bidx(1,4) + 1

           ! * Calculate matrix-elements rhxtwg_cpc
           !
           if (ik_bz==ikp_bz) then ! Already in memory.
             rhxtwg_cpc(:) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,icp,ic,ik_ibz),Gsph_Max)

           else ! Calculate matrix element from wfr.
#if 0
             ! TODO: use this but change the loops.
             call wfd_get_ur(Wfd,icp,ikp_ibz,1,wfr_ckp) ! SPIN support is missing
             call wfd_get_ur(Wfd,ic , ik_ibz,1,wfr_ck )

             call rho_tw_g(paral_kgb,nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&              wfr_ckp,itim_kp,ktabr_kp,ph_mkpt,spinrot_kp,&
&              wfr_ck ,itim_k ,ktabr_k ,ph_mkt ,spinrot_k ,&
&              dim_rtwg,rhxtwg_cpc,tim_fourdp,MPI_enreg_seq)
#else
             call rho_tw_g(paral_kgb,nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&              wfnr(:,icp,ikp_ibz),itim_kp,ktabr_kp,ph_mkpt,spinrot_kp,&
&              wfnr(:,ic ,ik_ibz ),itim_k ,ktabr_k ,ph_mkt ,spinrot_k ,&
&              dim_rtwg,rhxtwg_cpc,tim_fourdp,MPI_enreg_seq)
#endif 
             
             if (usepaw==1) then ! Add PAW onsite contribution.   
               spad=(nspinor-1)
               i1=icp; if (nspinor==2) i1=(2*icp-1)
               i2=ic ; if (nspinor==2) i2=(2*ic-1)
               call paw_rho_tw_g(Bsp%npweps,dim_rtwg,nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,Gsph_Max%gvec,&
                 Cprj_kp(:,i1:i1+spad),Cprj_k(:,i2:i2+spad),Pwij_q,rhxtwg_cpc)
             end if
           end if

           ! Prepare sum_GG' rho_c'c*(G) W_qbz(G,G') rho_v'v(G')
           ! first sum on G: sum_G rho_c'c(G) W_qbz*(G,G')    (W_qbz conjugated)
           if (w_is_diagonal) then
             ctccp(:) = rhxtwg_cpc * Wstardiag
           else
             ctccp(:) = MATMUL(rhxtwg_cpc,Wstar(:,:))
           end if
           
           do iv=bidx(1,1),bidx(2,1)    !do iv=BSp%lomo,BSp%homo
             irv = iv - bidx(1,1) + 1
             it = transtab(ik_bz,irv,irc); if (it==0) CYCLE ! ir-uv-cutoff
             tenergy = trans(it)%en
             
             do ivp=bidx(1,3),bidx(2,3) !do ivp=BSp%lomo,BSp%homo
               irvp = ivp - bidx(1,3) + 1

               if (is_resonant) then
                 itp = transtab(ikp_bz,irvp,ircp) 
               else ! have to exchange band indeces
                 itp = transtab(ikp_bz,ircp,irvp) 
               end if
               
               if (itp==0) CYCLE ! ir-uv-cutoff

               ! FIXME Temporary work around, when ikp_bz == ik it might happen that itp<it
               ! should rewrite the loops using contracted k-dependent indeces for bands
               if (itp<it) CYCLE 

               !if (itp<my_cols(1).or.itp>my_cols(2)) CYCLE 
               ir = it + itp*(itp-1)/2
               if (ir<istart(my_rank).or.ir>istop(my_rank)) CYCLE

               tpenergy = trans(itp)%en
               if (ABS(DBLE(tenergy - tpenergy)) > BSp%stripecut) CYCLE ! stripe

               ! ============================================
               ! === Calculate matrix elements rhxtwg_vpv ===
               ! ============================================
               if (ik_bz==ikp_bz) then ! Already in memory.

                 rhxtwg_vpv(:) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,ivp,iv,ik_ibz),Gsph_Max)

               else ! Calculate matrix element from wfr.
#if 0
                 ! TODO: use this but change the loops.
                 call wfd_get_ur(Wfd,ivp,ikp_ibz,1,wfr_vkp) ! SPIN support is missing
                 call wfd_get_ur(Wfd,iv , ik_ibz,1,wfr_vk )

                 call rho_tw_g(paral_kgb,nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&                   wfr_vkp,itim_kp,ktabr_kp,ph_mkpt,spinrot_kp,&
&                   wfr_vk ,itim_k ,ktabr_k ,ph_mkt ,spinrot_k ,&
&                   dim_rtwg,rhxtwg_vpv,tim_fourdp,MPI_enreg_seq)
#else
                 call rho_tw_g(paral_kgb,nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&                   wfnr(:,ivp,ikp_ibz),itim_kp,ktabr_kp,ph_mkpt,spinrot_kp,&
&                   wfnr(:,iv,  ik_ibz),itim_k ,ktabr_k ,ph_mkt ,spinrot_k ,&
&                   dim_rtwg,rhxtwg_vpv,tim_fourdp,MPI_enreg_seq)
#endif 
                
                 if (usepaw==1) then ! Add PAW onsite contribution.
                   spad=(nspinor-1)
                   i1=ivp; if (nspinor==2) i1=(2*ivp-1)
                   i2=iv ; if (nspinor==2) i2=(2*iv-1)
                   call paw_rho_tw_g(Bsp%npweps,dim_rtwg,nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,&
&                    Gsph_Max%gvec,Cprj_kp(:,i1:i1+spad),Cprj_k(:,i2:i2+spad),Pwij_q,rhxtwg_vpv)
                 end if
               end if

               ! sum on G2: sum_G2 rho_c'c(G) W_qbz(G,G') rho_v'v(G')
               http = -faq * XDOTC(BSp%npweps,ctccp,1,rhxtwg_vpv,1)

               ! Save result taking into account the symmetry of the matrix.
               ! Note that the diagonal of the resonant block is not forced to be real 
               if (itp >= it) then
                 ir = it + itp*(itp-1)/2
                 if (ir<istart(my_rank).or.ir>istop(my_rank)) then 
                   write(msg,'(a,3i5)')"Gonna SIGFAULT",ir,istart(my_rank),istop(my_rank)
                   MSG_PERS_ERROR(msg)
                 end if
                 my_bsham(ir) = http
               else
                 MSG_ERROR("itp < it but You should not be here")
                 ir = itp + it*(it-1)/2
                 if (is_resonant) then
                   my_bsham(ir) = CONJG(http)
                 else ! Code for coupling block
                   my_bsham(ir) =       http
                 end if
               end if

#ifdef DEV_MG_DEBUG_MODE
               ttp_check(it,itp) = ttp_check(it,itp)+1
#endif
             end do ! ivp
           end do ! iv
         end do ! icp
       end do ! ic

       deallocate(gbound, STAT=istat)     

       if (usepaw==1.and.ik_bz/=ikp_bz) then ! Free the onsite contribution for this q.
         call destroy_paw_pwij(Pwij_q); deallocate(Pwij_q)
       end if

     end do ! ik_bz
   end do ! Fat loop over ikp_bz

#if defined DEV_MG_DEBUG_MODE
   do itp=1,nh
     do it=1,nh
      ir = it + itp*(itp-1)/2
       if (itp>=it .and. ttp_check(it,itp) /= 1) then
         if (ir>=istart(my_rank).and.ir<=istop(my_rank)) then 
           write(std_out,*)"WARNING upper triangle is not 1 ",it,itp,ttp_check(it,itp)
           write(std_out,*)TRIM(repr_trans(trans(it)))
           write(std_out,*)TRIM(repr_trans(trans(itp)))
         end if
       end if
       if (itp< it .and. ttp_check(it,itp) /= 0) then
         write(std_out,*)"WARNING lower triangle is not 0 ",it,itp,ttp_check(it,itp)
         write(std_out,*)TRIM(repr_trans(trans(it)))
         write(std_out,*)TRIM(repr_trans(trans(itp)))
       end if
     end do
   end do
   ierr = SUM(SUM(ttp_check,DIM=2),DIM=1)
   if (ierr/=my_hsize) then 
    write(msg,'(a,2i5)')"ierr/=my_hsize",ierr,my_hsize
    MSG_PERS_ERROR(msg)
   end if
   deallocate(ttp_check)
#endif

   deallocate(ctccp)
   deallocate(Wstardiag,W_qbz,Wstar)

   call wrtout(std_out,' Coulomb term completed',"COLL")
 end if ! do_coulomb_term

! =====================
! === Exchange term ===
! =====================
! TODO might add treatment of <psi|q+G|psi> for q+G -> 0
 if (do_exchange_term) then
   call wrtout(std_out," Calculating exchange term","COLL")

   allocate(rhotwg1(BSp%npweps),rhotwg2(BSp%npweps))
   allocate(vc_sqrt_qbz(BSp%npwvec))

   ! * Get iq_ibz, and symmetries from iq_bz.
   iq_bz = iqbz0 ! q = 0 -> iqbz0
   call get_BZ_item(Qmesh,iq_bz,qbz,iq_ibz,isym_q,itim_q)

   ! * Set up table of |q(BZ)+G|
   if (iq_ibz==1) then 
     do ig=1,BSp%npwvec
      vc_sqrt_qbz(grottb(ig,itim_q,isym_q))=Vcp%vcqlwl_sqrt(ig,1)
     end do
   else 
     MSG_ERROR("iq_ibz should be 1")
   end if

   do itp=1,nh ! Loop over transition tp = (kp,vp,cp)

     if (itp<my_cols(1) .or. itp>my_cols(2)) CYCLE ! I dont have this column.
     tpenergy = trans(itp)%en
     ikp_bz   = trans(itp)%k
     ivp      = trans(itp)%v
     icp      = trans(itp)%c
                                                                                                           
     ikp_ibz = Kmesh%tab (ikp_bz)
     isym_kp = Kmesh%tabo(ikp_bz)
     itim_kp = (3-Kmesh%tabi(ikp_bz))/2
                                                                                                           
     if (is_resonant) then 
       rhotwg2(:) = sym_rhotwgq0(itim_kp,isym_kp,dim_rtwg,BSp%npweps,rhxtwg_q0(:,ivp,icp,ikp_ibz),Gsph_Max)
     else ! Code for coupling block.
       rhotwg2(:) = sym_rhotwgq0(itim_kp,isym_kp,dim_rtwg,BSp%npweps,rhxtwg_q0(:,icp,ivp,ikp_ibz),Gsph_Max)
     end if

     do it=1,itp ! Loop over transition t = (k,v,c)
       ir = it + itp*(itp-1)/2
       if (ir<istart(my_rank) .or. ir>istop(my_rank)) CYCLE

       tenergy = trans(it)%en
       if (ABS(DBLE(tenergy - tpenergy)) > BSp%stripecut) CYCLE  ! Stripe cutoff
       ik_bz   = trans(it)%k
       iv      = trans(it)%v
       ic      = trans(it)%c

       ik_ibz = Kmesh%tab(ik_bz)
       isym_k = Kmesh%tabo(ik_bz)
       itim_k = (3-Kmesh%tabi(ik_bz))/2

       !if (itim_k==2) CYCLE ! time-reversal or not       

       rhotwg1(:) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,iv,ic,ik_ibz),Gsph_Max)
       do ig=2,BSp%npweps
         rhotwg1(ig) = rhotwg1(ig) * vc_sqrt_qbz(ig) * vc_sqrt_qbz(ig) 
       end do

       ! sum over G/=0
       ctemp = XDOTC(BSp%npweps-1,rhotwg1(2:),1,rhotwg2(2:),1)
       ctemp = two * ctemp * faq

       my_bsham(ir) = my_bsham(ir) + ctemp

     end do ! it
   end do ! itp

   deallocate(rhotwg1,rhotwg2)
   deallocate(vc_sqrt_qbz)

   call wrtout(std_out," Exchange term finished ","COLL")
 end if ! do_exchange_term
      
! =====================
! === Diagonal term ===
! =====================
 if (is_resonant) then
   call wrtout(std_out," Adding diagonal term","COLL")
   do it=1,nh
     ir = it + it*(it-1)/2
     if (ir>=istart(my_rank) .and. ir<=istop(my_rank)) my_bsham(ir) = my_bsham(ir) + trans(it)%en
   end do
 end if

!----------------------------------------------------------------------
!     fill up hamiltonian by time-reversal
!----------------------------------------------------------------------

 if (time_reversal) then
! print *, 'fill hamiltonian by time reversal'
! do ik_bz = 1, BSp%nkbz
!  if(Kmesh%tabi(ik_bz) == 1) CYCLE
!  iki = kbzrotm1(ik_bz)
!  print *, 'ik_bz = ', ik_bz, ' from  ', iki
!  do ikp_bz = ik_bz, BSp%nkbz
!   ikpi = kbzrotm1(ikp_bz)
!   print *, 'ikp_bz = ', ikp_bz, ' from  ', ikpi
!  do iv = BSp%lomo, BSp%homo
!   do ic = BSp%lumo, BSp%nbnds
!    ib = (ik_bz-1)*BSp%nbndv*BSp%nbndc+(iv-BSp%lomo)*BSp%nbndc + ic-BSp%homo
!    ibi = (iki-1)*BSp%nbndv*BSp%nbndc+(iv-BSp%lomo)*BSp%nbndc + ic-BSp%homo
!     do ivp = BSp%lomo, BSp%homo
!      do icp = BSp%lumo, BSp%nbnds
!      ibp = (ikp_bz-1)*BSp%nbndv*BSp%nbndc+(ivp-BSp%lomo)*BSp%nbndc + icp-BSp%homo
!      ibpi = (ikpi-1)*BSp%nbndv*BSp%nbndc+(ivp-BSp%lomo)*BSp%nbndc + icp-BSp%homo
!      my_bsham(ib,ibp) = CONJG(my_bsham(ibi,ibpi))
!      if(ik_bz.ne.ikp_bz) my_bsham(ibp,ib) = my_bsham(ibi,ibpi)
!      end do
!     end do
!    end do
!   end do
!  end do
! end do
 end if ! time_reversal

 if (prtvol>10.and.nh<3000) then 
   msg=' Coupling Hamiltonian matrix elements: '
   if (is_resonant) msg=' Reasonant Hamiltonian matrix elements: '
   call wrtout(std_out,msg,"PERS")
   call wrtout(std_out,'    k  v  c       k" v" c        H',"PERS")
   do itp=1,nh      
     ikp_bz = trans(itp)%k
     ivp    = trans(itp)%v
     icp    = trans(itp)%c
     do it=1,itp
       ik_bz = trans(it)%k
       iv    = trans(it)%v
       ic    = trans(it)%c
       ir = it + itp*(itp-1)/2
       if (ir>=istart(my_rank).and.ir<=istop(my_rank)) then 
         http = my_bsham(ir)
         if (ABS(http) > tol3) then
           write(msg,'(i5,2i3,3x,i5,2i3,2x,2f7.3)')ik_bz, iv, ic, ikp_bz, ivp, icp, http*Ha_eV
           call wrtout(std_out,msg,"PERS")
         end if
       end if
     end do
   end do
 end if

 ! * Free memory.
 deallocate(igfftg0)
 deallocate(ktabr_k,ktabr_kp)
 deallocate(rhxtwg_vpv,rhxtwg_cpc)
 deallocate(wfnr)
 deallocate(wfr_ckp)
 deallocate(wfr_vkp)
 deallocate(wfr_vk)
 deallocate(wfr_ck)
 !
 ! * Optional deallocation for PAW.
 if (usepaw==1) then
   call cprj_free(Cprj_k ); deallocate(Cprj_k )
   call cprj_free(Cprj_kp); deallocate(Cprj_kp)
   call cprj_free(Cp1); deallocate(Cp1)
   call cprj_free(Cp2); deallocate(Cp2)
 end if

 call xbarrier_mpi(spaceComm)

 ! =================================
 ! === Write Hamiltonian on disk ===
 ! =================================
 use_mpiio = (nprocs>1)
 use_mpiio=.FALSE.
 !use_mpiio=.FALSE.
#ifndef HAVE_MPI_IO 
 use_mpiio = .FALSE.
#endif

 if (use_mpiio) then
#ifdef HAVE_MPI_IO 
   ! Writing of the Hamiltonian via MPI-IO.
   ! * Individual MPI-IO is used as the data to be written in contiguous.
   !   and the advantage wrt collective IO should be marginal.

   if (is_resonant) then
     write(msg,'(2a,f6.2,a)')&
&      " Writing resonant excitonic Hamiltonian on file "//TRIM(fname)," via MPI-IO; file size= ",&
&      tot_hsize*2*spc*b2Gb," [Gb]."
   else
     write(msg,'(2a,f6.2,a)')&
&      " Writing coupling excitonic Hamiltonian on file "//TRIM(fname)," via MPI-IO; file size= ",&
&      tot_hsize*2*spc*b2Gb," [Gb]."
   end if
   call wrtout(std_out,msg,"COLL")
   call wrtout(ab_out,msg,"COLL")

   ! Open the file.
   !amode=MPI_MODE_CREATE + MPI_MODE_WRONLY + MPI_MODE_EXCL + MPI_MODE_APPEND
   !amode=MPI_MODE_CREATE + MPI_MODE_WRONLY + MPI_MODE_EXCL 
   !amode=MPI_MODE_CREATE + MPI_MODE_WRONLY 
   amode=MPI_MODE_CREATE + MPI_MODE_RDWR
   !amode= MPI_MODE_WRONLY 

   !my_spaceComm = MPI_COMM_WORLD
   !write(*,*)"rank, spaceComm, nprocs",my_rank,spaceComm,nprocs
   !write(*,*)"rank, my_spaceComm, nprocs",my_rank,my_spaceComm,nprocs
   !call xbarrier_mpi(spaceComm)
   !if(my_rank==master)then
   !  open(unit=888,file=fname,form='unformatted')
   !  rewind(888)
   !end if
   call xbarrier_mpi(spaceComm)
   
   !call MPI_FILE_OPEN(spaceComm, fname, amode, MPI_INFO_NULL, fh, ierr)
   call MPI_FILE_OPEN(MPI_COMM_WORLD, fname, MPI_MODE_CREATE + MPI_MODE_RDWR, MPI_INFO_NULL, fh, ierr)
   !call MPI_FILE_OPEN(MPI_COMM_SELF, fname, MPI_MODE_RDWR, MPI_INFO_NULL, fh, ierr)
   write(*,*)"from rank",my_rank," MPI_FILE_OPEN returned", ierr, MPI_SUCCESS
   !call xbarrier_mpi(spaceComm)
   msg = " MPI_IO error opening file "//TRIM(fname)//": "
   ABI_CHECK_MPI(ierr,msg)

   call xbarrier_mpi(spaceComm)

   call MPI_TYPE_SIZE(MPI_DOUBLE_COMPLEX, dpc_size, ierr)
   ABI_CHECK_MPI(ierr,"")

   if (my_rank==0) then 
     offset = 0
   else 
     offset = SUM(hsize_of(:my_rank-1)) * dpc_size
   end if

   call MPI_FILE_SET_VIEW(fh, offset, MPI_DOUBLE_COMPLEX, MPI_DOUBLE_COMPLEX, 'native', MPI_INFO_NULL, ierr)
   ABI_CHECK_MPI(ierr,"SET_VIEW")

   ! FIXME Quick hacking: file is always written in double precision. 
   ! One should define a file format to allow for calculations both in double and single precision.
   ! It seems that MPI-IO does not provide options to change the precision inside the MPI routine.
   allocate(buffer_dpc(hsize_of(my_rank)))
   buffer_dpc = my_bsham

   call MPI_FILE_WRITE(fh, buffer_dpc, hsize_of(my_rank), MPI_DOUBLE_COMPLEX, MPI_STATUS_IGNORE, ierr)
   ABI_CHECK_MPI(ierr,"FILE_WRITE")

   deallocate(buffer_dpc)
   
   ! Close the file.
   call MPI_FILE_CLOSE(fh, ierr)               
   ABI_CHECK_MPI(ierr,"FILE_CLOSE")

#else
   MSG_BUG("You should not be here!")
#endif
 else
    !
    ! Use FORTRAN IO with direct access mode.
    if (is_resonant) then
      write(msg,'(2a,f6.2,a)')&
&       " Writing resonant excitonic Hamiltonian on file "//TRIM(fname),"; file size= ",tot_hsize*2*spc*b2Gb," [Gb]."
    else
      write(msg,'(2a,f6.2,a)')&
&       " Writing coupling excitonic Hamiltonian on file "//TRIM(fname),"; file size= ",tot_hsize*2*spc*b2Gb," [Gb]."
    end if
    call wrtout(std_out,msg,"COLL")
    call wrtout(ab_out,msg,"COLL")

    ! * Each node sends its data to master node. 
    ! * Block are distributed according to the rank of the node.
    if (my_rank==master) then
      bsh_unt = get_unit()
      recl4dpc = get_reclen("dpc")
      open(unit=bsh_unt,file=fname,access='direct',recl=recl4dpc)
      do ir=1,my_hsize
        write(bsh_unt,rec=ir) CMPLX(my_bsham(ir),kind=dpc)
      end do                             
      deallocate(my_bsham)
    end if

    ! Collect data from other nodes.
    do sender=1,nprocs-1
      if (my_rank==master) allocate(buffer(hsize_of(sender)))
      call xexch_mpi(my_bsham,hsize_of(sender),sender,buffer,master,spaceComm,ierr)

      if (my_rank==master) then
        offset = SUM(hsize_of(:sender-1))
        do ir=1,hsize_of(sender)
          rec_idx= ir + offset
          write(bsh_unt,rec=rec_idx) CMPLX(buffer(ir),kind=dpc)
        end do                               
        deallocate(buffer)
      end if
      call xbarrier_mpi(spaceComm)
    end do

    if (my_rank==master) then ! Append the size of hamiltonian matrix to check TODO rethink the fileformat
      write(bsh_unt,rec=tot_hsize+1) nh
      write(bsh_unt,rec=tot_hsize+2) BSp%nkbz
      close(bsh_unt)
    end if

 end if ! use_mpiio

 if (allocated(my_bsham)) deallocate(my_bsham)
 deallocate(istart,istop,hsize_of,my_cols)

 call xbarrier_mpi(spaceComm) 

 DBG_EXIT("COLL")

end subroutine calc_exch
!!***
