!{\src2tex{textfont=tt}}
!!****f* ABINIT/update_mmat
!! NAME
!! update_mmat
!!
!! FUNCTION
!! This routine updates the M matrix for a Berrys phase magnetization calculation.
!!
!! COPYRIGHT
!!
!! INPUTS
!!  berryopt 5 is finite B field, -5 is just magnetization
!!  cg(2,mcg) wavefunction <G|C band,k> coefficients for ALL bands
!!  cgq(2,mcgq) wavefunctions at neighbouring k points
!!  dimffnl = 2nd dimension of ffnl (1 + number of derivatives)
!!  ecut=cut-off energy for plane wave basis sphere (Ha)
!!  ecutsm=smearing energy for plane wave kinetic energy (Ha)
!!  effmass=effective mass for electrons (1. in common case)
!!  ffnl(npw_k,dimffnl,lmnmax,ntypat) = nonlocal form factors
!!  filstat=name of the status file
!!  gs_hamk <type(gs_hamiltonian_type)>=all data for the Hamiltonian at k
!!  icg=shift to be applied on the location of data in the array cg
!!  ikpt=number of the k-point
!!  kg_k(3,npw_k) = integer coords of planewave in basis sphere
!!  kinpw(npw_k)=(modified) kinetic energy for each plane wave (Hartree)
!!  lmnmax = max number of (l,m,n) components over all atom types
!!  matblk = dimension of array ph3d
!!  mband =maximum number of bands
!!  mcg=second dimension of the cg array
!!  mcgq=second dimension of the cgq array
!!  mgfft = max size of 1D FFTs
!!  mkgq = second dimension of pwnsfacq
!!  mpi_enreg=information about MPI parallelization
!!  mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
!!  mpssoang= 1+max(spin*angular momentum) for nonlocal pseudopotentials
!!  mpw=maximum dimensioned size of npw
!!  natom=number of atoms in cell.
!!  nkpt=number of k points.
!!  npwarr(nkpt)=number of planewaves in basis at this k point
!!  npw_k = number of plane waves for this k point
!!  nspinor=number of spinorial components of the wavefunctions
!!  ntypat=number of types of atoms in cell.
!!  nvloc=final dimension of vlocal (usually 1, but 4 for non-collinear
!!  n4,n5,n6 used for dimensionning of vlocal
!!  paral_kgb = flag defining parallelism in getghc
!!  pawtab(ntypat*usepaw) <type(pawtab_type)>=paw tabulated starting data
!!  ph3d(2,npw_k,matblk) = 3D structure factors for each atom and planewave
!!  prtvol = flag controlling verbosity of output
!!  pwind(pwind_alloc,2,3) = array used to compute
!!           the overlap matrix smat between k-points (see initberry.f)
!!  pwind_alloc = first dimension of pwind
!!  pwnsfac(2,pwind_alloc) = phase factors for non-symmorphic translations
!!                           (see initberry.f)
!!  pwnsfacq(2,mkgq) = phase factors for the nearest neighbours of the
!!                     current k-point
!!  vlocal(n4,n5,n6,nvloc)= local potential in real space, on the augmented fft grid
!!
!! OUTPUT
!!
!! SIDE EFFECTS
!!  dtefield <type(efield_type)> = variables related to Berry phase. The contribution to the bare
!!   magnetization at the given k point is stored in dtefield%mag_bare_k(idir,ikpt)
!!
!! NOTES
!!
!! PARENTS
!!
!! CHILDREN
!!      cprj_alloc,cprj_free,cprj_get,getghc,mkkin,smatrix,smatrix_k_paw
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine update_mmat(berryopt,cg,cgq,dimffnl,dtefield,ecut,ecutsm,effmass,ffnl,filstat,&
&                   gs_hamk,icg,ikpt,kg_k,&
&                   kinpw,lmnmax,matblk,&
&                   mband,mcg,mcgq,mgfft,mkgq,&
&                   mpi_enreg,mpsang,mpssoang,mpw,&
&                   natom,nkpt,npw_k,npwarr,nspinor,ntypat,&
&                   nvloc,n4,n5,n6,pawtab,pwind,pwind_alloc,&
&                   paral_kgb,ph3d,prtvol,pwnsfac,pwnsfacq,vlocal)

 use defs_basis
 use defs_datatypes
 use defs_abitypes
 use m_wfs

#if defined HAVE_MPI && defined HAVE_MPI2
 use mpi
#endif

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_53_abiutil
 use interfaces_56_recipspace
 use interfaces_66_paw
 use interfaces_66_wfs
 use interfaces_67_common, except_this_one => update_mmat
!End of the abilint section

 implicit none

#if defined HAVE_MPI && defined HAVE_MPI1
 include 'mpif.h'
#endif

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: berryopt,icg,ikpt,dimffnl,lmnmax,matblk,mband,mcg,mcgq,mgfft
 integer,intent(in) :: mkgq,mpsang,mpssoang,mpw,natom
 integer,intent(in) :: nkpt,npw_k,nspinor,ntypat,nvloc,n4,n5,n6
 integer,intent(in) :: paral_kgb,prtvol,pwind_alloc
 real(dp),intent(in) :: ecut,ecutsm,effmass
 character(len=fnlen),intent(in) :: filstat
 type(efield_type),intent(inout) :: dtefield
 type(gs_hamiltonian_type), intent(in) :: gs_hamk
 type(MPI_type),intent(inout) :: mpi_enreg

!arrays
 integer, intent(in) :: kg_k(3,npw_k),npwarr(nkpt),pwind(pwind_alloc,2,3)
 real(dp),intent(in) :: cg(2,mcg),cgq(2,mcgq),kinpw(npw_k)
 real(dp),intent(in) :: ffnl(npw_k,dimffnl,lmnmax,ntypat)
 real(dp),intent(inout) :: ph3d(2,npw_k,matblk)
 real(dp),intent(in) :: pwnsfac(2,pwind_alloc),pwnsfacq(2,mkgq)
 real(dp), intent(inout) :: vlocal(n4,n5,n6,nvloc) ! this variable is inout in getghc
 type(pawtab_type),intent(in)  :: pawtab(ntypat)

!Local variables-------------------------------
!scalars
 integer :: bsig,cpopt,ddkflag,iat,iatom,iband,icg1,icp,icp2,idir,idum1
 integer :: ifor,ihcg,ikgf,ikptf
 integer :: bband,bdir,bfor,kdir,kfor
 integer :: ikpt2,ikpt2f,ilmn,istep,itrs,itypat,jband,job,kband
 integer :: ksig,mcg1_k,mcg_q,nband_k
 integer :: ndat,npw_k2,shiftbd,sij_opt,tim_getghc,type_calc
 real(dp) :: dkinv,lambda
 real(dp) :: sfac
 complex(dpc) :: c1,c2,c3
 
!arrays
 integer,allocatable :: dimlmn(:),pwind_k(:),sflag_k(:)
 real(dp) :: dk(3),dtm_k(2),dotri(2),kpt_ksig(3)
 real(dp),allocatable :: bwave(:,:),kinpw_ksig(:),kwave(:,:)
 real(dp),allocatable :: cg1_k(:,:),cgq_k(:,:)
 real(dp),allocatable :: emat(:,:,:),ghc(:,:),gvnlc(:,:),gsc(:,:)
 real(dp),allocatable :: tcg(:,:,:,:),pwnsfac_k(:,:)
! real(dp),allocatable :: dcg(:,:,:),dsmat_k_paw(:,:,:,:) ! used for debugging and testing below
 real(dp),allocatable :: smat_inv(:,:,:),smat_inv_all(:,:,:,:,:)
 real(dp),allocatable :: smat_k(:,:,:),smat_k_paw(:,:,:)
 type(cprj_type),allocatable :: cprj_k(:,:),cprj_kb(:,:),tcprj(:,:,:,:)
 type(cprj_type),allocatable :: kcprj(:,:)

! ************************************************************************

 dtefield%mmat(:,:,:,ikpt,:,:,:) = zero
 nband_k = dtefield%nband_occ

 if (berryopt == 5) then
   ihcg = dtefield%cgindex(ikpt,1)
   dtefield%hcg(:,ihcg+1:ihcg+nband_k*npw_k,:) = zero
 end if

!=============================================================================
!compute finite-difference DDK wavefunctions at present k point for all bands
!
!the following code is largely copied from cgwf, where the same thing is done
!=============================================================================

 mcg1_k = mpw*nband_k

 allocate(tcg(2,mcg1_k,3,2),cg1_k(2,mcg1_k))
 tcg(:,:,:,:) = zero
 
 mcg_q = mpw*mband*nspinor
 allocate(cgq_k(2,mcg_q))
 allocate(sflag_k(nband_k),pwind_k(mpw),pwnsfac_k(4,mpw))

 allocate(cprj_k(natom,nband_k))
 allocate(cprj_kb(natom,nband_k))
 allocate(dimlmn(natom))
 iatom = 0
 do itypat = 1, ntypat
   do iat = 1, gs_hamk%nattyp(itypat)
     iatom = iatom + 1
     dimlmn(iatom)=pawtab(itypat)%lmn_size
   end do
 end do
 call cprj_alloc(cprj_k,0,dimlmn)
 call cprj_alloc(cprj_kb,0,dimlmn)

 allocate(tcprj(natom,nband_k,3,2))

 do idir = 1, 3
   do ifor = 1, 2
     call cprj_alloc(tcprj(:,:,idir,ifor),0,dimlmn)
   end do
 end do

 ikptf = dtefield%i2fbz(ikpt)
 ikgf = dtefield%fkgindex(ikptf)  ! this is the shift for pwind

 icp=nband_k*(ikptf-1)
 call cprj_get(gs_hamk%atindx1,cprj_k,dtefield%cprj,natom,1,icp,ikpt,0,1,&
& nband_k,dtefield%fnkpt,mpi_enreg,natom,&
& nband_k,nband_k,1,1,0)

 allocate(smat_k(2,nband_k,nband_k))
 allocate(smat_inv(2,nband_k,nband_k))
 allocate(smat_inv_all(2,nband_k,nband_k,3,2))
 allocate(smat_k_paw(2,nband_k,nband_k))

 job = 1 ; shiftbd = 1; ddkflag = 1

 do idir = 1, 3

   dk(:) = dtefield%dkvecs(:,idir)
   dkinv = one/(two*dk(idir))

   do ifor = 1, 2

     if (ifor == 2) dkinv = -dkinv

     ikpt2f = dtefield%ikpt_dk(ikptf,ifor,idir)
     if (dtefield%indkk_f2ibz(ikpt2f,6) == 1) then
       itrs = 10
     else
       itrs = 0
     end if
     ikpt2 = dtefield%indkk_f2ibz(ikpt2f,1)
     npw_k2 = npwarr(ikpt2)
     pwind_k(1:npw_k) = pwind(ikgf+1:ikgf+npw_k,ifor,idir)
     pwnsfac_k(1:2,1:npw_k) = pwnsfac(1:2,ikgf+1:ikgf+npw_k)
     sflag_k(:) = dtefield%sflag(:,ikpt,ifor,idir)
     smat_k(:,:,:) = dtefield%smat(:,:,:,ikpt,ifor,idir)

     if (mpi_enreg%paral_compil_kpt == 1) then
       icg1 = dtefield%cgqindex(2,ifor+2*(idir-1),ikpt) ! nsppol implicitly = 1 
       cgq_k(:,1:nband_k*nspinor*npw_k2) = &
&       cgq(:,icg1+1:icg1+nband_k*nspinor*npw_k2)
       idum1 = dtefield%cgqindex(3,ifor+2*(idir-1),ikpt) ! nsppol implicitly = 1
       pwnsfac_k(3:4,1:npw_k2) = pwnsfacq(1:2,idum1+1:idum1+npw_k2)
     else
       icg1 = dtefield%cgindex(ikpt2,1) ! nsppol implicitly = 1
       cgq_k(:,1:nband_k*nspinor*npw_k2) = &
&       cg(:,icg1+1:icg1+nband_k*nspinor*npw_k2)
       idum1 = dtefield%fkgindex(ikpt2f)
       pwnsfac_k(3:4,1:npw_k2) = pwnsfac(1:2,idum1+1:idum1+npw_k2)
     end if

     icp2=nband_k*(ikpt2-1)
     call cprj_get(gs_hamk%atindx1,cprj_kb,dtefield%cprj,natom,1,icp2,ikpt,0,1,&
&     nband_k,dtefield%fnkpt,mpi_enreg,natom,&
&     nband_k,nband_k,1,1,0)
     call smatrix_k_paw(cprj_k,cprj_kb,dtefield,idir,ifor,natom,smat_k_paw,gs_hamk%typat)

     icg1 = 0
     call smatrix(cg,cgq_k,cg1_k,ddkflag,dtm_k,icg,icg1,itrs,&
&     job,nband_k,mcg,mcg_q,mcg1_k,1,&
&     mpw,nband_k,&
&     npw_k,npw_k2,nspinor,pwind_k,pwnsfac_k,sflag_k,&
&     shiftbd,smat_inv,smat_k,smat_k_paw,gs_hamk%usepaw)
     smat_inv_all(:,:,:,idir,ifor) = smat_inv(:,:,:)

!    now cg1_k contains |\tilde{u}_kb>, and these have also been shifted by pwind so 
!    that their expansion on G is in registry with the G vectors of |u_k>

     tcg(:,:,idir,ifor) = cg1_k(:,:)

!    must assemble \tilde{cprj} following the same algorithm as |\tilde{u}>

     do jband = 1, nband_k
       do iband = 1, nband_k
         iatom = 0
         do itypat = 1, ntypat
           do iat = 1, gs_hamk%nattyp(itypat)
             iatom = iatom+1
             do ilmn = 1, dimlmn(iatom)
               tcprj(iatom,jband,idir,ifor)%cp(1,ilmn) = tcprj(iatom,jband,idir,ifor)%cp(1,ilmn) + &
&               cprj_kb(iatom,iband)%cp(1,ilmn)*smat_inv(1,iband,jband)-&
&               cprj_kb(iatom,iband)%cp(2,ilmn)*smat_inv(2,iband,jband)
               tcprj(iatom,jband,idir,ifor)%cp(2,ilmn) = tcprj(iatom,jband,idir,ifor)%cp(2,ilmn) + &
&               cprj_kb(iatom,iband)%cp(2,ilmn)*smat_inv(1,iband,jband)+&
&               cprj_kb(iatom,iband)%cp(1,ilmn)*smat_inv(2,iband,jband)
             end do
           end do
         end do
       end do
     end do

!    debugging to check overlap of cg1_k with cg 
!    
!    call smatrix_k_paw(cprj_k,tcprj(:,:,idir,ifor),dtefield,idir,ifor,natom,smat_k_paw)
!    do iband = 1, nband_k
!    do jband = 1, nband_k
!    dotr = zero; doti = zero;
!    do ipw = 1, npw_k
!    dotr = dotr + cg(1,icg+(iband-1)*npw_k+ipw)*tcg(1,(jband-1)*npw_k+ipw,idir,ifor)+&
!    &           cg(2,icg+(iband-1)*npw_k+ipw)*tcg(2,(jband-1)*npw_k+ipw,idir,ifor)
!    doti = doti - cg(2,icg+(iband-1)*npw_k+ipw)*tcg(1,(jband-1)*npw_k+ipw,idir,ifor)+&
!    &           cg(1,icg+(iband-1)*npw_k+ipw)*tcg(2,(jband-1)*npw_k+ipw,idir,ifor)
!    end do
!    write(6,'(a,5i4,2f16.8)')'JWZ debug ',&
!    &              ikpt,idir,ifor,iband,jband,dotr+smat_k_paw(1,iband,jband),doti+smat_k_paw(2,iband,jband)
!    end do
!    end do
!    
!    end debugging to check overlap of cg1_k with cg 
!    above debugging section checks that <u_nk|\tilde{u}_mk+b> = \delta_{nm}

   end do ! end loop over ifor

 end do ! end loop over idir
!Now tcg(2,mpw,idir,ifor) contains the tilde wavefunctions at this k point for all bands
!for +/- b
!and tcprj(natom,nband_k,idir,ifor) contains the corresponding cprjs at this k point for all
!bands and +/- b

!compute dcg(2,mpw,idir)
!allocate(dcg(2,mcg1_k,3))
!dcg(:,:,:) = zero
!
!allocate(dsmat_k_paw(2,nband_k,nband_k,3))
!dsmat_k_paw(:,:,:,:) = zero
!
!do idir = 1, 3
!sfac = 0.5
!do ifor = 1, 2
!dcg(:,:,idir) = dcg(:,:,idir) + sfac*tcg(:,:,idir,ifor)
!call smatrix_k_paw(cprj_k,tcprj(:,:,idir,ifor),dtefield,idir,ifor,natom,smat_k_paw)
!dsmat_k_paw(:,:,:,idir) = dsmat_k_paw(:,:,:,idir) + sfac*smat_k_paw(:,:,:)
!sfac = -sfac
!end do
!end do
!
!test overlap of cg with dcg
!allocate(bwave(2,npw_k),kwave(2,npw_k))
!do idir = 1, 3
!do bband = 1, nband_k
!bwave(:,:) = cg(:,icg+(bband-1)*npw_k+1:icg+bband*npw_k)
!do kband = 1, nband_k
!kwave(:,:) = dcg(:,(kband-1)*npw_k+1:kband*npw_k,idir)
!dotr = dot_product(bwave(1,:),kwave(1,:))+dot_product(bwave(2,:),kwave(2,:))
!dotr = dotr + dsmat_k_paw(1,bband,kband,idir)
!doti = -dot_product(bwave(2,:),kwave(1,:))+dot_product(bwave(1,:),kwave(2,:))
!doti = doti + dsmat_k_paw(2,bband,kband,idir)
!write(6,'(a,4i4,2f16.8)')'JWZ debug ',&
!&            ikpt,idir,bband,kband,dotr,doti
!end do       
!end do
!end do
!deallocate(bwave,kwave)

!test overlap of cg with dcg based on smatrix only

!do idir = 1, 3
!do bband = 1, nband_k
!do kband = 1, nband_k
!dotr = zero; doti = zero
!do ksig = -1, 1, 2
!kfor = (3-ksig)/2
!sfac = 0.5*ksig
!do jband = 1, nband_k
!dotr = dotr + sfac*smat_inv_all(1,jband,kband,idir,kfor)*dtefield%smat(1,bband,jband,ikpt,idir,kfor) - &
!&            sfac*smat_inv_all(2,jband,kband,idir,kfor)*dtefield%smat(2,bband,jband,ikpt,idir,kfor)
!doti = doti + sfac*smat_inv_all(1,jband,kband,idir,kfor)*dtefield%smat(2,bband,jband,ikpt,idir,kfor) + &
!&            sfac*smat_inv_all(2,jband,kband,idir,kfor)*dtefield%smat(1,bband,jband,ikpt,idir,kfor)
!end do
!end do
!if ( abs(dotr) > tol12 .or. abs(doti) > tol12 ) then
!write(6,'(a,4i4,2f16.8)')'JWZ debug ',&
!&              ikpt,idir,bband,kband,dotr,doti
!end if
!end do
!end do
!end do

!compute emat(2,bband,kband) = <u_bband|H_k|u_kband>

 sij_opt = 0 ! compute <G|H|u> only, not gsc
 cpopt = -1 ! do not return cprj in getghc calls
 lambda = 0.0 ! no shift to H used here
 ndat = 1 ! number of FFTs to do in parallel
 tim_getghc = 1 ! as in cgwf.F90
 type_calc = 0 ! use entire Hamiltonian

 allocate(bwave(2,npw_k),kwave(2,npw_k))
 allocate(emat(2,nband_k,nband_k))
 allocate(ghc(2,npw_k),gvnlc(2,npw_k))
 allocate(gsc(2,npw_k*ndat*(sij_opt+1)/2))
 allocate(kcprj(natom,nspinor*(1+cpopt)))
 call cprj_alloc(kcprj,0,dimlmn)

 do kband = 1, nband_k

   kwave(:,:) = cg(:,icg+(kband-1)*npw_k+1:icg+kband*npw_k)
   call  getghc(cpopt,kwave,kcprj,dimffnl,ffnl,filstat,ghc,gsc,&
&   gs_hamk,gvnlc,kg_k,kinpw,lambda,lmnmax,&
&   matblk,mgfft,mpi_enreg,mpsang,mpssoang,&
&   natom,ndat,npw_k,nspinor,ntypat,nvloc,n4,n5,n6,&
&   paral_kgb,ph3d,prtvol,sij_opt,tim_getghc,type_calc,vlocal)

   do bband = 1, nband_k 
     
     bwave(:,:) = cg(:,icg+(bband-1)*npw_k+1:icg+bband*npw_k) 

     emat(1,bband,kband) = dot_product(bwave(1,:),ghc(1,:)) + dot_product(bwave(2,:),ghc(2,:)) 
     emat(2,bband,kband) = dot_product(bwave(1,:),ghc(2,:)) - dot_product(bwave(2,:),ghc(1,:)) 

   end do
 end do

!compute cmat(2,bband,kband,ikpt,idir)
!for example, cmat(2,bband,kband,ikpt, z ) = <du_bband/dk_x|du_kband/dk_y> - (x<->y)

 dtefield%cmat(:,:,:,ikpt,:) = zero

 do idir = 1, 3
   select case(idir)
     case(1)
       bdir = 2; kdir = 3
     case(2)
       bdir = 3; kdir = 1
     case(3)
       bdir = 1; kdir = 2
   end select
   sfac = 0.25
   do istep = 1, 2
     do bsig = -1, 1, 2
       do ksig= -1, 1, 2
         bfor = (-bsig+3)/2; kfor = (-ksig+3)/2

         call smatrix_k_paw(tcprj(:,:,bdir,bfor),tcprj(:,:,kdir,kfor),dtefield,kdir,kfor,natom,&
&         smat_k_paw,gs_hamk%typat,bdir,bfor)

         do bband = 1, nband_k
           bwave(:,:) = tcg(:,(bband-1)*npw_k+1:bband*npw_k,bdir,bfor)

           do kband = 1, nband_k
             kwave(:,:) = tcg(:,(kband-1)*npw_k+1:kband*npw_k,kdir,kfor)

             dotri(1) = dot_product(bwave(1,:),kwave(1,:))+dot_product(bwave(2,:),kwave(2,:))
             dotri(2) = -dot_product(bwave(2,:),kwave(1,:))+dot_product(bwave(1,:),kwave(2,:))

             dtefield%cmat(1,bband,kband,ikpt,idir) = dtefield%cmat(1,bband,kband,ikpt,idir) + &
&             bsig*ksig*(dotri(1)+smat_k_paw(1,bband,kband))
             dtefield%cmat(2,bband,kband,ikpt,idir) = dtefield%cmat(2,bband,kband,ikpt,idir) + &
&             bsig*ksig*(dotri(2)+smat_k_paw(2,bband,kband))
           end do ! end loop over kband
         end do ! end loop over bband
       end do ! end loop over ksig
     end do ! end loop over bsig
     
     sfac = -sfac
     idum1 = bdir; bdir = kdir; kdir = idum1
   end do ! loop over istep
 end do ! loop over idir

 dtefield%mmat(:,:,:,ikpt,:,:,:) = zero

 sij_opt = 0 ! compute <G|H|u> only, not gsc
 cpopt = -1 ! do not return cprj in getghc calls
 lambda = 0.0 ! no shift to H used here
 ndat = 1 ! number of FFTs to do in parallel
 tim_getghc = 1 ! as in cgwf.F90

 allocate(kinpw_ksig(npw_k))
 
 sfac = 0.5
 do bdir = 1, 3
   do kdir = 1, 3
     if (bdir /= kdir) then

       do bsig = -1, 1, 2
         bfor = (-bsig+3)/2

         do ksig= -1, 1, 2
           kfor = (-ksig+3)/2

           kpt_ksig(:) = gs_hamk%kpoint(:)+ksig*dtefield%dkvecs(:,kdir)
           call mkkin(ecut,ecutsm,effmass,gs_hamk%gmet,kg_k,kinpw_ksig,kpt_ksig,npw_k)

           call smatrix_k_paw(tcprj(:,:,bdir,bfor),tcprj(:,:,kdir,kfor),dtefield,&
&           kdir,kfor,natom,smat_k_paw,gs_hamk%typat,bdir,bfor)

           do bband = 1, nband_k
             bwave(:,:) = tcg(:,(bband-1)*npw_k+1:bband*npw_k,bdir,bfor)

             do kband = 1, nband_k
               do iband = 1, nband_k
                 kwave(:,:) = tcg(:,(iband-1)*npw_k+1:iband*npw_k,kdir,kfor)

                 dotri(1) = dot_product(bwave(1,:),kwave(1,:))+dot_product(bwave(2,:),kwave(2,:))
                 dotri(2) = -dot_product(bwave(2,:),kwave(1,:))+dot_product(bwave(1,:),kwave(2,:))

                 c1 = cmplx(emat(1,iband,kband),emat(2,iband,kband))
                 c2 = cmplx(dotri(1)+smat_k_paw(1,bband,iband),dotri(2)+smat_k_paw(2,bband,iband))
                 c3 = sfac*ksig*c1*c2

                 dtefield%mmat(1,bband,kband,ikpt,bdir,bfor,kdir) = &
&                 dtefield%mmat(1,bband,kband,ikpt,bdir,bfor,kdir) + &
&                 real(c3)
                 dtefield%mmat(2,bband,kband,ikpt,bdir,bfor,kdir) = &
&                 dtefield%mmat(2,bband,kband,ikpt,bdir,bfor,kdir) + &
&                 aimag(c3)

                 if (iband == kband) then
!                  TODO: add nonlocal part
                   type_calc = 1 ! use local part only
!                  note use of kinpw_ksig to refer to k+ksig*kdir, not k
!                  note that tilde cg wavefunctions tcg have been adjusted by smatrix.F90 such that they are
!                  expanded on the same set of G vectors as |u_nk>, therefore use of kg_k is still correct
                   call  getghc(cpopt,kwave,kcprj,dimffnl,ffnl,filstat,ghc,gsc,&
&                   gs_hamk,gvnlc,kg_k,kinpw_ksig,lambda,lmnmax,&
&                   matblk,mgfft,mpi_enreg,mpsang,mpssoang,&
&                   natom,ndat,npw_k,nspinor,ntypat,nvloc,n4,n5,n6,&
&                   paral_kgb,ph3d,prtvol,sij_opt,tim_getghc,type_calc,vlocal)

                   dotri(1) = dot_product(bwave(1,:),ghc(1,:))+dot_product(bwave(2,:),ghc(2,:))
                   dotri(2) = -dot_product(bwave(2,:),ghc(1,:))+dot_product(bwave(1,:),ghc(2,:))
                   c3 = sfac*ksig*cmplx(dotri(1),dotri(2))
                   dtefield%mmat(1,bband,kband,ikpt,bdir,bfor,kdir) = &
&                   dtefield%mmat(1,bband,kband,ikpt,bdir,bfor,kdir) + &
&                   real(c3)
                   dtefield%mmat(2,bband,kband,ikpt,bdir,bfor,kdir) = &
&                   dtefield%mmat(2,bband,kband,ikpt,bdir,bfor,kdir) + &
&                   aimag(c3)

                 end if

               end do ! end loop over iband

             end do ! end loop over kband
           end do ! end loop over bband

         end do ! end loop over ksig
       end do ! end loop over bsig

     end if ! end check that bdir /= kdir 
   end do ! end loop over kdir 
 end do ! loop over bdir

 deallocate(kinpw_ksig)
 
 deallocate(bwave,kwave)
 deallocate(ghc,gvnlc,gsc)
 deallocate(emat)
 call cprj_free(kcprj)
 deallocate(kcprj)

 deallocate(tcg,cg1_k)
 deallocate(cgq_k,sflag_k,pwind_k,pwnsfac_k)
 call cprj_free(cprj_k)
 deallocate(cprj_k)
 call cprj_free(cprj_kb)
 deallocate(cprj_kb)
 do idir =1, 3
   do ifor = 1, 2
     call cprj_free(tcprj(:,:,idir,ifor))
   end do
 end do
 deallocate(tcprj)


 deallocate(smat_k,smat_inv,smat_inv_all,smat_k_paw)

end subroutine update_mmat
!!***
