!{\src2tex{textfont=tt}}
!!****f* ABINIT/exc_iterative_diago
!!
!! NAME
!!  exc_iterative_diago
!!
!! FUNCTION
!!  Calculates eigenvalues and eigenvectors of the Hermitian excitonic Hamiltonian (coupling is neglected).
!!
!! COPYRIGHT
!! Copyright (C) 2009-2010 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  ntrans=Rank of the resonant block of the Hamiltoninan.
!!  BS_files<excfiles>=Datatype storing names and files used in the Bethe-Salpeter code.
!!    %exh=Name of the file storing the excitonin resonant part.
!!    %out_eig_out=Name of the file where final results are store.
!!    %in_eig=Name of the file used to initialize the calculation.
!!  spaceComm=MPI communicator.
!!  nbands=Number of eigenstates required.
!!  nline=Max number of line minimizations.
!!  tolwfr=Tolerance on the residuals.
!!  nbdbuf
!!  nstep
!!
!! OUTPUT
!!  Eigenvalues and eigenvectors are written on file %out_eig
!!
!! NOTES
!!  Concernig the approach followed to parallelize this routine: the most important
!!  bottleneck is represent by the storage of the excitonic Hamiltonian since a 
!!  large number of k-points is needed to obtain converged exciton energies.
!!  The number of eigenstates is usually much smaller than the rank of the full matrix,
!!  this is especially true if we are only interested in the binding energy of the 
!!  exciton or in the excitonin states close to the single-particle gap. 
!!  Therefore good scaling and good performace should be obtained by distributing 
!!  the row of the excitonic Hamiltonina among the nodes while the required 
!!  eigenvectors are duplicated on each node. The memory needed to stores the eigenvalues
!!  scales like ntrans*nbands where ntrans is the rank of the Hamiltonian and this might 
!!  render the calculation unfeasible when nbands is large. 
!!  On the other hand, having the complex set of trial eigenvectors on each node permits to parallelize
!!  tasks such as the application of the Hamiltonian as well as the orthogonalization or the sub-space rotation 
!!  the later two algorithms represent the most CPU demanding part in standard KS calculations
!!  as they scale with the third power of the number of atoms.
!!  The conjugate direction and the gradient as well as Hphi are not distributed as the line minimization 
!!  requires the evalauation of <cg_dir_|H_exc|cg_dir>. 
!!  Note that this routine has been written having in mind an homogeneous network of machines.
!!  A network made of different CPU will lead to unpredictable results as each node has 
!!  to check for the converge of the calculation.
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

#define DEV_MG_DEBUG_PARA 0

subroutine exc_iterative_diago(ntrans,nbands,nstep,nline,nbdbuf,tolwfr,BS_files,prtvol,spaceComm)

 use defs_basis
 use defs_abitypes
 use m_bs_defs
 use m_xmpi
 use m_errors

 use m_io_tools,   only : get_unit
 use m_abilasi,    only : xheev, xhpev

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_51_manage_mpi
 use interfaces_69_bse, except_this_one => exc_iterative_diago
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: ntrans,spaceComm,nbands,nline,prtvol,nbdbuf,nstep
 real(dp),intent(in) :: tolwfr
 type(excfiles),intent(in) ::  BS_files

!Local variables ------------------------------
!scalars
 integer,parameter :: STRICT=2,MEDIUM=1,WORST=0 
 integer :: cg_nsteps,my_t1,my_t2,my_nt
 integer :: ii,jj,it,itp,ir,hreso_unt,eig_unt,iband,iline,nh_check
 integer :: recl4dpc,ios,max_nline
 integer :: istat,itemp,seed,cg_step,nbdbuf_
 integer ::  fold1,fold2,foldim,foldre
 integer :: ierr,nprocs,my_rank,master
 real(dp) :: excgap,maxexc,norm,lambda,etrial,etrial_old,deltae
! real(dp) :: deold
 real(dp) :: dhd,dhc,den
 real(dp) :: fac,poly,xx
 real(dp) :: root,swap,tan2th,diff
 real(dp) :: tolwfr_
 !complex(dpc) :: cg_gamma,dotgg,old_dotgg
 real(dp) :: cg_gamma,dotgg,old_dotgg
 real(dp) :: max_resid,costh,sinth
 complex(dpc) :: zz,kprc,ctemp_dpc
 logical :: phi_on_file
 logical,parameter :: use_precond=.TRUE.
 character(len=500) :: msg
!arrays
 integer :: nline_for(nbands),convergence_of(nbands)
 real(dp) :: resid(nbands),energy(nbands),rbuf2(2)
! real(dp),allocatable :: gsc(:,:)
! real(dp),allocatable :: cg(:,:)
 !complex,allocatable :: hexc(:,:)
 complex(dpc),allocatable :: hexc(:,:),hji(:),vec_tmp(:)
 complex(dpc),pointer :: my_phi(:)
 real(dp),allocatable :: hexc_diagonal(:)
 complex(dpc),target,allocatable :: phi_block(:,:)
 complex(dpc),allocatable :: hphi(:),buffer_dpc(:)
 complex(dpc),allocatable :: cg_dir(:),grad_dir(:),prc_dir(:)
 complex(dpc),allocatable :: old_cg_dir(:)
 type(MPI_type) :: MPI_enreg_seq

!************************************************************************

 DBG_ENTER("COLL")

 nprocs  = xcomm_size(spaceComm)
 my_rank = xcomm_rank(spaceComm)
 master=0

!* Fake MPI_type for the sequential part.
 call initmpi_seq(MPI_enreg_seq)

 ABI_CHECK(ntrans>=nprocs,"ntrans<nprocs!")

 ! Divide the columns of the Hamiltoninan among the nodes.
 call split_work(ntrans,spaceComm,my_t1,my_t2)
 my_nt = my_t2-my_t1+1
 write(msg,'(2(a,i0),a)')" rank ",my_rank," handles ",my_nt," columns of the excitonic Hamiltoninan. "
 call wrtout(std_out,msg,"PERS")

 tolwfr_ = tolwfr
 if (tolwfr < 10**(-30)) then
  tolwfr_ = tol12
  write(msg,'(2(a,es12.4))')" Input tolwfr= ",tolwfr," Using tolwfr= ",tolwfr_
  MSG_WARNING(msg)
 end if

 cg_nsteps = nstep
 if (cg_nsteps<=0) then 
  cg_nsteps = 30
  write(msg,'(2(a,es12.4))')" Input nstep= ",nstep," Using cg_nsteps= ",cg_nsteps
  MSG_WARNING(msg)
 end if

 nbdbuf_ = nbdbuf
 if (nbdbuf<=0) then 
  nbdbuf_ = 4
  write(msg,'(2(a,es12.4))')" Input nbdbuf= ",nbdbuf," Using nbdbuf= ",nbdbuf_
  MSG_WARNING(msg)
 end if

 write(*,*)"cg_nsteps",cg_nsteps
 write(*,*)"nbands",nbands
 write(*,*)"nline",nline
 write(*,*)"nbdbuf_",nbdbuf_
 write(*,*)"tolwfr_",tolwfr_

 write(msg,'(3a,f8.1,a)')&
& ' Allocating excitonic hamiltonian. ',ch10,&
& ' Memory requested: ',my_nt*ntrans*spc*b2Mb,' Mb.'
 call wrtout(std_out,msg,"COLL")

 hreso_unt = get_unit()
 recl4dpc = get_reclen("dpc")
 open(unit=hreso_unt,file=BS_files%exh,status='old',access='direct',recl=recl4dpc,iostat=ios)

 msg=' Opening file '//TRIM(BS_files%exh)//' as old with direct access.'
 ABI_CHECK(ios==0,msg)

 read(hreso_unt,rec=((ntrans*ntrans + ntrans)/2)+1) itemp
 if (itemp/=ntrans) then
  write(msg,'(a,i0,a,i0,a)')'nt should be ',ntrans,' but read ',itemp,' wrong file fort.55 *.exh'
  MSG_ERROR(msg)
 end if

 ! Construct full excitonic Hamiltonian using Hermiticity.
 !allocate(hexc(my_nt,ntrans), STAT=istat)
 allocate(hexc(ntrans,my_t1:my_t2), STAT=istat)
 ABI_CHECK(istat==0,'out of memory: excitonic hamiltonian')

 hexc = -999
 do itp=1,ntrans
  do it=1,itp
   if ( (it >=my_t1 .and. it <=my_t2) .or. &
&       (itp>=my_t1 .and. itp<=my_t2) ) then ! I have either (i,j) or (j,j)
    ir = it + itp*(itp-1)/2
    read(hreso_unt,rec=ir) ctemp_dpc
    if (itp==it) ctemp_dpc = REAL(ctemp_dpc,kind=dp)
    if (itp>=my_t1 .and. itp<=my_t2) hexc(it,itp) = ctemp_dpc !FIXME this is wrong is QP lifetimes are included
    if (it >=my_t1 .and. it <=my_t2) hexc(itp,it) = CONJG(ctemp_dpc)
   end if
  end do
  !hexc(itp,itp) = REAL(hexc(itp,itp)) 
 end do

 ABI_CHECK(ALL(hexc/=-999),"-999")

 ! Save also the diagonal part for preconditioning
 if (use_precond) then
  allocate(hexc_diagonal(my_t1:my_t2))
  do it=my_t1,my_t2
   ir = it + it*(it-1)/2
   read(hreso_unt,rec=ir) ctemp_dpc
   hexc_diagonal(it) = REAL(ctemp_dpc,kind=dp)
  end do
 end if

 close(hreso_unt)

 !
 ! === Initialisation of the excitonic wavefunctions ===
 ! Two cases are possible.
 ! 1) Fill trial eigenvectors with random numbers
 !    One needs to initialize wfs in such a way to avoid symmetry traps,
 !    and to avoid linear dependencies between wavefunctions
 ! 2) Read eigenstates generated by a previous calculation.

 allocate(phi_block(my_t1:my_t2,nbands), STAT=istat)
 ABI_CHECK(istat==0," out-of-memory phi_block")

 phi_on_file= BS_files%in_eig_exists

 SELECT CASE (phi_on_file)

 CASE (.TRUE.) 
  msg=' Initializing eigenvectors from file: '//TRIM(BS_files%in_eig)
  call wrtout(std_out,msg,"COLL")
                                                    
  eig_unt=get_unit()
  open(eig_unt,file=BS_files%in_eig,form='unformatted')
                                                    
  read(eig_unt) nh_check
  ABI_CHECK(nh_check==ntrans,"nh_check /= ntrans")

  read(eig_unt) !skip CMPLX(exevl(1:nh))
  allocate(buffer_dpc(ntrans))
  do ii=1,nbands
   read(eig_unt) buffer_dpc
   phi_block(my_t1:my_t2,ii) = buffer_dpc(my_t1:my_t2) 
  end do
  deallocate(buffer_dpc)

  close(eig_unt)

 CASE (.FALSE.)
  ! Use random number generator. For portability, use only integer numbers
  ! The series of couples (fold1,fold2) is periodic with a period of
  ! 3x5x7x11x13x17x19x23x29x31, that is, larger than 2**32, the largest integer*4
  ! fold1 is between 0 and 34, fold2 is between 0 and 114. As sums of five
  ! uniform random variables, their distribution is close to a gaussian
  ! the gaussian distributions are folded, in order to be back to a uniform distribution
  ! foldre is between 0 and 20, foldim is between 0 and 18.

  do iband=1,nbands
   do ii=my_t1,my_t2
    seed=ii+(iband-1)*ntrans ! Different seed for different transitions and bands
    fold1=mod(seed,3)+mod(seed,5)+mod(seed,7)+mod(seed,11)+mod(seed,13)
    fold2=mod(seed,17)+mod(seed,19)+mod(seed,23)+mod(seed,29)+mod(seed,31)
    foldre=mod(fold1+fold2,21)
    foldim=mod(3*fold1+2*fold2,19)

    phi_block(ii,iband) = DCMPLX(foldre,foldim)
   end do
  end do

 END SELECT

 ! =========================
 ! === Orthogonalization ===
 ! =========================
 call cholesky_ortho()
 call check_phi_block("After ortho")

 ! * Sub-space rotation.
 call subspace_rotation()
 call check_phi_block("After subspace_rotation")

! ===========================
! ==== Conjugate gradient ===
! ===========================

 allocate(hphi(ntrans),cg_dir(ntrans),old_cg_dir(ntrans), grad_dir(ntrans),prc_dir(ntrans), STAT=istat)
 ABI_CHECK(istat==0,"out-of-memory in cg vectors")

 max_nline=4
 resid(:)=-999; nline_for(1:nbands)=max_nline; convergence_of(1:nbands)=WORST

 do cg_step=1,cg_nsteps

  do iband=1,nbands

   if (prtvol>=10) then ! Tell us what is going on:
    write(msg,'(a,i6,2x,a,i3,a)')' --- exc_iterative_diago is called for band',iband,'for',nline_for(iband),' lines'
    call wrtout(std_out,msg,'PERS')
   end if

   my_phi => phi_block(my_t1:my_t2,iband)  ! Extraction of the vector that is iteratively updated.

   do iline=1,nline_for(iband)
    
    hphi = czero
    hphi = MATMUL(hexc, my_phi)  ! Compute etrial=<phi|H|phi> and the residual [H-etrial]|phi>.
    call xsum_mpi(hphi,spaceComm,ierr)

    etrial = DOT_PRODUCT(my_phi, hphi(my_t1:my_t2))
    call xsum_mpi(etrial,spaceComm,ierr)
    energy(iband) =  etrial
    
    grad_dir(my_t1:my_t2) = hphi(my_t1:my_t2) - etrial*my_phi      ! Compute residual (squared) norm.
    resid(iband) =  DOT_PRODUCT(grad_dir(my_t1:my_t2), grad_dir(my_t1:my_t2)) 
    call xsum_mpi(resid(iband),spaceComm,ierr)
    convergence_of(iband) = convergence_degree(resid(iband))

    ! Check that etrial is decreasing on succeeding lines:
    if (iline>1 .and. (etrial > etrial_old+tol12)) then
     write(msg,'(a,i8,a,1p,e14.6,a1,3x,a,1p,e14.6,a1)')&
&     ' New trial energy at line',iline,' = ',etrial,ch10,&
&     ' is higher than former:',etrial_old,ch10
     MSG_WARNING(msg)
    end if
    etrial_old = etrial

    ! If residual sufficiently small stop line minimization.
    if (convergence_of(iband)==STRICT) then
     if (prtvol>=10) then
      write(msg,'(a,i4,a,i2,a,es12.4)')&
&      ' cgwf: band',iband,' converged after ',iline,&
&      ' line minimizations : resid =',resid(iband)
      call wrtout(std_out,msg,'PERS')
     end if
     EXIT !iline
    end if

    ! === PROJECT THE STEEPEST DESCENT DIRECTION OVER THE SUBSPACE ORTHOGONAL TO OTHER BANDS ===

!   The following projection over the subspace orthogonal to occupied bands
!   is optional. It is a bit more accurate, but doubles the number of N^3 ops.
!   It is done only if ortalg>=0.

!   Project the steepest descent direction: direc(2,npw)=<G|H|Cnk> - \sum_{(i<=n)} <G|H|Cik> , normalized.

!    !!!!
!     MG Don't know why here we sum over i=<=n!!!!!!!!

    ! Grad_dir is already orthogonal to this band
    allocate(hji(nbands)); hji=czero
    do jj=1,nbands
     if (jj/=iband) hji(jj) = DOT_PRODUCT(phi_block(:,jj), hphi(my_t1:my_t2) )     
    end do
    call xsum_mpi(hji,spaceComm,ierr)

    do jj=1,nbands
     if (jj/=iband) grad_dir(my_t1:my_t2) = grad_dir(my_t1:my_t2) - hji(jj)*phi_block(:,jj)
    end do
    deallocate(hji)

   !=== PRECONDITION THE STEEPEST DESCENT DIRECTION ===
   if (use_precond) then

    den = DOT_PRODUCT(grad_dir(my_t1:my_t2), hexc_diagonal(my_t1:my_t2)*grad_dir(my_t1:my_t2) )
    call xsum_mpi(den,spaceComm,ierr)

    do ii=my_t1,my_t2
     xx = hexc_diagonal(ii)/den ! Teter polynomial ratio, modified according to Kresse, Furthmuller, PRB 54, 11169 (1996)
     poly=27._dp+xx*(18._dp+xx*(12._dp+xx*8._dp))
     fac=poly/(poly+16._dp*xx**4)
     kprc = fac*four/(three*den)
     prc_dir(ii) = kprc * grad_dir(ii)
    end do

    ! * PROJECT THE PRECOND. STEEPEST DESCENT DIRECTION OVER THE SUBSPACE ORTHOGONAL TO OTHER BANDS.
    allocate(hji(nbands)); hji=czero
    do jj=1,nbands
     hji(jj) = DOT_PRODUCT(phi_block(:,jj), prc_dir(my_t1:my_t2) )   
    end do
    call xsum_mpi(hji,spaceComm,ierr)

    do jj=1,nbands
     prc_dir(my_t1:my_t2) = prc_dir(my_t1:my_t2) - hji(jj)*phi_block(:,jj)
    end do
    deallocate(hji)

   else 
    prc_dir(my_t1:my_t2) = grad_dir(my_t1:my_t2)
   end if
    

   !=== COMPUTE THE CONJUGATE-GRADIENT ===
   dotgg = DOT_PRODUCT(prc_dir(my_t1:my_t2),grad_dir(my_t1:my_t2))
   call xsum_mpi(dotgg,spaceComm,ierr)

   if (iline==1) then ! At first iteration, cg_gamma is set to zero
    cg_gamma=zero
    old_dotgg=dotgg
    cg_dir = prc_dir
    old_cg_dir = cg_dir
   else
    cg_gamma=dotgg/old_dotgg
    old_dotgg=dotgg
    !write(*,*)"cg_gamma= ",cg_gamma
    !cg_dir = prc_dir + cg_gamma*cg_dir
    cg_dir = prc_dir + cg_gamma*old_cg_dir !TODO check this, anyhow it is much faster.
    old_cg_dir =cg_dir  ! old_cg_dir is used to store the previsou CG direction, cg_dir will be orthonormalized to the band
   end if

    ! === PROJECTION OF THE CONJUGATED GRADIENT ===
    zz = DOT_PRODUCT(my_phi, cg_dir(my_t1:my_t2))
    call xsum_mpi(zz,spaceComm,ierr)
    cg_dir(my_t1:my_t2) = cg_dir(my_t1:my_t2) -zz*my_phi(:)

    norm = DOT_PRODUCT(cg_dir(my_t1:my_t2), cg_dir(my_t1:my_t2) )
    call xsum_mpi(norm,spaceComm,ierr)
    norm = SQRT(norm) 
    cg_dir = cg_dir/norm ! Have to normalize it.

    ! Line minimization of the Raileigh functional. 
    allocate(vec_tmp(ntrans)); vec_tmp=czero
    vec_tmp = MATMUL(hexc, cg_dir(my_t1:my_t2))
    call xsum_mpi(vec_tmp,spaceComm,ierr)

#if DEV_MG_DEBUG_PARA
    if (my_rank==master) then 
     write(777,*)"cg_step, iband, iline",cg_step, iband, iline 
     write(777,*)vec_tmp
    end if
#endif

    dhd = DOT_PRODUCT( cg_dir(my_t1:my_t2), vec_tmp(my_t1:my_t2))      ! is this always real?
    dhc = REAL( DOT_PRODUCT( my_phi, vec_tmp(my_t1:my_t2) ))
    deallocate(vec_tmp)

    rbuf2 = (/dhd,dhc/)
    call xsum_mpi(rbuf2,spaceComm,ierr)
    dhd = rbuf2(1)
    dhc = rbuf2(2)

#if DEV_MG_DEBUG_PARA
   write(201*(my_rank+1),*)"cg_step, iband, iline dotgg dhd dhc",cg_step,iband,iline,dotgg,dhd,dhc
#endif

! Taken from cgwf
    ! Compute tan(2 theta),sin(theta) and cos(theta)
    tan2th=2.0_dp*dhc/(etrial-dhd)

    if (abs(tan2th)<1.d-05) then

     costh=1.0_dp-0.125_dp*tan2th**2
     sinth=0.5_dp*tan2th*(1.0_dp-0.375_dp*tan2th**2)

     ! Check that result is above machine precision 
! FIXME  This part is not safe on clusters made of different machines or different compiation options.
     if (abs(sinth)<epsilon(0._dp)) then
      write(msg, '(a,es16.4)' ) ' cgwf: converged with tan2th=',tan2th
      call wrtout(std_out,msg,'PERS')
      EXIT !Exit from the loop on iline
     end if

    else
     root=sqrt(1.0_dp+tan2th**2)
     costh=sqrt(0.5_dp+0.5_dp/root)
     sinth=sign(sqrt(0.5_dp-0.5_dp/root),tan2th)
    end if

    ! Check for lower of two possible roots (same sign as curvature at theta where slope is zero)
    diff=(etrial-dhd)
!   Swap c and d if value of diff is positive
    if (diff>zero) then
     swap=costh
     costh=-sinth
     sinth=swap
     if (prtvol<0 .or. prtvol>=10) then
      write(msg,'(a,2i4)')' Note: swap roots, iline,diff=',iline,diff
      call wrtout(std_out,msg,'PERS')
     end if
    end if

!=== GENERATE NEW |wf>, H|wf>  =============

    my_phi = costh*my_phi + sinth*cg_dir(my_t1:my_t2)
#if DEV_MG_DEBUG_PARA
    write(100*(my_rank+1),*)"cg_step iband, iline costh sinth etrial",cg_step,iband,iline,costh,sinth,etrial
#endif
!end taken from cgwf
 
    !norm = SQRT( DOT_PRODUCT(my_phi,my_phi) )
    !my_phi = my_phi /norm
    !write(*,*)norm
    !write(*,*)DOT_PRODUCT(hphi,my_phi),cos(theta_min)

!======================================================================
!=========== CHECK CONVERGENCE AGAINST TRIAL ENERGY ===================
!======================================================================
    ! Compute delta(E)
    !deltae=chc*(costh**2-1._dp)+dhd*sinth**2+2._dp*costh*sinth*dhc
    deltae=etrial*(costh**2-1._dp)+dhd*sinth**2+2._dp*costh*sinth*dhc

!   Check convergence and eventually exit
!    if (iline==1) then
!      deold=deltae
!    else if (abs(deltae)<0.005_dp*abs(deold) .and. iline/=nline_for(iband))then
!     if (prtvol>=10)then
!      write(msg, '(a,i4,1x,a,1p,e12.4,a,e12.4,a)' ) &
!&      ' cgwf: line',iline,&
!&      ' deltae=',deltae,' < 0.005*',deold,' =>skip lines'
!      call wrtout(std_out,msg,'PERS')
!     end if
!     energy(iband) = energy(iband) + deltae
!     EXIT
!    end if

   end do ! LOOP FOR A GIVEN BAND. Note that there are three "exit" instructions inside

   ! Modify nline_for(iband) according to converge degree.
   !if (convergence_of(iband) == STRICT) nline_for(iband) = MAX(max_nline-2,2)
   !if (convergence_of(iband) == MEDIUM) nline_for(iband) = MAX(max_nline-1,2)
   !if (convergence_of(iband) == WORST ) nline_for(iband) = max_nline
  end do !iband

  if (prtvol>2) then
   do ii=0,(nbands-1)/8
    write(msg,'(a,8es10.2)')' res:',(resid(iband),iband=1+ii*8,min(nbands,8+ii*8))
    call wrtout(std_out,msg,'COLL')
   end do
   do ii=0,(nbands-1)/8
    write(msg,'(a,8es10.2)')' ene:',(energy(iband),iband=1+ii*8,min(nbands,8+ii*8))
    call wrtout(std_out,msg,'COLL')
   end do
  end if

  write(msg,'(a,i0)')"After cg_step: ",cg_step
  call check_phi_block(msg)

  ! Find largest residual over bands and Print residuals
  max_resid=MAXVAL( resid(:MAX(1,nbands-nbdbuf_)) )

  if (max_resid < tolwfr_) then 
   write(msg,'(a,i0,2(a,es10.2),a,i0,a)')&
&   " After ",cg_step," iterations, max_resid= ",max_resid," < tolwfr= ",tolwfr_," ( Excluding nbdbuf= ",nbdbuf_,")"
   call wrtout(std_out,msg,'COLL')
   EXIT ! cg_step
  end if

  if (cg_step==1.or.MOD(cg_step,1)==0) then
   !msg = " Sub-space rotation + ortho + Fix phase inside CG loop"
   !call wrtout(std_out,msg,"COLL")

   call subspace_rotation()

   call cholesky_ortho() 

   !mcg=ntrans; mgsc=ntrans; useoverlap=0
   !allocate(cg(2,mcg),gsc(2,mgsc*useoverlap))
   !do ii=1,nbands
   ! cg(1,:) = REAL (phi_block(:,ii))
   ! cg(2,:) = AIMAG(phi_block(:,ii))
   ! call fxphas(cg,gsc,0,0,1,mcg,mgsc,MPI_enreg_seq,1,ntrans,useoverlap)
   ! phi_block(:,ii)=CMPLX(cg(1,:),cg(2,:))
   !end do
   !deallocate(cg,gsc)
  end if

 end do !cg_step

 ! Release some memory before entering RMM-DIIS
 deallocate(hphi,cg_dir,old_cg_dir,grad_dir,prc_dir)

 do ii=0,(nbands-1)/8
  write(msg,'(a,8es10.2)')' res:',(resid(iband),iband=1+ii*8,min(nbands,8+ii*8))
  call wrtout(std_out,msg,'COLL')
 end do
 do ii=0,(nbands-1)/8
  write(msg,'(a,8es10.2)')' ene:',(energy(iband),iband=1+ii*8,min(nbands,8+ii*8))
  call wrtout(std_out,msg,'COLL')
 end do

 if (max_resid > tolwfr_) then 
  write(msg,'(2a,i5,2a,2(a,es10.2),a,i3,a)')ch10,&
&   " WARNING: conjugate-gradient not converged after ",cg_step," iterations.",ch10,&
&   " max_resid= ",max_resid," > tolwfr= ",tolwfr_," ( Excluding nbdbuf= ",nbdbuf_,")"
  call wrtout(std_out,msg,'COLL')
  call wrtout(ab_out,msg,'COLL')
 end if

 excgap = MINVAL(energy(:))
 maxexc = MAXVAL(energy(:))

 write(msg,'(a,2(a,f7.2,2a))')ch10,&
& " First excitonic eigenvalue= ",excgap*Ha_eV," [eV]",ch10,&
& " Last  excitonic eigenvalue= ",maxexc*Ha_eV," [eV]",ch10
 call wrtout(std_out,msg,"COLL")
 call wrtout(ab_out,msg,"COLL")

 call check_phi_block("END OF CONJUGATE-GRADIENT")

 ! RMM-DIIS Algorithm.
 !do iband=1,nbands
 ! call rmm_diis_for(iband)
 !end do
 !call cholesky_ortho() 

 ! * Master writes the final results on file.
 if (my_rank==master) then
  msg=" Writing eigenvalues/vectors on file "//BS_files%out_eig
  call wrtout(std_out,msg,"COLL")
  eig_unt=get_unit()
  open(eig_unt,file=BS_files%out_eig,form='unformatted')
  write(eig_unt) nbands
  write(eig_unt) CMPLX(energy(1:nbands))
 end if

 ! Wavefunctions are gathered on the master node band-by-band.
 allocate(buffer_dpc(ntrans))
 do iband=1,nbands
  buffer_dpc=czero
  buffer_dpc(my_t1:my_t2) = phi_block(:,iband)
  call xsum_master(buffer_dpc,master,spaceComm,ierr) 
  if (my_rank==master) write(eig_unt) buffer_dpc(1:ntrans)
 end do
 deallocate(buffer_dpc)
                                                      
 if (my_rank==master) close(eig_unt)
 call xbarrier_mpi(spaceComm)

 deallocate(hexc)
 deallocate(phi_block)

 if (use_precond) then
  deallocate(hexc_diagonal)
 end if

 call xbarrier_mpi(spaceComm)

 MSG_ERROR("Done")

 DBG_EXIT("COLL")

CONTAINS  !===========================================================
!!***

!!****f* exc_iterative_diago/subspace_rotation
!! NAME
!! subspace_rotation
!!
!! FUNCTION
!!
!! INPUTS
!!
!! OUTPUT
!!
!! PARENTS
!!      exc_iterative_diago
!!
!! CHILDREN
!!      wrtout,zhegv
!!
!! SOURCE

subroutine subspace_rotation()


 implicit none

!Local variables ------------------------------
 integer :: ii,jj,ipack
!arrays
 real(dp),allocatable :: sub_ene_(:)
! real(dp),allocatable :: evec_(:,:) 
 complex(dpc),allocatable :: sub_ham_(:,:),sub_pham_(:),hphi_tot(:)
! complex(dpc),allocatable :: phi_tmp_(:,:)

!************************************************************************

 ! * Sub-space rotation. Calculate <phi_i|H|phi_j> in packed form.
 ! TODO: this part can be rewritten using BLAS3 routines.

 allocate(hphi_tot(ntrans))
 allocate(sub_pham_(nbands*(nbands+1)/2))
 sub_pham_=czero; ipack=0

 do jj=1,nbands
  hphi_tot = czero
  hphi_tot(:) = MATMUL(hexc, phi_block(:,jj))
  call xsum_mpi(hphi_tot,spaceComm,ierr)

  do ii=1,jj
   ipack=ipack+1
   sub_pham_(ipack) = DOT_PRODUCT(phi_block(my_t1:my_t2,ii), hphi_tot(my_t1:my_t2) )
   if (ii==jj) sub_pham_(ipack) = REAL(sub_pham_(ipack),kind=dp)
  end do
 end do
 call xsum_mpi(sub_pham_,spaceComm,ierr)

 allocate(sub_ham_(nbands,nbands)); sub_ham_=czero
 allocate(sub_ene_(nbands))

 call xhpev("Vectors","Upper",nbands,sub_pham_,sub_ene_,sub_ham_,nbands) !,comm)

 deallocate(hphi_tot)
 deallocate(sub_pham_)
 deallocate(sub_ene_)

 !do ii=1,nbands
 ! norm = DOT_PRODUCT(sub_ham_(:,ii),sub_ham_(:,ii))
 ! write(*,*)"norm subspac",norm
 ! sub_ham_(:,ii) = sub_ham_(:,ii)/norm
 !end do

 !allocate(evec_(2*nbands,nbands))

 !do ii=1,nbands
 ! do jj=1,nbands
 ! evec_(jj,  ii) = REAL (sub_ham_(jj,ii))
 ! evec_(jj+1,ii) = AIMAG(sub_ham_(jj,ii))
 ! end do
 !end do

 !call normev(evec_,nbands,nbands)

 !do ii=1,nbands
 ! do jj=1,nbands
 !  sub_ham_(jj,ii) = CMPLX( evec_(jj,ii),evec_(jj+1,ii) )
 ! end do
 !end do
 !deallocate(evec_)

#if 0
 allocate(phi_tmp_(my_nt,nbands), STAT=istat)
 ABI_CHECK(istat==0,"out-of-memory in phi_tmp_")
 phi_tmp_ = phi_block

 call ZGEMM('N','N',my_nt,nbands,nbands,cone,phi_tmp_,my_nt,sub_ham_,nbands,czero,phi_block,my_nt)

 deallocate(phi_tmp_)
#else
 phi_block(:,:) = MATMUL(phi_block,sub_ham_)
#endif

 deallocate(sub_ham_)

end subroutine subspace_rotation
!!***

!!****f* exc_iterative_diago/cholesky_ortho
!! NAME
!! cholesky_ortho
!!
!! FUNCTION
!!
!! INPUTS
!!
!! OUTPUT 
!!
!! PARENTS
!!      exc_iterative_diago
!!
!! CHILDREN
!!      wrtout,zhegv
!!
!! SOURCE

subroutine cholesky_ortho()


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_linalg
!End of the abilint section

 implicit none

!Local variables ------------------------------
 integer :: my_info,ii,jj,ipack
!arrays
 complex(dpc),allocatable :: overlap_(:,:),povlp_(:)

!************************************************************************

 ! 1) overlap_ij =  <phi_i|phi_j>
 allocate(overlap_(nbands,nbands))

#if defined HAVE_BSE_UNPACKED
 overlap_ = czero

 call ZGEMM('C','N',nbands,nbands,my_nt,cone,phi_block,my_nt,phi_block,my_nt,czero,overlap_,nbands)
 call xsum_mpi(overlap_,spaceComm,ierr)
 
 do ii=1,nbands
  overlap_(ii,ii)=REAL(overlap_(ii,ii),kind=dp)
 end do

 ! 2) Cholesky factorization: overlap_ = U^H U with U upper triangle matrix.
 call ZPOTRF('U',nbands,overlap_,nbands,my_info)
 if (my_info/=0)  then
  write(msg,'(a,i3)')' ZPOTRF returned info= ',my_info
  MSG_ERROR(msg)
 end if 

#else

 ! 1) Calculate overlap_ij =  <phi_i|phi_j> in packed form.
 allocate(povlp_(nbands*(nbands+1)/2))
 povlp_ = czero; ipack=0
 do jj=1,nbands
  do ii=1,jj
   ipack=ipack+1
   povlp_(ipack) = DOT_PRODUCT( phi_block(my_t1:my_t2,ii), phi_block(my_t1:my_t2,jj) ) 
   if (ii==jj) povlp_(ipack) = REAL(povlp_(ipack),kind=dp)
  end do
 end do
 call xsum_mpi(povlp_,spaceComm,ierr)
                                                                                                  
 ! 2) Cholesky factorization: overlap_ = U^H U with U upper triangle matrix.
 call ZPPTRF ("U",nbands,povlp_,my_info)
 if (my_info/=0)  then
  write(msg,'(a,i3)')' ZPPTRF returned info= ',my_info
  MSG_ERROR(msg)
 end if 
 !call xsum_mpi(povlp_,spaceComm,ierr)
 !povlp_=povlp_/nprocs

 !unpack povlp_ to prepare call to ZTRSM.
 ipack=0
 do jj=1,nbands
  do ii=1,jj
   ipack=ipack+1
   if (ii/=jj) then
    overlap_(ii,jj)=      povlp_(ipack)
    overlap_(jj,ii)=CONJG(povlp_(ipack))
   else 
    overlap_(ii,ii)=REAL(povlp_(ipack),kind=dp)
   end if
  end do
 end do
 deallocate(povlp_)
#endif
 
 ! Check if this can be done with Scalapack. Direct PZTRSM is not provided

 ! 3) Solve X U = phi_block, on exit the phi_block treated by this node is orthonormalized.
 !call ZTRSM('R','U','N','N',ntrans,nbands,cone,overlap_,nbands,phi_block,ntrans)
 call ZTRSM('R','U','N','N',my_nt,nbands,cone,overlap_,nbands,phi_block,my_nt)
 deallocate(overlap_)

end subroutine cholesky_ortho
!!***

!!****f* exc_iterative_diago/convergence_degree
!! NAME
!! convergence_degree
!!
!! FUNCTION
!!
!! INPUTS
!!
!! OUTPUT 
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

function convergence_degree(resid)


 implicit none

!Arguments
 integer :: convergence_degree
 real(dp),intent(in) :: resid

!************************************************************************

 if (resid<tolwfr_) then 
  convergence_degree = STRICT
 else 
  convergence_degree = WORST
  if (resid<tolwfr_*10**5) convergence_degree = MEDIUM
 end if

end function convergence_degree
!!***

!!****f* exc_iterative_diago/check_phi_block
!! NAME
!! check_phi_block
!!
!! FUNCTION
!!
!! INPUTS
!!
!! OUTPUT 
!!
!! PARENTS
!!      exc_iterative_diago
!!
!! CHILDREN
!!      wrtout,zhegv
!!
!! SOURCE

subroutine check_phi_block(string)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 character(len=*),intent(in) :: string

!Local variables ------------------------------
!scalars
 integer :: ii,jj
 real(dp) :: err,rdum
!arrays
 complex(dpc),allocatable :: buff_(:,:)
!no_abirules
!integer :: irank

!************************************************************************

 !if (nprocs==1) RETURN
 !return

#if 0
 allocate(buff_(ntrans,nbands))
 err = -one
 do irank=1,nprocs-1
  call xexch_mpi(phi_block,ntrans*nbands,irank,buff_,master,spaceComm,ierr)
  if (my_rank==master) then
   buff_ = buff_-phi_block
   err = MAX(err,MAXVAL(MAXVAL(ABS(buff_),DIM=1)))
  end if
  call xbarrier_mpi(spaceComm)
 end do
 deallocate(buff_)
#else

 allocate(buff_(nbands,nbands)); buff_=czero
 do jj=1,nbands
  do ii=1,jj
   buff_(ii,jj) = DOT_PRODUCT( phi_block(my_t1:my_t2,ii), phi_block(my_t1:my_t2,jj) )
  end do
 end do
 call xsum_mpi(buff_,spaceComm,ierr)

 err = -one
 do jj=1,nbands
  do ii=1,jj
   if (ii==jj) then 
    rdum =  ABS(buff_(ii,jj)-one)
   else 
    rdum =  ABS(buff_(ii,jj))
   end if
   err = MAX(err,rdum)
  end do
 end do
 deallocate(buff_)
#endif

 if (my_rank==master) then
  write(*,*)"After ",TRIM(string),", inconsistency errors in phi_block =",err
 end if

 !write(*,*)"master casts its own data"
 !call xcast_mpi(phi_block,master,spaceComm,ierr)

end subroutine check_phi_block
!!***

!!****f* exc_iterative_diago/rmm_diis_for
!! NAME
!! rmm_diis_for
!!
!! FUNCTION
!!
!! INPUTS
!!
!! OUTPUT 
!!
!! PARENTS
!!
!! CHILDREN
!!      wrtout,zhegv
!!
!! SOURCE

subroutine rmm_diis_for(iband)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_69_bse
!End of the abilint section

 implicit none

!Arguments
 integer,intent(in) :: iband

!Local variables
!scalars
 integer,parameter :: DIIS_DIM=10
 integer :: idiis,lwork,info,ii
 real(dp) :: etrial_old,etrial,norm,den
 real(dp) :: fac,poly,xx
 complex(dpc) :: kprc
 character(len=500) :: msg
!arrays
 real(dp),allocatable :: diis_ene(:),rwork(:)
 complex(dpc),allocatable :: work(:)
 complex(dpc),allocatable :: hphi(:)
 complex(dpc),allocatable,target :: phi_diis(:,:),res_diis(:,:)
 complex(dpc),pointer :: res(:),phi(:),hexc_diagonal(:)
 complex(dpc),allocatable :: diis_mat1(:,:),diis_mat2(:,:)

!************************************************************************

 write(msg,'(a,i0)')" Entering rmm-diis for band: ",iband
 call wrtout(std_out,msg,"COLL")

 allocate(hphi(ntrans)) !,cg_dir(ntrans),old_cg_dir(ntrans), grad_dir(ntrans),prc_dir(ntrans), STAT=istat)

 allocate(phi_diis(ntrans,DIIS_DIM)) ! Arrays containing the iterative space.
 allocate(res_diis(ntrans,DIIS_DIM))

 ! phi_block is assumed to contains a good set of orthonormal vectors to be used as starting points. 
 ! Switch to RMM-DIIS.

 phi_diis(:,1)   = phi_block(:,iband) ! Exctract the state to be optimized
 phi => phi_diis(:,1)
 res => res_diis(:,1)

 hphi = MATMUL(hexc, phi) 
 norm = SQRT( DOT_PRODUCT(phi, phi) )
 etrial = DOT_PRODUCT(phi, hphi) / norm
 res = hphi - etrial*phi
 etrial_old = etrial

 ! TODO find optimal value for lambda.
 lambda = 0.5

 do idiis=2,DIIS_DIM

  phi => phi_diis(:,idiis)
  res => res_diis(:,idiis)

  ! Update trial vector.
  phi = phi_diis(:,idiis-1) + lambda*res_diis(:,idiis-1)

  hphi = MATMUL(hexc, phi) 
  norm = SQRT( DOT_PRODUCT(phi, phi) )
  etrial = DOT_PRODUCT(phi, hphi) / norm
  res = hphi - etrial*phi

  resid(iband) =  DOT_PRODUCT(res,res) 
  convergence_of(iband) = convergence_degree(resid(iband))

  ! Check that etrial is decreasing on succeeding lines:  
  if (idiis>1 .and. (etrial > etrial_old+tol12)) then
   write(msg,'(a,i8,a,1p,e14.6,a1,3x,a,1p,e14.6,a1)')&
&   ' DIIS: New trial energy at idiis',idiis,' = ',etrial,ch10,&
&   ' is higher than former:',etrial_old,ch10
   MSG_WARNING(msg)
  end if
  etrial_old = etrial

  ! If residual sufficiently small, stop line minimization.
  if (convergence_of(iband)==STRICT) then
   if (prtvol>=10) then
    write(msg,'(a,i4,a,i2,a,es12.4)')&
&    ' rmm-diis: band',iband,' converged after ',idiis,&
&    ' RMM-DIIS iterations : resid =',resid(iband)
    call wrtout(std_out,msg,'PERS')
   end if
   EXIT !iline
  end if

  ! preconditioning                                   
  if (use_precond) then
   den = DOT_PRODUCT(res, hexc_diagonal(:)*res )
   !call xsum_mpi(den,spaceComm,ierr)
                                                                                                                         
   do ii=1,ntrans
    xx = hexc_diagonal(ii)/den ! Teter polynomial ratio, modified according to Kresse, Furthmuller, PRB 54, 11169 (1996)
    poly=27._dp+xx*(18._dp+xx*(12._dp+xx*8._dp))
    fac=poly/(poly+16._dp*xx**4)
    kprc = fac*four/(three*den)
    res(ii) = kprc * res(ii)
   end do
  end if

  ! Direct inversion in the iterative subspace.
  allocate(diis_mat1(idiis,idiis))
  allocate(diis_mat2(idiis,idiis))

  do jj=1,idiis
   do ii=jj,idiis
    diis_mat1(ii,jj) = DOT_PRODUCT( res_diis(:,ii), res_diis(:,jj) ) 
    diis_mat2(ii,jj) = DOT_PRODUCT( phi_diis(:,ii), phi_diis(:,jj) ) 
   end do
  end do

  ! HereI can use zhegvx that however is not shipped with abinit.
  ! On exit, if JOBZ = 'V', then if INFO = 0, A contains the
  ! matrix Z of eigenvectors.  The eigenvectors are normalizedas follows:
  ! if ITYPE = 1 or 2, Z**H*B*Z = I;
  allocate(diis_ene(idiis))

  lwork = MAX(1,2*idiis-1)
  allocate(work(lwork))
  allocate(rwork(MAX(1,3*idiis-2)))

  call ZHEGV(1,"Vectors","Upper",idiis,diis_mat1,idiis,diis_mat2,idiis,diis_ene,work,lwork,rwork,info)

  if (info /=0) then
   write(msg,'(a,i4)')" ZHEGV returned info :",info
   MSG_ERROR(msg)
  end if

  deallocate(work,rwork)
  deallocate(diis_ene)
  deallocate(diis_mat2)

  ! Linear combination in the iterative subspace.
  !call ZGEMM('N','N',ntrans,nbands,nbands,cone,phi_block,ntrans,sub_ham,nbands,czero,phi_block,ntrans)
  hphi = MATMUL( phi_diis(:,1:idiis), diis_mat1(:,1) )
  phi_diis(:,idiis) = hphi

  deallocate(diis_mat1)

  !converged = .FALSE.
  !if (converged) ! Save result in phi_block(:,iband)
  ! phi_block(:,iband) = phi_diis(:,idiis); EXIT 
  !end if

 end do

 deallocate(hphi)
 deallocate(phi_diis,res_diis)

 ! Now phi_block contains the converged eigenvectors.
 ! To be orthogonalized when all states have been optimized.
 write(*,*)" RMM-DISS res: ",resid(iband)

end subroutine rmm_diis_for

end subroutine exc_iterative_diago
!!***
