!{\src2tex{textfont=tt}}
!!****f* ABINIT/exceig
!! NAME
!!  exceig
!!
!! FUNCTION
!!  Calculates eigenvalues and eigenvectors of the Hermitian excitonic Hamiltonian (coupling is neglected).
!!
!! COPYRIGHT
!! Copyright (C) 2009-2010 EXC and ABINIT group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida, M.Giantomassi)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  nh=Rank of the resonant block of the Hamiltoninan.
!!  BS_files<excfiles>=Datatype storing names and files used in the Bethe-Salpeter code.
!!    %exh=Name of the file storing the excitonic resonant part.
!!  spaceComm=MPI communicator.
!!  filbseig=The name of the output file
!!  prtvol=Verbosity level.
!!
!! OUTPUT
!!  Eigenvalues and eigenvectors are written on file filbseig
!!
!! PARENTS
!!      bethe_salpeter
!!
!! CHILDREN
!!      delete_file,destruction_matrix_scalapack,end_scalapack,idx_glob
!!      init_matrix_scalapack,init_scalapack,slk_write,slk_zinvert,wrtout,xginv
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

! undef this option to perform the diagonalization in single-precision.
#define DEV_DIAGO_DP

! undef this option to perform the diagonalization in single-precision.
#define DEV_DIAGO_COUPL_DP

subroutine exceig(nh,BS_files,filbseig,prtvol,spaceComm)

 use defs_basis
 use m_bs_defs
 use defs_scalapack
 use m_xmpi
 use m_errors
#if defined HAVE_MPI && defined HAVE_MPI2
 use mpi
#endif

 use m_io_tools,   only : get_unit, delete_file, pick_aname
 use m_abilasi,    only : xheev, xheevx

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_51_manage_mpi
!End of the abilint section

 implicit none

#if defined HAVE_MPI && defined HAVE_MPI1
 include 'mpif.h'
#endif

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nh,spaceComm,prtvol
 type(excfiles),intent(in) ::  BS_files
 character(len=*),intent(in) :: filbseig

!Local variables ------------------------------
!scalars
#ifdef DEV_DIAGO_DP
 integer,parameter :: dgp=dp,dgpc=dpc
#else
 integer,parameter :: dgp=sp,dgpc=spc
#endif
 integer :: ii,it,itp,ir,mi,hreso_unt,eig_unt
 integer :: recl4dpc,ios
 integer :: istat
 integer :: nprocs,my_rank,master
 integer :: ldz,nene_printed
 real(dp) :: exc_gap,maxexc
 real(dp) :: abstol
 complex(dpc) :: ctemp
 logical :: use_scalapack,do_full_diago,use_mpiio
 character(len=500) :: msg
!arrays
 real(dgp),allocatable :: exc_ene(:)
 complex(dgpc),allocatable :: exc_mat(:,:)
 complex(dpc),allocatable :: exc_vec(:,:)
#if defined HAVE_LINALG_MPI
 integer :: il,iu,itloc,jj,jtloc,itglob,jtglob,istwf_k,tbloc,tmp_unt,mene_found,ierr
 real(dp) :: vl,vu
 real(dp),external :: PDLAMCH
 real(dp),allocatable :: exc_ene_dp(:)
 complex(dpc),allocatable :: cbuff_dpc(:)
 !complex(dpc),allocatable :: exc_mat_dpc(:,:)
 character(len=fnlen) :: tmp_fname
 type(matrix_scalapack)    :: Slk_mat,Slk_vec 
 type(processor_scalapack) :: Slk_processor
#endif

!************************************************************************

 nprocs  = xcomm_size(spaceComm)
 my_rank = xcomm_rank(spaceComm)
 master=0

 use_scalapack = .FALSE.
#if defined HAVE_LINALG_MPI
 use_scalapack = (nprocs > 1)
#endif

 use_mpiio=.FALSE.

 recl4dpc = get_reclen("dpc")
 do_full_diago=.TRUE.

 nene_printed = MIN(32,nh)
 if (prtvol>10) nene_printed = nh

 SELECT CASE (use_scalapack)
 CASE (.FALSE.)

   write(msg,'(a,i8)')' Direct diagonalization of the resonant excitonic Hamiltonian, no. transitions= ',nh
   call wrtout(std_out,msg,"COLL")
   call wrtout(ab_out,msg,"COLL")

   write(msg,'(a,f8.1,a)')' Allocating excitonic eigenvalues. Memory required: ', nh*dgp*b2Mb,' Mb. '
   call wrtout(std_out,msg,"COLL")
                                                                                                      
   allocate(exc_ene(nh), STAT=istat)
   ABI_CHECK(istat==0,'out of memory: excitonic eigenvalues')
                                                                                                        
   write(msg,'(a,f8.1,a)')' Allocating excitonic hamiltonian.  Memory required: ',nh*nh*dgpc*b2Mb,' Mb.'
   call wrtout(std_out,msg,"COLL")
                                                                                                        
   allocate(exc_mat(nh,nh), STAT=istat)
   ABI_CHECK(istat==0,'out of memory: excitonic hamiltonian')

   hreso_unt = get_unit()
   open(unit=hreso_unt,file=BS_files%exh,status='old',access='direct',recl=recl4dpc,iostat=ios)
   msg=' Opening file '//TRIM(BS_files%exh)//' as old with direct access.'
   ABI_CHECK(ios==0,msg)

   ! Check the size of hamiltonian matrix
   ! Not possible anymore, this info should be reported in the header.
   !$read(hreso_unt,rec=((nh*nh + nh)/2)+1) itemp
   !$if(itemp/=nh) then
   !$ write(msg,'(a,i0,a,i0,a)')'nt should be ',nh,' but read ',itemp,' wrong file fort.55 *.exh'
   !$ MSG_ERROR(msg)
   !$end if

   ! Construct full excitonic Hamiltonian using Hermiticity. file is always in double precision.
   do itp=1,nh
     do it=1,itp
       ir = it + itp * (itp - 1) / 2
       read(hreso_unt,rec=ir) ctemp
       exc_mat(it,itp) = ctemp
       exc_mat(itp,it) = CONJG(ctemp)
     end do
     exc_mat(itp,itp) = REAL(exc_mat(itp,itp)) !FIXME this is wrong is QP lifetimes are included
   end do

   close(hreso_unt)

   if (do_full_diago) then 
     call wrtout(std_out," Full diagonalization via XHEEV","COLL")
     call xheev("Vectors","Upper",nh,exc_mat,exc_ene)
   else
     call wrtout(std_out," Partial diagonalization via XHEEVX","COLL")
     abstol=zero; ldz=nh
     allocate(exc_vec(ldz,nh)) ! TODO Single precision is not available.
     !$call xheevx("Vectors","All","Upper",nh,exc_mat,vl,vu,il,iu,abstol,mene_found,exc_ene,exc_vec,ldz)
     exc_mat = exc_vec
     deallocate(exc_vec)
   end if

  ! ==============================================
  ! === Now exc_mat contains the eigenvectors ====
  ! ==============================================

  ! * Master node writes the final results.
  if (my_rank==master) then
    call wrtout(std_out,' Writing eigenvalues/vectors on file: '//TRIM(filbseig),"COLL")

    eig_unt=get_unit()
    open(eig_unt,file=filbseig,form='unformatted')

    write(eig_unt) nh
    write(eig_unt) CMPLX(exc_ene(1:nh),kind=dpc)
    do mi=1,nh
      write(eig_unt) CMPLX(exc_mat(1:nh,mi),kind=dpc)
    end do

    close(eig_unt)
  end if

  write(msg,'(a,i4)')' Excitonic eigenvalues in eV up to n= ',nene_printed 
  call wrtout(std_out,msg,"PERS")
  call wrtout(ab_out,msg,"COLL")
  do it=0,(nene_printed-1)/8
    write(msg,'(8f10.5)') ( exc_ene(ii)*Ha_eV, ii=1+it*8,MIN(it*8+8,nene_printed) )
    call wrtout(std_out,msg,"PERS")
    call wrtout(ab_out,msg,"COLL")
  end do

  exc_gap = MINVAL(exc_ene)
  maxexc  = MAXVAL(exc_ene)

  deallocate(exc_mat,exc_ene)

 CASE (.TRUE.)

#if defined HAVE_LINALG_MPI

   ! Here we always work in double-precision.
   write(msg,'(a,i8)')' Direct diagonalization of the resonant excitonic Hamiltonian, no. transitions= ',nh
   call wrtout(std_out,msg,"PERS")
   call wrtout(ab_out,msg,"COLL")

   istwf_k=1; tbloc=50
   write(msg,'(2(a,i0))')" Using scaLAPACK version with nprocs= ",nprocs,"; block size= ",tbloc
   call wrtout(std_out,msg,"PERS")
   call wrtout(ab_out,msg,"COLL")

   write(msg,'(a,f8.1,a)')' Allocating excitonic eigenvalues. Memory required: ', nh*dp*b2Mb,' Mb. '
   call wrtout(std_out,msg,"PERS")

   !call wrtout(std_out,"before init_scalapack","PERS")
   !call xbarrier_mpi(spaceComm)

   call init_scalapack(Slk_processor,spaceComm)

   !call wrtout(std_out,"init_scalapack","PERS")
   !call xbarrier_mpi(spaceComm)

   call init_matrix_scalapack(Slk_mat,nh,nh,Slk_processor,istwf_k,tbloc=tbloc)

   !call wrtout(std_out,"init_matrix1","PERS")
   !call xbarrier_mpi(spaceComm)

   call init_matrix_scalapack(Slk_vec,nh,nh,Slk_processor,istwf_k,tbloc=tbloc) 
   !call wrtout(std_out,"init_matrix2","PERS")

   !call xbarrier_mpi(spaceComm)

   ! Initialize scaLAPACK matrix from file.
   call slk_read(Slk_mat,"Upper","Hermitian",BS_files%exh,use_mpiio)

   !do jtloc=1,Slk_mat%sizeb_local(2)
   ! do itloc=1,Slk_mat%sizeb_local(1)
   !  call idx_glob(Slk_mat,itloc,jtloc,itglob,jtglob)
   !  write(100+my_rank,*)itglob,jtglob,Slk_mat%buffer_cplx(itloc,jtloc) 
   !  if (jtglob>=itglob) then 
   !   ir = itglob + jtglob*(jtglob-1)/2 ! Upper triagle is already stored on file.
   !   read(hreso_unt,rec=ir) ctemp
   !   Slk_mat%buffer_cplx(itloc,jtloc) = CMPLX(ctemp,kind=dpc)
   !  else                     
   !   ir = jtglob + itglob*(itglob-1)/2 ! Read the transposed elements and do the conjugate.
   !   read(hreso_unt,rec=ir) ctemp
   !   Slk_mat%buffer_cplx(itloc,jtloc) = CONJG( CMPLX(ctemp,kind=dpc) )
   !  end if
   ! end do
   !end do
   allocate(exc_ene_dp(nh))

   if (do_full_diago) then ! Direct diagonalization with scaLAPACK (full diago).
     call slk_pzheev("Vectors","Upper",Slk_mat,Slk_vec,exc_ene_dp)
   else ! Direct diagonalization with scaLAPACK (partial diago).
     abstol=zero !ABSTOL = PDLAMCH(spaceComm,'U')
     call slk_pzheevx("Vectors","All","Upper",Slk_mat,vl,vu,il,iu,abstol,Slk_vec,mene_found,exc_ene_dp)
     !call slk_pzheevx("Vectors","I","Upper",Slk_mat,vl,vu,1,125,abstol,Slk_vec,mene_found,exc_ene_dp)
   end if

   call destruction_matrix_scalapack(Slk_mat)

   call wrtout(std_out,' Writing eigenvalues/vectors on file: '//TRIM(filbseig),"COLL")

   !$ Very inefficient coding. Allocate global array to store the distributed eigenvectors  
   !$ Fill the matrix, then the master node writes the final results on file 
   !$allocate(exc_mat_dpc(nh,nh), STAT=istat)
   !$ABI_CHECK(istat==0,'out of memory: excitonic eigenvectors')
   !$exc_mat_dpc = czero

   !$call slk_matrix_to_global_dpc_2D(Slk_vec,"All",exc_mat_dpc)
   !$call xsum_master(exc_mat_dpc,master,spaceComm,ierr)

   !$if (my_rank==master) then
   !$ eig_unt=get_unit()
   !$ open(eig_unt,file=filbseig,form='unformatted')
   !$                                                      
   !$ write(eig_unt) nh
   !$ write(eig_unt) CMPLX(exc_ene(1:nh),kind=dpc)
   !$ do mi=1,nh
   !$  write(eig_unt) CMPLX(exc_mat_dpc(1:nh,mi),kind=dpc)
   !$ end do
   !$ close(eig_unt)
   !$end if
   !$
   !$deallocate(exc_mat_dpc)

   ! Write distributed matrix on file tmp_fname using streams instead of Fortran records.
   tmp_fname=pick_aname()
   call slk_write(Slk_vec,"All",tmp_fname,use_mpiio)

   call destruction_matrix_scalapack(Slk_vec)
   call end_scalapack(Slk_processor)

   ! Conversion: stream --> Fortran file with records
   if (my_rank==master) then 
     eig_unt=get_unit()
     open(eig_unt,file=filbseig,form='unformatted')

     write(eig_unt) nh
     write(eig_unt) CMPLX(exc_ene_dp(1:nh),kind=dpc)

     tmp_unt=get_unit()
     open(tmp_unt,file=tmp_fname,form='unformatted',access="direct",recl=recl4dpc)
     allocate(cbuff_dpc(nh))
     it=0
     do ii=1,nh
       do jj=1,nh
         it=it+1
         read(tmp_unt,rec=it) cbuff_dpc(jj)
       end do
       write(eig_unt) cbuff_dpc
     end do

     deallocate(cbuff_dpc)
     close(tmp_unt)
     close(eig_unt)
     call delete_file(tmp_fname,ierr)
   end if

   write(msg,'(a,i4)')' Excitonic eigenvalues in eV up to n= ',nene_printed 
   call wrtout(std_out,msg,"PERS")
   call wrtout(ab_out,msg,"COLL")
   do it=0,(nene_printed-1)/8
     write(msg,'(8f10.5)') ( exc_ene_dp(ii)*Ha_eV, ii=1+it*8,MIN(it*8+8,nene_printed) )
     call wrtout(std_out,msg,"PERS")
     call wrtout(ab_out,msg,"COLL")
   end do

   exc_gap = MINVAL(exc_ene_dp)
   maxexc  = MAXVAL(exc_ene_dp)

   deallocate(exc_ene_dp)

   call xbarrier_mpi(spaceComm)

#else
   MSG_BUG("You should not be here!")
#endif

 END SELECT

 write(msg,'(a,2(a,f7.2,2a))')ch10,&                          
&  " First excitonic eigenvalue= ",exc_gap*Ha_eV," [eV]",ch10,&
&  " Last  excitonic eigenvalue= ",maxexc*Ha_eV, " [eV]",ch10
 call wrtout(std_out,msg,"COLL")
 call wrtout(ab_out,msg,"COLL")

 msg=" "
 call wrtout(std_out,msg,"PERS")
 call wrtout(ab_out,msg,"COLL")

 call xbarrier_mpi(spaceComm)

end subroutine exceig
!!***

!----------------------------------------------------------------------

!!****f* ABINIT/printexcevl
!! NAME
!!  printexcevl
!!
!! FUNCTION
!!  Print excitonic eigenvalues on std_out and ab_out.
!!
!! COPYRIGHT
!! Copyright (C) 2009-2010 ABINIT group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida, M.Giantomassi)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  nh=Rank of the resonant block of the Hamiltoninan.
!!  gw_gap=GW direct gap.
!!  filbseig=The name of file containing eigenvalues and eigenvectors
!!
!! OUTPUT
!!  exc_gap=Excitonic direct gap.
!!  Additional info on the Excitonic spectrum are reported on standard output.
!!
!! PARENTS
!!      bethe_salpeter
!!
!! CHILDREN
!!      delete_file,destruction_matrix_scalapack,end_scalapack,idx_glob
!!      init_matrix_scalapack,init_scalapack,slk_write,slk_zinvert,wrtout,xginv
!!
!! SOURCE

subroutine printexcevl(nh,filbseig,gw_gap,exc_gap)

 use defs_basis
 use m_bs_defs
 use m_errors

 use m_io_tools,   only : get_unit

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nh
 complex(dpc),intent(in) :: gw_gap 
 complex(dpc),intent(out) :: exc_gap 
 character(len=*),intent(in) :: filbseig

!Local variables ------------------------------
!scalars
 integer :: n,ii,j,k,eig_unt,ieig,ios
 complex(dpc) :: bindenergy,ctemp
 character(len=500) :: msg
!arrays
 complex(dpc),allocatable :: exc_cene(:)

!************************************************************************

 exc_gap = czero

 eig_unt = get_unit()
 open(unit=eig_unt,file=filbseig,form='unformatted',status='old',iostat=ios)
 msg = "Opening file: "//TRIM(filbseig)
 ABI_CHECK(ios==0,msg)

 read(eig_unt) n

 if (n/=nh.and.n/=nh*2) then
   write(msg,'(a,i0)')' Wrong file'//filbseig, n
   close(eig_unt)
   MSG_ERROR(msg)
   !RETURN
 end if

 allocate(exc_cene(n))

 read(eig_unt) exc_cene(:)
 close(eig_unt)

 ! put in ascending order
 do ii=n,2,-1
   do j=1,ii-1
     if (DBLE(exc_cene(j)) > DBLE(exc_cene(j+1))) then
       ctemp = exc_cene(j)
       exc_cene(j) = exc_cene(j+1)
       exc_cene(j+1) = ctemp
     end if
   end do
 end do

 exc_gap = DCMPLX(ABS(DBLE(exc_cene(1))),AIMAG(exc_cene(1)))

 do ii=1,n
   if (ABS(DBLE(exc_cene(ii))) < DBLE(exc_gap)) then
     exc_gap = CMPLX(ABS(DBLE(exc_cene(ii))),AIMAG(exc_cene(ii)))
   end if
 end do

 bindenergy = gw_gap - exc_gap

 write(msg,"(3(a,2f6.2,2a))")&
&  " GW  direct gap     ",gw_gap*Ha_eV,     " [eV] ",ch10,&
&  " EXC direct gap     ",exc_gap*Ha_eV,    " [eV] ",ch10,&
&  " EXC binding energy ",bindenergy*Ha_eV," [eV] ",ch10
 call wrtout(std_out,msg,"COLL")
 call wrtout(ab_out,msg,"COLL")
 
 msg=' Excitonic eigenvalues up to the GW energy gap [eV]'
 call wrtout(std_out,msg,"COLL")
 call wrtout(ab_out,msg,"COLL")

 do ii=1,n
   if (DBLE(exc_cene(ii)) > zero) EXIT
 end do

 do j=ii,n
   if (DBLE(exc_cene(j)) > DBLE(gw_gap)) exit
 end do
 j=j-1

 do ieig=ii,j
   write(msg,'(i3,a,2f6.2,a)')ieig," (",exc_cene(ieig)*Ha_eV,")"
   call wrtout(std_out,msg,"COLL")
   call wrtout(ab_out,msg,"COLL")
 end do
 
 ii=ii-1
 do j=ii,1,-1
   if (ABS(DBLE(exc_cene(j))) > DBLE(gw_gap)) EXIT
 end do
 j=j+1

 ! This coding is not portable, write to ab_out has been disabled.
 if (ii>0) then 
   do k=ii,j,-1
     write(msg,'(i3,a,2f6.2,a)')k," (",exc_cene(k)*Ha_eV,")"
     call wrtout(std_out,msg,"COLL")
     !call wrtout(ab_out,msg,"COLL")
   end do
 end if

end subroutine printexcevl
!!***

!----------------------------------------------------------------------

!!****f* ABINIT/exccoupl
!! NAME
!!  exccoupl
!!
!! FUNCTION
!!  Calculate excitonic eigenvalues and eigenvectors by performing a direct diagonalization.
!!  of the non Hermitian excitonic Hamiltoninan (resonant + coupling). 
!!
!! COPYRIGHT
!! Copyright (C) 2009-2010 ABINIT group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida, M.Giantomassi)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  filbseig=The name of the output file.
!!  nh=Rank of the resonant block of the Hamiltoninan (equal to the rank of the coupling part)
!!  spaceComm=MPI communicator.
!!  BS_files<excfiles>=Datatype storing names and files used in the Bethe-Salpeter code.
!!    %exh=Name of the file storing the excitonin resonant part.
!!    %exc=Name of the file storing the coupling part.
!!
!! OUTPUT
!!  Excitonic eigenvectors and eigenvalues are written on file BS_files%exeig.
!!
!! PARENTS
!!      bethe_salpeter
!!
!! CHILDREN
!!      delete_file,destruction_matrix_scalapack,end_scalapack,idx_glob
!!      init_matrix_scalapack,init_scalapack,slk_write,slk_zinvert,wrtout,xginv
!!
!! SOURCE

subroutine exccoupl(nh,filbseig,BS_files,prtvol,spaceComm)
      
 use defs_basis
 use defs_abitypes
 use m_bs_defs
 use m_xmpi
 use m_errors
#if defined HAVE_MPI && defined HAVE_MPI2
 use mpi
#endif

 use m_io_tools, only : get_unit
 use m_abilasi,  only : xgeev

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
!End of the abilint section

 implicit none

#if defined HAVE_MPI && defined HAVE_MPI1
 include 'mpif.h'
#endif

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nh,spaceComm,prtvol
 type(excfiles),intent(in) :: BS_files
 character(len=*),intent(in) :: filbseig

!Local variables ------------------------------
!scalars
#ifdef DEV_DIAGO_COUPL_DP
 integer,parameter :: dgp=dp,dgpc=dpc
#else
 integer,parameter :: dgp=sp,dgpc=spc
#endif
 integer,parameter :: ldvl = 1
 integer :: ii,nh2,hreso_unt,ios,hcoup_unt,eig_unt
 integer :: mi,it,itp,ir,istat,nprocs,master,my_rank
 integer :: recl4dpc,nene_printed !ierr,
 real(dp) :: exc_gap,maxexc,temp
 complex(dpc) :: cttp,http
 character(len=500) :: msg
!arrays
 complex(dgpc),allocatable :: exc_ham(:,:),exc_rvect(:,:),exc_ene(:)
 complex(dgpc) :: vl_dpc(ldvl,1)

!************************************************************************

 nprocs  = xcomm_size(spaceComm)
 my_rank = xcomm_rank(spaceComm)
 master=0

 ! Only master performs the diagonalization. ScaLAPACK does not provide the parallel version of ZGEEV. 
 if (my_rank/=master) GOTO 10
      
 nh2 = 2*nh
 write(msg,'(a,i8)')' Direct diagonalization of the full excitonic Hamiltonian, no. transitions= ',nh2
 call wrtout(std_out,msg,"COLL")
 call wrtout(ab_out,msg,"COLL")

 write(msg,'(3a,f8.1,a)')&
&  ' Allocating full excitonic hamiltonian ',ch10,&
&  ' memory-space requested: ',nh2*nh2*dgpc*b2MB,' Mb. '
 call wrtout(std_out,msg,"COLL")

 allocate(exc_ham(nh2,nh2), STAT=istat)
 ABI_CHECK(istat==0,'out of memory: full excitonic hamiltonian')

 write(msg,'(3a,f8.1,3a,f8.1,a)')&
&  ' Allocating excitonic eigenvalues and eigenvectors. ',ch10,&
&  ' Memory-space requested: ',nh2*dgpc*b2MB   ,' Mb. ',ch10,&
&  ' Memory-space requested: ',nh2**2*dgpc*b2MB,' Mb. '
 call wrtout(std_out,msg,"COLL")

 allocate(exc_ene(nh2), STAT=istat)
 ABI_CHECK(istat==0,'out of memory: exc_ene')

 allocate(exc_rvect(nh2,nh2), STAT=istat)
 ABI_CHECK(istat==0,"out of memory: excitonic eigenvectors")

 hreso_unt = get_unit()
 recl4dpc = get_reclen("dpc")
 open(unit=hreso_unt,file=BS_files%exh,status='old',access='direct',recl=recl4dpc,iostat=ios)
 ABI_CHECK(ios==0,"Opening reso unit")

 call wrtout(std_out,' Reading resonant excitonic Hamiltonian ',"COLL")

 ! Check the size of hamiltonian matrix
 ! Not possible anymore, this info should be reported in the header.
 !$read(hreso_unt,rec=((nh*nh + nh)/2)+1) itemp
 !$ABI_CHECK(itemp==nh,'Wrong resonant file')

 do itp=1,nh
   do it=1,itp
     ir = it + itp * (itp - 1) / 2
     read(hreso_unt,rec=ir) http
     exc_ham(it ,itp) = http
     exc_ham(itp,it ) = CONJG(http)
     exc_ham(nh+it ,nh+itp) = - CONJG(http)
     exc_ham(nh+itp,nh+it ) = - http
   end do
   exc_ham(itp,itp)       = REAL(exc_ham(itp   ,itp   ),kind=dgpc)  !FIXME this is wrong is life times are included
   exc_ham(nh+itp,nh+itp) = REAL(exc_ham(nh+itp,nh+itp),kind=dgpc)
 end do

 close(hreso_unt)

 call wrtout(std_out,' Reading coupling excitonic Hamiltonian',"COLL")

 hcoup_unt = get_unit()
 open(unit=hcoup_unt,file=BS_files%exc,status='old',access='direct',recl=recl4dpc,iostat=ios)
 ABI_CHECK(ios==0,"opening coupling h")

 ! Check the size of hamiltonian matrix
 ! Not possible anymore, this info should be reported in the header.
 !$read(hcoup_unt,rec=((nh*nh + nh)/2)+1) itemp
 !$ABI_CHECK(itemp==nh,'wrong file .exc')

 do itp=1,nh
   do it=1,itp
     ir = it + itp * (itp - 1) / 2
     read(hcoup_unt,rec=ir) cttp
     exc_ham(it    ,nh+itp) = cttp
     exc_ham(itp   ,nh+it ) = cttp
     exc_ham(nh+it ,itp   ) = - CONJG(cttp)
     exc_ham(nh+itp,it    ) = - CONJG(cttp)
   end do
 end do

 close(hcoup_unt)

 ! ======================================================
 ! ==== Calculate right eigenvectors and eigenvalues ====
 ! ======================================================

#ifdef DEV_DIAGO_COUPL_DP
 call wrtout(std_out,' Diagonalizing full excitonic Hamiltonian. double precision version ',"COLL")
#else
 call wrtout(std_out,' Diagonalizing full excitonic Hamiltonian. double precision version ',"COLL")
#endif

 call xgeev("No_left_eigen","Vectors",nh2,exc_ham,nh2,exc_ene,vl_dpc,ldvl,exc_rvect,nh2)

 deallocate(exc_ham)

 exc_gap = MINVAL(ABS(DBLE (exc_ene)))
 maxexc  = MAXVAL(ABS(DBLE (exc_ene)))
 temp    = MAXVAL(ABS(AIMAG(exc_ene)))

 write(msg,'(2(a,f7.2,2a),a,es8.2,2a)')&
&  " First excitonic eigenvalue: ",exc_gap*Ha_eV," [eV].",ch10,&
&  " Last  excitonic eigenvalue: ",maxexc*Ha_eV, " [eV].",ch10,& 
&  " Largest imaginary part:     ",temp*Ha_eV,   " [eV] ",ch10
 call wrtout(std_out,msg,"COLL")
 call wrtout(ab_out,msg,"COLL")

 nene_printed = MIN(32,nh2)
 if (prtvol>10) nene_printed = nh2

 ! This is not portable as the the eigenvalues calculated by ZGEEV are not sorted.
 ! Even two subsequent calculations with the same input on the same machine
 ! might produce differing orderings. Might sort the eigenvalues though, just for printing.

 write(msg,'(a,i4)')' Complex excitonic eigenvalues in eV up to n= ',nene_printed 
 call wrtout(std_out,msg,"PERS")
 !call wrtout(ab_out,msg,"COLL")

 do it=0,(nene_printed-1)/4
   write(msg,'(8f10.5)') ( exc_ene(ii)*Ha_eV, ii=1+it*4,MIN(it*4+4,nene_printed) )
   call wrtout(std_out,msg,"PERS")
   !call wrtout(ab_out,msg,"COLL")
 end do

 msg=" "
 call wrtout(std_out,msg,"PERS")
 !call wrtout(ab_out,msg,"COLL")
      
 call wrtout(std_out," Writing eigenvalues and eigenvectors on file","COLL")

 eig_unt = get_unit()
 open(eig_unt,file=filbseig,form='unformatted',iostat=ios)
 msg=" Error Opening file: "//TRIM(filbseig)
 ABI_CHECK(ios==0,msg)

 write(eig_unt) nh2
 write(eig_unt) CMPLX(exc_ene(:),kind=dpc)
 do mi=1,nh2
   write(eig_unt) CMPLX(exc_rvect(:,mi),kind=dpc)
 end do

 close(eig_unt)
 deallocate(exc_ene,exc_rvect)

10 call xbarrier_mpi(spaceComm)
          
end subroutine exccoupl
!!***

!----------------------------------------------------------------------

!!****f* ABINIT/excoverlap
!! NAME
!!  excoverlap
!!
!! FUNCTION
!!  Calculate the overlap matrix between excitonic eigenvectors if the case in which
!!  the coupling between resonant and anti-resonant transitions is treated exactly.
!!
!! COPYRIGHT
!! Copyright (C) 2009-2010 ABINIT group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida, M.Giantomassi)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  nh=Rank of the resonant block of the Hamiltonian (equal to the rank of the coupling part)
!!  spaceComm=MPI communicator
!!  BS_files<excfiles>=Datatype storing names and files used in the Bethe-Salpeter code.
!!
!! OUTPUT
!!  The overlap matrix s^-1 is written of file BS_files%exovl
!!
!! PARENTS
!!      bethe_salpeter
!!
!! CHILDREN
!!      delete_file,destruction_matrix_scalapack,end_scalapack,idx_glob
!!      init_matrix_scalapack,init_scalapack,slk_write,slk_zinvert,wrtout,xginv
!!
!! SOURCE

subroutine excoverlap(nh,filbseig,BS_files,spaceComm)

 use defs_basis
 use defs_abitypes
 use defs_scalapack
 use m_bs_defs
 use m_xmpi
 use m_errors

 use m_io_tools,  only : get_unit, delete_file, pick_aname
 use m_blas,      only : xdotc
 use m_abilasi,   only : xginv

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_51_manage_mpi
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nh,spaceComm
 type(excfiles),intent(in) :: BS_files
 character(len=*),intent(in) :: filbseig

!Local variables ------------------------------
!scalars
 integer :: nh2,ios,eig_unt,ovlp_unt,nprocs,master,my_rank,mi,mip,istat,itemp
 logical :: use_scalapack,use_mpiio
 character(len=500) :: msg
!arrays
 complex(spc),allocatable :: rvec(:,:),ovlp(:,:)
 complex(dpc),allocatable :: buff_dpc(:)
#if defined HAVE_LINALG_MPI
 integer :: itloc,jtloc,itglob,jtglob,istwf_k,tbloc,tmp_unt,recl4dpc,it,ii,jj,ierr
 !real(dp),external :: PDLAMCH
 character(len=fnlen) :: tmp_fname
 type(matrix_scalapack)    :: Slk_mat
 type(processor_scalapack) :: Slk_processor
 complex(dpc),allocatable :: cbuff(:)
#endif

!************************************************************************

 nprocs  = xcomm_size(spaceComm)
 my_rank = xcomm_rank(spaceComm)
 master=0

 use_scalapack = .FALSE.
#if defined HAVE_LINALG_MPI
 use_scalapack = (nprocs > 1)
#endif

 use_mpiio=.FALSE.

 if (.not.use_scalapack .and. my_rank/=master) GOTO 10

 nh2=2*nh
 write(msg,'(a,i0)')' Calculating overlap matrix, matrix rank= ',nh2
 call wrtout(std_out,msg,"COLL")

 eig_unt = get_unit()
 open(unit=eig_unt,file=filbseig,form='unformatted',status='old',iostat=ios)
 msg = " Opening file: "//TRIM(filbseig)
 ABI_CHECK(ios==0,msg)

 read(eig_unt) itemp
 ABI_CHECK(itemp==nh2," Wrong file "//TRIM(filbseig))

 SELECT CASE (use_scalapack)
 CASE (.FALSE.)

  allocate(rvec(nh2,nh2), STAT=istat)
  ABI_CHECK(istat==0,'out of memory rvec')

  allocate(ovlp(nh2,nh2), STAT=istat)
  ABI_CHECK(istat==0,'out of memory in ovlp matrix')

  call wrtout(std_out," Reading excitonic eigenvectors ","COLL")

  allocate(buff_dpc(nh2))

  read(eig_unt)  ! Skip eigenvalues.
  do mi=1,nh2    ! Read right eigenvectors.
    read(eig_unt) buff_dpc(:)
    rvec(:,mi) = buff_dpc
  end do

  deallocate(buff_dpc)
 
  call wrtout(std_out,' Calculating overlap matrix ',"COLL")

  do mi=1,nh2
    do mip=1,nh2
      ovlp(mi,mip) = xdotc(nh2,rvec(:,mi),1,rvec(:,mip),1)
    end do
  end do
  deallocate(rvec)
       
  call wrtout(std_out," Inverting overlap matrix ","COLL")

  call xginv(ovlp,nh2)

  if (my_rank==master) then
    call wrtout(std_out,' Writing overlap matrix s^-1 on file '//TRIM(BS_files%exovl),"COLL")

    ovlp_unt = get_unit()
    open(unit=ovlp_unt,file=BS_files%exovl,form='unformatted',iostat=ios)

    write(ovlp_unt) nh2
    do mi=1,nh2 ! write overlap matrix s^-1(mi,:)
      write(ovlp_unt) CMPLX(ovlp(mi,:),kind=dpc)
    end do
    close(ovlp_unt)
  end if

  deallocate(ovlp)

 CASE (.TRUE.)
                           
#if defined HAVE_LINALG_MPI

  istwf_k=1; tbloc=50
  write(msg,'(2(a,i0))')" Using scaLAPACK version with nprocs= ",nprocs,"; block size= ",tbloc
  call wrtout(std_out,msg,"PERS")
  call wrtout(ab_out,msg,"COLL")

  call init_scalapack(Slk_processor,spaceComm)
  call init_matrix_scalapack(Slk_mat,nh2,nh2,Slk_processor,istwf_k,tbloc=tbloc)

  ! Very inefficient coding, it can be optimized. 
  ! For example each node reads the subset it is dealing with. 
  allocate(rvec(nh2,nh2), STAT=istat)
  ABI_CHECK(istat==0,'out of memory in rvec')
  allocate(buff_dpc(nh2))

  read(eig_unt) ! Skip eigenvalues.
  do mi=1,nh2
    read(eig_unt) buff_dpc(:)
    rvec(:,mi) = buff_dpc
  end do

  deallocate(buff_dpc)

  call wrtout(std_out," Calculating overlap matrix ","COLL")

  ! Fill the local matrix
  do jtloc=1,Slk_mat%sizeb_local(2)
    do itloc=1,Slk_mat%sizeb_local(1)
      call idx_glob(Slk_mat,itloc,jtloc,itglob,jtglob)
      Slk_mat%buffer_cplx(itloc,jtloc) = xdotc(nh2,rvec(:,itglob),1,rvec(:,jtglob),1)
    end do
  end do

  deallocate(rvec)

  ! Invert the matrix with scaLAPACK.
  call slk_zinvert(Slk_mat)

  ! Gather the results on the master node.
  ! Very inefficient numb coding. Allocate global array to store the distributed eigenvectors  
  ! Fill the matrix, then master node writes the final results on file 
  ! (better coding requires either scaLAPACK tool or MPI-IO (the later should be much faster).

  !$allocate(ovlp_dpc(nh2,nh2), STAT=istat)
  !$ABI_CHECK(istat==0,'out of memory ovlp_dpc')
  !$ovlp_dpc = czero

  !$call slk_matrix_to_global_dpc_2D(Slk_mat,"All",ovlp_dpc)
  !$call xsum_master(ovlp_dpc,master,spaceComm,ierr)

  !$if (my_rank==master) then
  !$ msg=' Writing overlap matrix s^-1 on file '//TRIM(BS_files%exovl)
  !$ call wrtout(std_out,msg,"PERS")

  !$ ovlp_unt = get_unit()
  !$ open(unit=ovlp_unt,file=BS_files%exovl,form='unformatted',iostat=ios)

  !$ write(ovlp_unt) nh2
  !$ do mi=1,nh2 ! write overlap matrix s^-1(mi,:)
  !$  write(ovlp_unt) CMPLX(ovlp_dpc(mi,:),kind=dpc)
  !$ end do
  !$ close(ovlp_unt)
  !$end if

  !$deallocate(ovlp_dpc)

  ! Use MPI-IO
  ! Write distributed matrix on file tmp_fname using streams instead of Fortran records.
  tmp_fname=pick_aname()
  call slk_write(Slk_mat,"All",tmp_fname,use_mpiio)

  ! Conversion: stream --> Fortran sequential mode with records
  if (my_rank==master) then 
    allocate(cbuff(nh2))

    ovlp_unt = get_unit()
    open(unit=ovlp_unt,file=BS_files%exovl,form='unformatted',iostat=ios)

    tmp_unt=get_unit()
    recl4dpc = get_reclen("dpc")
    open(tmp_unt,file=tmp_fname,form='unformatted',access="direct",recl=recl4dpc)

    write(ovlp_unt) nh2
    
    it=0 
    do ii=1,nh2 ! write overlap matrix s^-1(mi,:)
      do jj=1,nh2
        it=ii+(jj-1)*nh2
        read(tmp_unt,rec=it) cbuff(jj)
      end do
      write(ovlp_unt) cbuff
    end do

    close(ovlp_unt)
    close(tmp_unt)
                                                                                  
    deallocate(cbuff)
    call delete_file(tmp_fname,ierr)
  end if

  call destruction_matrix_scalapack(Slk_mat)
  call end_scalapack(Slk_processor)

#else
 MSG_BUG("You should not be here!")
#endif

 END SELECT

 close(eig_unt)

10 call xbarrier_mpi(spaceComm)
       
end subroutine excoverlap
!!***
