!{\src2tex{textfont=tt}}
!!****p* ABINIT/fftprof
!! NAME
!! fftprof
!!
!! FUNCTION
!!  Utility for profiling the FFT libraries supported by abinit.
!!
!! COPYRIGHT
!! Copyright (C) 2004-2009 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!!
!! INPUTS
!!  (main program)
!!
!! OUTPUT
!!  Timing analysis of the different FFT libraries and algorithms.
!!
!! NOTES
!!  Point-group symmetries are not taken into account in getng during the generation
!!  of the FFT mesh. Therefore the FFT mesh might differ from the one
!!  found by abinit for the same cutoff and Bravais lattice (actually it might be smaller).
!!
!! PARENTS
!!
!! CHILDREN
!!      destroy_fft_prof,destroy_fft_test,destroy_mpi_enreg
!!      fftprof_ncalls_per_test,fftw3_gain_wisdom,fftw3_set_in_place,get_kg
!!      init_fft_test,initmpi_seq,metric,mpi_comm_rank,mpi_comm_size
!!      nullify_mpi_enreg,print_fft_profs,prompt,time_fourdp,time_fourdp_cplx
!!      time_fourwf,time_rhotwg,wrtout,xmpi_end,xmpi_init
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

program fftprof

 use defs_basis
 use defs_abitypes
 use m_xmpi
 use m_FFT_prof
#if defined HAVE_MPI && defined HAVE_MPI2
 use mpi
#endif

 use m_io_tools,   only : prompt
 use m_gsphere,    only : get_kg

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_14_hidewrite
 use interfaces_42_geometry
 use interfaces_51_manage_mpi
!End of the abilint section

 implicit none

#if defined HAVE_MPI && defined HAVE_MPI1
 include 'mpif.h'
#endif

!Arguments -----------------------------------

!Local variables-------------------------------
!scalars
 integer,parameter :: nsym1=1
 integer :: ndat,ierr,isym,fftcache,it,cplex,ntests,option_fourwf,osc_npw
 integer :: n1,n2,n3,use_wisdom,do_outofplace,ncalls,map2sphere,use_padfft
 character(len=500) :: header
 real(dp) :: ecut,ucvol,boxcutmin,osc_ecut
 type(MPI_type) :: MPI_enreg
!arrays
 integer :: symrel(3,3,nsym1)
 real(dp),parameter :: gamma_point(3)=(/zero,zero,zero/)
 real(dp) :: rprimd(3,3),gmet(3,3),gprimd(3,3),rmet(3,3),kpoint(3)
 type(FFT_test_t),allocatable :: Ftest(:)
 type(FFT_prof_t),allocatable :: Ftprof(:)
 integer,pointer :: osc_gvec(:,:)
 integer,allocatable :: fft_setups(:,:)

! *************************************************************************

 call xmpi_init()

 call nullify_mpi_enreg(mpi_enreg)
 call initmpi_seq(MPI_enreg)

#ifdef HAVE_MPI
 MPI_enreg%world_comm=MPI_COMM_WORLD
 MPI_enreg%world_group=MPI_GROUP_NULL
 call MPI_COMM_RANK(MPI_COMM_WORLD,MPI_enreg%me,ierr)
 call MPI_COMM_SIZE(MPI_COMM_WORLD,MPI_enreg%nproc,ierr)
 MPI_enreg%paral_compil=1
 MPI_enreg%paral_compil_respfn=0
 MPI_enreg%paral_level=2
#endif

 call prompt("Enter ecut in Hartree: ",ecut)
 call prompt("Enter rprimd in Bohr: ",rprimd)
 
 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)

 do isym=1,nsym1
   symrel(:,:,isym) = RESHAPE((/1,0,0,0,1,0,0,0,1/),(/3,3/))
 end do

!kpoint  = gamma_point
 kpoint = (/0.3,0.5,0.25/)

 call prompt("Number of calls for each test? ",ncalls)
 if (ncalls>0) then 
   call fftprof_ncalls_per_test(ncalls)
 else
   call fftprof_ncalls_per_test(20)
 end if

!fftcache is machine-dependent.
!ngfft_sg(8)=16
!#if defined FC_FUJITSU || defined FC_NEC || defined FC_HITACHI
!ngfft_sg(8)=8096    ! Large value for the vector machines
!#elif defined i386
!ngfft_sg(8)=256     ! Was optimized for my PII 450MHz
!#endif

 ntests=2; ndat=1
 allocate(fft_setups(3,ntests)) ! First dimension contains fftalg,fftcache, ndat
 fftcache = 0
 fft_setups(:,1) = (/312,fftcache,ndat/)
 fftcache = 16
!fft_setups(:,2) = (/412,16,ndat/)
 fft_setups(:,2) = (/112,fftcache,ndat/)

 allocate(Ftest(ntests))
 allocate(Ftprof(ntests))

!Needed for crappy compilers that do not support => null() in type declarations.
 do it=1,ntests
   nullify(Ftest(it)%kg_k)
   nullify(Ftprof(it)%results)
 end do

 boxcutmin = two
!boxcutmin = one
 do it=1,ntests
   call init_FFT_test(Ftest(it),fft_setups(:,it),kpoint,ecut,boxcutmin,gmet,nsym1,symrel,MPI_enreg,&
&   use_istwfk=.FALSE.,store_results=.FALSE.)
!  &    use_istwfk=.TRUE.,store_results=.FALSE.)
 end do

 use_wisdom = 0
!call prompt(" Do you want to to use FFTW3 wisdom?: ",use_wisdom)
 if (use_wisdom/=0) then
   call wrtout(std_out," gaining FFTW wisdom","COLL")
   n1 = Ftest(1)%ngfft(1)
   n2 = Ftest(1)%ngfft(2)
   n3 = Ftest(1)%ngfft(3)
   call fftw3_gain_wisdom(n1,n2,n3,ndat)
 end if

!call prompt("Do you wanto to run FFTW3 out-of-place?: ",do_outofplace)
 do_outofplace = 0
 if (do_outofplace /= 0) then 
   call fftw3_set_in_place(.FALSE.)
 else
   call fftw3_set_in_place(.TRUE.)
 end if

!call fftw3_set_nthreads(nthreads=1)

!=======================
!==== fourdp timing ====
!=======================

 do cplex=1,2
   write(header,'(a,i2)')"fourdp with cplex: ",cplex     
   do it=1,ntests
     call time_fourdp(Ftest(it),cplex,Ftprof(it))
   end do
   call print_FFT_profs(Ftprof,header)
 end do

!===========================================
!==== fourdp timing with complex arrays ====
!===========================================
!These routines are used in the GW part, FFTW3 is expected to                 
!be more efficient as the conversion complex(:) <--> real(2,:) is not needed. 

 header = "fourdp_cplx in-place (GW code)"
 do it=1,ntests
   call time_fourdp_cplx(Ftest(it),Ftprof(it),.TRUE.)
 end do
 call print_FFT_profs(Ftprof,header)

 header = "fourdp_cplx out-of-place (GW code)"
 do it=1,ntests
   call time_fourdp_cplx(Ftest(it),Ftprof(it),.FALSE.)
 end do
 call print_FFT_profs(Ftprof,header)

!=======================
!==== fourwf timing ====
!=======================

 cplex = 2; option_fourwf=0
!cplex = 1; option_fourwf=1
!cplex = 2; option_fourwf=2
!cplex = 2; option_fourwf=3
 header = "fourwf with cplex=2 and option=0"
!call fftw3_set_in_place(.FALSE.)

 do it=1,ntests
   call time_fourwf(Ftest(it),cplex,option_fourwf,Ftprof(it))
 end do
 call print_FFT_profs(Ftprof,header)

!=========================
!==== rho_tw_g timing ====
!=========================

 call prompt("Enter ecut for GW oscillators in Hartree: ",osc_ecut)

 nullify(osc_gvec)
 call get_kg(gamma_point,1,osc_ecut,gmet,osc_npw,osc_gvec) 
!TODO should reorder by shells to be consistent with the GW part!
!Moreover I guess this ordering is more efficient when we have
!to map the box to the G-sphere!
!write(*,*)" got ",osc_npw," Gs for oscillators"

 header = "rho_tw_g with use_padfft=1"
 map2sphere=1; use_padfft=1

 do it=1,ntests
   if  (Ftest(it)%ngfft(7) /= 312) cycle
   call time_rhotwg(Ftest(it),map2sphere,use_padfft,osc_npw,osc_gvec,Ftprof(it))
   call print_FFT_profs(Ftprof(it:it),header)
 end do

 header = "rho_tw_g with use_padfft=0"
 map2sphere=1; use_padfft=0
 
 do it=1,ntests
   call time_rhotwg(Ftest(it),map2sphere,use_padfft,osc_npw,osc_gvec,Ftprof(it))
 end do
 call print_FFT_profs(Ftprof,header)

 deallocate(osc_gvec)

!===============================
!=== End of run, free memory ===
!===============================

 deallocate(fft_setups)

 call destroy_FFT_test(Ftest)
 deallocate(Ftest)

 call destroy_FFT_prof(Ftprof)
 deallocate(Ftprof)

 call destroy_mpi_enreg(mpi_enreg)
 call xmpi_end()

end program fftprof
!!***
