Logo Search packages:      
Sourcecode: openmx version File versions  Download package

Band_DFT_Dosout.c

/**********************************************************************
  Band_DFT_Dosout.c:

     Band_DFT_Dosout.c is a subroutine to calculate the density of
     states based on the band calculation.

  Log of Band_DFT_Dosout.c:

     12/May/2003  Released by H.Kino
     25/Dec/2003  a non-collinear part (added by T.Ozaki)

***********************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include "openmx_common.h"


#define  measure_time   0


#ifdef nompi
#include "mimic_mpi.h"
#else
#include "mpi.h"
#endif


static double Band_DFT_Dosout_Col(
                           int knum_i, int knum_j, int knum_k,
                           int SpinP_switch,
                           double *****nh,
                           double *****ImNL,
                           double ****CntOLP);

static double Band_DFT_DosoutGauss_Col(
                           int knum_i, int knum_j, int knum_k,
                           int SpinP_switch,
                           double *****nh,
                           double *****ImNL,
                           double ****CntOLP);

static double Band_DFT_Dosout_NonCol(
                           int knum_i, int knum_j, int knum_k,
                           int SpinP_switch,
                           double *****nh,
                           double *****ImNL,
                           double ****CntOLP);

static double Band_DFT_DosoutGauss_NonCol(
                           int knum_i, int knum_j, int knum_k,
                           int SpinP_switch,
                           double *****nh,
                           double *****ImNL,
                           double ****CntOLP);




double Band_DFT_Dosout( int knum_i, int knum_j, int knum_k,
                        int SpinP_switch,
                        double *****nh,
                        double *****ImNL,
                        double ****CntOLP)
{
  static double time0;

  if ( (SpinP_switch==0 || SpinP_switch==1) && Dos_fileout){
    time0 = Band_DFT_Dosout_Col( knum_i, knum_j, knum_k, SpinP_switch, nh, ImNL, CntOLP);
  }

  else if ( (SpinP_switch==0 || SpinP_switch==1) && DosGauss_fileout){
    time0 = Band_DFT_DosoutGauss_Col( knum_i, knum_j, knum_k, SpinP_switch, nh, ImNL, CntOLP);
  }

  else if (SpinP_switch==3 && Dos_fileout){
    time0 = Band_DFT_Dosout_NonCol( knum_i, knum_j, knum_k, SpinP_switch, nh, ImNL, CntOLP);
  }

  else if (SpinP_switch==3 && DosGauss_fileout){
    time0 = Band_DFT_DosoutGauss_NonCol( knum_i, knum_j, knum_k, SpinP_switch, nh, ImNL, CntOLP);
  }

  return time0;
}








static double Band_DFT_DosoutGauss_Col(
                           int knum_i, int knum_j, int knum_k,
                           int SpinP_switch,
                           double *****nh,
                           double *****ImNL,
                           double ****CntOLP)
{
  int i,j,k,spin,l,i1,j1,n1;
  int n, wanA;
  int *MP;
  int MA_AN, GA_AN, tnoA,Anum, LB_AN;
  int GB_AN, wanB, tnoB, Bnum, RnB;
  int l1,l2,l3,ik;
  int kloop,kloop0,S_knum,E_knum,T_knum,num_kloop0; 
  int ie,iemin,iemax,n1min,iecenter,iewidth;
  int MaxL,e1,s1;
  int i_vec[10];

  double sum,sumi,u2,v2,uv,vu,tmp,pi2,xa,eg,x,de;
  double factor,EV_cut0;
  double kRn,si,co,Redum,Imdum,Redum2,Resum;
  double TStime,TEtime,time0;
  double OLP_eigen_cut=Threshold_OLP_Eigen;

  double *koS;
  double **ko; int N_ko, i_ko[10];
  dcomplex **H;  int N_H,  i_H[10];
  dcomplex **S;  int N_S,  i_S[10];
  double *M1,***EIGEN;
  dcomplex **C;  int N_C,  i_C[10];
  double *KGrids1, *KGrids2, *KGrids3;
  double ****CDM; int N_CDM, i_CDM[10];
  double **SD; int N_SD, i_SD[10];
  double *tmp_array0;
  dcomplex ***H2,**TmpM;
  double *T_KGrids1,*T_KGrids2,*T_KGrids3;
  int *Ti_KGrids1,*Tj_KGrids2,*Tk_KGrids3,*arpo;
  dcomplex Ctmp1,Ctmp2;

  float *fSD;
  double ****Dummy_ImNL;
  double snum_i, snum_j, snum_k; 
  double k1,k2,k3;
  double *r_energy;
  double time1,time2,time3;
  double time4,time5,time6;
  double time7,time8,time9,time10;
  double Stime1,Etime1;
  float ****Dos;
  float *DosVec;
  char file_ev[YOUSO10],file_eig[YOUSO10];
  FILE *fp_ev,*fp_eig;
  char buf1[fp_bsize];          /* setvbuf */
  char buf2[fp_bsize];          /* setvbuf */
  int numprocs,myid,ID,ID1,ID2,tag;

  MPI_Status stat;
  MPI_Request request;

  /* MPI */
  MPI_Comm_size(mpi_comm_level1,&numprocs);
  MPI_Comm_rank(mpi_comm_level1,&myid);
  MPI_Barrier(mpi_comm_level1);

  if (myid==Host_ID){
    printf("\n<Band_DFT_Dosout>\n"); fflush(stdout);
  }

  dtime(&TStime);

  time1 = 0.0;
  time2 = 0.0;
  time3 = 0.0;
  time4 = 0.0;
  time5 = 0.0;
  time6 = 0.0;
  time7 = 0.0;
  time8 = 0.0;
  time9 = 0.0;
  time10 = 0.0;

  /****************************************************
             calculation of the array size
  ****************************************************/

  n = 0;
  for (i=1; i<=atomnum; i++){
    wanA  = WhatSpecies[i];
    n  += Spe_Total_CNO[wanA];
  }

  /****************************************************
   Allocation

   int       MP[List_YOUSO[1]]
   double    ko[List_YOUSO[23]][n+1]
   double    koS[n+1];
   dcomplex  H[n+1][n+1]
   dcomplex  S[n+1][n+1]
   double    M1[n+1]
   dcomplex  C[n+1][n+1]
   double    KGrids1[knum_i]
   double    KGrids2[knum_j]
   double    KGrids3[knum_k]
   double    CDM[Matomnum+1][List_YOUSO[8]]
                [List_YOUSO[7]][List_YOUSO[7]]
   double    SD[List_YOUSO[1]][List_YOUSO[7]]
   float     fSD[List_YOUSO[7]]
  ****************************************************/

  DosVec = (float*)malloc(sizeof(float)*atomnum*List_YOUSO[7]);

  MP = (int*)malloc(sizeof(int)*List_YOUSO[1]);
  arpo = (int*)malloc(sizeof(int)*numprocs);

  N_ko=2; i_ko[0]=List_YOUSO[23]; i_ko[1]=n+1;
  ko=(double**)malloc_multidimarray("double",N_ko, i_ko); 

  koS = (double*)malloc(sizeof(double)*(n+1));

  N_H=2; i_H[0]=i_H[1]=n+1;
  H=(dcomplex**)malloc_multidimarray("dcomplex",N_H, i_H);

  N_S=2; i_S[0]=i_S[1]=n+1;
  S=(dcomplex**)malloc_multidimarray("dcomplex",N_S, i_S);

  M1 = (double*)malloc(sizeof(double)*(n+1));

  N_C=2; i_C[0]=i_C[1]=n+1;
  C=(dcomplex**)malloc_multidimarray("dcomplex",N_C, i_C);

  KGrids1 = (double*)malloc(sizeof(double)*knum_i);
  KGrids2 = (double*)malloc(sizeof(double)*knum_j);
  KGrids3 = (double*)malloc(sizeof(double)*knum_k);

  N_CDM=4; i_CDM[0]=Matomnum+1; i_CDM[1]=List_YOUSO[8];
  i_CDM[2]=List_YOUSO[7]; i_CDM[3]=List_YOUSO[7];
  CDM=(double****)malloc_multidimarray("double",N_CDM,i_CDM);

  N_SD=2;  i_SD[0]=List_YOUSO[1]; i_SD[1]=List_YOUSO[7];
  SD=(double**)malloc_multidimarray("double",N_SD,i_SD);

  fSD=(float*)malloc(sizeof(float)*List_YOUSO[7]);

  tmp_array0 = (double*)malloc(sizeof(double)*(n+1)*(n+1)*2);

  TmpM = (dcomplex**)malloc(sizeof(dcomplex*)*(n+1));
  for (j=0; j<n+1; j++){
    TmpM[j] = (dcomplex*)malloc(sizeof(dcomplex)*(n+1));
  }

  H2 = (dcomplex***)malloc(sizeof(dcomplex**)*List_YOUSO[23]);
  for (i=0; i<List_YOUSO[23]; i++){
    H2[i] = (dcomplex**)malloc(sizeof(dcomplex*)*(n+1));
    for (j=0; j<n+1; j++){
      H2[i][j] = (dcomplex*)malloc(sizeof(dcomplex)*(n+1));
    }
  }

  /* allocate Dos */

  Dos = (float****)malloc(sizeof(float***)*DosGauss_Num_Mesh);
  for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
    Dos[ik] = (float***)malloc(sizeof(float**)*(SpinP_switch+1) );
    for (spin=0; spin<=SpinP_switch; spin++) {
      Dos[ik][spin] = (float**)malloc(sizeof(float*)*(atomnum+1) );
      for (GA_AN=0; GA_AN<=atomnum; GA_AN++) {
        Dos[ik][spin][GA_AN] = (float*)malloc(sizeof(float)*List_YOUSO[7] );
        for (i=0; i<List_YOUSO[7]; i++)  Dos[ik][spin][GA_AN][i] = 0.0;
      }
    }
  }

  /* allocate r_energy */

  r_energy = (double*)malloc(sizeof(double)*DosGauss_Num_Mesh);

  /* set up energies where DOS is calculated */

  Dos_Erange[0] += ChemP;
  Dos_Erange[1] += ChemP;

  de = (Dos_Erange[1]-Dos_Erange[0])/(double)DosGauss_Num_Mesh;
  for (i=0; i<DosGauss_Num_Mesh; i++) {
    r_energy[i] = Dos_Erange[0] + de*(double)i;
  }

  /* no spin-orbit coupling */
  if (SO_switch==0){
    Dummy_ImNL = (double****)malloc(sizeof(double***)*1);
    Dummy_ImNL[0] = (double***)malloc(sizeof(double**)*1);
    Dummy_ImNL[0][0] = (double**)malloc(sizeof(double*)*1);
    Dummy_ImNL[0][0][0] = (double*)malloc(sizeof(double)*1);
  }

  snum_i = knum_i;
  snum_j = knum_j;
  snum_k = knum_k;

  /* set up  k-grids */
  for (i=0; i<=(knum_i-1); i++){
    if (knum_i==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_i) + Shift_K_Point;
    }
    KGrids1[i]=k1;
  }
  for (i=0; i<=(knum_j-1); i++){
    if (knum_j==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_j) - Shift_K_Point;
    }
    KGrids2[i]=k1;
  }
  for (i=0; i<=(knum_k-1); i++){
    if (knum_k==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_k) + 2.0*Shift_K_Point;
    }
    KGrids3[i]=k1;
  }

  if (myid==Host_ID){
    printf(" KGrids1: ");
    for (i=0;i<=knum_i-1;i++) printf("%f ",KGrids1[i]);
    printf("\n");
    printf(" KGrids2: ");
    for (i=0;i<=knum_j-1;i++) printf("%f ",KGrids2[i]);
    printf("\n");
    printf(" KGrids3: ");
    for (i=0;i<=knum_k-1;i++) printf("%f ",KGrids3[i]);
    printf("\n");
  }

  /***********************************
       one-dimentionalize for MPI
  ************************************/

  T_knum = 0;
  for (i=0; i<=(knum_i-1); i++){
    for (j=0; j<=(knum_j-1); j++){
      for (k=0; k<=(knum_k-1); k++){
        T_knum++;
      }
    }
  }

  T_KGrids1 = (double*)malloc(sizeof(double)*T_knum);
  T_KGrids2 = (double*)malloc(sizeof(double)*T_knum);
  T_KGrids3 = (double*)malloc(sizeof(double)*T_knum);

  Ti_KGrids1 = (int*)malloc(sizeof(int)*T_knum);
  Tj_KGrids2 = (int*)malloc(sizeof(int)*T_knum);
  Tk_KGrids3 = (int*)malloc(sizeof(int)*T_knum);

  EIGEN  = (double***)malloc(sizeof(double**)*List_YOUSO[23]);
  for (i=0; i<List_YOUSO[23]; i++){
    EIGEN[i] = (double**)malloc(sizeof(double*)*T_knum);
    for (j=0; j<T_knum; j++){
      EIGEN[i][j] = (double*)malloc(sizeof(double)*(n+1));
    }
  }

  /* set T_KGrid1,2,3 */

  T_knum = 0;
  for (i=0; i<=(knum_i-1); i++){
    for (j=0; j<=(knum_j-1); j++){
      for (k=0; k<=(knum_k-1); k++){

      T_KGrids1[T_knum] = KGrids1[i];
      T_KGrids2[T_knum] = KGrids2[j];
      T_KGrids3[T_knum] = KGrids3[k];

      Ti_KGrids1[T_knum] = i;
      Tj_KGrids2[T_knum] = j;
      Tk_KGrids3[T_knum] = k;

        T_knum++;
      }
    }
  }

  /* allocate k-points into proccessors */

  if (T_knum<=myid){
    S_knum = -10;
    E_knum = -100;
    num_kloop0 = 1;
  }
  else if (T_knum<numprocs) {
    S_knum = myid;
    E_knum = myid;
    num_kloop0 = 1;
  }
  else {
    tmp = (double)T_knum/(double)numprocs; 
    num_kloop0 = (int)tmp + 1;

    S_knum = (int)((double)myid*(tmp+0.0001)); 
    E_knum = (int)((double)(myid+1)*(tmp+0.0001)) - 1;
    if (myid==(numprocs-1)) E_knum = T_knum - 1;
    if (E_knum<0)           E_knum = 0;
  }

  factor = 1.0/(double)(knum_i * knum_j * knum_k); /* normalization factor */
  pi2 = sqrt(PI);

  /* for kloop */

  for (kloop0=0; kloop0<num_kloop0; kloop0++){

    kloop = kloop0 + S_knum;
    arpo[myid] = -1;
    if (S_knum<=kloop && kloop<=E_knum) arpo[myid] = kloop;
    for (ID=0; ID<numprocs; ID++){
      MPI_Bcast(&arpo[ID], 1, MPI_INT, ID, mpi_comm_level1);
    }

    /* set S */

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (measure_time==1) dtime(&Stime1);
      
      if (0<=kloop){
      
        k1 = T_KGrids1[kloop];
        k2 = T_KGrids2[kloop];
        k3 = T_KGrids3[kloop];
      
        Overlap_Band(ID,CntOLP,TmpM,MP,k1,k2,k3);
        n = TmpM[0][0].r;
      
        if (myid==ID){
        for (i1=1; i1<=n; i1++){
          for (j1=1; j1<=n; j1++){
            S[i1][j1].r = TmpM[i1][j1].r;
            S[i1][j1].i = TmpM[i1][j1].i;
          } 
        } 
        } 
       
      }

      if (measure_time==1){
        dtime(&Etime1);
        time1 += Etime1 - Stime1;
      }
    }

    kloop = arpo[myid];

    if (0<=kloop){

      if (measure_time==1) dtime(&Stime1);

      EigenBand_lapack(S,ko[0],n);

      if (measure_time==1){
        dtime(&Etime1);
        time2 += Etime1 - Stime1;
      }

      if (3<=level_stdout && 0<=kloop){
      printf("  kloop %2d  k1 k2 k3 %10.6f %10.6f %10.6f\n",
             kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
      for (i1=1; i1<=n; i1++){
        printf("  Eigenvalues of OLP  %2d  %15.12f\n",i1,ko[0][i1]);
      }
      }

      /* minus eigenvalues to 1.0e-14 */

      for (l=1; l<=n; l++){
        if (ko[0][l]<0.0) ko[0][l] = 1.0e-14;
        koS[l] = ko[0][l];
      }

      /* calculate S*1/sqrt(ko) */

      for (l=1; l<=n; l++) M1[l] = 1.0/sqrt(ko[0][l]);

      /* S * M1  */

      for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
        S[i1][j1].r = S[i1][j1].r*M1[j1];
        S[i1][j1].i = S[i1][j1].i*M1[j1];
      } 
      } 

    } /* if (0<=kloop) */

    /* set H */

    for (spin=0; spin<=SpinP_switch; spin++){

      for (ID=0; ID<numprocs; ID++){

        kloop = arpo[ID];

        if (0<=kloop){
          k1 = T_KGrids1[kloop];
          k2 = T_KGrids2[kloop];
          k3 = T_KGrids3[kloop];

          Hamiltonian_Band(ID, nh[spin], TmpM, MP, k1, k2, k3);

          if (myid==ID){
          for (i1=1; i1<=n; i1++){
            for (j1=1; j1<=n; j1++){
              H[i1][j1].r = TmpM[i1][j1].r;
              H[i1][j1].i = TmpM[i1][j1].i;
            } 
          } 
          } 
      }
      }

      kloop = arpo[myid];

      if (0<=kloop){

        /****************************************************
                       M1 * U^t * H * U * M1
        ****************************************************/

        if (measure_time==1) dtime(&Stime1);

        /* transpose S */

      for (i1=1; i1<=n; i1++){
        for (j1=i1+1; j1<=n; j1++){
          Ctmp1 = S[i1][j1];
          Ctmp2 = S[j1][i1];
          S[i1][j1] = Ctmp2;
          S[j1][i1] = Ctmp1;
        }
      }

        /* H * U * M1 */
 
      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n; j1++){

          sum  = 0.0;
          sumi = 0.0;

          for (l=1; l<=n; l++){
            sum  += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
            sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
          }

          C[j1][i1].r = sum;
          C[j1][i1].i = sumi;
        }
      }     

        /* M1 * U^+ H * U * M1 */

      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n; j1++){

          sum  = 0.0;
            sumi = 0.0;

          for (l=1; l<=n; l++){
            sum  +=  S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
            sumi +=  S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
          }
          H[i1][j1].r = sum;
          H[i1][j1].i = sumi;
        }
      }     

        /* H to C */

      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n; j1++){
          C[i1][j1] = H[i1][j1];
        }
      }

       /* penalty for ill-conditioning states */

      EV_cut0 = Threshold_OLP_Eigen;

      for (i1=1; i1<=n; i1++){

        if (koS[i1]<EV_cut0){
          C[i1][i1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
        }
 
        /* cutoff the interaction between the ill-conditioned state */
 
        if (1.0e+3<C[i1][i1].r){
          for (j1=1; j1<=n; j1++){
            C[i1][j1] = Complex(0.0,0.0);
            C[j1][i1] = Complex(0.0,0.0);
          }
          C[i1][i1].r = 1.0e+4;
        }
      }

        if (measure_time==1){
          dtime(&Etime1);
          time3 += Etime1 - Stime1;
        }

        /* solve eigenvalue problem */

        if (measure_time==1) dtime(&Stime1);

      n1 = n;
        EigenBand_lapack(C,ko[spin],n1);

      for (i1=1; i1<=n; i1++) EIGEN[spin][kloop][i1] = ko[spin][i1];

        if (measure_time==1){
          dtime(&Etime1);
          time4 += Etime1 - Stime1;
        }

        /****************************************************
           transformation to the original eigen vectors.
           NOTE JRCAT-244p and JAIST-2122p 
           C = U * lambda^{-1/2} * D
        ****************************************************/

        if (measure_time==1) dtime(&Stime1);

      /* transpose */

      for (i1=1; i1<=n; i1++){
        for (j1=i1+1; j1<=n; j1++){
          Ctmp1 = S[i1][j1];
          Ctmp2 = S[j1][i1];
          S[i1][j1] = Ctmp2;
          S[j1][i1] = Ctmp1;
        }
      }

      /* transpose */

      for (i1=1; i1<=n; i1++){
        for (j1=i1+1; j1<=n; j1++){
          Ctmp1 = C[i1][j1];
          Ctmp2 = C[j1][i1];
          C[i1][j1] = Ctmp2;
          C[j1][i1] = Ctmp1;
        }
      }

        /* shift */

      for (j1=1; j1<=n; j1++){
        for (l=n; 1<=l; l--){
          C[j1][l] = C[j1][l];
        }
      }

      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n; j1++){
          H2[spin][i1][j1].r = 0.0;
          H2[spin][i1][j1].i = 0.0;
        }
      }

      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n1; j1++){

          sum  = 0.0;
          sumi = 0.0;

          for (l=1; l<=n; l++){
            sum  +=  S[i1][l].r*C[j1][l].r - S[i1][l].i*C[j1][l].i;
            sumi +=  S[i1][l].r*C[j1][l].i + S[i1][l].i*C[j1][l].r;
          }

          H2[spin][i1][j1].r = sum;
          H2[spin][i1][j1].i = sumi;

        }
      }

        if (measure_time==1){
          dtime(&Etime1);
          time5 += Etime1 - Stime1;
        }

      } /* if (0<=kloop) */
    } /* spin */

    /****************************************************
        store LDOS
    ****************************************************/

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (0<=kloop){

      for (spin=0; spin<=SpinP_switch; spin++){

          if (measure_time==1) dtime(&Stime1);

          /* MPI: EIGEN */ 

          MPI_Bcast(&EIGEN[spin][kloop][0], n+1, MPI_DOUBLE, ID, mpi_comm_level1);

          /* MPI: H2 */

        k = 0;

        for (i1=0; i1<=n; i1++){
          for (j1=0; j1<=n; j1++){
            tmp_array0[k] = H2[spin][i1][j1].r;
            k++;
          }
        }

        for (i1=0; i1<=n; i1++){
          for (j1=0; j1<=n; j1++){
            tmp_array0[k] = H2[spin][i1][j1].i;
            k++;
          }
        }

          MPI_Barrier(mpi_comm_level1);
          MPI_Bcast(&tmp_array0[0], 2*(n+1)*(n+1), MPI_DOUBLE, ID, mpi_comm_level1);

        k = 0;

        for (i1=0; i1<=n; i1++){
          for (j1=0; j1<=n; j1++){
            H[i1][j1].r = tmp_array0[k];
            k++;
          }
        }
         
        for (i1=0; i1<=n; i1++){
          for (j1=0; j1<=n; j1++){
            H[i1][j1].i = tmp_array0[k];
            k++;
          }
        }

        /* transpose */

        for (i1=1; i1<=n; i1++){
          for (j1=i1+1; j1<=n; j1++){
            Ctmp1 = H[i1][j1];
            Ctmp2 = H[j1][i1];
            H[i1][j1] = Ctmp2;
            H[j1][i1] = Ctmp1;
          }
        }

          /****************************************************
                               store LDOS
          ****************************************************/

          k1 = T_KGrids1[kloop];
          k2 = T_KGrids2[kloop];
          k3 = T_KGrids3[kloop];

          i = Ti_KGrids1[kloop];
          j = Tj_KGrids2[kloop];
          k = Tk_KGrids3[kloop];

          if (measure_time==1){
            dtime(&Etime1);
            time6 += Etime1 - Stime1;
          }

          for (l=1; l<=n; l++){

            if (measure_time==1) dtime(&Stime1);

            /* initialize */
            for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
              GA_AN = M2G[MA_AN];
              wanA = WhatSpecies[GA_AN];
              tnoA = Spe_Total_CNO[wanA];
              for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
                GB_AN = natn[GA_AN][LB_AN];
                wanB = WhatSpecies[GB_AN];
                tnoB = Spe_Total_CNO[wanB];
                for (i1=0; i1<tnoA; i1++){
                  for (j1=0; j1<tnoB; j1++){
                    CDM[MA_AN][LB_AN][i1][j1] =0.0;
                  }
                }
              }
            }

            /* theta_ij = C_ni C_nj */

            for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){

              GA_AN = M2G[MA_AN];
              wanA = WhatSpecies[GA_AN];
              tnoA = Spe_Total_CNO[wanA];
              Anum = MP[GA_AN];

              for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){

                GB_AN = natn[GA_AN][LB_AN];
                RnB = ncn[GA_AN][LB_AN];
                wanB = WhatSpecies[GB_AN];
                tnoB = Spe_Total_CNO[wanB];
                Bnum = MP[GB_AN];
                l1 = atv_ijk[RnB][1];
                l2 = atv_ijk[RnB][2];
                l3 = atv_ijk[RnB][3];
                kRn = k1*l1 + k2*l2 + k3*l3;

                si = sin(2.0*PI*kRn);     /* imaginary part */
                co = cos(2.0*PI*kRn);     /* real part      */

                for (i1=0; i1<tnoA; i1++){
                  for (j1=0; j1<tnoB; j1++){

                    u2 = H[l][Anum+i1].r*H[l][Bnum+j1].r;
                    v2 = H[l][Anum+i1].i*H[l][Bnum+j1].i;
                    uv = H[l][Anum+i1].r*H[l][Bnum+j1].i;
                    vu = H[l][Anum+i1].i*H[l][Bnum+j1].r;

                    Redum = (u2 + v2);
                    Imdum = (uv - vu);
                    Redum2 = co*Redum - si*Imdum;
                    Resum  = Redum2;
                    CDM[MA_AN][LB_AN][i1][j1] += Resum;

              } /* j1 */
            }   /* i1 */

            }     /* LB_AN */
          }       /* MA_AN */

            /*******************************************
                 M_i = S_ij D_ji
                 D_ji = C_nj C_ni

                 S_ij : CntOLP
                 D_ji : CDM
            ******************************************/

            for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
              wanA = WhatSpecies[GA_AN];
              tnoA = Spe_Total_CNO[wanA];
              for (i1=0; i1<tnoA; i1++){
                SD[GA_AN][i1] = 0.0;
              }
            }

            for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
              GA_AN = M2G[MA_AN];
              wanA = WhatSpecies[GA_AN];
              tnoA = Spe_Total_CNO[wanA];
              for (i1=0; i1<tnoA; i1++){
                for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
                  GB_AN = natn[GA_AN][LB_AN];
                  wanB = WhatSpecies[GB_AN];
                  tnoB = Spe_Total_CNO[wanB];
                  for (j1=0; j1<tnoB; j1++){
                     SD[GA_AN][i1] += CDM[MA_AN][LB_AN][i1][j1]*
                                   CntOLP[MA_AN][LB_AN][i1][j1];
                  }
                }
              }
            }

            if (measure_time==1){
              dtime(&Etime1);
              time7 += Etime1 - Stime1;
            }

            if (measure_time==1) dtime(&Stime1);

            /*  calculate contribution to Gaussian DOS */

          for (GA_AN=1; GA_AN<=atomnum; GA_AN++){

            wanA = WhatSpecies[GA_AN];
            tnoA = Spe_Total_CNO[wanA];
            ID2 = G2ID[GA_AN];

            if (ID2==myid){

            for (i1=0; i1<tnoA; i1++){
              fSD[i1]=(float)SD[GA_AN][i1];
            }

            /*************************
                  exp( -(x/a)^2 ) 
                  a= DosGauss_Width
                x=-3a : 3a is counted 
            **************************/

                eg = EIGEN[spin][kloop][l];
                x = (eg-Dos_Erange[0])/(Dos_Erange[1]-Dos_Erange[0])*(DosGauss_Num_Mesh-1);
              iecenter = (int)x ; 

                iewidth = DosGauss_Width*3.0/(Dos_Erange[1]-Dos_Erange[0])*(DosGauss_Num_Mesh-1)+3;

              iemin = iecenter - iewidth;
              iemax = iecenter + iewidth;

                if (iemin<0) iemin=0;
                if (iemax>=DosGauss_Num_Mesh) iemax=DosGauss_Num_Mesh-1;

                if ( 0<=iemin && iemin<DosGauss_Num_Mesh && 0<=iemax && iemax<DosGauss_Num_Mesh ) {

                  for (ie=iemin;ie<=iemax;ie++) {
                    xa = (eg-r_energy[ie])/DosGauss_Width;

                  for (i1=0; i1<tnoA; i1++){
                  Dos[ie][spin][GA_AN][i1] += factor*fSD[i1]* exp(-xa*xa)/(DosGauss_Width*pi2);
                }
              }
            }

            }
          }   /* GA_AN */

            if (measure_time==1){
              dtime(&Etime1);
              time8 += Etime1 - Stime1;
            }

        }      /* l             */
      }        /* spin          */
      }          /* if (0<=kloop) */
    }            /* ID            */
  }              /* kloop0        */

  /****************************************************************
                       Dos by MPI
  ****************************************************************/

  if (measure_time==1) dtime(&Stime1);

  {
 
    float *DosTmp;

    DosTmp = (float*)malloc(sizeof(float)*DosGauss_Num_Mesh*(SpinP_switch+1)*List_YOUSO[7] );

    for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
      ID2 = G2ID[GA_AN];

      if (ID2==myid){

        k = 0;

        for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
        for (spin=0; spin<=SpinP_switch; spin++){
          for (i=0; i<List_YOUSO[7]; i++){
            DosTmp[k] = Dos[ik][spin][GA_AN][i];
              k++;
          }
        }
        }

      if (ID2!=Host_ID){
        tag = 999;
        MPI_Isend(&DosTmp[0], DosGauss_Num_Mesh*(SpinP_switch+1)*List_YOUSO[7],
                    MPI_FLOAT,Host_ID,tag,mpi_comm_level1,&request);
        MPI_Wait(&request,&stat);
      }
      }

      if (myid==Host_ID && ID2!=Host_ID){
      tag = 999;
      MPI_Recv(&DosTmp[0], DosGauss_Num_Mesh*(SpinP_switch+1)*List_YOUSO[7],
                 MPI_FLOAT, ID2, tag, mpi_comm_level1, &stat);

        k = 0;

        for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
        for (spin=0; spin<=SpinP_switch; spin++){
          for (i=0; i<List_YOUSO[7]; i++){
            Dos[ik][spin][GA_AN][i] = DosTmp[k];
              k++;
          }
        }
        }
      }

      MPI_Barrier(mpi_comm_level1);

    }

    free(DosTmp);
  }

  if (measure_time==1){
    dtime(&Etime1);
    time9 += Etime1 - Stime1;
  }

  /****************************************************************
                    write eigenvalues and eigenvectors
  ****************************************************************/

  if (measure_time==1) dtime(&Stime1);

  if (myid==Host_ID){

    sprintf(file_eig,"%s%s.Dos.val",filepath,filename);

    if ( (fp_eig=fopen(file_eig,"w"))==NULL ) {
      printf("can not open a file %s\n",file_eig);
    }
  
    else{

      /* write *.Dos.val */

      printf("write eigenvalues\n");

      fprintf(fp_eig,"mode        7\n");
      fprintf(fp_eig,"NonCol      0\n");
      fprintf(fp_eig,"N           %d\n",n);
      fprintf(fp_eig,"Nspin       %d\n",SpinP_switch);
      fprintf(fp_eig,"Erange      %lf %lf\n",Dos_Erange[0],Dos_Erange[1]);
      fprintf(fp_eig,"Kgrid       %d %d %d\n",1,1,1);
      fprintf(fp_eig,"atomnum     %d\n",atomnum);
      fprintf(fp_eig,"<WhatSpecies\n");
      for (i=1; i<=atomnum; i++) {
        fprintf(fp_eig,"%d ",WhatSpecies[i]);
      }
      fprintf(fp_eig,"\nWhatSpecies>\n");
      fprintf(fp_eig,"SpeciesNum     %d\n",SpeciesNum);
      fprintf(fp_eig,"<Spe_Total_CNO\n");
      for (i=0;i<SpeciesNum;i++) {
        fprintf(fp_eig,"%d ",Spe_Total_CNO[i]);
      }
      fprintf(fp_eig,"\nSpe_Total_CNO>\n");

      MaxL = Supported_MaxL; 

      fprintf(fp_eig,"MaxL           %d\n",Supported_MaxL);
      fprintf(fp_eig,"<Spe_Num_CBasis\n");
      for (i=0;i<SpeciesNum;i++) {
        for (l=0;l<=MaxL;l++) {
        fprintf(fp_eig,"%d ",Spe_Num_CBasis[i][l]);
        }
        fprintf(fp_eig,"\n");
      }
      fprintf(fp_eig,"Spe_Num_CBasis>\n");
      fprintf(fp_eig,"ChemP       %lf\n",ChemP);

      fprintf(fp_eig,"irange      %d %d\n",0,DosGauss_Num_Mesh-1);
      fprintf(fp_eig,"<Eigenvalues\n");
      for (spin=0; spin<=SpinP_switch; spin++) {
        fprintf(fp_eig,"%d %d %d ",0,0,0);

        for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
          fprintf(fp_eig,"%lf ",r_energy[ik]);
      }
        fprintf(fp_eig,"\n");
      }  
      fprintf(fp_eig,"Eigenvalues>\n");

      fclose(fp_eig);
    }

    /* write *.Dos.vec */

    printf("write eigenvectors\n");

    sprintf(file_ev,"%s%s.Dos.vec",filepath,filename);

    if ( (fp_ev=fopen(file_ev,"w"))==NULL ) {
      printf("can not open a file %s\n",file_ev);
    }
    else {

      for (spin=0; spin<=SpinP_switch; spin++) {
        for (ik=0; ik<DosGauss_Num_Mesh; ik++) {

          i_vec[0] = 0; 
          i_vec[1] = 0;
          i_vec[2] = 0;

          if (myid==Host_ID) fwrite(i_vec,sizeof(int),3,fp_ev);

          k = 0;
          for (GA_AN=1; GA_AN<=atomnum; GA_AN++) {
          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];
          for (i=0; i<tnoA; i++){
              DosVec[k] = Dos[ik][spin][GA_AN][i];
              k++;
          }
        }

          fwrite(DosVec,sizeof(float),k,fp_ev);

      }
      }

      fclose(fp_ev);
    }

  } /* if (myid==Host_ID) */

  if (measure_time==1){
    dtime(&Etime1);
    time10 += Etime1 - Stime1;
  }

  if (measure_time==1){
    printf("myid=%4d  time1=%15.12f\n",myid,time1);
    printf("myid=%4d  time2=%15.12f\n",myid,time2);
    printf("myid=%4d  time3=%15.12f\n",myid,time3);
    printf("myid=%4d  time4=%15.12f\n",myid,time4);
    printf("myid=%4d  time5=%15.12f\n",myid,time5);
    printf("myid=%4d  time6=%15.12f\n",myid,time6);
    printf("myid=%4d  time7=%15.12f\n",myid,time7);
    printf("myid=%4d  time8=%15.12f\n",myid,time8);
    printf("myid=%4d  time9=%15.12f\n",myid,time9);
    printf("myid=%4d time10=%15.12f\n",myid,time10);
  }

  /****************************************************
                       free arrays
  ****************************************************/

  free(DosVec);

  free(MP);
  free(arpo);

  for (i=0; i<i_ko[0]; i++){
    free(ko[i]);
  }
  free(ko);

  free(koS);

  for (i=0; i<i_H[0]; i++){
    free(H[i]);
  }
  free(H);

  for (i=0; i<i_S[0]; i++){
    free(S[i]);
  }
  free(S);

  free(M1);

  for (i=0; i<i_C[0]; i++){
    free(C[i]);
  }
  free(C);

  free(KGrids1); free(KGrids2);free(KGrids3);
  
  for (i=0; i<i_CDM[0]; i++){
    for (j=0; j<i_CDM[1]; j++){
      for (k=0; k<i_CDM[2]; k++){
        free(CDM[i][j][k]);
      }
      free(CDM[i][j]);
    }
    free(CDM[i]);
  }
  free(CDM);

  for (i=0; i<i_SD[0]; i++){
    free(SD[i]);
  }
  free(SD);

  free(fSD);

  free(tmp_array0);

  for (j=0; j<n+1; j++){
    free(TmpM[j]);
  }
  free(TmpM);

  for (i=0; i<List_YOUSO[23]; i++){
    for (j=0; j<n+1; j++){
      free(H2[i][j]);
    }
    free(H2[i]);
  }
  free(H2);

  /* no spin-orbit coupling */
  if (SO_switch==0){
    free(Dummy_ImNL[0][0][0]);
    free(Dummy_ImNL[0][0]);
    free(Dummy_ImNL[0]);
    free(Dummy_ImNL);
  }

  free(T_KGrids1);
  free(T_KGrids2);
  free(T_KGrids3);

  free(Ti_KGrids1);
  free(Tj_KGrids2);
  free(Tk_KGrids3);

  for (i=0; i<List_YOUSO[23]; i++){
    for (j=0; j<T_knum; j++){
      free(EIGEN[i][j]);
    }
    free(EIGEN[i]);
  }
  free(EIGEN);

  for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
    for (spin=0; spin<=SpinP_switch; spin++) {
      for (GA_AN=0; GA_AN<=atomnum; GA_AN++) {
        free(Dos[ik][spin][GA_AN]);
      }
      free(Dos[ik][spin]);
    }
    free(Dos[ik]);
  }
  free(Dos);

  free(r_energy);

  /* for elapsed time */

  dtime(&TEtime);
  time0 = TEtime - TStime;
  return time0;

}







static double Band_DFT_DosoutGauss_NonCol(
                           int knum_i, int knum_j, int knum_k,
                           int SpinP_switch,
                           double *****nh,
                           double *****ImNL,
                           double ****CntOLP)
{
  int i,j,k,spin,l,i1,j1,ik,n1;
  int n, wanA,ii1,jj1,m,n2;
  int *MP;
  int MA_AN, GA_AN, tnoA,Anum, LB_AN;
  int GB_AN, wanB, tnoB, Bnum, RnB;
  int l1,l2,l3;
 
  int ie,iemin,iemax,iemin0,iemax0,n1min,mn;
  int MaxL,e1,s1,T_knum,S_knum,E_knum;
  int kloop,kloop0,num_kloop0;
  int i_vec[10];

  double EV_cut0;
  double sum,sumi,u2,v2,uv,vu,tmp;
  double kRn,si,co,Redum,Imdum;
  double Redum2,Resum,Imdum2;
  double TStime,TEtime,time0;
  double OLP_eigen_cut=Threshold_OLP_Eigen;
  double theta,phi,sit,cot,sip,cop,de;
  double tmp1,tmp2,tmp3,eg,x,xa,pi2,factor;
  double *ko; int N_ko, i_ko[10];
  double *koS;
  double *r_energy;
  dcomplex **H;  int N_H,  i_H[10];
  dcomplex **S;  int N_S,  i_S[10];
  dcomplex Ctmp1,Ctmp2;
  double *M1,**EIGEN;
  dcomplex **C;  int N_C,  i_C[10];
  double *KGrids1, *KGrids2, *KGrids3;
  double *****CDM; int N_CDM, i_CDM[10];
  double ***SD; int N_SD, i_SD[10];
  double *tmp_array0;
  dcomplex **H2,**TmpM;
  double *T_KGrids1,*T_KGrids2,*T_KGrids3;
  int *Ti_KGrids1,*Tj_KGrids2,*Tk_KGrids3,*arpo;
  int iecenter,iewidth;

  float *fSDup,*fSDdn;
  double *****Dummy_ImNL;
  float ****Dos;
  float *DosVec;
  double snum_i, snum_j, snum_k; 
  double k1,k2,k3;

  char file_ev[YOUSO10],file_eig[YOUSO10];
  FILE *fp_ev,*fp_eig;
  char buf1[fp_bsize];          /* setvbuf */
  char buf2[fp_bsize];          /* setvbuf */
  int numprocs,myid,ID,ID1,ID2,tag;

  MPI_Status stat;
  MPI_Request request;

  /* MPI */
  MPI_Comm_size(mpi_comm_level1,&numprocs);
  MPI_Comm_rank(mpi_comm_level1,&myid);
  MPI_Barrier(mpi_comm_level1);

  if (myid==Host_ID){
    printf("\n<Band_DFT_Dosout>\n"); fflush(stdout);
  }

  dtime(&TStime);

  /****************************************************
             calculation of the array size
  ****************************************************/

  n = 0;
  for (i=1; i<=atomnum; i++){
    wanA  = WhatSpecies[i];
    n  = n + Spe_Total_CNO[wanA];
  }
  n2 = 2*n + 2;

  /****************************************************
   Allocation

   int       MP[List_YOUSO[1]]
   double    ko[n2]
   double    koS[n+1];
   dcomplex  H[n2][n2]
   dcomplex  S[n2][n2]
   double    M1[n2]
   dcomplex  C[n2][n2]
   double    KGrids1[knum_i]
   double    KGrids2[knum_j]
   double    KGrids3[knum_k]
   double    CDM[4][Matomnum+1][List_YOUSO[8]]
                [List_YOUSO[7]][List_YOUSO[7]]
   double    SD[4][List_YOUSO[1]][List_YOUSO[7]]
   float     fSDup[List_YOUSO[7]]
   float     fSDdn[List_YOUSO[7]]
   double    Htmp[n2][n2]
  ****************************************************/

  DosVec = (float*)malloc(sizeof(float)*2*atomnum*List_YOUSO[7]);

  MP = (int*)malloc(sizeof(int)*List_YOUSO[1]);

  arpo = (int*)malloc(sizeof(int)*numprocs);

  ko = (double*)malloc(sizeof(double)*n2);
  koS = (double*)malloc(sizeof(double)*(n+1));

  H = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (j=0; j<n2; j++){
    H[j] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  S = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (i=0; i<n2; i++){
    S[i] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  M1 = (double*)malloc(sizeof(double)*n2);

  tmp_array0 = (double*)malloc(sizeof(double)*2*n2*n2);

  C = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (j=0; j<n2; j++){
    C[j] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  KGrids1 = (double*)malloc(sizeof(double)*knum_i);
  KGrids2 = (double*)malloc(sizeof(double)*knum_j);
  KGrids3 = (double*)malloc(sizeof(double)*knum_k);

  N_CDM=5; i_CDM[0]=4; i_CDM[1]=Matomnum+1; i_CDM[2]=List_YOUSO[8];
  i_CDM[3]=List_YOUSO[7]; i_CDM[4]=List_YOUSO[7];
  CDM=(double*****)malloc_multidimarray("double",N_CDM,i_CDM);

  N_SD=3;  i_SD[0]=4; i_SD[1]=List_YOUSO[1]; i_SD[2]=List_YOUSO[7];
  SD=(double***)malloc_multidimarray("double",N_SD,i_SD);

  fSDup = (float*)malloc(sizeof(float)*List_YOUSO[7]);
  fSDdn = (float*)malloc(sizeof(float)*List_YOUSO[7]);

  TmpM = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (j=0; j<n2; j++){
    TmpM[j] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  H2 = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (j=0; j<n2; j++){
    H2[j] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  /* allocate Dos */

  Dos = (float****)malloc(sizeof(float***)*DosGauss_Num_Mesh);
  for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
    Dos[ik] = (float***)malloc(sizeof(float**)*2 );
    for (spin=0; spin<=1; spin++) {
      Dos[ik][spin] = (float**)malloc(sizeof(float*)*(atomnum+1) );
      for (GA_AN=0; GA_AN<=atomnum; GA_AN++) {
        Dos[ik][spin][GA_AN] = (float*)malloc(sizeof(float)*List_YOUSO[7] );
        for (i=0; i<List_YOUSO[7]; i++)  Dos[ik][spin][GA_AN][i] = 0.0;
      }
    }
  }

  /* allocate r_energy */

  r_energy = (double*)malloc(sizeof(double)*DosGauss_Num_Mesh);

  /* set up energies where DOS is calculated */

  Dos_Erange[0] += ChemP;
  Dos_Erange[1] += ChemP;

  de = (Dos_Erange[1]-Dos_Erange[0])/(double)DosGauss_Num_Mesh;
  for (i=0; i<DosGauss_Num_Mesh; i++) {
    r_energy[i] = Dos_Erange[0] + de*(double)i;
  }

  /* non-spin-orbit coupling and non-LDA+U */
  if (SO_switch==0 && Hub_U_switch==0 && Constraint_NCS_switch==0 
      && Zeeman_NCS_switch==0 && Zeeman_NCO_switch==0){

    Dummy_ImNL = (double*****)malloc(sizeof(double****)*1);
    Dummy_ImNL[0] = (double****)malloc(sizeof(double***)*1);
    Dummy_ImNL[0][0] = (double***)malloc(sizeof(double**)*1);
    Dummy_ImNL[0][0][0] = (double**)malloc(sizeof(double*)*1);
    Dummy_ImNL[0][0][0][0] = (double*)malloc(sizeof(double)*1);
  }

  /****************************************************
    set up k-grids
  ****************************************************/

  snum_i = knum_i;
  snum_j = knum_j;
  snum_k = knum_k;

  for (i=0; i<=(knum_i-1); i++){
    if (knum_i==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_i) + Shift_K_Point;
    }
    KGrids1[i]=k1;
  }
  for (i=0; i<=(knum_j-1); i++){
    if (knum_j==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_j) - Shift_K_Point;
    }
    KGrids2[i]=k1;
  }
  for (i=0; i<=(knum_k-1); i++){
    if (knum_k==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_k) + 2.0*Shift_K_Point;
    }
    KGrids3[i]=k1;
  }

  if (myid==Host_ID){
    printf(" KGrids1: ");
    for (i=0;i<=knum_i-1;i++) printf("%f ",KGrids1[i]);
    printf("\n");
    printf(" KGrids2: ");
    for (i=0;i<=knum_j-1;i++) printf("%f ",KGrids2[i]);
    printf("\n");
    printf(" KGrids3: ");
    for (i=0;i<=knum_k-1;i++) printf("%f ",KGrids3[i]);
    printf("\n");
  }

  /***********************************
       one-dimentionalize for MPI
  ************************************/

  T_knum = 0;
  for (i=0; i<=(knum_i-1); i++){
    for (j=0; j<=(knum_j-1); j++){
      for (k=0; k<=(knum_k-1); k++){
        T_knum++;
      }
    }
  }

  T_KGrids1 = (double*)malloc(sizeof(double)*T_knum);
  T_KGrids2 = (double*)malloc(sizeof(double)*T_knum);
  T_KGrids3 = (double*)malloc(sizeof(double)*T_knum);

  Ti_KGrids1 = (int*)malloc(sizeof(int)*T_knum);
  Tj_KGrids2 = (int*)malloc(sizeof(int)*T_knum);
  Tk_KGrids3 = (int*)malloc(sizeof(int)*T_knum);

  EIGEN = (double**)malloc(sizeof(double*)*T_knum);
  for (j=0; j<T_knum; j++){
    EIGEN[j] = (double*)malloc(sizeof(double)*n2);
  }

  /* set T_KGrid1,2,3 */

  T_knum = 0;
  for (i=0; i<=(knum_i-1); i++){
    for (j=0; j<=(knum_j-1); j++){
      for (k=0; k<=(knum_k-1); k++){

      T_KGrids1[T_knum] = KGrids1[i];
      T_KGrids2[T_knum] = KGrids2[j];
      T_KGrids3[T_knum] = KGrids3[k];

      Ti_KGrids1[T_knum] = i;
      Tj_KGrids2[T_knum] = j;
      Tk_KGrids3[T_knum] = k;

        T_knum++;
      }
    }
  }

  /* allocate k-points into proccessors */

  if (T_knum<=myid){
    S_knum = -10;
    E_knum = -100;
    num_kloop0 = 1;
  }
  else if (T_knum<numprocs) {
    S_knum = myid;
    E_knum = myid;
    num_kloop0 = 1;
  }
  else {
    tmp = (double)T_knum/(double)numprocs; 
    num_kloop0 = (int)tmp + 1;

    S_knum = (int)((double)myid*(tmp+0.0001)); 
    E_knum = (int)((double)(myid+1)*(tmp+0.0001)) - 1;
    if (myid==(numprocs-1)) E_knum = T_knum - 1;
    if (E_knum<0)           E_knum = 0;
  }

  factor = 1.0/(double)(knum_i * knum_j * knum_k); /* normalization factor */
  pi2 = sqrt(PI);

  /* for kloop */

  for (kloop0=0; kloop0<num_kloop0; kloop0++){

    kloop = kloop0 + S_knum;
    arpo[myid] = -1;
    if (S_knum<=kloop && kloop<=E_knum) arpo[myid] = kloop;
    for (ID=0; ID<numprocs; ID++){
      MPI_Bcast(&arpo[ID], 1, MPI_INT, ID, mpi_comm_level1);
    }

    /* set S */

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (0<=kloop){

        k1 = T_KGrids1[kloop];
        k2 = T_KGrids2[kloop];
        k3 = T_KGrids3[kloop];

        Overlap_Band(ID,CntOLP,TmpM,MP,k1,k2,k3);
        n = TmpM[0][0].r;

        if (myid==ID){
        for (i1=1; i1<=n; i1++){
          for (j1=1; j1<=n; j1++){
            S[i1][j1].r = TmpM[i1][j1].r;
            S[i1][j1].i = TmpM[i1][j1].i;
          } 
        } 
        } 

      }
    }

    kloop = arpo[myid];

    if (0<=kloop){

      EigenBand_lapack(S,ko,n);

      if (2<=level_stdout && 0<=kloop){
      printf("  kloop %2d  k1 k2 k3 %10.6f %10.6f %10.6f\n",
             kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
      for (i1=1; i1<=n; i1++){
        printf("  Eigenvalues of OLP  %2d  %15.12f\n",i1,ko[i1]);
      }
      }

      /* minus eigenvalues to 1.0e-14 */
      for (l=1; l<=n; l++){
        if (ko[l]<0.0){
          ko[l] = 1.0e-14;

          if (2<=level_stdout){
          printf("found an eigenvalue smaller than %10.8f of OLP kloop=%2d\n",
                           Threshold_OLP_Eigen,kloop);
        }
      }

        koS[l] = ko[l];
      }

      /* calculate S*1/sqrt(ko) */

      for (l=1; l<=n; l++) M1[l] = 1.0/sqrt(ko[l]);

      /* S * M1  */

      for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
        S[i1][j1].r = S[i1][j1].r*M1[j1];
        S[i1][j1].i = S[i1][j1].i*M1[j1];
      } 
      } 

    }

    /****************************************************
       make a full Hamiltonian matrix
    ****************************************************/

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (0<=kloop){
        k1 = T_KGrids1[kloop];
        k2 = T_KGrids2[kloop];
        k3 = T_KGrids3[kloop];

        /* non-spin-orbit coupling and non-LDA+U */  
        if (SO_switch==0 && Hub_U_switch==0 && Constraint_NCS_switch==0 
            && Zeeman_NCS_switch==0 && Zeeman_NCO_switch==0)
          Hamiltonian_Band_NC(ID, nh, Dummy_ImNL, TmpM, MP, k1, k2, k3);

        /* spin-orbit coupling or LDA+U */  
        else
          Hamiltonian_Band_NC(ID, nh,       ImNL, TmpM, MP, k1, k2, k3);

      if (myid==ID){
        for (i1=1; i1<=2*n; i1++){
          for (j1=1; j1<=2*n; j1++){
            H[i1][j1].r = TmpM[i1][j1].r;
            H[i1][j1].i = TmpM[i1][j1].i;
          } 
        } 
      } 
      }
    }

    kloop = arpo[myid];

    if (0<=kloop){

      /****************************************************
                     M1 * U^t * H * U * M1
      ****************************************************/

      /* transpose S */

      for (i1=1; i1<=n; i1++){
      for (j1=i1+1; j1<=n; j1++){
        Ctmp1 = S[i1][j1];
        Ctmp2 = S[j1][i1];
        S[i1][j1] = Ctmp2;
        S[j1][i1] = Ctmp1;
      }
      }

      /* H * U * M1 */

      for (i1=1; i1<=2*n; i1++){
      for (j1=1; j1<=n; j1++){

        for (m=0; m<=1; m++){

          sum  = 0.0;
          sumi = 0.0;
            mn = m*n;

          for (l=1; l<=n; l++){

            sum  += H[i1][l+mn].r*S[j1][l].r - H[i1][l+mn].i*S[j1][l].i;
            sumi += H[i1][l+mn].r*S[j1][l].i + H[i1][l+mn].i*S[j1][l].r;
          }

          jj1 = 2*j1 - 1 + m;

          C[jj1][i1].r = sum;
          C[jj1][i1].i = sumi;
        }
      }
      }     

      /* M1 * U^+ H * U * M1 */

      for (i1=1; i1<=n; i1++){

      for (m=0; m<=1; m++){

        ii1 = 2*i1 - 1 + m;

        for (j1=1; j1<=2*n; j1++){

          sum  = 0.0;
          sumi = 0.0;

            mn = m*n;

          for (l=1; l<=n; l++){
            sum  +=  S[i1][l].r*C[j1][l+mn].r + S[i1][l].i*C[j1][l+mn].i;
            sumi +=  S[i1][l].r*C[j1][l+mn].i - S[i1][l].i*C[j1][l+mn].r;
          }
          H[ii1][j1].r = sum;
          H[ii1][j1].i = sumi;
        }
      }
      }     

      /* H to C */

      for (i1=1; i1<=2*n; i1++){
      for (j1=1; j1<=2*n; j1++){
        C[i1][j1].r = H[i1][j1].r;
        C[i1][j1].i = H[i1][j1].i;
      }
      }

      /* penalty for ill-conditioning states */

      EV_cut0 = Threshold_OLP_Eigen;

      for (i1=1; i1<=n; i1++){

      if (koS[i1]<EV_cut0){
        C[2*i1-1][2*i1-1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
        C[2*i1  ][2*i1  ].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
      }

      /* cutoff the interaction between the ill-conditioned state */

      if (1.0e+3<C[2*i1-1][2*i1-1].r){
        for (j1=1; j1<=2*n; j1++){
          C[2*i1-1][j1    ] = Complex(0.0,0.0);
          C[j1    ][2*i1-1] = Complex(0.0,0.0);
          C[2*i1  ][j1    ] = Complex(0.0,0.0);
          C[j1    ][2*i1  ] = Complex(0.0,0.0);
        }
        C[2*i1-1][2*i1-1] = Complex(1.0e+4,0.0);
        C[2*i1  ][2*i1  ] = Complex(1.0e+4,0.0);
      }
      }

      /* solve eigenvalue problem */

      n1 = 2*n;
      EigenBand_lapack(C, ko, n1);

      for (i1=1; i1<=2*n; i1++) EIGEN[kloop][i1] = ko[i1];

      /****************************************************
         Transformation to the original eigenvectors.
         NOTE JRCAT-244p and JAIST-2122p 
         C = U * lambda^{-1/2} * D
      ****************************************************/

      /* transpose */
      for (i1=1; i1<=n; i1++){
      for (j1=i1+1; j1<=n; j1++){
        Ctmp1 = S[i1][j1];
        Ctmp2 = S[j1][i1];
        S[i1][j1] = Ctmp2;
        S[j1][i1] = Ctmp1;
      }
      }

      for (i1=1; i1<=2*n; i1++){
      for (j1=1; j1<=2*n; j1++){
        H2[i1][j1].r = 0.0;
        H2[i1][j1].i = 0.0;
      }
      }

      for (m=0; m<=1; m++){
      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n1; j1++){

          sum  = 0.0; 
          sumi = 0.0;

          for (l=1; l<=n; l++){
            sum  +=  S[i1][l].r*C[2*(l-1)+1+m][j1].r
                   - S[i1][l].i*C[2*(l-1)+1+m][j1].i;
            sumi +=  S[i1][l].r*C[2*(l-1)+1+m][j1].i
                 + S[i1][l].i*C[2*(l-1)+1+m][j1].r;
          } 
          H2[i1+m*n][j1].r = sum;
          H2[i1+m*n][j1].i = sumi;
        }
      }
      }

    } /* if (0<=kloop) */

    /****************************************************
        store LDOS
    ****************************************************/

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (0<=kloop){

        /* MPI: EIGEN */ 

        MPI_Bcast(&EIGEN[kloop][0], 2*n+1, MPI_DOUBLE, ID, mpi_comm_level1);

      /* MPI: H2 */

      k = 0;

      for (i1=0; i1<=2*n; i1++){
        for (j1=0; j1<=2*n; j1++){
          tmp_array0[k] = H2[i1][j1].r;
          k++;
        }
      }
         
      for (i1=0; i1<=2*n; i1++){
        for (j1=0; j1<=2*n; j1++){
          tmp_array0[k] = H2[i1][j1].i;
          k++;
        }
      }

      MPI_Barrier(mpi_comm_level1);
      MPI_Bcast(&tmp_array0[0], 2*(2*n+1)*(2*n+1), MPI_DOUBLE, ID, mpi_comm_level1);

      k = 0;

        /* transpose */
      for (i1=0; i1<=2*n; i1++){
        for (j1=0; j1<=2*n; j1++){
          H[j1][i1].r = tmp_array0[k];
          k++;
        }
      }

        /* transpose */
      for (i1=0; i1<=2*n; i1++){
        for (j1=0; j1<=2*n; j1++){
          H[j1][i1].i = tmp_array0[k];
          k++;
        }
      }

        /****************************************************
                             store LDOS
        ****************************************************/

      k1 = T_KGrids1[kloop];
      k2 = T_KGrids2[kloop];
      k3 = T_KGrids3[kloop];

      i = Ti_KGrids1[kloop];
      j = Tj_KGrids2[kloop];
      k = Tk_KGrids3[kloop];

      for (l=1; l<=2*n; l++){

        /* initialize */
        for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
          GA_AN = M2G[MA_AN];
          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];
          for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
            GB_AN = natn[GA_AN][LB_AN];
            wanB = WhatSpecies[GB_AN];
            tnoB = Spe_Total_CNO[wanB];
            for (i1=0; i1<tnoA; i1++){
            for (j1=0; j1<tnoB; j1++){
              CDM[0][MA_AN][LB_AN][i1][j1] = 0.0;
              CDM[1][MA_AN][LB_AN][i1][j1] = 0.0;
              CDM[2][MA_AN][LB_AN][i1][j1] = 0.0;
              CDM[3][MA_AN][LB_AN][i1][j1] = 0.0;
            }
            }
          }
        }

        /* theta_ij = C_ni C_nj */

        for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
          GA_AN = M2G[MA_AN];
          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];
          Anum = MP[GA_AN];
          for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
            GB_AN = natn[GA_AN][LB_AN];
            RnB = ncn[GA_AN][LB_AN];
            wanB = WhatSpecies[GB_AN];
            tnoB = Spe_Total_CNO[wanB];
            Bnum = MP[GB_AN];
            l1 = atv_ijk[RnB][1];
            l2 = atv_ijk[RnB][2];
            l3 = atv_ijk[RnB][3];
            kRn = k1*l1 + k2*l2 + k3*l3;
            si = sin(2.0*PI*kRn);  /* Imaginary part */
            co = cos(2.0*PI*kRn);  /* Real part      */
            for (i1=0; i1<=(tnoA-1); i1++){
            for (j1=0; j1<=(tnoB-1); j1++){

              /* Re11 */

              u2 = H[l][Anum+i1].r*H[l][Bnum+j1].r;
              v2 = H[l][Anum+i1].i*H[l][Bnum+j1].i;
              uv = H[l][Anum+i1].r*H[l][Bnum+j1].i;
              vu = H[l][Anum+i1].i*H[l][Bnum+j1].r;
              Redum = (u2 + v2);
              Imdum = (uv - vu);
              Redum2 = co*Redum - si*Imdum;
              CDM[0][MA_AN][LB_AN][i1][j1] += Redum2;

              /* Re22 */

              u2 = H[l][Anum+i1+n].r*H[l][Bnum+j1+n].r;
              v2 = H[l][Anum+i1+n].i*H[l][Bnum+j1+n].i;
              uv = H[l][Anum+i1+n].r*H[l][Bnum+j1+n].i;
              vu = H[l][Anum+i1+n].i*H[l][Bnum+j1+n].r;
              Redum = (u2 + v2);
              Imdum = (uv - vu);
              Redum2 = co*Redum - si*Imdum;
              CDM[1][MA_AN][LB_AN][i1][j1] += Redum2;

              /* Re12 */
 
              u2 = H[l][Anum+i1].r*H[l][Bnum+j1+n].r;
              v2 = H[l][Anum+i1].i*H[l][Bnum+j1+n].i;
              uv = H[l][Anum+i1].r*H[l][Bnum+j1+n].i;
              vu = H[l][Anum+i1].i*H[l][Bnum+j1+n].r;

              Redum = (u2 + v2);
              Imdum = (uv - vu);
              Redum2 = co*Redum - si*Imdum;
              CDM[2][MA_AN][LB_AN][i1][j1] = Redum2;

              /* Im12
                 conjugate complex of Im12 due to difference in the definition
                 between density matrix and charge density
              */

              u2 = H[l][Anum+i1].r*H[l][Bnum+j1+n].r;
              v2 = H[l][Anum+i1].i*H[l][Bnum+j1+n].i;
              uv = H[l][Anum+i1].r*H[l][Bnum+j1+n].i;
              vu = H[l][Anum+i1].i*H[l][Bnum+j1+n].r;

              Redum = (u2 + v2);
              Imdum = (uv - vu);
              Imdum2 = -(co*Imdum + si*Redum);
              CDM[3][MA_AN][LB_AN][i1][j1] = Imdum2;


            } /* j1 */
            }   /* i1 */
          }     /* LB_AN */
        }       /* MA_AN */

        /*******************************************
            M_i = S_ij D_ji
            D_ji = C_nj C_ni
            S_ij : CntOLP
            D_ji : CDM
        ******************************************/

        for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];
          for (i1=0; i1<tnoA; i1++){
            SD[0][GA_AN][i1]=0.0;
            SD[1][GA_AN][i1]=0.0;
            SD[2][GA_AN][i1]=0.0;
            SD[3][GA_AN][i1]=0.0;
          }
        }

        for (spin=0; spin<=3; spin++){
          for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
            GA_AN = M2G[MA_AN];
            wanA = WhatSpecies[GA_AN];
            tnoA = Spe_Total_CNO[wanA];
            for (i1=0; i1<tnoA; i1++){
            for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
              GB_AN = natn[GA_AN][LB_AN];
              wanB = WhatSpecies[GB_AN];
              tnoB = Spe_Total_CNO[wanB];
              for (j1=0; j1<tnoB; j1++){
                SD[spin][GA_AN][i1] += CDM[spin][MA_AN][LB_AN][i1][j1]*
                                          CntOLP[MA_AN][LB_AN][i1][j1];
              }
            }
            }
          }
        }

          /*  calculate contribution to Gaussian DOS */

        for (GA_AN=1; GA_AN<=atomnum; GA_AN++){

          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];
          ID2 = G2ID[GA_AN];

          if (ID2==myid){

            theta = Angle0_Spin[GA_AN];
            phi   = Angle1_Spin[GA_AN];

            sit = sin(theta);
            cot = cos(theta);
            sip = sin(phi);
            cop = cos(phi);

            for (i1=0; i1<tnoA; i1++){

              tmp1 = 0.5*(SD[0][GA_AN][i1] + SD[1][GA_AN][i1]);
              tmp2 = 0.5*cot*(SD[0][GA_AN][i1] - SD[1][GA_AN][i1]);
              tmp3 = (SD[2][GA_AN][i1]*cop - SD[3][GA_AN][i1]*sip)*sit;

            fSDup[i1]=(float)(tmp1 + tmp2 + tmp3);
            fSDdn[i1]=(float)(tmp1 - tmp2 - tmp3);
            }

            /*************************
                  exp( -(x/a)^2 ) 
                  a= DosGauss_Width
                x=-3a : 3a is counted 
            **************************/

            eg = EIGEN[kloop][l];
            x = (eg-Dos_Erange[0])/(Dos_Erange[1]-Dos_Erange[0])*(DosGauss_Num_Mesh-1);
            iecenter = (int)x ; 

            iewidth = DosGauss_Width*3.0/(Dos_Erange[1]-Dos_Erange[0])*(DosGauss_Num_Mesh-1)+3;

            iemin = iecenter - iewidth;
            iemax = iecenter + iewidth;

            if (iemin<0) iemin=0;
            if (iemax>=DosGauss_Num_Mesh) iemax=DosGauss_Num_Mesh-1;

            if ( 0<=iemin && iemin<DosGauss_Num_Mesh && 0<=iemax && iemax<DosGauss_Num_Mesh ) {

            for (ie=iemin;ie<=iemax;ie++) {
              xa = (eg-r_energy[ie])/DosGauss_Width;

              for (i1=0; i1<tnoA; i1++){
                Dos[ie][0][GA_AN][i1] += factor*fSDup[i1]* exp(-xa*xa)/(DosGauss_Width*pi2);
                Dos[ie][1][GA_AN][i1] += factor*fSDdn[i1]* exp(-xa*xa)/(DosGauss_Width*pi2);
              }
            }
            }
          }

        }    /* GA_AN */
      }      /* l             */ 
      }        /* if (0<=kloop) */
    }          /* ID            */
  }            /* kloop0        */

  /****************************************************************
                            Dos by MPI
  ****************************************************************/

  {
 
    float *DosTmp;

    DosTmp = (float*)malloc(sizeof(float)*DosGauss_Num_Mesh*2*List_YOUSO[7] );

    for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
      ID2 = G2ID[GA_AN];

      if (ID2==myid){

        k = 0;

        for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
        for (spin=0; spin<=1; spin++){
          for (i=0; i<List_YOUSO[7]; i++){
            DosTmp[k] = Dos[ik][spin][GA_AN][i];
              k++;
          }
        }
        }

      if (ID2!=Host_ID){
        tag = 999;
        MPI_Isend(&DosTmp[0], DosGauss_Num_Mesh*2*List_YOUSO[7],
                    MPI_FLOAT,Host_ID,tag,mpi_comm_level1,&request);
        MPI_Wait(&request,&stat);
      }
      }

      if (myid==Host_ID && ID2!=Host_ID){
      tag = 999;
      MPI_Recv(&DosTmp[0], DosGauss_Num_Mesh*2*List_YOUSO[7],
                 MPI_FLOAT, ID2, tag, mpi_comm_level1, &stat);

        k = 0;

        for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
        for (spin=0; spin<=1; spin++){
          for (i=0; i<List_YOUSO[7]; i++){
            Dos[ik][spin][GA_AN][i] = DosTmp[k];
              k++;
          }
        }
        }
      }

      MPI_Barrier(mpi_comm_level1);

    }

    free(DosTmp);
  }

  /****************************************************************
                    write eigenvalues and eigenvectors
  ****************************************************************/

  if (myid==Host_ID){

    sprintf(file_eig,"%s%s.Dos.val",filepath,filename);

    if ( (fp_eig=fopen(file_eig,"w"))==NULL ) {
      printf("can not open a file %s\n",file_eig);
    }
  
    else{

      /* write *.Dos.val */

      printf("write eigenvalues\n");

      fprintf(fp_eig,"mode        7\n");
      fprintf(fp_eig,"NonCol      1\n");
      fprintf(fp_eig,"N           %d\n",n);
      fprintf(fp_eig,"Nspin       %d\n",1);  /* switch to 1 */
      fprintf(fp_eig,"Erange      %lf %lf\n",Dos_Erange[0],Dos_Erange[1]);
      fprintf(fp_eig,"Kgrid       %d %d %d\n",1,1,1);
      fprintf(fp_eig,"atomnum     %d\n",atomnum);
      fprintf(fp_eig,"<WhatSpecies\n");
      for (i=1; i<=atomnum; i++) {
        fprintf(fp_eig,"%d ",WhatSpecies[i]);
      }
      fprintf(fp_eig,"\nWhatSpecies>\n");
      fprintf(fp_eig,"SpeciesNum     %d\n",SpeciesNum);
      fprintf(fp_eig,"<Spe_Total_CNO\n");
      for (i=0;i<SpeciesNum;i++) {
        fprintf(fp_eig,"%d ",Spe_Total_CNO[i]);
      }
      fprintf(fp_eig,"\nSpe_Total_CNO>\n");

      MaxL = Supported_MaxL; 

      fprintf(fp_eig,"MaxL           %d\n",Supported_MaxL);
      fprintf(fp_eig,"<Spe_Num_CBasis\n");
      for (i=0;i<SpeciesNum;i++) {
        for (l=0;l<=MaxL;l++) {
        fprintf(fp_eig,"%d ",Spe_Num_CBasis[i][l]);
        }
        fprintf(fp_eig,"\n");
      }
      fprintf(fp_eig,"Spe_Num_CBasis>\n");
      fprintf(fp_eig,"ChemP       %lf\n",ChemP);

      fprintf(fp_eig,"<SpinAngle\n");
      for (i=1; i<=atomnum; i++) {
        fprintf(fp_eig,"%lf %lf\n",Angle0_Spin[i],Angle1_Spin[i]);
      }
      fprintf(fp_eig,"SpinAngle>\n");

      fprintf(fp_eig,"irange      %d %d\n",0,DosGauss_Num_Mesh-1);
      fprintf(fp_eig,"<Eigenvalues\n");
      for (spin=0; spin<=1; spin++) {
        fprintf(fp_eig,"%d %d %d ",0,0,0);

        for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
          fprintf(fp_eig,"%lf ",r_energy[ik]);
      }
        fprintf(fp_eig,"\n");
      }  
      fprintf(fp_eig,"Eigenvalues>\n");

      fclose(fp_eig);
    }

    /* write *.Dos.vec */

    printf("write eigenvectors\n");

    sprintf(file_ev,"%s%s.Dos.vec",filepath,filename);

    if ( (fp_ev=fopen(file_ev,"w"))==NULL ) {
      printf("can not open a file %s\n",file_ev);
    }
    else {

      for (ik=0; ik<DosGauss_Num_Mesh; ik++) {

      i_vec[0] = 0; 
      i_vec[1] = 0;
      i_vec[2] = 0;

      if (myid==Host_ID) fwrite(i_vec,sizeof(int),3,fp_ev);

        k = 0;
      for (GA_AN=1; GA_AN<=atomnum; GA_AN++) {
        wanA = WhatSpecies[GA_AN];
        tnoA = Spe_Total_CNO[wanA];
          for (i=0; i<tnoA; i++){
            DosVec[k] = Dos[ik][0][GA_AN][i];  k++;
        }
          for (i=0; i<tnoA; i++){
            DosVec[k] = Dos[ik][1][GA_AN][i];  k++;
        }
      }

        fwrite(DosVec,sizeof(float),k,fp_ev);

      }

      fclose(fp_ev);
    }

  } /* if (myid==Host_ID) */

  /****************************************************
                       free arrays
  ****************************************************/

  free(DosVec);

  free(MP);

  free(arpo);

  free(ko);
  free(koS);

  free(tmp_array0);

  for (j=0; j<n2; j++){
    free(H[j]);
  }
  free(H);

  for (i=0; i<n2; i++){
    free(S[i]);
  }
  free(S);

  free(M1);

  for (j=0; j<n2; j++){
    free(C[j]);
  }
  free(C);

  free(KGrids1); free(KGrids2);free(KGrids3);

  for (i=0; i<i_CDM[0]; i++){
    for (j=0; j<i_CDM[1]; j++){
      for (k=0; k<i_CDM[2]; k++){
        for (l=0; l<i_CDM[3]; l++){
          free(CDM[i][j][k][l]);
      }
        free(CDM[i][j][k]);
      }
      free(CDM[i][j]);
    }
    free(CDM[i]);
  }
  free(CDM);

  for (i=0; i<i_SD[0]; i++){
    for (j=0; j<i_SD[1]; j++){
      free(SD[i][j]);
    }
    free(SD[i]);
  }
  free(SD);

  free(fSDup);
  free(fSDdn);

  for (j=0; j<n2; j++){
    free(TmpM[j]);
  }
  free(TmpM);

  for (j=0; j<n2; j++){
    free(H2[j]);
  }
  free(H2);

  for (ik=0; ik<DosGauss_Num_Mesh; ik++) {
    for (spin=0; spin<=1; spin++) {
      for (GA_AN=0; GA_AN<=atomnum; GA_AN++) {
        free(Dos[ik][spin][GA_AN]);
      }
      free(Dos[ik][spin]);
    }
    free(Dos[ik]);
  }
  free(Dos);

  free(r_energy);

  /* non-spin-orbit coupling and non-LDA+U */  
  if (SO_switch==0 && Hub_U_switch==0 && Constraint_NCS_switch==0 
      && Zeeman_NCS_switch==0 && Zeeman_NCO_switch==0){
    free(Dummy_ImNL[0][0][0][0]);
    free(Dummy_ImNL[0][0][0]);
    free(Dummy_ImNL[0][0]);
    free(Dummy_ImNL[0]);
    free(Dummy_ImNL);
  }

  free(T_KGrids1);
  free(T_KGrids2);
  free(T_KGrids3);

  free(Ti_KGrids1);
  free(Tj_KGrids2);
  free(Tk_KGrids3);

  for (j=0; j<T_knum; j++){
    free(EIGEN[j]);
  }
  free(EIGEN);

  /* for elapsed time */

  dtime(&TEtime);
  time0 = TEtime - TStime;
  return time0;
}







static double Band_DFT_Dosout_Col(
                           int knum_i, int knum_j, int knum_k,
                           int SpinP_switch,
                           double *****nh,
                           double *****ImNL,
                           double ****CntOLP)
{
  int i,j,k,spin,l,i1,j1,n1;
  int n, wanA;
  int *MP;
  int MA_AN, GA_AN, tnoA,Anum, LB_AN;
  int GB_AN, wanB, tnoB, Bnum, RnB;
  int l1,l2,l3;
  int kloop,kloop0,S_knum,E_knum,T_knum,num_kloop0; 
  int ie,iemin,iemax,iemin0,iemax0,n1min;
  int MaxL,e1,s1;
  int i_vec[10];

  double sum,sumi,u2,v2,uv,vu,tmp;
  double kRn,si,co,Redum,Imdum,Redum2,Resum;
  double TStime,TEtime,time0;
  double OLP_eigen_cut=Threshold_OLP_Eigen;

  double EV_cut0;
  double *koS;
  double **ko; int N_ko, i_ko[10];
  dcomplex **H;  int N_H,  i_H[10];
  dcomplex **S;  int N_S,  i_S[10];
  double *M1,***EIGEN;
  dcomplex **C;  int N_C,  i_C[10];
  double *KGrids1, *KGrids2, *KGrids3;
  double ****CDM; int N_CDM, i_CDM[10];
  double **SD; int N_SD, i_SD[10];
  double *tmp_array0;
  dcomplex ***H2,**TmpM;
  double *T_KGrids1,*T_KGrids2,*T_KGrids3;
  int *Ti_KGrids1,*Tj_KGrids2,*Tk_KGrids3,*arpo;
  dcomplex Ctmp1,Ctmp2;

  float *fSD;
  double ****Dummy_ImNL;
  double snum_i, snum_j, snum_k; 
  double k1,k2,k3;
  char file_ev[YOUSO10],file_eig[YOUSO10];
  FILE *fp_ev,*fp_eig;
  char buf1[fp_bsize];          /* setvbuf */
  char buf2[fp_bsize];          /* setvbuf */
  int numprocs,myid,ID,ID1,ID2,tag;

  MPI_Status stat;
  MPI_Request request;

  /* MPI */
  MPI_Comm_size(mpi_comm_level1,&numprocs);
  MPI_Comm_rank(mpi_comm_level1,&myid);
  MPI_Barrier(mpi_comm_level1);

  if (myid==Host_ID){
    printf("\n<Band_DFT_Dosout>\n"); fflush(stdout);
  }

  dtime(&TStime);

  /****************************************************
             calculation of the array size
  ****************************************************/

  n = 0;
  for (i=1; i<=atomnum; i++){
    wanA  = WhatSpecies[i];
    n  += Spe_Total_CNO[wanA];
  }

  /****************************************************
   Allocation

   int       MP[List_YOUSO[1]]
   double    ko[List_YOUSO[23]][n+1]
   double    koS[n+1];
   dcomplex  H[n+1][n+1]
   dcomplex  S[n+1][n+1]
   double    M1[n+1]
   dcomplex  C[n+1][n+1]
   double    KGrids1[knum_i]
   double    KGrids2[knum_j]
   double    KGrids3[knum_k]
   double    CDM[Matomnum+1][List_YOUSO[8]]
                [List_YOUSO[7]][List_YOUSO[7]]
   double    SD[List_YOUSO[1]][List_YOUSO[7]]
   float     fSD[List_YOUSO[7]]
  ****************************************************/

  MP = (int*)malloc(sizeof(int)*List_YOUSO[1]);
  arpo = (int*)malloc(sizeof(int)*numprocs);

  N_ko=2; i_ko[0]=List_YOUSO[23]; i_ko[1]=n+1;
  ko=(double**)malloc_multidimarray("double",N_ko, i_ko); 

  koS = (double*)malloc(sizeof(double)*(n+1));

  N_H=2; i_H[0]=i_H[1]=n+1;
  H=(dcomplex**)malloc_multidimarray("dcomplex",N_H, i_H);

  N_S=2; i_S[0]=i_S[1]=n+1;
  S=(dcomplex**)malloc_multidimarray("dcomplex",N_S, i_S);

  M1 = (double*)malloc(sizeof(double)*(n+1));

  N_C=2; i_C[0]=i_C[1]=n+1;
  C=(dcomplex**)malloc_multidimarray("dcomplex",N_C, i_C);

  KGrids1 = (double*)malloc(sizeof(double)*knum_i);
  KGrids2 = (double*)malloc(sizeof(double)*knum_j);
  KGrids3 = (double*)malloc(sizeof(double)*knum_k);

  N_CDM=4; i_CDM[0]=Matomnum+1; i_CDM[1]=List_YOUSO[8];
  i_CDM[2]=List_YOUSO[7]; i_CDM[3]=List_YOUSO[7];
  CDM=(double****)malloc_multidimarray("double",N_CDM,i_CDM);

  N_SD=2;  i_SD[0]=List_YOUSO[1]; i_SD[1]=List_YOUSO[7];
  SD=(double**)malloc_multidimarray("double",N_SD,i_SD);

  fSD=(float*)malloc(sizeof(float)*List_YOUSO[7]);

  tmp_array0 = (double*)malloc(sizeof(double)*(n+1)*(n+1)*2);

  TmpM = (dcomplex**)malloc(sizeof(dcomplex*)*(n+1));
  for (j=0; j<n+1; j++){
    TmpM[j] = (dcomplex*)malloc(sizeof(dcomplex)*(n+1));
  }

  H2 = (dcomplex***)malloc(sizeof(dcomplex**)*List_YOUSO[23]);
  for (i=0; i<List_YOUSO[23]; i++){
    H2[i] = (dcomplex**)malloc(sizeof(dcomplex*)*(n+1));
    for (j=0; j<n+1; j++){
      H2[i][j] = (dcomplex*)malloc(sizeof(dcomplex)*(n+1));
    }
  }

  /* no spin-orbit coupling */
  if (SO_switch==0){
    Dummy_ImNL = (double****)malloc(sizeof(double***)*1);
    Dummy_ImNL[0] = (double***)malloc(sizeof(double**)*1);
    Dummy_ImNL[0][0] = (double**)malloc(sizeof(double*)*1);
    Dummy_ImNL[0][0][0] = (double*)malloc(sizeof(double)*1);
  }

  snum_i = knum_i;
  snum_j = knum_j;
  snum_k = knum_k;

  /* set up  k-grids */
  for (i=0; i<=(knum_i-1); i++){
    if (knum_i==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_i) + Shift_K_Point;
    }
    KGrids1[i]=k1;
  }
  for (i=0; i<=(knum_j-1); i++){
    if (knum_j==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_j) - Shift_K_Point;
    }
    KGrids2[i]=k1;
  }
  for (i=0; i<=(knum_k-1); i++){
    if (knum_k==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_k) + 2.0*Shift_K_Point;
    }
    KGrids3[i]=k1;
  }

  if (myid==Host_ID){
    printf(" KGrids1: ");
    for (i=0;i<=knum_i-1;i++) printf("%f ",KGrids1[i]);
    printf("\n");
    printf(" KGrids2: ");
    for (i=0;i<=knum_j-1;i++) printf("%f ",KGrids2[i]);
    printf("\n");
    printf(" KGrids3: ");
    for (i=0;i<=knum_k-1;i++) printf("%f ",KGrids3[i]);
    printf("\n");
  }

  /***********************************
       one-dimentionalize for MPI
  ************************************/

  T_knum = 0;
  for (i=0; i<=(knum_i-1); i++){
    for (j=0; j<=(knum_j-1); j++){
      for (k=0; k<=(knum_k-1); k++){
        T_knum++;
      }
    }
  }

  T_KGrids1 = (double*)malloc(sizeof(double)*T_knum);
  T_KGrids2 = (double*)malloc(sizeof(double)*T_knum);
  T_KGrids3 = (double*)malloc(sizeof(double)*T_knum);

  Ti_KGrids1 = (int*)malloc(sizeof(int)*T_knum);
  Tj_KGrids2 = (int*)malloc(sizeof(int)*T_knum);
  Tk_KGrids3 = (int*)malloc(sizeof(int)*T_knum);

  EIGEN  = (double***)malloc(sizeof(double**)*List_YOUSO[23]);
  for (i=0; i<List_YOUSO[23]; i++){
    EIGEN[i] = (double**)malloc(sizeof(double*)*T_knum);
    for (j=0; j<T_knum; j++){
      EIGEN[i][j] = (double*)malloc(sizeof(double)*(n+1));
    }
  }

  /* set T_KGrid1,2,3 */

  T_knum = 0;
  for (i=0; i<=(knum_i-1); i++){
    for (j=0; j<=(knum_j-1); j++){
      for (k=0; k<=(knum_k-1); k++){

      T_KGrids1[T_knum] = KGrids1[i];
      T_KGrids2[T_knum] = KGrids2[j];
      T_KGrids3[T_knum] = KGrids3[k];

      Ti_KGrids1[T_knum] = i;
      Tj_KGrids2[T_knum] = j;
      Tk_KGrids3[T_knum] = k;

        T_knum++;
      }
    }
  }

  /* allocate k-points into proccessors */

  if (T_knum<=myid){
    S_knum = -10;
    E_knum = -100;
    num_kloop0 = 1;
  }
  else if (T_knum<numprocs) {
    S_knum = myid;
    E_knum = myid;
    num_kloop0 = 1;
  }
  else {
    tmp = (double)T_knum/(double)numprocs; 
    num_kloop0 = (int)tmp + 1;

    S_knum = (int)((double)myid*(tmp+0.0001)); 
    E_knum = (int)((double)(myid+1)*(tmp+0.0001)) - 1;
    if (myid==(numprocs-1)) E_knum = T_knum - 1;
    if (E_knum<0)           E_knum = 0;
  }

  /****************************************************************
                      find iemin and iemax
  *****************************************************************/

  iemin=n; iemax=1; n1min=1;

  k1 = 0.0;
  k2 = 0.0;
  k3 = 0.0;

  Overlap_Band(Host_ID,CntOLP,S,MP,k1,k2,k3);

  if (myid==Host_ID){

    n = S[0][0].r;
    EigenBand_lapack(S, ko[0], n);

    if (3<=level_stdout){
      printf("  k1 k2 k3 %10.6f %10.6f %10.6f\n",k1,k2,k3);
      for (i1=1; i1<=n; i1++){
      printf("  Eigenvalues of OLP  %2d  %15.12f\n",i1,ko[0][i1]);
      }
    }

    /* minus eigenvalues to 1.0e-14 */

    for (l=1; l<=n; l++){
      if (ko[0][l]<0.0) ko[0][l] = 1.0e-14;
      koS[l] = ko[0][l];
    }

    /* calculate S*1/sqrt(ko) */

    for (l=1; l<=n; l++) M1[l] = 1.0/sqrt(ko[0][l]);

    /* S * M1  */

    for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
      S[i1][j1].r = S[i1][j1].r*M1[j1];
      S[i1][j1].i = S[i1][j1].i*M1[j1];
      } 
    } 

  } /* if (myid==Host_ID) */

  for (spin=0; spin<=SpinP_switch; spin++){

    /****************************************************
               make a full Hamiltonian matrix
    ****************************************************/

    Hamiltonian_Band(Host_ID, nh[spin], H, MP, k1, k2, k3);

    if (myid==Host_ID){

      /****************************************************
                    M1 * U^t * H * U * M1
      ****************************************************/

      /* transpose S */

      if (spin==0){
      for (i1=1; i1<=n; i1++){
        for (j1=i1+1; j1<=n; j1++){
          Ctmp1 = S[i1][j1];
          Ctmp2 = S[j1][i1];
          S[i1][j1] = Ctmp2;
          S[j1][i1] = Ctmp1;
        }
      }
      }

      /* H * U * M1 */
 
      for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
        sum = 0.0; sumi=0.0;
        for (l=1; l<=n; l++){
          sum  += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
          sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
        }

        C[j1][i1].r = sum;
        C[j1][i1].i = sumi;
      }
      }     

      /* M1 * U^+ H * U * M1 */

      for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
        sum = 0.0;
          sumi=0.0;
        for (l=1; l<=n; l++){
          sum  +=  S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
          sumi +=  S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
        }
        H[i1][j1].r = sum;
        H[i1][j1].i = sumi;
      }
      }     

      /* H to C */

      for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
        C[i1][j1] = H[i1][j1];
      }
      }

      /* penalty for ill-conditioning states */

      EV_cut0 = Threshold_OLP_Eigen;

      for (i1=1; i1<=n; i1++){

      if (koS[i1]<EV_cut0){
        C[i1][i1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
      }
 
      /* cutoff the interaction between the ill-conditioned state */
 
      if (1.0e+3<C[i1][i1].r){
        for (j1=1; j1<=n; j1++){
          C[i1][j1] = Complex(0.0,0.0);
          C[j1][i1] = Complex(0.0,0.0);
        }
        C[i1][i1].r = 1.0e+4;
      }
      }

      /* solve eigenvalue problem */

      n1 = n;
      EigenBand_lapack(C,ko[spin],n1);

      if (n1min<n1) n1min=n1;

      iemin0=1;
      for (i1=1;i1<=n1;i1++) {
      if (ko[spin][i1]>(ChemP+Dos_Erange[0]) ) {
        iemin0=i1-1;
        break;
      }
      }
      if (iemin0<1) iemin0=1;

      iemax0=n1;
      for (i1=iemin0;i1<=n1;i1++) {
      if (ko[spin][i1]>(ChemP+Dos_Erange[1]) ) {
        iemax0=i1;
        break;
      }
      }
      if (iemax0>n1) iemax0=n1;

      if (iemin>iemin0) iemin=iemin0;
      if (iemax<iemax0) iemax=iemax0;

    } /* if (myid==Host_ID) */
  }   /* spin */

  /* add a buffer to iemin and iemax */

  iemin -= 5;
  iemax += 5;

  if (iemin<1) iemin = 1;
  if (n<iemax) iemax = n;

  if (myid==Host_ID){
    if (n1min<iemax) iemax=n1min;
    printf(" iemin and iemax= %d %d\n",iemin,iemax);
  }

  /* MPI, iemin, iemax */
  MPI_Barrier(mpi_comm_level1);
  MPI_Bcast(&iemin, 1, MPI_INT, Host_ID, mpi_comm_level1);
  MPI_Bcast(&iemax, 1, MPI_INT, Host_ID, mpi_comm_level1);

  /****************************************************************
                     eigenvalues and eigenvectors
   ****************************************************************/

  if (myid==Host_ID){
    strcpy(file_eig,".Dos.val");
    fnjoint(filepath,filename,file_eig);
    if ( (fp_eig=fopen(file_eig,"w"))==NULL ) {

#ifdef xt3
      setvbuf(fp_eig,buf1,_IOFBF,fp_bsize);  /* setvbuf */
#endif

      printf("can not open a file %s\n",file_eig);
    }

    strcpy(file_ev,".Dos.vec");
    fnjoint(filepath,filename,file_ev);
    if ( (fp_ev=fopen(file_ev,"w"))==NULL ) {

#ifdef xt3
      setvbuf(fp_ev,buf2,_IOFBF,fp_bsize);  /* setvbuf */
#endif

      printf("can not open a file %s\n",file_ev);
    }
    if ( fp_eig==NULL || fp_ev==NULL ) {
      goto Finishing;
    }
  }

  if (myid==Host_ID){

    if (fp_eig) {
      fprintf(fp_eig,"mode        1\n");
      fprintf(fp_eig,"NonCol      0\n");
      fprintf(fp_eig,"N           %d\n",n);
      fprintf(fp_eig,"Nspin       %d\n",SpinP_switch);
      fprintf(fp_eig,"Erange      %lf %lf\n",Dos_Erange[0],Dos_Erange[1]);
      fprintf(fp_eig,"irange      %d %d\n",iemin,iemax);
      fprintf(fp_eig,"Kgrid       %d %d %d\n",knum_i,knum_j,knum_k);
      fprintf(fp_eig,"atomnum     %d\n",atomnum);
      fprintf(fp_eig,"<WhatSpecies\n");
      for (i=1;i<=atomnum;i++) {
        fprintf(fp_eig,"%d ",WhatSpecies[i]);
      }
      fprintf(fp_eig,"\nWhatSpecies>\n");
      fprintf(fp_eig,"SpeciesNum     %d\n",SpeciesNum);
      fprintf(fp_eig,"<Spe_Total_CNO\n");
      for (i=0;i<SpeciesNum;i++) {
        fprintf(fp_eig,"%d ",Spe_Total_CNO[i]);
      }
      fprintf(fp_eig,"\nSpe_Total_CNO>\n");

      MaxL=Supported_MaxL; 

      fprintf(fp_eig,"MaxL           %d\n",Supported_MaxL);
      fprintf(fp_eig,"<Spe_Num_CBasis\n");
      for (i=0;i<SpeciesNum;i++) {
        for (l=0;l<=MaxL;l++) {
          fprintf(fp_eig,"%d ",Spe_Num_CBasis[i][l]);
        }
        fprintf(fp_eig,"\n");
      }
      fprintf(fp_eig,"Spe_Num_CBasis>\n");

      fprintf(fp_eig,"ChemP       %lf\n",ChemP);

      fprintf(fp_eig,"<Eigenvalues\n");

    }
    if (fp_eig) {
      printf(" write eigenvalues\n");
    }
    if (fp_ev) {
      printf(" write eigenvectors\n");
    }
  }

  /* for kloop */

  for (kloop0=0; kloop0<num_kloop0; kloop0++){

    kloop = kloop0 + S_knum;
    arpo[myid] = -1;
    if (S_knum<=kloop && kloop<=E_knum) arpo[myid] = kloop;
    for (ID=0; ID<numprocs; ID++){
      MPI_Bcast(&arpo[ID], 1, MPI_INT, ID, mpi_comm_level1);
    }

    /* set S */

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];
      
      if (0<=kloop){
      
        k1 = T_KGrids1[kloop];
        k2 = T_KGrids2[kloop];
        k3 = T_KGrids3[kloop];
      
        Overlap_Band(ID,CntOLP,TmpM,MP,k1,k2,k3);
        n = TmpM[0][0].r;
      
        if (myid==ID){
        for (i1=1; i1<=n; i1++){
          for (j1=1; j1<=n; j1++){
            S[i1][j1].r = TmpM[i1][j1].r;
            S[i1][j1].i = TmpM[i1][j1].i;
          } 
        } 
        } 
       
      }
    }

    kloop = arpo[myid];

    if (0<=kloop){

      EigenBand_lapack(S,ko[0],n);

      if (3<=level_stdout && 0<=kloop){
      printf("  kloop %2d  k1 k2 k3 %10.6f %10.6f %10.6f\n",
             kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
      for (i1=1; i1<=n; i1++){
        printf("  Eigenvalues of OLP  %2d  %15.12f\n",i1,ko[0][i1]);
      }
      }

      /* minus eigenvalues to 1.0e-14 */

      for (l=1; l<=n; l++){
      if (ko[0][l]<0.0) ko[0][l] = 1.0e-14;
      koS[l] = ko[0][l];
      }

      /* calculate S*1/sqrt(ko) */

      for (l=1; l<=n; l++) M1[l] = 1.0/sqrt(ko[0][l]);

      /* S * M1  */

      for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
        S[i1][j1].r = S[i1][j1].r*M1[j1];
        S[i1][j1].i = S[i1][j1].i*M1[j1];
      } 
      } 

    } /* if (0<=kloop) */

    /* set H */

    for (spin=0; spin<=SpinP_switch; spin++){

      for (ID=0; ID<numprocs; ID++){

        kloop = arpo[ID];

        if (0<=kloop){
          k1 = T_KGrids1[kloop];
          k2 = T_KGrids2[kloop];
          k3 = T_KGrids3[kloop];

          Hamiltonian_Band(ID, nh[spin], TmpM, MP, k1, k2, k3);

          if (myid==ID){
          for (i1=1; i1<=n; i1++){
            for (j1=1; j1<=n; j1++){
              H[i1][j1].r = TmpM[i1][j1].r;
              H[i1][j1].i = TmpM[i1][j1].i;
            } 
          } 
          } 
      }
      }

      kloop = arpo[myid];

      if (0<=kloop){

        /****************************************************
                       M1 * U^t * H * U * M1
        ****************************************************/

        /* transpose S */

      for (i1=1; i1<=n; i1++){
        for (j1=i1+1; j1<=n; j1++){
          Ctmp1 = S[i1][j1];
          Ctmp2 = S[j1][i1];
          S[i1][j1] = Ctmp2;
          S[j1][i1] = Ctmp1;
        }
      }

        /* H * U * M1 */
 
      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n; j1++){

          sum  = 0.0;
          sumi = 0.0;

          for (l=1; l<=n; l++){
            sum  += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
            sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
          }

          C[j1][i1].r = sum;
          C[j1][i1].i = sumi;
        }
      }     

        /* M1 * U^+ H * U * M1 */

      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n; j1++){

          sum  = 0.0;
            sumi = 0.0;

          for (l=1; l<=n; l++){
            sum  +=  S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
            sumi +=  S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
          }
          H[i1][j1].r = sum;
          H[i1][j1].i = sumi;
        }
      }     

        /* H to C */

      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n; j1++){
          C[i1][j1] = H[i1][j1];
        }
      }

      /* penalty for ill-conditioning states */

      EV_cut0 = Threshold_OLP_Eigen;

      for (i1=1; i1<=n; i1++){

        if (koS[i1]<EV_cut0){
          C[i1][i1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
        }
 
        /* cutoff the interaction between the ill-conditioned state */
 
        if (1.0e+3<C[i1][i1].r){
          for (j1=1; j1<=n; j1++){
            C[i1][j1] = Complex(0.0,0.0);
            C[j1][i1] = Complex(0.0,0.0);
          }
          C[i1][i1].r = 1.0e+4;
        }
      }

        /* solve eigenvalue problem */

      n1 = n;
        EigenBand_lapack(C,ko[spin],n1);

      for (i1=1; i1<=n; i1++) EIGEN[spin][kloop][i1] = ko[spin][i1];

        /****************************************************
           transformation to the original eigen vectors.
           NOTE JRCAT-244p and JAIST-2122p 
           C = U * lambda^{-1/2} * D
        ****************************************************/

      /* transpose */

      for (i1=1; i1<=n; i1++){
        for (j1=i1+1; j1<=n; j1++){
          Ctmp1 = S[i1][j1];
          Ctmp2 = S[j1][i1];
          S[i1][j1] = Ctmp2;
          S[j1][i1] = Ctmp1;
        }
      }

      /* transpose */

      for (i1=1; i1<=n; i1++){
        for (j1=i1+1; j1<=n; j1++){
          Ctmp1 = C[i1][j1];
          Ctmp2 = C[j1][i1];
          C[i1][j1] = Ctmp2;
          C[j1][i1] = Ctmp1;
        }
      }

        /* shift */

      for (j1=1; j1<=n; j1++){
        for (l=n; 1<=l; l--){
          C[j1][l] = C[j1][l];
        }
      }

      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n; j1++){
          H2[spin][i1][j1].r = 0.0;
          H2[spin][i1][j1].i = 0.0;
        }
      }

      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n1; j1++){

          sum  = 0.0;
          sumi = 0.0;

          for (l=1; l<=n; l++){
            sum  +=  S[i1][l].r*C[j1][l].r - S[i1][l].i*C[j1][l].i;
            sumi +=  S[i1][l].r*C[j1][l].i + S[i1][l].i*C[j1][l].r;
          }

          H2[spin][i1][j1].r = sum;
          H2[spin][i1][j1].i = sumi;

        }
      }
      } /* if (0<=kloop) */
    } /* spin */

    /****************************************************
        store LDOS
    ****************************************************/

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (0<=kloop){

      for (spin=0; spin<=SpinP_switch; spin++){

          /* MPI: H2 */

        k = 0;

        for (i1=0; i1<=n; i1++){
          for (j1=0; j1<=n; j1++){
            tmp_array0[k] = H2[spin][i1][j1].r;
            k++;
          }
        }

        for (i1=0; i1<=n; i1++){
          for (j1=0; j1<=n; j1++){
            tmp_array0[k] = H2[spin][i1][j1].i;
            k++;
          }
        }

          MPI_Barrier(mpi_comm_level1);
          MPI_Bcast(&tmp_array0[0], 2*(n+1)*(n+1), MPI_DOUBLE, ID, mpi_comm_level1);

        k = 0;

        for (i1=0; i1<=n; i1++){
          for (j1=0; j1<=n; j1++){
            H[i1][j1].r = tmp_array0[k];
            k++;
          }
        }
         
        for (i1=0; i1<=n; i1++){
          for (j1=0; j1<=n; j1++){
            H[i1][j1].i = tmp_array0[k];
            k++;
          }
        }

        /* transpose */

        for (i1=1; i1<=n; i1++){
          for (j1=i1+1; j1<=n; j1++){
            Ctmp1 = H[i1][j1];
            Ctmp2 = H[j1][i1];
            H[i1][j1] = Ctmp2;
            H[j1][i1] = Ctmp1;
          }
        }

          /****************************************************
                               store LDOS
          ****************************************************/

          k1 = T_KGrids1[kloop];
          k2 = T_KGrids2[kloop];
          k3 = T_KGrids3[kloop];

          i = Ti_KGrids1[kloop];
          j = Tj_KGrids2[kloop];
          k = Tk_KGrids3[kloop];

          for (l=iemin; l<=iemax; l++){

            /* initialize */
            for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
              GA_AN = M2G[MA_AN];
              wanA = WhatSpecies[GA_AN];
              tnoA = Spe_Total_CNO[wanA];
              for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
                GB_AN = natn[GA_AN][LB_AN];
                wanB = WhatSpecies[GB_AN];
                tnoB = Spe_Total_CNO[wanB];
                for (i1=0; i1<tnoA; i1++){
                  for (j1=0; j1<tnoB; j1++){
                    CDM[MA_AN][LB_AN][i1][j1] =0.0;
                  }
                }
              }
            }

            /* theta_ij = C_ni C_nj */

            for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){

              GA_AN = M2G[MA_AN];
              wanA = WhatSpecies[GA_AN];
              tnoA = Spe_Total_CNO[wanA];
              Anum = MP[GA_AN];

              for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){

                GB_AN = natn[GA_AN][LB_AN];
                RnB = ncn[GA_AN][LB_AN];
                wanB = WhatSpecies[GB_AN];
                tnoB = Spe_Total_CNO[wanB];
                Bnum = MP[GB_AN];
                l1 = atv_ijk[RnB][1];
                l2 = atv_ijk[RnB][2];
                l3 = atv_ijk[RnB][3];
                kRn = k1*l1 + k2*l2 + k3*l3;

                si = sin(2.0*PI*kRn);     /* imaginary part */
                co = cos(2.0*PI*kRn);     /* real part      */

                for (i1=0; i1<tnoA; i1++){
                  for (j1=0; j1<tnoB; j1++){

                    u2 = H[l][Anum+i1].r*H[l][Bnum+j1].r;
                    v2 = H[l][Anum+i1].i*H[l][Bnum+j1].i;
                    uv = H[l][Anum+i1].r*H[l][Bnum+j1].i;
                    vu = H[l][Anum+i1].i*H[l][Bnum+j1].r;

                    Redum = (u2 + v2);
                    Imdum = (uv - vu);
                    Redum2 = co*Redum - si*Imdum;
                    Resum  = Redum2;
                    CDM[MA_AN][LB_AN][i1][j1] += Resum;

              } /* j1 */
            }   /* i1 */

            }     /* LB_AN */
          }       /* MA_AN */

            /*******************************************
                 M_i = S_ij D_ji
                 D_ji = C_nj C_ni

                 S_ij : CntOLP
                 D_ji : CDM
            ******************************************/

            for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
              wanA = WhatSpecies[GA_AN];
              tnoA = Spe_Total_CNO[wanA];
              for (i1=0; i1<tnoA; i1++){
                SD[GA_AN][i1] = 0.0;
              }
            }

            for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
              GA_AN = M2G[MA_AN];
              wanA = WhatSpecies[GA_AN];
              tnoA = Spe_Total_CNO[wanA];
              for (i1=0; i1<tnoA; i1++){
                for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
                  GB_AN = natn[GA_AN][LB_AN];
                  wanB = WhatSpecies[GB_AN];
                  tnoB = Spe_Total_CNO[wanB];
                  for (j1=0; j1<tnoB; j1++){
                     SD[GA_AN][i1] += CDM[MA_AN][LB_AN][i1][j1]*
                                   CntOLP[MA_AN][LB_AN][i1][j1];
                  }
                }
              }
            }

            /*  writing a binary file */

          i_vec[0] = Ti_KGrids1[kloop];
            i_vec[1] = Tj_KGrids2[kloop];
            i_vec[2] = Tk_KGrids3[kloop];

          if (myid==Host_ID){
              fwrite(i_vec,sizeof(int),3,fp_ev);
          }
            MPI_Barrier(mpi_comm_level1);

          for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
            wanA = WhatSpecies[GA_AN];
            tnoA = Spe_Total_CNO[wanA];
            ID2 = G2ID[GA_AN];
                   
            if (ID2==myid){
            for (i1=0; i1<tnoA; i1++){
              fSD[i1]=(float)SD[GA_AN][i1];
            }

            if (ID2!=Host_ID){
                  tag = 999;
              MPI_Isend(&fSD[0],tnoA,MPI_FLOAT,Host_ID,tag,mpi_comm_level1,&request);
              MPI_Wait(&request,&stat);
            }
            }

            if (myid==Host_ID && ID2!=Host_ID){
                tag = 999;
            MPI_Recv(&fSD[0], tnoA, MPI_FLOAT, ID2, tag, mpi_comm_level1, &stat);
            }

            if (myid==Host_ID) fwrite(fSD,sizeof(float),tnoA,fp_ev);
              MPI_Barrier(mpi_comm_level1);

          }
        }      /* l             */
      }        /* spin          */
      }          /* if (0<=kloop) */
    }            /* ID            */
  }              /* kloop0        */

  /****************************************************
     MPI:

     EIGEN
  ****************************************************/

  tmp = (double)T_knum/(double)numprocs; 

  for (spin=0; spin<=SpinP_switch; spin++){
    for (kloop=0; kloop<T_knum; kloop++){

      for (ID=0; ID<numprocs; ID++){

      if (T_knum<=ID){
        s1 = -10;
        e1 = -100;
      }
      else if (T_knum<numprocs) {
        s1 = ID;
        e1 = ID;
      }
      else {
        s1 = (int)((double)ID*(tmp+0.0001)); 
          e1 = (int)((double)(ID+1)*(tmp+0.0001)) - 1;
          if (ID==(numprocs-1)) e1 = T_knum - 1;
          if (e1<0)             e1 = 0;
      }

        if (s1<=kloop && kloop<=e1)  ID1 = ID;                    
      }

      MPI_Bcast(&EIGEN[spin][kloop][0], n+1, MPI_DOUBLE, ID1, mpi_comm_level1);
    }
  }


  if (myid==Host_ID){
    if (fp_eig) {

      for (kloop=0; kloop<T_knum; kloop++){
      for (spin=0; spin<=SpinP_switch; spin++){

          i = Ti_KGrids1[kloop];
          j = Tj_KGrids2[kloop];
          k = Tk_KGrids3[kloop];

        fprintf(fp_eig,"%d %d %d ",i,j,k);
        for (ie=iemin;ie<=iemax;ie++) {
          fprintf(fp_eig,"%lf ",EIGEN[spin][kloop][ie]);
        }
        fprintf(fp_eig,"\n");

      }
      }

      fprintf(fp_eig,"Eigenvalues>\n");
    }
  }

Finishing: 

  if (myid==Host_ID){
    if (fp_eig) { 
      fclose(fp_eig);
    }
    if (fp_ev) {
      fclose(fp_ev);
    }
  }

  /****************************************************
                       free arrays
  ****************************************************/

  free(MP);
  free(arpo);

  for (i=0; i<i_ko[0]; i++){
    free(ko[i]);
  }
  free(ko);

  free(koS);

  for (i=0; i<i_H[0]; i++){
    free(H[i]);
  }
  free(H);

  for (i=0; i<i_S[0]; i++){
    free(S[i]);
  }
  free(S);

  free(M1);

  for (i=0; i<i_C[0]; i++){
    free(C[i]);
  }
  free(C);

  free(KGrids1); free(KGrids2);free(KGrids3);
  
  for (i=0; i<i_CDM[0]; i++){
    for (j=0; j<i_CDM[1]; j++){
      for (k=0; k<i_CDM[2]; k++){
        free(CDM[i][j][k]);
      }
      free(CDM[i][j]);
    }
    free(CDM[i]);
  }
  free(CDM);

  for (i=0; i<i_SD[0]; i++){
    free(SD[i]);
  }
  free(SD);

  free(fSD);

  free(tmp_array0);

  for (j=0; j<n+1; j++){
    free(TmpM[j]);
  }
  free(TmpM);

  for (i=0; i<List_YOUSO[23]; i++){
    for (j=0; j<n+1; j++){
      free(H2[i][j]);
    }
    free(H2[i]);
  }
  free(H2);

  /* no spin-orbit coupling */
  if (SO_switch==0){
    free(Dummy_ImNL[0][0][0]);
    free(Dummy_ImNL[0][0]);
    free(Dummy_ImNL[0]);
    free(Dummy_ImNL);
  }

  free(T_KGrids1);
  free(T_KGrids2);
  free(T_KGrids3);

  free(Ti_KGrids1);
  free(Tj_KGrids2);
  free(Tk_KGrids3);

  for (i=0; i<List_YOUSO[23]; i++){
    for (j=0; j<T_knum; j++){
      free(EIGEN[i][j]);
    }
    free(EIGEN[i]);
  }
  free(EIGEN);

  /* for elapsed time */

  dtime(&TEtime);
  time0 = TEtime - TStime;
  return time0;

}







static double Band_DFT_Dosout_NonCol(
                           int knum_i, int knum_j, int knum_k,
                           int SpinP_switch,
                           double *****nh,
                           double *****ImNL,
                           double ****CntOLP)
{
  int i,j,k,spin,l,i1,j1,n1;
  int n, wanA,ii1,jj1,m,n2;
  int *MP;
  int MA_AN, GA_AN, tnoA,Anum, LB_AN;
  int GB_AN, wanB, tnoB, Bnum, RnB;
  int l1,l2,l3;
 
  int ie,iemin,iemax,iemin0,iemax0,n1min,mn;
  int MaxL,e1,s1,T_knum,S_knum,E_knum;
  int kloop,kloop0,num_kloop0;
  int i_vec[10];

  double EV_cut0;
  double sum,sumi,u2,v2,uv,vu,tmp;
  double kRn,si,co,Redum,Imdum,Redum2,Resum,Imdum2;
  double theta,phi,sit,cot,sip,cop,tmp1,tmp2,tmp3;
  double TStime,TEtime,time0;
  double OLP_eigen_cut=Threshold_OLP_Eigen;

  double *ko; int N_ko, i_ko[10];
  double *koS;
  dcomplex **H;  int N_H,  i_H[10];
  dcomplex **S;  int N_S,  i_S[10];
  dcomplex Ctmp1,Ctmp2;
  double *M1,**EIGEN;
  dcomplex **C;  int N_C,  i_C[10];
  double *KGrids1, *KGrids2, *KGrids3;
  double *****CDM; int N_CDM, i_CDM[10];
  double ***SD; int N_SD, i_SD[10];
  double **SDup,**SDdn;
  double *tmp_array0;
  dcomplex **H2,**TmpM;
  double *T_KGrids1,*T_KGrids2,*T_KGrids3;
  int *Ti_KGrids1,*Tj_KGrids2,*Tk_KGrids3,*arpo;

  float *fSD;
  double *****Dummy_ImNL;

  double snum_i, snum_j, snum_k; 
  double k1,k2,k3;

  char file_ev[YOUSO10],file_eig[YOUSO10];
  FILE *fp_ev,*fp_eig;
  char buf1[fp_bsize];          /* setvbuf */
  char buf2[fp_bsize];          /* setvbuf */
  int numprocs,myid,ID,ID1,ID2,tag;

  MPI_Status stat;
  MPI_Request request;

  /* MPI */
  MPI_Comm_size(mpi_comm_level1,&numprocs);
  MPI_Comm_rank(mpi_comm_level1,&myid);
  MPI_Barrier(mpi_comm_level1);

  if (myid==Host_ID){
    printf("\n<Band_DFT_Dosout>\n"); fflush(stdout);
  }

  dtime(&TStime);

  /****************************************************
             calculation of the array size
  ****************************************************/

  n = 0;
  for (i=1; i<=atomnum; i++){
    wanA  = WhatSpecies[i];
    n  = n + Spe_Total_CNO[wanA];
  }
  n2 = 2*n + 2;

  /****************************************************
   Allocation

   int       MP[List_YOUSO[1]]
   double    ko[n2]
   double    koS[n+1];
   dcomplex  H[n2][n2]
   dcomplex  S[n2][n2]
   double    M1[n2]
   dcomplex  C[n2][n2]
   double    KGrids1[knum_i]
   double    KGrids2[knum_j]
   double    KGrids3[knum_k]
   double    CDM[4][Matomnum+1][List_YOUSO[8]]
                [List_YOUSO[7]][List_YOUSO[7]]
   double    SD[4][List_YOUSO[1]][List_YOUSO[7]]
   float     fSD[List_YOUSO[7]*2]
   double    Htmp[n2][n2]
  ****************************************************/

  MP = (int*)malloc(sizeof(int)*List_YOUSO[1]);

  arpo = (int*)malloc(sizeof(int)*numprocs);

  ko = (double*)malloc(sizeof(double)*n2);
  koS = (double*)malloc(sizeof(double)*(n+1));

  H = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (j=0; j<n2; j++){
    H[j] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  S = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (i=0; i<n2; i++){
    S[i] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  M1 = (double*)malloc(sizeof(double)*n2);

  tmp_array0 = (double*)malloc(sizeof(double)*2*n2*n2);

  C = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (j=0; j<n2; j++){
    C[j] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  KGrids1 = (double*)malloc(sizeof(double)*knum_i);
  KGrids2 = (double*)malloc(sizeof(double)*knum_j);
  KGrids3 = (double*)malloc(sizeof(double)*knum_k);

  N_CDM=5; i_CDM[0]=4; i_CDM[1]=Matomnum+1; i_CDM[2]=List_YOUSO[8];
  i_CDM[3]=List_YOUSO[7]; i_CDM[4]=List_YOUSO[7];
  CDM=(double*****)malloc_multidimarray("double",N_CDM,i_CDM);

  N_SD=3;  i_SD[0]=4; i_SD[1]=List_YOUSO[1]; i_SD[2]=List_YOUSO[7];
  SD=(double***)malloc_multidimarray("double",N_SD,i_SD);

  SDup = (double**)malloc(sizeof(double*)*List_YOUSO[1]); 
  for (i=0; i<List_YOUSO[1]; i++){ 
    SDup[i] = (double*)malloc(sizeof(double)*List_YOUSO[7]); 
  }

  SDdn = (double**)malloc(sizeof(double*)*List_YOUSO[1]); 
  for (i=0; i<List_YOUSO[1]; i++){ 
    SDdn[i] = (double*)malloc(sizeof(double)*List_YOUSO[7]); 
  }

  fSD=(float*)malloc(sizeof(float)*List_YOUSO[7]*2);

  TmpM = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (j=0; j<n2; j++){
    TmpM[j] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  H2 = (dcomplex**)malloc(sizeof(dcomplex*)*n2);
  for (j=0; j<n2; j++){
    H2[j] = (dcomplex*)malloc(sizeof(dcomplex)*n2);
  }

  /* non-spin-orbit coupling and non-LDA+U */
  if (SO_switch==0 && Hub_U_switch==0 && Constraint_NCS_switch==0 
       && Zeeman_NCS_switch==0 && Zeeman_NCO_switch==0){
    Dummy_ImNL = (double*****)malloc(sizeof(double****)*1);
    Dummy_ImNL[0] = (double****)malloc(sizeof(double***)*1);
    Dummy_ImNL[0][0] = (double***)malloc(sizeof(double**)*1);
    Dummy_ImNL[0][0][0] = (double**)malloc(sizeof(double*)*1);
    Dummy_ImNL[0][0][0][0] = (double*)malloc(sizeof(double)*1);
  }

  /****************************************************
    set up k-grids
  ****************************************************/

  snum_i = knum_i;
  snum_j = knum_j;
  snum_k = knum_k;

  for (i=0; i<=(knum_i-1); i++){
    if (knum_i==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_i) + Shift_K_Point;
    }
    KGrids1[i]=k1;
  }
  for (i=0; i<=(knum_j-1); i++){
    if (knum_j==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_j) - Shift_K_Point;
    }
    KGrids2[i]=k1;
  }
  for (i=0; i<=(knum_k-1); i++){
    if (knum_k==1){
      k1 = 0.0;
    }
    else {
      k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*snum_k) + 2.0*Shift_K_Point;
    }
    KGrids3[i]=k1;
  }

  if (myid==Host_ID){
    printf(" KGrids1: ");
    for (i=0;i<=knum_i-1;i++) printf("%f ",KGrids1[i]);
    printf("\n");
    printf(" KGrids2: ");
    for (i=0;i<=knum_j-1;i++) printf("%f ",KGrids2[i]);
    printf("\n");
    printf(" KGrids3: ");
    for (i=0;i<=knum_k-1;i++) printf("%f ",KGrids3[i]);
    printf("\n");
  }

  /***********************************
       one-dimentionalize for MPI
  ************************************/

  T_knum = 0;
  for (i=0; i<=(knum_i-1); i++){
    for (j=0; j<=(knum_j-1); j++){
      for (k=0; k<=(knum_k-1); k++){
        T_knum++;
      }
    }
  }

  T_KGrids1 = (double*)malloc(sizeof(double)*T_knum);
  T_KGrids2 = (double*)malloc(sizeof(double)*T_knum);
  T_KGrids3 = (double*)malloc(sizeof(double)*T_knum);

  Ti_KGrids1 = (int*)malloc(sizeof(int)*T_knum);
  Tj_KGrids2 = (int*)malloc(sizeof(int)*T_knum);
  Tk_KGrids3 = (int*)malloc(sizeof(int)*T_knum);

  EIGEN = (double**)malloc(sizeof(double*)*T_knum);
  for (j=0; j<T_knum; j++){
    EIGEN[j] = (double*)malloc(sizeof(double)*n2);
  }

  /* set T_KGrid1,2,3 */

  T_knum = 0;
  for (i=0; i<=(knum_i-1); i++){
    for (j=0; j<=(knum_j-1); j++){
      for (k=0; k<=(knum_k-1); k++){

      T_KGrids1[T_knum] = KGrids1[i];
      T_KGrids2[T_knum] = KGrids2[j];
      T_KGrids3[T_knum] = KGrids3[k];

      Ti_KGrids1[T_knum] = i;
      Tj_KGrids2[T_knum] = j;
      Tk_KGrids3[T_knum] = k;

        T_knum++;
      }
    }
  }

  /* allocate k-points into proccessors */

  if (T_knum<=myid){
    S_knum = -10;
    E_knum = -100;
    num_kloop0 = 1;
  }
  else if (T_knum<numprocs) {
    S_knum = myid;
    E_knum = myid;
    num_kloop0 = 1;
  }
  else {
    tmp = (double)T_knum/(double)numprocs; 
    num_kloop0 = (int)tmp + 1;

    S_knum = (int)((double)myid*(tmp+0.0001)); 
    E_knum = (int)((double)(myid+1)*(tmp+0.0001)) - 1;
    if (myid==(numprocs-1)) E_knum = T_knum - 1;
    if (E_knum<0)           E_knum = 0;
  }

  /****************************************************************
                      find iemin and iemax
  *****************************************************************/

  iemin=n2; iemax=1; n1min=1;

  k1 = 0.0;
  k2 = 0.0;
  k3 = 0.0;

  Overlap_Band(Host_ID,CntOLP,S,MP,k1,k2,k3);

  if (myid==Host_ID){

    n = S[0][0].r;
    EigenBand_lapack(S, ko, n);

    if (2<=level_stdout){
      printf("  k1 k2 k3 %10.6f %10.6f %10.6f\n",k1,k2,k3);
      for (i1=1; i1<=n; i1++){
      printf("  Eigenvalues of OLP  %2d  %15.12f\n",i1,ko[i1]);
      }
    }

    /* minus eigenvalues to 1.0e-14 */
    for (l=1; l<=n; l++){
      if (ko[l]<0.0){
      ko[l] = 1.0e-14;

      if (2<=level_stdout){
        printf("found an eigenvalue smaller than %10.8f of OLP kloop=%2d\n",
             Threshold_OLP_Eigen,kloop);
      }
      }

      koS[l] = ko[l];
    }

    /* calculate S*1/sqrt(ko) */

    for (l=1; l<=n; l++) M1[l] = 1.0/sqrt(ko[l]);

    /* S * M1  */

    for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
      S[i1][j1].r = S[i1][j1].r*M1[j1];
      S[i1][j1].i = S[i1][j1].i*M1[j1];
      } 
    } 

  } /* if (myid==Host_ID) */

  /****************************************************
             make a full Hamiltonian matrix
  ****************************************************/

  /* non-spin-orbit coupling and non-LDA+U */
  if (SO_switch==0 && Hub_U_switch==0 && Constraint_NCS_switch==0 
      && Zeeman_NCS_switch==0 && Zeeman_NCO_switch==0)
    Hamiltonian_Band_NC(Host_ID, nh, Dummy_ImNL, H, MP, k1, k2, k3);

  /* spin-orbit coupling or LDA+U */
  else
    Hamiltonian_Band_NC(Host_ID, nh, ImNL, H, MP, k1, k2, k3);

  if (myid==Host_ID){

    /****************************************************
                    M1 * U^+ * H * U * M1
    ****************************************************/

    /* transpose S */

    for (i1=1; i1<=n; i1++){
      for (j1=i1+1; j1<=n; j1++){
      Ctmp1 = S[i1][j1];
      Ctmp2 = S[j1][i1];
      S[i1][j1] = Ctmp2;
      S[j1][i1] = Ctmp1;
      }
    }

    /* H * U * M1 */

    for (i1=1; i1<=2*n; i1++){
      for (j1=1; j1<=n; j1++){

      for (m=0; m<=1; m++){

        sum  = 0.0;
        sumi = 0.0;

          mn = m*n;

        for (l=1; l<=n; l++){
          sum  += H[i1][l+mn].r*S[j1][l].r - H[i1][l+mn].i*S[j1][l].i;
          sumi += H[i1][l+mn].r*S[j1][l].i + H[i1][l+mn].i*S[j1][l].r;
        }

        jj1 = 2*j1 - 1 + m;

        C[jj1][i1].r = sum;
        C[jj1][i1].i = sumi;
      }
      }
    }     

    /* M1 * U^+ H * U * M1 */

    for (i1=1; i1<=n; i1++){

      for (m=0; m<=1; m++){

      ii1 = 2*i1 - 1 + m;

      for (j1=1; j1<=2*n; j1++){

        sum  = 0.0;
        sumi = 0.0;

          mn = m*n;

        for (l=1; l<=n; l++){
          sum  +=  S[i1][l].r*C[j1][l+mn].r + S[i1][l].i*C[j1][l+mn].i;
          sumi +=  S[i1][l].r*C[j1][l+mn].i - S[i1][l].i*C[j1][l+mn].r;
        }
        H[ii1][j1].r = sum;
        H[ii1][j1].i = sumi;
      }
      }
    }     

    /* H to C */

    for (i1=1; i1<=2*n; i1++){
      for (j1=1; j1<=2*n; j1++){
      C[i1][j1].r = H[i1][j1].r;
      C[i1][j1].i = H[i1][j1].i;
      }
    }

    /* penalty for ill-conditioning states */

    EV_cut0 = Threshold_OLP_Eigen;

    for (i1=1; i1<=n; i1++){

      if (koS[i1]<EV_cut0){
      C[2*i1-1][2*i1-1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
      C[2*i1  ][2*i1  ].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
      }

      /* cutoff the interaction between the ill-conditioned state */

      if (1.0e+3<C[2*i1-1][2*i1-1].r){
      for (j1=1; j1<=2*n; j1++){
        C[2*i1-1][j1    ] = Complex(0.0,0.0);
        C[j1    ][2*i1-1] = Complex(0.0,0.0);
        C[2*i1  ][j1    ] = Complex(0.0,0.0);
        C[j1    ][2*i1  ] = Complex(0.0,0.0);
      }
      C[2*i1-1][2*i1-1] = Complex(1.0e+4,0.0);
      C[2*i1  ][2*i1  ] = Complex(1.0e+4,0.0);
      }
    }

    /* solve eigenvalue problem */

    n1 = 2*n;
    EigenBand_lapack(C, ko, n1);

    if (n1min<n1) n1min=n1;

    iemin0=1;
    for (i1=1;i1<=n1;i1++) {
      if (ko[i1]> (ChemP+Dos_Erange[0])) {
      iemin0=i1-1;
      break;
      }
    }
    if (iemin0<1) iemin0=1;

    iemax0=n1;
    for (i1=iemin0;i1<=n1;i1++) {
      if (ko[i1]> (ChemP+Dos_Erange[1])) {
      iemax0=i1;
      break;
      }
    }
    if (iemax0>n1) iemax0=n1;

    if (iemin>iemin0) iemin=iemin0;
    if (iemax<iemax0) iemax=iemax0;

  }   /* if (myid==Host_ID) */

  /* add a buffer to iemin and iemax */

  iemin -= 5;
  iemax += 5;

  if (iemin<1)  iemin = 1;
  if (n2<iemax) iemax = n2;

  if (myid==Host_ID){
    if (n1min<iemax) iemax=n1min;
    printf(" iemin and iemax= %d %d\n",iemin,iemax);fflush(stdout);
  }

  /* MPI, iemin, iemax */
  MPI_Barrier(mpi_comm_level1);
  MPI_Bcast(&iemin, 1, MPI_INT, Host_ID, mpi_comm_level1);
  MPI_Bcast(&iemax, 1, MPI_INT, Host_ID, mpi_comm_level1);

  /****************************************************************
                     eigenvalues and eigenvectors
   ****************************************************************/

  if (myid==Host_ID){
    strcpy(file_eig,".Dos.val");
    fnjoint(filepath,filename,file_eig);
    if ( (fp_eig=fopen(file_eig,"w"))==NULL ) {

#ifdef xt3
      setvbuf(fp_eig,buf1,_IOFBF,fp_bsize);  /* setvbuf */
#endif

      printf("can not open a file %s\n",file_eig);
    }

    strcpy(file_ev,".Dos.vec");
    fnjoint(filepath,filename,file_ev);
    if ( (fp_ev=fopen(file_ev,"w"))==NULL ) {

#ifdef xt3
      setvbuf(fp_ev,buf2,_IOFBF,fp_bsize);  /* setvbuf */
#endif

      printf("can not open a file %s\n",file_ev);
    }
    if ( fp_eig==NULL || fp_ev==NULL ) {
      goto Finishing;
    }
  }

  if (myid==Host_ID){

    if (fp_eig) {
      fprintf(fp_eig,"mode        1\n");
      fprintf(fp_eig,"NonCol      1\n");
      fprintf(fp_eig,"N           %d\n",n);
      fprintf(fp_eig,"Nspin       %d\n",1); /* switch to 1 */
      fprintf(fp_eig,"Erange      %lf %lf\n",Dos_Erange[0],Dos_Erange[1]);
      fprintf(fp_eig,"Kgrid       %d %d %d\n",knum_i,knum_j,knum_k);
      fprintf(fp_eig,"atomnum     %d\n",atomnum);
      fprintf(fp_eig,"<WhatSpecies\n");
      for (i=1;i<=atomnum;i++) {
        fprintf(fp_eig,"%d ",WhatSpecies[i]);
      }
      fprintf(fp_eig,"\nWhatSpecies>\n");
      fprintf(fp_eig,"SpeciesNum     %d\n",SpeciesNum);
      fprintf(fp_eig,"<Spe_Total_CNO\n");
      for (i=0;i<SpeciesNum;i++) {
        fprintf(fp_eig,"%d ",Spe_Total_CNO[i]);
      }
      fprintf(fp_eig,"\nSpe_Total_CNO>\n");

      MaxL=Supported_MaxL; 

      fprintf(fp_eig,"MaxL           %d\n",Supported_MaxL);
      fprintf(fp_eig,"<Spe_Num_CBasis\n");
      for (i=0;i<SpeciesNum;i++) {
        for (l=0;l<=MaxL;l++) {
          fprintf(fp_eig,"%d ",Spe_Num_CBasis[i][l]);
        }
        fprintf(fp_eig,"\n");
      }
      fprintf(fp_eig,"Spe_Num_CBasis>\n");

      fprintf(fp_eig,"ChemP       %lf\n",ChemP);

      fprintf(fp_eig,"<SpinAngle\n");
      for (i=1; i<=atomnum; i++) {
        fprintf(fp_eig,"%lf %lf\n",Angle0_Spin[i],Angle1_Spin[i]);
      }
      fprintf(fp_eig,"SpinAngle>\n");

      printf(" write eigenvalues\n");
      printf(" write eigenvectors\n");
    }
  }

  /* for kloop */

  for (kloop0=0; kloop0<num_kloop0; kloop0++){

    kloop = kloop0 + S_knum;
    arpo[myid] = -1;
    if (S_knum<=kloop && kloop<=E_knum) arpo[myid] = kloop;
    for (ID=0; ID<numprocs; ID++){
      MPI_Bcast(&arpo[ID], 1, MPI_INT, ID, mpi_comm_level1);
    }

    /* set S */

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (0<=kloop){

        k1 = T_KGrids1[kloop];
        k2 = T_KGrids2[kloop];
        k3 = T_KGrids3[kloop];

        Overlap_Band(ID,CntOLP,TmpM,MP,k1,k2,k3);
        n = TmpM[0][0].r;

        if (myid==ID){
        for (i1=1; i1<=n; i1++){
          for (j1=1; j1<=n; j1++){
            S[i1][j1].r = TmpM[i1][j1].r;
            S[i1][j1].i = TmpM[i1][j1].i;
          } 
        } 
        } 

      }
    }

    kloop = arpo[myid];

    if (0<=kloop){

      EigenBand_lapack(S,ko,n);

      if (2<=level_stdout && 0<=kloop){
      printf("  kloop %2d  k1 k2 k3 %10.6f %10.6f %10.6f\n",
             kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
      for (i1=1; i1<=n; i1++){
        printf("  Eigenvalues of OLP  %2d  %15.12f\n",i1,ko[i1]);
      }
      }

      /* minus eigenvalues to 1.0e-14 */
      for (l=1; l<=n; l++){
      if (ko[l]<0.0){
        ko[l] = 1.0e-14;

        if (2<=level_stdout){
          printf("found an eigenvalue smaller than %10.8f of OLP kloop=%2d\n",
               Threshold_OLP_Eigen,kloop);
        }
      }

      koS[l] = ko[l];
      }

      /* calculate S*1/sqrt(ko) */

      for (l=1; l<=n; l++) M1[l] = 1.0/sqrt(ko[l]);

      /* S * M1  */

      for (i1=1; i1<=n; i1++){
      for (j1=1; j1<=n; j1++){
        S[i1][j1].r = S[i1][j1].r*M1[j1];
        S[i1][j1].i = S[i1][j1].i*M1[j1];
      } 
      } 

    }

    /****************************************************
       make a full Hamiltonian matrix
    ****************************************************/

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (0<=kloop){
        k1 = T_KGrids1[kloop];
        k2 = T_KGrids2[kloop];
        k3 = T_KGrids3[kloop];

        /* non-spin-orbit coupling and non-LDA+U */  
        if (SO_switch==0 && Hub_U_switch==0 && Constraint_NCS_switch==0 
            && Zeeman_NCS_switch==0 && Zeeman_NCO_switch==0)
          Hamiltonian_Band_NC(ID, nh, Dummy_ImNL, TmpM, MP, k1, k2, k3);

        /* spin-orbit coupling or LDA+U */  
        else
          Hamiltonian_Band_NC(ID, nh,       ImNL, TmpM, MP, k1, k2, k3);

      if (myid==ID){
        for (i1=1; i1<=2*n; i1++){
          for (j1=1; j1<=2*n; j1++){
            H[i1][j1].r = TmpM[i1][j1].r;
            H[i1][j1].i = TmpM[i1][j1].i;
          } 
        } 
      } 
      }
    }

    kloop = arpo[myid];

    if (0<=kloop){

      /****************************************************
                     M1 * U^t * H * U * M1
      ****************************************************/

      /* transpose S */

      for (i1=1; i1<=n; i1++){
      for (j1=i1+1; j1<=n; j1++){
        Ctmp1 = S[i1][j1];
        Ctmp2 = S[j1][i1];
        S[i1][j1] = Ctmp2;
        S[j1][i1] = Ctmp1;
      }
      }

      /* H * U * M1 */

      for (i1=1; i1<=2*n; i1++){
      for (j1=1; j1<=n; j1++){

        for (m=0; m<=1; m++){

          sum  = 0.0;
          sumi = 0.0;
            mn = m*n;

          for (l=1; l<=n; l++){

            sum  += H[i1][l+mn].r*S[j1][l].r - H[i1][l+mn].i*S[j1][l].i;
            sumi += H[i1][l+mn].r*S[j1][l].i + H[i1][l+mn].i*S[j1][l].r;
          }

          jj1 = 2*j1 - 1 + m;

          C[jj1][i1].r = sum;
          C[jj1][i1].i = sumi;
        }
      }
      }     

      /* M1 * U^+ H * U * M1 */

      for (i1=1; i1<=n; i1++){

      for (m=0; m<=1; m++){

        ii1 = 2*i1 - 1 + m;

        for (j1=1; j1<=2*n; j1++){

          sum  = 0.0;
          sumi = 0.0;

            mn = m*n;

          for (l=1; l<=n; l++){
            sum  +=  S[i1][l].r*C[j1][l+mn].r + S[i1][l].i*C[j1][l+mn].i;
            sumi +=  S[i1][l].r*C[j1][l+mn].i - S[i1][l].i*C[j1][l+mn].r;
          }
          H[ii1][j1].r = sum;
          H[ii1][j1].i = sumi;
        }
      }
      }     

      /* H to C */

      for (i1=1; i1<=2*n; i1++){
      for (j1=1; j1<=2*n; j1++){
        C[i1][j1].r = H[i1][j1].r;
        C[i1][j1].i = H[i1][j1].i;
      }
      }

      /* penalty for ill-conditioning states */

      EV_cut0 = Threshold_OLP_Eigen;

      for (i1=1; i1<=n; i1++){

      if (koS[i1]<EV_cut0){
        C[2*i1-1][2*i1-1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
        C[2*i1  ][2*i1  ].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
      }

      /* cutoff the interaction between the ill-conditioned state */

      if (1.0e+3<C[2*i1-1][2*i1-1].r){
        for (j1=1; j1<=2*n; j1++){
          C[2*i1-1][j1    ] = Complex(0.0,0.0);
          C[j1    ][2*i1-1] = Complex(0.0,0.0);
          C[2*i1  ][j1    ] = Complex(0.0,0.0);
          C[j1    ][2*i1  ] = Complex(0.0,0.0);
        }
        C[2*i1-1][2*i1-1] = Complex(1.0e+4,0.0);
        C[2*i1  ][2*i1  ] = Complex(1.0e+4,0.0);
      }
      }

      /* solve eigenvalue problem */

      n1 = 2*n;
      EigenBand_lapack(C, ko, n1);

      for (i1=1; i1<=2*n; i1++) EIGEN[kloop][i1] = ko[i1];

      /****************************************************
         Transformation to the original eigenvectors.
         NOTE JRCAT-244p and JAIST-2122p 
         C = U * lambda^{-1/2} * D
      ****************************************************/

      /* transpose */
      for (i1=1; i1<=n; i1++){
      for (j1=i1+1; j1<=n; j1++){
        Ctmp1 = S[i1][j1];
        Ctmp2 = S[j1][i1];
        S[i1][j1] = Ctmp2;
        S[j1][i1] = Ctmp1;
      }
      }

      for (i1=1; i1<=2*n; i1++){
      for (j1=1; j1<=2*n; j1++){
        H2[i1][j1].r = 0.0;
        H2[i1][j1].i = 0.0;
      }
      }

      for (m=0; m<=1; m++){
      for (i1=1; i1<=n; i1++){
        for (j1=1; j1<=n1; j1++){

          sum  = 0.0; 
          sumi = 0.0;

          for (l=1; l<=n; l++){
            sum  +=  S[i1][l].r*C[2*(l-1)+1+m][j1].r
                   - S[i1][l].i*C[2*(l-1)+1+m][j1].i;
            sumi +=  S[i1][l].r*C[2*(l-1)+1+m][j1].i
                 + S[i1][l].i*C[2*(l-1)+1+m][j1].r;
          } 
          H2[i1+m*n][j1].r = sum;
          H2[i1+m*n][j1].i = sumi;
        }
      }
      }

    } /* if (0<=kloop) */

    /****************************************************
        store LDOS
    ****************************************************/

    for (ID=0; ID<numprocs; ID++){

      kloop = arpo[ID];

      if (0<=kloop){


      /* MPI: H2 */

      k = 0;

      for (i1=0; i1<=2*n; i1++){
        for (j1=0; j1<=2*n; j1++){
          tmp_array0[k] = H2[i1][j1].r;
          k++;
        }
      }
         
      for (i1=0; i1<=2*n; i1++){
        for (j1=0; j1<=2*n; j1++){
          tmp_array0[k] = H2[i1][j1].i;
          k++;
        }
      }

      MPI_Barrier(mpi_comm_level1);
      MPI_Bcast(&tmp_array0[0], 2*(2*n+1)*(2*n+1), MPI_DOUBLE, ID, mpi_comm_level1);

      k = 0;

        /* transpose */
      for (i1=0; i1<=2*n; i1++){
        for (j1=0; j1<=2*n; j1++){
          H[j1][i1].r = tmp_array0[k];
          k++;
        }
      }

        /* transpose */
      for (i1=0; i1<=2*n; i1++){
        for (j1=0; j1<=2*n; j1++){
          H[j1][i1].i = tmp_array0[k];
          k++;
        }
      }

        /****************************************************
         store LDOS
        ****************************************************/

      k1 = T_KGrids1[kloop];
      k2 = T_KGrids2[kloop];
      k3 = T_KGrids3[kloop];

      i = Ti_KGrids1[kloop];
      j = Tj_KGrids2[kloop];
      k = Tk_KGrids3[kloop];

      for (l=iemin; l<=iemax; l++){

        /* theta_ij = C_ni C_nj */

        for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
          GA_AN = M2G[MA_AN];
          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];
          Anum = MP[GA_AN];
          for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
            GB_AN = natn[GA_AN][LB_AN];
            RnB = ncn[GA_AN][LB_AN];
            wanB = WhatSpecies[GB_AN];
            tnoB = Spe_Total_CNO[wanB];
            Bnum = MP[GB_AN];
            l1 = atv_ijk[RnB][1];
            l2 = atv_ijk[RnB][2];
            l3 = atv_ijk[RnB][3];
            kRn = k1*l1 + k2*l2 + k3*l3;
            si = sin(2.0*PI*kRn);  /* Imaginary part */
            co = cos(2.0*PI*kRn);  /* Real part      */

            for (i1=0; i1<tnoA; i1++){
            for (j1=0; j1<tnoB; j1++){

              /* Re11 */
              u2 = H[l][Anum+i1].r*H[l][Bnum+j1].r;
              v2 = H[l][Anum+i1].i*H[l][Bnum+j1].i;
              uv = H[l][Anum+i1].r*H[l][Bnum+j1].i;
              vu = H[l][Anum+i1].i*H[l][Bnum+j1].r;
              Redum = (u2 + v2);
              Imdum = (uv - vu);
              Redum2 = co*Redum - si*Imdum;
              CDM[0][MA_AN][LB_AN][i1][j1] = Redum2;

              /* Re22 */
              u2 = H[l][Anum+i1+n].r*H[l][Bnum+j1+n].r;
              v2 = H[l][Anum+i1+n].i*H[l][Bnum+j1+n].i;
              uv = H[l][Anum+i1+n].r*H[l][Bnum+j1+n].i;
              vu = H[l][Anum+i1+n].i*H[l][Bnum+j1+n].r;
              Redum = (u2 + v2);
              Imdum = (uv - vu);
              Redum2 = co*Redum - si*Imdum;
              CDM[1][MA_AN][LB_AN][i1][j1] = Redum2;

              /* Re12 */
 
              u2 = H[l][Anum+i1].r*H[l][Bnum+j1+n].r;
              v2 = H[l][Anum+i1].i*H[l][Bnum+j1+n].i;
              uv = H[l][Anum+i1].r*H[l][Bnum+j1+n].i;
              vu = H[l][Anum+i1].i*H[l][Bnum+j1+n].r;

              Redum = (u2 + v2);
              Imdum = (uv - vu);
              Redum2 = co*Redum - si*Imdum;
              CDM[2][MA_AN][LB_AN][i1][j1] = Redum2;

              /* Im12
                 conjugate complex of Im12 due to difference in the definition
                 between density matrix and charge density
              */

              u2 = H[l][Anum+i1].r*H[l][Bnum+j1+n].r;
              v2 = H[l][Anum+i1].i*H[l][Bnum+j1+n].i;
              uv = H[l][Anum+i1].r*H[l][Bnum+j1+n].i;
              vu = H[l][Anum+i1].i*H[l][Bnum+j1+n].r;

              Redum = (u2 + v2);
              Imdum = (uv - vu);
              Imdum2 = -(co*Imdum + si*Redum);
              CDM[3][MA_AN][LB_AN][i1][j1] = Imdum2;

            } /* j1 */
            }   /* i1 */
          }     /* LB_AN */
        }       /* MA_AN */

        /*******************************************
                  M_i = S_ij D_ji
                  D_ji = C_nj C_ni
                  S_ij : CntOLP
                  D_ji : CDM
        ******************************************/

        for (GA_AN=1; GA_AN<=atomnum; GA_AN++){
          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];
          for (i1=0; i1<tnoA; i1++){
            SD[0][GA_AN][i1] = 0.0;
            SD[1][GA_AN][i1] = 0.0;
            SD[2][GA_AN][i1] = 0.0;
            SD[3][GA_AN][i1] = 0.0;
          }
        }

        for (spin=0; spin<=3; spin++){
          for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
            GA_AN = M2G[MA_AN];
            wanA = WhatSpecies[GA_AN];
            tnoA = Spe_Total_CNO[wanA];
            for (i1=0; i1<tnoA; i1++){
            for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
              GB_AN = natn[GA_AN][LB_AN];
              wanB = WhatSpecies[GB_AN];
              tnoB = Spe_Total_CNO[wanB];
              for (j1=0; j1<tnoB; j1++){
                SD[spin][GA_AN][i1] += CDM[spin][MA_AN][LB_AN][i1][j1]*
                                              CntOLP[MA_AN][LB_AN][i1][j1];
              }
            }
            }
          }
        }

        /*  transform to up and down states */

        for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
          GA_AN = M2G[MA_AN];
          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];

          theta = Angle0_Spin[GA_AN];
          phi   = Angle1_Spin[GA_AN];

          sit = sin(theta);
          cot = cos(theta);
          sip = sin(phi);
          cop = cos(phi);

          for (i1=0; i1<tnoA; i1++){

            tmp1 = 0.5*(SD[0][GA_AN][i1] + SD[1][GA_AN][i1]);
            tmp2 = 0.5*cot*(SD[0][GA_AN][i1] - SD[1][GA_AN][i1]);
            tmp3 = (SD[2][GA_AN][i1]*cop - SD[3][GA_AN][i1]*sip)*sit;

            SDup[GA_AN][i1] = tmp1 + tmp2 + tmp3;
            SDdn[GA_AN][i1] = tmp1 - tmp2 - tmp3;
          }
        }

        /*  writing a binary file */

        i_vec[0]=i; i_vec[1]=j; i_vec[2]=k;
        if (myid==Host_ID) fwrite(i_vec,sizeof(int),3,fp_ev);

        for (GA_AN=1; GA_AN<=atomnum; GA_AN++){

          wanA = WhatSpecies[GA_AN];
          tnoA = Spe_Total_CNO[wanA];
          ID2 = G2ID[GA_AN];
                   
          if (ID2==myid){

            for (i1=0; i1<tnoA; i1++){
            fSD[i1] = SDup[GA_AN][i1];
            }
            for (i1=0; i1<tnoA; i1++){
            fSD[tnoA+i1] = SDdn[GA_AN][i1];
            }

            if (myid!=Host_ID){
            tag = 999;
            MPI_Isend(&fSD[0],2*tnoA,MPI_FLOAT,Host_ID,tag,mpi_comm_level1,&request);
            MPI_Wait(&request,&stat);
            }
          }

          if (myid==Host_ID && ID2!=Host_ID){
            tag = 999;
            MPI_Recv(&fSD[0], 2*tnoA, MPI_FLOAT, ID2, tag, mpi_comm_level1, &stat);
          }

          if (myid==Host_ID) fwrite(fSD,sizeof(float),2*tnoA,fp_ev);
          MPI_Barrier(mpi_comm_level1);

        }
      }      /* l             */ 
      }        /* if (0<=kloop) */
    }          /* ID            */
  }            /* kloop0        */

  /****************************************************
     MPI:

     EIGEN
  ****************************************************/

  tmp = (double)T_knum/(double)numprocs; 

  for (kloop=0; kloop<T_knum; kloop++){

    for (ID=0; ID<numprocs; ID++){

      if (T_knum<=ID){
      s1 = -10;
      e1 = -100;
      }
      else if (T_knum<numprocs) {
      s1 = ID;
      e1 = ID;
      }
      else {
      s1 = (int)((double)ID*(tmp+0.0001)); 
      e1 = (int)((double)(ID+1)*(tmp+0.0001)) - 1;
      if (ID==(numprocs-1)) e1 = T_knum - 1;
      if (e1<0)             e1 = 0;
      }

      if (s1<=kloop && kloop<=e1)  ID1 = ID;                    
    }

    MPI_Bcast(&EIGEN[kloop][0], 2*n+1, MPI_DOUBLE, ID1, mpi_comm_level1);
  }

  if (myid==Host_ID){
    if (fp_eig) {

      fprintf(fp_eig,"irange      %d %d\n",iemin,iemax);
      fprintf(fp_eig,"<Eigenvalues\n");

      for (kloop=0; kloop<T_knum; kloop++){
        for (spin=0; spin<=1; spin++){

          i = Ti_KGrids1[kloop];
          j = Tj_KGrids2[kloop];
          k = Tk_KGrids3[kloop];

          fprintf(fp_eig,"%d %d %d ",i,j,k);
        for (ie=iemin; ie<=iemax; ie++) {
          fprintf(fp_eig,"%lf ",EIGEN[kloop][ie]);
        }
        fprintf(fp_eig,"\n");

      }
      }

      fprintf(fp_eig,"Eigenvalues>\n");

    } /* fp_eig */
  } /* if (myid==Host_ID) */

Finishing: 

  if (myid==Host_ID){
    if (fp_eig) { 
      fclose(fp_eig);
    }
    if (fp_ev) {
      fclose(fp_ev);
    }
  }

  /****************************************************
                       free arrays
  ****************************************************/

  free(MP);

  free(arpo);

  free(ko);
  free(koS);

  free(tmp_array0);

  for (j=0; j<n2; j++){
    free(H[j]);
  }
  free(H);

  for (i=0; i<n2; i++){
    free(S[i]);
  }
  free(S);

  free(M1);

  for (j=0; j<n2; j++){
    free(C[j]);
  }
  free(C);

  free(KGrids1); free(KGrids2);free(KGrids3);
  
  for (i=0; i<i_CDM[0]; i++){
    for (j=0; j<i_CDM[1]; j++){
      for (k=0; k<i_CDM[2]; k++){
        for (l=0; l<i_CDM[3]; l++){
          free(CDM[i][j][k][l]);
      }
        free(CDM[i][j][k]);
      }
      free(CDM[i][j]);
    }
    free(CDM[i]);
  }
  free(CDM);

  for (i=0; i<i_SD[0]; i++){
    for (j=0; j<i_SD[1]; j++){
      free(SD[i][j]);
    }
    free(SD[i]);
  }
  free(SD);

  for (i=0; i<List_YOUSO[1]; i++){ 
    free(SDup[i]);
  }
  free(SDup);

  for (i=0; i<List_YOUSO[1]; i++){ 
    free(SDdn[i]);
  }
  free(SDdn);

  free(fSD);

  for (j=0; j<n2; j++){
    free(TmpM[j]);
  }
  free(TmpM);

  for (j=0; j<n2; j++){
    free(H2[j]);
  }
  free(H2);

  /* non-spin-orbit coupling and non-LDA+U */  
  if (SO_switch==0 && Hub_U_switch==0 && Constraint_NCS_switch==0 
      && Zeeman_NCS_switch==0 && Zeeman_NCO_switch==0){
    free(Dummy_ImNL[0][0][0][0]);
    free(Dummy_ImNL[0][0][0]);
    free(Dummy_ImNL[0][0]);
    free(Dummy_ImNL[0]);
    free(Dummy_ImNL);
  }

  free(T_KGrids1);
  free(T_KGrids2);
  free(T_KGrids3);

  free(Ti_KGrids1);
  free(Tj_KGrids2);
  free(Tk_KGrids3);

  for (j=0; j<T_knum; j++){
    free(EIGEN[j]);
  }
  free(EIGEN);

  /* for elapsed time */

  dtime(&TEtime);
  time0 = TEtime - TStime;
  return time0;
}

Generated by  Doxygen 1.6.0   Back to index