diff --git a/source/module_elecstate/module_charge/charge_init.cpp b/source/module_elecstate/module_charge/charge_init.cpp
index 0c4737856b..c0806da937 100644
--- a/source/module_elecstate/module_charge/charge_init.cpp
+++ b/source/module_elecstate/module_charge/charge_init.cpp
@@ -53,61 +53,37 @@ void Charge::init_rho(elecstate::efermi& eferm_iout,
             {
                 std::stringstream ssc;
                 ssc << PARAM.globalv.global_readin_dir << "SPIN" << is + 1 << "_CHG.cube";
-                double& ef_tmp = eferm_iout.get_ef(is);
-                if (ModuleIO::read_cube(
-#ifdef __MPI
-                    & (GlobalC::Pgrid),
-#endif
+                if (ModuleIO::read_vdata_palgrid(GlobalC::Pgrid,
                     (PARAM.inp.esolver_type == "sdft" ? GlobalV::RANK_IN_STOGROUP : GlobalV::MY_RANK),
-                    is,
                     GlobalV::ofs_running,
-                    PARAM.inp.nspin,
                     ssc.str(),
                     this->rho[is],
-                    this->rhopw->nx,
-                    this->rhopw->ny,
-                    this->rhopw->nz,
-                    ef_tmp,
-                    & (GlobalC::ucell),
-                    this->prenspin))
+                    GlobalC::ucell.nat))
                 {
                     GlobalV::ofs_running << " Read in the charge density: " << ssc.str() << std::endl;
                 }
-                else if (is > 0)
+                else if (is > 0)    // nspin=2 or 4
                 {
-                    if (prenspin == 1)
+                    if (is == 1)    // failed at the second spin
                     {
-                        GlobalV::ofs_running << " Didn't read in the charge density but autoset it for spin " << is + 1
-                                             << std::endl;
-                        for (int ir = 0; ir < this->rhopw->nrxx; ir++)
-                        {
-                            this->rho[is][ir] = 0.0;
-                        }
+                        std::cout << "Incomplete charge density file!" << std::endl;
+                        read_error = true;
+                        break;
                     }
-                    //
-                    else if (prenspin == 2)
-                    { // read up and down , then rearrange them.
-                        if (is == 1)
-                        {
-                            std::cout << "Incomplete charge density file!" << std::endl;
-                            read_error = true;
-                            break;
-                        }
-                        else if (is == 2)
-                        {
-                            GlobalV::ofs_running << " Didn't read in the charge density but would rearrange it later. "
-                                                 << std::endl;
-                        }
-                        else if (is == 3)
+                    else if (is == 2)   // read 2 files when nspin=4
+                    {
+                        GlobalV::ofs_running << " Didn't read in the charge density but would rearrange it later. "
+                            << std::endl;
+                    }
+                    else if (is == 3)   // read 2 files when nspin=4
+                    {
+                        GlobalV::ofs_running << " rearrange charge density " << std::endl;
+                        for (int ir = 0; ir < this->rhopw->nrxx; ir++)
                         {
-                            GlobalV::ofs_running << " rearrange charge density " << std::endl;
-                            for (int ir = 0; ir < this->rhopw->nrxx; ir++)
-                            {
-                                this->rho[3][ir] = this->rho[0][ir] - this->rho[1][ir];
-                                this->rho[0][ir] = this->rho[0][ir] + this->rho[1][ir];
-                                this->rho[1][ir] = 0.0;
-                                this->rho[2][ir] = 0.0;
-                            }
+                            this->rho[3][ir] = this->rho[0][ir] - this->rho[1][ir];
+                            this->rho[0][ir] = this->rho[0][ir] + this->rho[1][ir];
+                            this->rho[1][ir] = 0.0;
+                            this->rho[2][ir] = 0.0;
                         }
                     }
                 }
@@ -127,22 +103,12 @@ void Charge::init_rho(elecstate::efermi& eferm_iout,
                     GlobalV::ofs_running << " try to read kinetic energy density from file : " << ssc.str()
                                          << std::endl;
                     // mohan update 2012-02-10, sunliang update 2023-03-09
-                    if (ModuleIO::read_cube(
-#ifdef __MPI
-                        & (GlobalC::Pgrid),
-#endif
+                    if (ModuleIO::read_vdata_palgrid(GlobalC::Pgrid,
                         (PARAM.inp.esolver_type == "sdft" ? GlobalV::RANK_IN_STOGROUP : GlobalV::MY_RANK),
-                        is,
                         GlobalV::ofs_running,
-                        PARAM.inp.nspin,
                         ssc.str(),
                         this->kin_r[is],
-                        this->rhopw->nx,
-                        this->rhopw->ny,
-                        this->rhopw->nz,
-                        eferm_iout.ef,
-                        & (GlobalC::ucell),
-                        this->prenspin))
+                        GlobalC::ucell.nat))
                     {
                         GlobalV::ofs_running << " Read in the kinetic energy density: " << ssc.str() << std::endl;
                     }
diff --git a/source/module_elecstate/module_charge/symmetry_rho.cpp b/source/module_elecstate/module_charge/symmetry_rho.cpp
index 3635fd1d51..3455402da5 100644
--- a/source/module_elecstate/module_charge/symmetry_rho.cpp
+++ b/source/module_elecstate/module_charge/symmetry_rho.cpp
@@ -97,7 +97,7 @@ void Symmetry_rho::psymm(double* rho_part,
         rhotot.resize(rho_basis->nxyz);
         ModuleBase::GlobalFunc::ZEROS(rhotot.data(), rho_basis->nxyz);
     }
-    Pgrid.reduce_to_fullrho(rhotot.data(), rho_part);
+    Pgrid.reduce(rhotot.data(), rho_part);
 
     // (2)
     if (GlobalV::MY_RANK == 0)
@@ -127,24 +127,7 @@ void Symmetry_rho::psymm(double* rho_part,
     }
 
     // (3)
-    const int ncxy = rho_basis->nx * rho_basis->ny;
-    std::vector<double> zpiece(ncxy);
-    for (int iz = 0; iz < rho_basis->nz; iz++)
-    {
-        ModuleBase::GlobalFunc::ZEROS(zpiece.data(), ncxy);
-        if (GlobalV::MY_RANK == 0)
-        {
-            for (int ix = 0; ix < rho_basis->nx; ix++)
-            {
-                for (int iy = 0; iy < rho_basis->ny; iy++)
-                {
-                    const int ir = ix * rho_basis->ny + iy;
-                    zpiece[ir] = rhotot[ix * rho_basis->ny * rho_basis->nz + iy * rho_basis->nz + iz];
-                }
-            }
-        }
-        Pgrid.zpiece_to_all(zpiece.data(), iz, rho_part);
-    }
+Pgrid.bcast(rhotot.data(), rho_part, GlobalV::MY_RANK);
 #endif
     return;
 }
\ No newline at end of file
diff --git a/source/module_esolver/esolver_fp.cpp b/source/module_esolver/esolver_fp.cpp
index e672797f73..fa7b6615bd 100644
--- a/source/module_esolver/esolver_fp.cpp
+++ b/source/module_esolver/esolver_fp.cpp
@@ -156,21 +156,12 @@ void ESolver_FP::after_scf(const int istep)
                     this->pw_rhod->real2recip(this->pelec->charge->rho_save[is], this->pelec->charge->rhog_save[is]);
                 }
                 std::string fn =PARAM.globalv.global_out_dir + "/SPIN" + std::to_string(is + 1) + "_CHG.cube";
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    this->pw_big->bz,
-                    this->pw_big->nbz,
-                    this->pw_rhod->nplane,
-                    this->pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     data,
                     is,
                     PARAM.inp.nspin,
                     istep,
                     fn,
-                    this->pw_rhod->nx,
-                    this->pw_rhod->ny,
-                    this->pw_rhod->nz,
                     this->pelec->eferm.get_efval(is),
                     &(GlobalC::ucell),
                     PARAM.inp.out_chg[1],
@@ -178,21 +169,12 @@ void ESolver_FP::after_scf(const int istep)
                 if (XC_Functional::get_func_type() == 3 || XC_Functional::get_func_type() == 5)
                 {
                     fn =PARAM.globalv.global_out_dir + "/SPIN" + std::to_string(is + 1) + "_TAU.cube";
-                    ModuleIO::write_cube(
-#ifdef __MPI
-                        this->pw_big->bz,
-                        this->pw_big->nbz,
-                        this->pw_rhod->nplane,
-                        this->pw_rhod->startz_current,
-#endif
+                    ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                         this->pelec->charge->kin_r_save[is],
                         is,
                         PARAM.inp.nspin,
                         istep,
                         fn,
-                        this->pw_rhod->nx,
-                        this->pw_rhod->ny,
-                        this->pw_rhod->nz,
                         this->pelec->eferm.get_efval(is),
                         &(GlobalC::ucell));
                 }
@@ -224,21 +206,12 @@ void ESolver_FP::after_scf(const int istep)
             {
                 std::string fn =PARAM.globalv.global_out_dir + "/SPIN" + std::to_string(is + 1) + "_POT.cube";
 
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    this->pw_big->bz,
-                    this->pw_big->nbz,
-                    this->pw_rhod->nplane,
-                    this->pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     this->pelec->pot->get_effective_v(is),
                     is,
                     PARAM.inp.nspin,
                     istep,
                     fn,
-                    this->pw_rhod->nx,
-                    this->pw_rhod->ny,
-                    this->pw_rhod->nz,
                     0.0, // efermi
                     &(GlobalC::ucell),
                     3,  // precision
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index 3e8e9b1f41..cb455ace00 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -1029,21 +1029,12 @@ void ESolver_KS_LCAO<TK, TR>::iter_finish(int& iter)
                 data = this->pelec->charge->rho_save[is];
             }
             std::string fn = PARAM.globalv.global_out_dir + "/tmp_SPIN" + std::to_string(is + 1) + "_CHG.cube";
-            ModuleIO::write_cube(
-#ifdef __MPI
-                this->pw_big->bz,
-                this->pw_big->nbz,
-                this->pw_rhod->nplane,
-                this->pw_rhod->startz_current,
-#endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 data,
                 is,
                 PARAM.inp.nspin,
                 0,
                 fn,
-                this->pw_rhod->nx,
-                this->pw_rhod->ny,
-                this->pw_rhod->nz,
                 this->pelec->eferm.get_efval(is),
                 &(GlobalC::ucell),
                 3,
@@ -1051,21 +1042,12 @@ void ESolver_KS_LCAO<TK, TR>::iter_finish(int& iter)
             if (XC_Functional::get_func_type() == 3 || XC_Functional::get_func_type() == 5)
             {
                 fn = PARAM.globalv.global_out_dir + "/tmp_SPIN" + std::to_string(is + 1) + "_TAU.cube";
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    this->pw_big->bz,
-                    this->pw_big->nbz,
-                    this->pw_rhod->nplane,
-                    this->pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     this->pelec->charge->kin_r_save[is],
                     is,
                     PARAM.inp.nspin,
                     0,
                     fn,
-                    this->pw_rhod->nx,
-                    this->pw_rhod->ny,
-                    this->pw_rhod->nz,
                     this->pelec->eferm.get_efval(is),
                     &(GlobalC::ucell));
             }
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index f79454891f..968bcea82f 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -239,21 +239,12 @@ void ESolver_KS_PW<T, Device>::before_scf(const int istep)
         {
             std::stringstream ss;
             ss << PARAM.globalv.global_out_dir << "SPIN" << is + 1 << "_CHG_INI.cube";
-            ModuleIO::write_cube(
-#ifdef __MPI
-                this->pw_big->bz,
-                this->pw_big->nbz,
-                this->pw_rhod->nplane,
-                this->pw_rhod->startz_current,
-#endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 this->pelec->charge->rho[is],
                 is,
                 PARAM.inp.nspin,
                 istep,
                 ss.str(),
-                this->pw_rhod->nx,
-                this->pw_rhod->ny,
-                this->pw_rhod->nz,
                 this->pelec->eferm.ef,
                 &(GlobalC::ucell));
         }
@@ -266,21 +257,12 @@ void ESolver_KS_PW<T, Device>::before_scf(const int istep)
         {
             std::stringstream ss;
             ss << PARAM.globalv.global_out_dir << "SPIN" << is + 1 << "_POT_INI.cube";
-            ModuleIO::write_cube(
-#ifdef __MPI
-                this->pw_big->bz,
-                this->pw_big->nbz,
-                this->pw_rhod->nplane,
-                this->pw_rhod->startz_current,
-#endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 this->pelec->pot->get_effective_v(is),
                 is,
                 PARAM.inp.nspin,
                 istep,
                 ss.str(),
-                this->pw_rhod->nx,
-                this->pw_rhod->ny,
-                this->pw_rhod->nz,
                 0.0, // efermi
                 &(GlobalC::ucell),
                 11, // precsion
@@ -480,21 +462,12 @@ void ESolver_KS_PW<T, Device>::iter_finish(int& iter)
                     data = this->pelec->charge->rho_save[is];
                 }
                 std::string fn = PARAM.globalv.global_out_dir + "/tmp_SPIN" + std::to_string(is + 1) + "_CHG.cube";
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    this->pw_big->bz,
-                    this->pw_big->nbz,
-                    this->pw_rhod->nplane,
-                    this->pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     data,
                     is,
                     PARAM.inp.nspin,
                     0,
                     fn,
-                    this->pw_rhod->nx,
-                    this->pw_rhod->ny,
-                    this->pw_rhod->nz,
                     this->pelec->eferm.get_efval(is),
                     &(GlobalC::ucell),
                     3,
@@ -502,21 +475,12 @@ void ESolver_KS_PW<T, Device>::iter_finish(int& iter)
                 if (XC_Functional::get_func_type() == 3 || XC_Functional::get_func_type() == 5)
                 {
                     fn = PARAM.globalv.global_out_dir + "/tmp_SPIN" + std::to_string(is + 1) + "_TAU.cube";
-                    ModuleIO::write_cube(
-#ifdef __MPI
-                        this->pw_big->bz,
-                        this->pw_big->nbz,
-                        this->pw_rhod->nplane,
-                        this->pw_rhod->startz_current,
-#endif
+                    ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                         this->pelec->charge->kin_r_save[is],
                         is,
                         PARAM.inp.nspin,
                         0,
                         fn,
-                        this->pw_rhod->nx,
-                        this->pw_rhod->ny,
-                        this->pw_rhod->nz,
                         this->pelec->eferm.get_efval(is),
                         &(GlobalC::ucell));
                 }
diff --git a/source/module_esolver/io_npz.cpp b/source/module_esolver/io_npz.cpp
index 062e16f51c..ea354438ec 100644
--- a/source/module_esolver/io_npz.cpp
+++ b/source/module_esolver/io_npz.cpp
@@ -73,7 +73,7 @@ void ESolver_KS_LCAO<TK, TR>::read_mat_npz(std::string& zipname, hamilt::HContai
             const int it = GlobalC::ucell.iat2it[iat];
             const int ia = GlobalC::ucell.iat2ia[iat];
 
-            //get atomic number (copied from write_cube.cpp)
+            //get atomic number (copied from write_vdata_palgrid.cpp)
             std::string element = "";
             element = GlobalC::ucell.atoms[it].label;
 			std::string::iterator temp = element.begin();
@@ -368,7 +368,7 @@ void ESolver_KS_LCAO<TK, TR>::output_mat_npz(std::string& zipname, const hamilt:
             const int it = GlobalC::ucell.iat2it[iat];
             const int ia = GlobalC::ucell.iat2ia[iat];
 
-            //get atomic number (copied from write_cube.cpp)
+            //get atomic number (copied from write_vdata_palgrid.cpp)
             std::string element = "";
             element = GlobalC::ucell.atoms[it].label;
 			std::string::iterator temp = element.begin();
diff --git a/source/module_esolver/lcao_before_scf.cpp b/source/module_esolver/lcao_before_scf.cpp
index 127ce080d9..3fe23490a5 100644
--- a/source/module_esolver/lcao_before_scf.cpp
+++ b/source/module_esolver/lcao_before_scf.cpp
@@ -227,21 +227,12 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(const int istep)
         {
             std::stringstream ss;
             ss << PARAM.globalv.global_out_dir << "SPIN" << is + 1 << "_CHG_INI.cube";
-            ModuleIO::write_cube(
-#ifdef __MPI
-                this->pw_big->bz, // bz first, then nbz
-                this->pw_big->nbz,
-                this->pw_rhod->nplane,
-                this->pw_rhod->startz_current,
-#endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 this->pelec->charge->rho[is],
                 is,
                 PARAM.inp.nspin,
                 istep,
                 ss.str(),
-                this->pw_rhod->nx,
-                this->pw_rhod->ny,
-                this->pw_rhod->nz,
                 this->pelec->eferm.ef,
                 &(GlobalC::ucell));
         }
@@ -254,21 +245,12 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(const int istep)
         {
             std::stringstream ss;
             ss << PARAM.globalv.global_out_dir << "SPIN" << is + 1 << "_POT_INI.cube";
-            ModuleIO::write_cube(
-#ifdef __MPI
-                this->pw_big->bz,
-                this->pw_big->nbz,
-                this->pw_rhod->nplane,
-                this->pw_rhod->startz_current,
-#endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 this->pelec->pot->get_effective_v(is),
                 is,
                 PARAM.inp.nspin,
                 istep,
                 ss.str(),
-                this->pw_rhod->nx,
-                this->pw_rhod->ny,
-                this->pw_rhod->nz,
                 0.0, // efermi
                 &(GlobalC::ucell),
                 11, // precsion
@@ -309,21 +291,12 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(const int istep)
         for (int is = 0; is < nspin0; is++)
         {
             std::string fn = PARAM.globalv.global_out_dir + "/SPIN" + std::to_string(is + 1) + "_CHG.cube";
-            ModuleIO::write_cube(
-#ifdef __MPI
-                this->pw_big->bz,
-                this->pw_big->nbz,
-                this->pw_rhod->nplane,
-                this->pw_rhod->startz_current,
-#endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 this->pelec->charge->rho[is],
                 is,
                 PARAM.inp.nspin,
                 istep,
                 fn,
-                this->pw_rhod->nx,
-                this->pw_rhod->ny,
-                this->pw_rhod->nz,
                 this->pelec->eferm.get_efval(is),
                 &(GlobalC::ucell),
                 3,
diff --git a/source/module_esolver/lcao_nscf.cpp b/source/module_esolver/lcao_nscf.cpp
index 27dd4b0c45..4bb4624e6a 100644
--- a/source/module_esolver/lcao_nscf.cpp
+++ b/source/module_esolver/lcao_nscf.cpp
@@ -196,21 +196,12 @@ void ESolver_KS_LCAO<TK, TR>::nscf() {
         {
             std::string fn = PARAM.globalv.global_out_dir + "/SPIN" + std::to_string(is + 1) + "_POT.cube";
 
-            ModuleIO::write_cube(
-#ifdef __MPI
-                this->pw_big->bz,
-                this->pw_big->nbz,
-                this->pw_rhod->nplane,
-                this->pw_rhod->startz_current,
-#endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 this->pelec->pot->get_effective_v(is),
                 is,
                 PARAM.inp.nspin,
                 0,
                 fn,
-                this->pw_rhod->nx,
-                this->pw_rhod->ny,
-                this->pw_rhod->nz,
                 0.0, // efermi
                 &(GlobalC::ucell),
                 3,  // precision
diff --git a/source/module_esolver/pw_nscf.cpp b/source/module_esolver/pw_nscf.cpp
index 006efbe8b9..09217ff02f 100644
--- a/source/module_esolver/pw_nscf.cpp
+++ b/source/module_esolver/pw_nscf.cpp
@@ -165,21 +165,12 @@ void ESolver_KS_PW<T, Device>::nscf() {
         {
             std::string fn = PARAM.globalv.global_out_dir + "/SPIN" + std::to_string(is + 1) + "_POT.cube";
 
-            ModuleIO::write_cube(
-#ifdef __MPI
-                this->pw_big->bz,
-                this->pw_big->nbz,
-                this->pw_rhod->nplane,
-                this->pw_rhod->startz_current,
-#endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 this->pelec->pot->get_effective_v(is),
                 is,
                 PARAM.inp.nspin,
                 0,
                 fn,
-                this->pw_rhod->nx,
-                this->pw_rhod->ny,
-                this->pw_rhod->nz,
                 0.0, // efermi
                 &(GlobalC::ucell),
                 3,  // precision
diff --git a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
index d215f2f391..829029e4bb 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp
@@ -35,10 +35,6 @@ void Parallel_Grid::init(
 	const int &bz_in)
 {
 
-#ifndef __MPI
-	return;
-#endif
-
 	ModuleBase::TITLE("Parallel_Grid","init");
 	
 	this->ncx = ncx_in;
@@ -47,7 +43,7 @@ void Parallel_Grid::init(
 	this->nczp = nczp_in;
 	this->nrxx = nrxx_in;
 	this->nbz = nbz_in;
-	this->bz = bz_in;
+    this->bz = bz_in;
 
 	if(nczp<0)
 	{
@@ -60,7 +56,11 @@ void Parallel_Grid::init(
 	assert(ncz > 0);
 
 	this->ncxy = ncx * ncy;
-	this->ncxyz = ncxy * ncz;
+    this->ncxyz = ncxy * ncz;
+
+#ifndef __MPI
+    return;
+#endif
 
     // enable to call this function again liuyu 2023-03-10
     if(this->allocate)
@@ -84,14 +84,16 @@ void Parallel_Grid::init(
 
 	this->nproc_in_pool = new int[GlobalV::KPAR];
 	int nprocgroup;
-	if(PARAM.inp.esolver_type == "sdft")		nprocgroup = GlobalV::NPROC_IN_STOGROUP;
-	else											nprocgroup = GlobalV::NPROC;
+	if(PARAM.inp.esolver_type == "sdft") {		nprocgroup = GlobalV::NPROC_IN_STOGROUP;
+	} else {											nprocgroup = GlobalV::NPROC;
+}
 
 	const int remain_pro = nprocgroup%GlobalV::KPAR;
 	for(int i=0; i<GlobalV::KPAR; i++)
 	{
 		nproc_in_pool[i] = nprocgroup/GlobalV::KPAR;
-		if(i<remain_pro) this->nproc_in_pool[i]++;
+		if(i<remain_pro) { this->nproc_in_pool[i]++;
+}
 	}
 
 	this->numz = new int*[GlobalV::KPAR];
@@ -115,7 +117,7 @@ void Parallel_Grid::init(
 	return;
 }
 
-void Parallel_Grid::z_distribution(void)
+void Parallel_Grid::z_distribution()
 {
 	assert(allocate);	
 
@@ -126,7 +128,8 @@ void Parallel_Grid::z_distribution(void)
 //		GlobalV::ofs_running << "\n now POOL=" << ip;
 		const int nproc = nproc_in_pool[ip];
 		
-		if(ip>0) startp[ip] = startp[ip-1] + nproc_in_pool[ip-1];
+		if(ip>0) { startp[ip] = startp[ip-1] + nproc_in_pool[ip-1];
+}
 		
 		// (1) how many z on each 'proc' in each 'pool'
 		for(int iz=0; iz<nbz; iz++)
@@ -182,7 +185,28 @@ void Parallel_Grid::z_distribution(void)
 
 
 #ifdef __MPI
-void Parallel_Grid::zpiece_to_all(double *zpiece, const int &iz, double *rho) const
+void Parallel_Grid::bcast(const double* const data_global, double* data_local, const int& rank)const
+{
+    std::vector<double> zpiece(ncxy);
+    for (int iz = 0; iz < this->ncz; ++iz)
+    {
+        ModuleBase::GlobalFunc::ZEROS(zpiece.data(), ncxy);
+        if (rank == 0)
+        {
+            for (int ix = 0; ix < ncx; ix++)
+            {
+                for (int iy = 0; iy < ncy; iy++)
+                {
+                    const int ixy = ix * ncy + iy;
+                    zpiece[ixy] = data_global[ixy * ncz + iz];
+                }
+            }
+        }
+        this->zpiece_to_all(zpiece.data(), iz, data_local);
+    }
+}
+
+void Parallel_Grid::zpiece_to_all(double* zpiece, const int& iz, double* rho) const
 {
 	if(PARAM.inp.esolver_type == "sdft")
 	{
@@ -323,9 +347,9 @@ void Parallel_Grid::zpiece_to_stogroup(double *zpiece, const int &iz, double *rh
 	return;	
 
 }
-void Parallel_Grid::reduce_to_fullrho(double *rhotot, double *rhoin)
+void Parallel_Grid::reduce(double* rhotot, const double* const rhoin)const
 {
-	//ModuleBase::TITLE("Parallel_Grid","reduce_to_fullrho");
+    //ModuleBase::TITLE("Parallel_Grid","reduce");
 
 	// if not the first pool, wait here until processpr 0
 	// send the Barrier command.
@@ -374,13 +398,9 @@ void Parallel_Grid::reduce_to_fullrho(double *rhotot, double *rhoin)
 
 		if(GlobalV::MY_RANK==0)
 		{
-			for(int ix=0; ix<this->ncx; ix++)
-			{
-				for(int iy=0; iy<this->ncy; iy++)
-				{
-					const int ir = ix * this->ncy + iy;
-					rhotot[ix * ncy * ncz + iy * ncz + iz] = zpiece[ir];
-				}
+            for (int ixy = 0; ixy < this->ncxy;++ixy)
+            {
+                rhotot[ixy * ncz + iz] = zpiece[ixy];
 			}	
 		}
 	}
@@ -397,10 +417,6 @@ void Parallel_Grid::init_final_scf(const int &ncx_in, const int &ncy_in, const i
 const int &nrxx_in, const int &nbz_in, const int &bz_in)
 {
 
-#ifndef __MPI
-	return;
-#endif
-
 	ModuleBase::TITLE("Parallel_Grid","init");
 	
 	this->ncx = ncx_in;
@@ -409,7 +425,7 @@ const int &nrxx_in, const int &nbz_in, const int &bz_in)
 	this->nczp = nczp_in;
 	this->nrxx = nrxx_in;
 	this->nbz = nbz_in;
-	this->bz = bz_in;
+    this->bz = bz_in;
 
 	if(nczp<0)
 	{
@@ -422,7 +438,11 @@ const int &nrxx_in, const int &nbz_in, const int &bz_in)
 	assert(ncz > 0);
 
 	this->ncxy = ncx * ncy;
-	this->ncxyz = ncxy * ncz;
+    this->ncxyz = ncxy * ncz;
+
+#ifndef __MPI
+    return;
+#endif
 
 	// (2)
 	assert(allocate_final_scf==false);
@@ -433,7 +453,8 @@ const int &nrxx_in, const int &nbz_in, const int &bz_in)
 	for(int i=0; i<GlobalV::KPAR; i++)
 	{
 		nproc_in_pool[i] = GlobalV::NPROC/GlobalV::KPAR;
-		if(i<remain_pro) this->nproc_in_pool[i]++;
+		if(i<remain_pro) { this->nproc_in_pool[i]++;
+}
 	}	
 
 	this->numz = new int*[GlobalV::KPAR];
diff --git a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.h b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.h
index d38997c264..f6e4811c03 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/parallel_grid.h
@@ -11,8 +11,14 @@ class Parallel_Grid
 	friend class Efield;
 	friend class Symmetry_rho;
 
-	Parallel_Grid();
-	~Parallel_Grid();
+    Parallel_Grid();
+    Parallel_Grid(const int ncx_in, const int ncy_in, const int ncz_in, const int nczp_in, const int nrxx_in, const int nbz_in, const int bz_in)
+        :ncx(ncx_in), ncy(ncy_in), ncz(ncz_in), nczp(nczp_in), nrxx(nrxx_in), nbz(nbz_in), bz(bz_in),
+        ncxy(ncx_in* ncy_in), ncxyz(ncx_in* ncy_in* ncz_in)
+    {
+        assert(ncx > 0 && ncy > 0 && ncz > 0 && nczp >= 0 && nrxx > 0 && nbz > 0 && bz > 0);
+    }
+    ~Parallel_Grid();
 	
 	void init(const int &ncx, const int &ncy, const int &ncz, 
 		const int &nczp, const int &nrxx, const int &nbz, const int &bz);
@@ -21,12 +27,19 @@ class Parallel_Grid
 		const int &nczp, const int &nrxx, const int &nbz, const int &bz); //LiuXh add 20180606
 
 #ifdef __MPI	
-	void zpiece_to_all(double *zpiece, const int &iz, double *rho) const;
-	void zpiece_to_stogroup(double *zpiece, const int &iz, double *rho) const; //qainrui add for sto-dft 2021-7-21
-	
-	void reduce_to_fullrho(double *rhotot, double *rhoin);
+    void zpiece_to_all(double* zpiece, const int& iz, double* rho) const;
+    void zpiece_to_stogroup(double* zpiece, const int& iz, double* rho) const; //qainrui add for sto-dft 2021-7-21
+
+    /// @brief  Broadcast data from root to all processors. The index order is [x][y][z].
+    void bcast(const double* const data_global, double* data_local, const int& rank)const;
+    /// @brief  Reduce data from all processors to root. The index order is [x][y][z].
+    void reduce(double* rhotot, const double* constrhoin)const;
 #endif
-	
+
+    const int& nx = this->ncx;
+    const int& ny = this->ncy;
+    const int& nz = this->ncz;
+
 	private:
 
 	void z_distribution(void);
@@ -41,13 +54,13 @@ class Parallel_Grid
 	int ncz;
 	int ncxy;
 	int ncxyz;
-	int nczp; // different processors have different values.
+    int nczp; // number of z-layers (xy-planes) in each processor
 	int nrxx;
 	int nbz;
 	int bz;
 
-	bool allocate;
-    bool allocate_final_scf; //LiuXh add 20180619
+    bool allocate = false;
+    bool allocate_final_scf = false; //LiuXh add 20180619
 };
 
 #endif
diff --git a/source/module_io/cube_io.h b/source/module_io/cube_io.h
index 89bdb2e75a..d7e6b706a6 100644
--- a/source/module_io/cube_io.h
+++ b/source/module_io/cube_io.h
@@ -2,95 +2,65 @@
 #define CUBE_IO_H
 #include <string>
 #include "module_cell/unitcell.h"
-#ifdef __MPI
-#include "module_hamilt_pw/hamilt_pwdft/parallel_grid.h"
-#endif
+class Parallel_Grid;
 
 namespace ModuleIO
 {
-bool read_cube(
-#ifdef __MPI
-    const Parallel_Grid*const Pgrid,
-#endif
+    /// read volumetric data from .cube file into the parallel distributed grid.
+    bool read_vdata_palgrid(
+    const Parallel_Grid& pgrid,
     const int my_rank,
-    const int is,
     std::ofstream& ofs_running,
-    const int nspin,
     const std::string& fn,
-    double*const data,
-    const int nx,
-    const int ny,
-    const int nz,
-    double& ef,
-    const UnitCell*const ucell,
-    int& prenspin,
-    const bool warning_flag = true);
+    double* const data,
+    const int nat);
 
-void write_cube(
-#ifdef __MPI
-    const int bz,
-    const int nbz,
-    const int nplane,
-    const int startz_current,
-#endif
-    const double*const data,
+    /// write volumetric data on the parallized grid into a .cube file
+    void write_vdata_palgrid(
+    const Parallel_Grid& pgrid,
+    const double* const data,
     const int is,
     const int nspin,
     const int iter,
     const std::string& fn,
-    const int nx,
-    const int ny,
-    const int nz,
     const double ef,
     const UnitCell*const ucell,
     const int precision = 11,
     const int out_fermi = 1); // mohan add 2007-10-17
 
+    /// read the full data from a cube file 
+    bool read_cube(const std::string& file,
+        std::vector<std::string>& comment,
+        int& natom,
+        std::vector<double>& origin,
+        int& nx,
+        int& ny,
+        int& nz,
+        std::vector<double>& dx,
+        std::vector<double>& dy,
+        std::vector<double>& dz,
+        std::vector<int>& atom_type,
+        std::vector<double>& atom_charge,
+        std::vector<std::vector<double>>& atom_pos,
+        std::vector<double>& data);
 
-// when MPI:
-//      read file as order (ixy,iz) to data[ixy*nz+iz]
-// when serial:
-//      read file as order (ixy,iz) to data[iz*nxy+ixy]
-void read_cube_core_match(
-    std::ifstream &ifs,
-#ifdef __MPI
-    const Parallel_Grid*const Pgrid,
-    const bool flag_read_rank,
-#endif
-    double*const data,
-    const int nxy,
-    const int nz);
-
-void read_cube_core_mismatch(
-    std::ifstream &ifs,
-#ifdef __MPI
-    const Parallel_Grid*const Pgrid,
-    const bool flag_read_rank,
-#endif
-    double*const data,
-    const int nx,
-    const int ny,
-    const int nz,
-    const int nx_read,
-    const int ny_read,
-    const int nz_read);
-
-// when MPI:
-//      write data[ixy*nplane+iz] to file as order (ixy,iz)
-// when serial:
-//      write data[iz*nxy+ixy] to file as order (ixy,iz)
-void write_cube_core(
-    std::ofstream &ofs_cube,
-#ifdef __MPI
-    const int bz,
-    const int nbz,
-    const int nplane,
-    const int startz_current,
-#endif
-    const double*const data,
-    const int nxy,
-    const int nz,
-    const int n_data_newline);
+    /// write a cube file
+    void write_cube(const std::string& file,
+        const std::vector<std::string>& comment,
+        const int& natom,
+        const std::vector<double>& origin,
+        const int& nx,
+        const int& ny,
+        const int& nz,
+        const std::vector<double>& dx,
+        const std::vector<double>& dy,
+        const std::vector<double>& dz,
+        const std::vector<int>& atom_type,
+        const std::vector<double>& atom_charge,
+        const std::vector<std::vector<double>>& atom_pos,
+        const std::vector<double>& data,
+        const int precision,
+        const int ndata_line = 6);
 
     /**
      * @brief The trilinear interpolation method
@@ -113,28 +83,23 @@ void write_cube_core(
      * directions, divided by the grid spacing. Here, it is assumed that the grid spacing is equal and can be
      * omitted during computation.
      *
-     * @param ifs the ifstream used to read charge density
+     * @param data_in the input data of size nxyz_read
      * @param nx_read nx read from file
      * @param ny_read ny read from file
      * @param nz_read nz read from file
      * @param nx the dimension of grids along x
      * @param ny the dimension of grids along y
      * @param nz the dimension of grids along z
-     * @param data the interpolated results
+     * @param data_out the interpolated results of size nxyz
      */
-    void trilinear_interpolate(std::ifstream& ifs,
-                               const int& nx_read,
-                               const int& ny_read,
-                               const int& nz_read,
-                               const int& nx,
-                               const int& ny,
-                               const int& nz,
-#ifdef __MPI
-                               std::vector<std::vector<double>> &data
-#else
-                               double* data
-#endif
-    );
+void trilinear_interpolate(const double* const data_in,
+    const int& nx_read,
+    const int& ny_read,
+    const int& nz_read,
+    const int& nx,
+    const int& ny,
+    const int& nz,
+    double* data_out);
 }
 
 #endif
diff --git a/source/module_io/get_pchg_lcao.cpp b/source/module_io/get_pchg_lcao.cpp
index 33d1c4d645..f4567a67da 100644
--- a/source/module_io/get_pchg_lcao.cpp
+++ b/source/module_io/get_pchg_lcao.cpp
@@ -130,21 +130,12 @@ void IState_Charge::begin(Gint_Gamma& gg,
                 // Use a const vector to store efermi for all spins, replace the original implementation:
                 // const double ef_tmp = pelec->eferm.get_efval(is);
                 double ef_spin = ef_all_spin[is];
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    bigpw_bz,
-                    bigpw_nbz,
-                    rhopw_nplane,
-                    rhopw_startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     rho_save[is].data(),
                     is,
                     nspin,
                     0,
                     ssc.str(),
-                    rhopw_nx,
-                    rhopw_ny,
-                    rhopw_nz,
                     ef_spin,
                     ucell_in);
             }
@@ -265,21 +256,12 @@ void IState_Charge::begin(Gint_k& gk,
                         ssc << global_out_dir << "BAND" << ib + 1 << "_K" << ik + 1 << "_SPIN" << is + 1 << "_CHG.cube";
 
                         double ef_spin = ef_all_spin[is];
-                        ModuleIO::write_cube(
-#ifdef __MPI
-                            bigpw_bz,
-                            bigpw_nbz,
-                            rhopw_nplane,
-                            rhopw_startz_current,
-#endif
+                        ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                             rho_save[is].data(),
                             is,
                             nspin,
                             0,
                             ssc.str(),
-                            rhopw_nx,
-                            rhopw_ny,
-                            rhopw_nz,
                             ef_spin,
                             ucell_in);
                     }
@@ -333,21 +315,12 @@ void IState_Charge::begin(Gint_k& gk,
                     ssc << global_out_dir << "BAND" << ib + 1 << "_SPIN" << is + 1 << "_CHG.cube";
 
                     double ef_spin = ef_all_spin[is];
-                    ModuleIO::write_cube(
-#ifdef __MPI
-                        bigpw_bz,
-                        bigpw_nbz,
-                        rhopw_nplane,
-                        rhopw_startz_current,
-#endif
+                    ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                         rho_save[is].data(),
                         is,
                         nspin,
                         0,
                         ssc.str(),
-                        rhopw_nx,
-                        rhopw_ny,
-                        rhopw_nz,
                         ef_spin,
                         ucell_in);
                 }
diff --git a/source/module_io/get_pchg_pw.h b/source/module_io/get_pchg_pw.h
index 1dfc797b32..e7ba789ede 100644
--- a/source/module_io/get_pchg_pw.h
+++ b/source/module_io/get_pchg_pw.h
@@ -118,21 +118,12 @@ void get_pchg_pw(const std::vector<int>& bands_to_print,
                 ssc << global_out_dir << "BAND" << ib + 1 << "_K" << ik % (nks / nspin) + 1 << "_SPIN" << spin_index + 1
                     << "_CHG.cube";
 
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    pw_big_bz,
-                    pw_big_nbz,
-                    pw_rhod->nplane,
-                    pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     rho_band[spin_index].data(),
                     spin_index,
                     nspin,
                     0,
                     ssc.str(),
-                    nx,
-                    ny,
-                    nz,
                     0.0,
                     ucell);
 
@@ -195,21 +186,12 @@ void get_pchg_pw(const std::vector<int>& bands_to_print,
                 std::stringstream ssc;
                 ssc << global_out_dir << "BAND" << ib + 1 << "_SPIN" << is + 1 << "_CHG.cube";
 
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    pw_big_bz,
-                    pw_big_nbz,
-                    pw_rhod->nplane,
-                    pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     rho_band[is].data(),
                     is,
                     nspin,
                     0,
                     ssc.str(),
-                    nx,
-                    ny,
-                    nz,
                     0.0,
                     ucell);
             }
diff --git a/source/module_io/get_wf_lcao.cpp b/source/module_io/get_wf_lcao.cpp
index 24a98757d6..d1257de1bb 100644
--- a/source/module_io/get_wf_lcao.cpp
+++ b/source/module_io/get_wf_lcao.cpp
@@ -121,21 +121,12 @@ void IState_Envelope::begin(const psi::Psi<double>* psid,
                 ss << global_out_dir << "BAND" << ib + 1 << "_GAMMA" << "_SPIN" << is + 1 << "_ENV.cube";
 
                 const double ef_tmp = this->pes_->eferm.get_efval(is);
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    pw_big->bz,
-                    pw_big->nbz,
-                    pw_rhod->nplane,
-                    pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     pes_->charge->rho_save[is],
                     is,
                     nspin,
                     0,
                     ss.str(),
-                    pw_rhod->nx,
-                    pw_rhod->ny,
-                    pw_rhod->nz,
                     ef_tmp,
                     &(GlobalC::ucell));
             }
@@ -212,42 +203,24 @@ void IState_Envelope::begin(const psi::Psi<double>* psid,
                 // Output real part
                 std::stringstream ss_real;
                 ss_real << global_out_dir << "BAND" << ib + 1 << "_GAMMA" << "_SPIN" << is + 1 << "_REAL.cube";
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    pw_big->bz,
-                    pw_big->nbz,
-                    pw_rhod->nplane,
-                    pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     wfc_real.data(),
                     is,
                     nspin,
                     0,
                     ss_real.str(),
-                    pw_rhod->nx,
-                    pw_rhod->ny,
-                    pw_rhod->nz,
                     ef_tmp,
                     &(GlobalC::ucell));
 
                 // Output imaginary part
                 std::stringstream ss_imag;
                 ss_imag << global_out_dir << "BAND" << ib + 1 << "_GAMMA" << "_SPIN" << is + 1 << "_IMAG.cube";
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    pw_big->bz,
-                    pw_big->nbz,
-                    pw_rhod->nplane,
-                    pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     wfc_imag.data(),
                     is,
                     nspin,
                     0,
                     ss_imag.str(),
-                    pw_rhod->nx,
-                    pw_rhod->ny,
-                    pw_rhod->nz,
                     ef_tmp,
                     &(GlobalC::ucell));
             }
@@ -381,21 +354,12 @@ void IState_Envelope::begin(const psi::Psi<std::complex<double>>* psi,
                 ss << global_out_dir << "BAND" << ib + 1 << "_k_" << ik + 1 << "_s_" << ispin + 1 << "_ENV.cube";
                 const double ef_tmp = this->pes_->eferm.get_efval(ispin);
 
-                ModuleIO::write_cube(
-#ifdef __MPI
-                    pw_big->bz,
-                    pw_big->nbz,
-                    pw_rhod->nplane,
-                    pw_rhod->startz_current,
-#endif
+                ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                     pes_->charge->rho[ispin],
                     ispin,
                     nspin,
                     0,
                     ss.str(),
-                    pw_rhod->nx,
-                    pw_rhod->ny,
-                    pw_rhod->nz,
                     ef_tmp,
                     &(GlobalC::ucell),
                     3,
@@ -458,21 +422,12 @@ void IState_Envelope::begin(const psi::Psi<std::complex<double>>* psi,
                     ss_real << global_out_dir << "BAND" << ib + 1 << "_k_" << ik + 1 << "_s_" << ispin + 1
                             << "_REAL.cube";
                     const double ef_tmp = this->pes_->eferm.get_efval(ispin);
-                    ModuleIO::write_cube(
-#ifdef __MPI
-                        pw_big->bz,
-                        pw_big->nbz,
-                        pw_rhod->nplane,
-                        pw_rhod->startz_current,
-#endif
+                    ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                         wfc_real.data(),
                         ispin,
                         nspin,
                         0,
                         ss_real.str(),
-                        pw_rhod->nx,
-                        pw_rhod->ny,
-                        pw_rhod->nz,
                         ef_tmp,
                         &(GlobalC::ucell));
 
@@ -480,21 +435,12 @@ void IState_Envelope::begin(const psi::Psi<std::complex<double>>* psi,
                     std::stringstream ss_imag;
                     ss_imag << global_out_dir << "BAND" << ib + 1 << "_k_" << ik + 1 << "_s_" << ispin + 1
                             << "_IMAG.cube";
-                    ModuleIO::write_cube(
-#ifdef __MPI
-                        pw_big->bz,
-                        pw_big->nbz,
-                        pw_rhod->nplane,
-                        pw_rhod->startz_current,
-#endif
+                    ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                         wfc_imag.data(),
                         ispin,
                         nspin,
                         0,
                         ss_imag.str(),
-                        pw_rhod->nx,
-                        pw_rhod->ny,
-                        pw_rhod->nz,
                         ef_tmp,
                         &(GlobalC::ucell));
                 }
diff --git a/source/module_io/read_cube.cpp b/source/module_io/read_cube.cpp
index 78ea02c445..f34c1f4680 100644
--- a/source/module_io/read_cube.cpp
+++ b/source/module_io/read_cube.cpp
@@ -1,211 +1,87 @@
 #include "module_io/cube_io.h"
+#include <limits>
+#include "module_hamilt_pw/hamilt_pwdft/parallel_grid.h"
 // #include "module_base/global_variable.h" // GlobalV reference removed
 
-bool ModuleIO::read_cube(
-#ifdef __MPI
-    const Parallel_Grid*const Pgrid,
-#endif
+bool ModuleIO::read_vdata_palgrid(
+    const Parallel_Grid& pgrid,
     const int my_rank,
-    const int is,
     std::ofstream& ofs_running,
-    const int nspin,
     const std::string& fn,
-    double*const data,
-    const int nx,
-    const int ny,
-    const int nz,
-    double& ef,
-    const UnitCell*const ucell,
-    int& prenspin,
-    const bool warning_flag)
+    double* const data,
+    const int natom)
 {
-    ModuleBase::TITLE("ModuleIO","read_cube");
+    ModuleBase::TITLE("ModuleIO", "read_vdata_palgrid");
+
+    // check if the file exists
     std::ifstream ifs(fn.c_str());
     if (!ifs)
     {
-        std::string tmp_warning_info = "!!! Couldn't find the charge file of ";
-        tmp_warning_info += fn;
+        std::string tmp_warning_info = "!!! Couldn't find the file: " + fn;
         ofs_running << tmp_warning_info << std::endl;
         return false;
     }
     else
     {
-        ofs_running << " Find the file, try to read charge from file." << std::endl;
+        ofs_running << " Find the file " << fn << " , try to read it." << std::endl;
     }
 
-    bool quit=false;
-
-    ifs.ignore(300, '\n'); // skip the header
-
-    if(nspin != 4)
+    // read the full grid data
+    const int& nx = pgrid.nx;
+    const int& ny = pgrid.ny;
+    const int& nz = pgrid.nz;
+    const int& nxyz = nx * ny * nz;
+    std::vector<double> data_xyz_full(nxyz, 0.0);
+    if (my_rank == 0)
     {
-        int v_in;
-        ifs >> v_in;
-        if (v_in != nspin)
+        std::vector<std::string> comment;
+        int natom = 0;
+        std::vector<double> origin;
+        std::vector<int> nvoxel;
+        int nx_read = 0;
+        int ny_read = 0;
+        int nz_read = 0;
+        std::vector<double> dx(3);
+        std::vector<double> dy(3);
+        std::vector<double> dz(3);
+        std::vector<std::vector<double>> axis_vecs;
+        std::vector<int> atom_type;
+        std::vector<double> atom_charge;
+        std::vector<std::vector<double>> atom_pos;
+        std::vector<double> data_read;
+
+        // we've already checked the file existence, so we don't need the returned value here
+        ModuleIO::read_cube(fn, comment, natom, origin, nx_read, ny_read, nz_read, dx, dy, dz, atom_type, atom_charge, atom_pos, data_read);
+
+        // if mismatch, trilinear interpolate
+        if (nx == nx_read && ny == ny_read && nz == nz_read)
         {
-            std::cout << " WARNING: nspin mismatch:  " << nspin << " in INPUT parameters but " << v_in << " in " << fn << std::endl;
-            return false;
+            std::memcpy(data_xyz_full.data(), data_read.data(), nxyz * sizeof(double));
         }
-    }
-    else
-    {
-        ifs >> prenspin;
-    }
-    ifs.ignore(150, ')');
-
-    ifs >> ef;
-    ofs_running << " read in fermi energy = " << ef << std::endl;
-
-    ifs.ignore(150, '\n');
-
-    ModuleBase::CHECK_INT(ifs, ucell->nat);
-    ifs.ignore(150, '\n');
-
-    int nx_read = 0;
-    int ny_read = 0;
-    int nz_read = 0;
-    double fac = ucell->lat0;
-    std::string temp;
-    if (warning_flag)
-    {
-        ifs >> nx_read;
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e11 / double(nx), quit);
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e12 / double(nx), quit);
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e13 / double(nx), quit);
-        ifs >> ny_read;
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e21 / double(ny), quit);
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e22 / double(ny), quit);
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e23 / double(ny), quit);
-        ifs >> nz_read;
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e31 / double(nz), quit);
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e32 / double(nz), quit);
-        ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->latvec.e33 / double(nz), quit);
-    }
-    else
-    {
-        ifs >> nx_read;
-        ifs >> temp >> temp >> temp;
-        ifs >> ny_read;
-        ifs >> temp >> temp >> temp;
-        ifs >> nz_read;
-        ifs >> temp >> temp >> temp;
-    }
-
-    for (int it = 0; it < ucell->ntype; it++)
-    {
-        for (int ia = 0; ia < ucell->atoms[it].na; ia++)
+        else
         {
-            ifs >> temp; // skip atomic number
-            ifs >> temp; // skip Z valance
-            if (warning_flag)
-            {
-                ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->atoms[it].tau[ia].x, quit);
-                ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->atoms[it].tau[ia].y, quit);
-                ModuleBase::CHECK_DOUBLE(ifs, fac * ucell->atoms[it].tau[ia].z, quit);
-            }
-            else
-            {
-                ifs >> temp >> temp >> temp;
-            }
+            trilinear_interpolate(data_read.data(), nx_read, ny_read, nz_read, nx, ny, nz, data_xyz_full.data());
         }
     }
 
-#ifdef __MPI
-    if(nx == nx_read && ny == ny_read && nz == nz_read)
-        ModuleIO::read_cube_core_match(ifs, Pgrid, (my_rank == 0), data, nx * ny, nz);
-    else
-        ModuleIO::read_cube_core_mismatch(ifs, Pgrid, (my_rank == 0), data, nx, ny, nz, nx_read, ny_read, nz_read);
+    // distribute
+#ifdef __MPI 
+    pgrid.bcast(data_xyz_full.data(), data, my_rank);
 #else
-    ofs_running << " Read SPIN = " << is + 1 << " charge now." << std::endl;
-    if(nx == nx_read && ny == ny_read && nz == nz_read)
-        ModuleIO::read_cube_core_match(ifs, data, nx*ny, nz);
-    else
-        ModuleIO::read_cube_core_mismatch(ifs, data, nx, ny, nz, nx_read, ny_read, nz_read);
+    std::memcpy(data, data_xyz_full.data(), nxyz * sizeof(double));
 #endif
-
     return true;
 }
 
-void ModuleIO::read_cube_core_match(
-    std::ifstream &ifs,
-#ifdef __MPI
-    const Parallel_Grid*const Pgrid,
-    const bool flag_read_rank,
-#endif
-    double*const data,
-    const int nxy,
-    const int nz)
-{
-#ifdef __MPI
-    if (flag_read_rank)
-    {
-        std::vector<std::vector<double>> read_rho(nz, std::vector<double>(nxy));
-        for (int ixy = 0; ixy < nxy; ixy++)
-            for (int iz = 0; iz < nz; iz++)
-                ifs >> read_rho[iz][ixy];
-        for (int iz = 0; iz < nz; iz++)
-            Pgrid->zpiece_to_all(read_rho[iz].data(), iz, data);
-    }
-    else
-    {
-        std::vector<double> zpiece(nxy);
-        for (int iz = 0; iz < nz; iz++)
-            Pgrid->zpiece_to_all(zpiece.data(), iz, data);
-    }
-#else
-    for (int ixy = 0; ixy < nxy; ixy++)
-        for (int iz = 0; iz < nz; iz++)
-            ifs >> data[iz * nxy + ixy];
-#endif
-}
-
-void ModuleIO::read_cube_core_mismatch(
-    std::ifstream &ifs,
-#ifdef __MPI
-    const Parallel_Grid*const Pgrid,
-    const bool flag_read_rank,
-#endif
-    double*const data,
-    const int nx,
-    const int ny,
-    const int nz,
-    const int nx_read,
-    const int ny_read,
-    const int nz_read)
-{
-#ifdef __MPI
-    const int nxy = nx * ny;
-    if (flag_read_rank)
-    {
-        std::vector<std::vector<double>> read_rho(nz, std::vector<double>(nxy));
-        ModuleIO::trilinear_interpolate(ifs, nx_read, ny_read, nz_read, nx, ny, nz, read_rho);
-        for (int iz = 0; iz < nz; iz++)
-            Pgrid->zpiece_to_all(read_rho[iz].data(), iz, data);
-    }
-    else
-    {
-        std::vector<double> zpiece(nxy);
-        for (int iz = 0; iz < nz; iz++)
-            Pgrid->zpiece_to_all(zpiece.data(), iz, data);
-    }
-#else
-    ModuleIO::trilinear_interpolate(ifs, nx_read, ny_read, nz_read, nx, ny, nz, data);
-#endif
-}
-
-void ModuleIO::trilinear_interpolate(std::ifstream& ifs,
-                                     const int& nx_read,
-                                     const int& ny_read,
-                                     const int& nz_read,
-                                     const int& nx,
-                                     const int& ny,
-                                     const int& nz,
-#ifdef __MPI
-                                     std::vector<std::vector<double>> &data
-#else
-                                     double* data
-#endif
-)
+void ModuleIO::trilinear_interpolate(
+    const double* const data_in,
+    const int& nx_read,
+    const int& ny_read,
+    const int& nz_read,
+    const int& nx,
+    const int& ny,
+    const int& nz,
+    double* data_out)
 {
     ModuleBase::TITLE("ModuleIO", "trilinear_interpolate");
 
@@ -220,7 +96,7 @@ void ModuleIO::trilinear_interpolate(std::ifstream& ifs,
         {
             for (int iz = 0; iz < nz_read; iz++)
             {
-                ifs >> read_rho[iz][ix * ny_read + iy];
+                read_rho[iz][ix * ny_read + iy] = data_in[(ix * ny_read + iy) * nz_read + iz];
             }
         }
     }
@@ -256,11 +132,7 @@ void ModuleIO::trilinear_interpolate(std::ifstream& ifs,
                                 + read_rho[highz][lowx * ny_read + highy] * (1 - dx) * dy * dz
                                 + read_rho[highz][highx * ny_read + highy] * dx * dy * dz;
 
-#ifdef __MPI
-                data[iz][ix * ny + iy] = result;
-#else
-                data[iz * nx * ny + ix * ny + iy] = result;
-#endif
+                data_out[(ix * ny + iy) * nz + iz] = result;    // x > y > z order, consistent with the cube file
             }
         }
     }
@@ -270,4 +142,53 @@ void ModuleIO::trilinear_interpolate(std::ifstream& ifs,
         delete[] read_rho[iz];
     }
     delete[] read_rho;
+}
+
+bool ModuleIO::read_cube(const std::string& file,
+    std::vector<std::string>& comment,
+    int& natom,
+    std::vector<double>& origin,
+    int& nx,
+    int& ny,
+    int& nz,
+    std::vector<double>& dx,
+    std::vector<double>& dy,
+    std::vector<double>& dz,
+    std::vector<int>& atom_type,
+    std::vector<double>& atom_charge,
+    std::vector<std::vector<double>>& atom_pos,
+    std::vector<double>& data)
+{
+    std::ifstream ifs(file);
+
+    if (!ifs) { return false; }
+
+    comment.resize(2);
+    for (auto& c : comment) { std::getline(ifs, c); }
+
+    ifs >> natom;
+    origin.resize(3);
+    for (auto& cp : origin) { ifs >> cp; }
+
+    dx.resize(3);
+    dy.resize(3);
+    dz.resize(3);
+    ifs >> nx >> dx[0] >> dx[1] >> dx[2];
+    ifs >> ny >> dy[0] >> dy[1] >> dy[2];
+    ifs >> nz >> dz[0] >> dz[1] >> dz[2];
+
+    atom_type.resize(natom);
+    atom_charge.resize(natom);
+    atom_pos.resize(natom, std::vector<double>(3));
+    for (int i = 0;i < natom;++i)
+    {
+        ifs >> atom_type[i] >> atom_charge[i] >> atom_pos[i][0] >> atom_pos[i][1] >> atom_pos[i][2];
+    }
+
+    const int nxyz = nx * ny * nz;
+    data.resize(nxyz);
+    for (int i = 0;i < nxyz;++i) { ifs >> data[i]; }
+
+    ifs.close();
+    return true;
 }
\ No newline at end of file
diff --git a/source/module_io/test_serial/rho_io_test.cpp b/source/module_io/test_serial/rho_io_test.cpp
index a478b2d8b3..0fdb484627 100644
--- a/source/module_io/test_serial/rho_io_test.cpp
+++ b/source/module_io/test_serial/rho_io_test.cpp
@@ -5,6 +5,7 @@
 #include "module_base/global_variable.h"
 #include "module_io/cube_io.h"
 #include "prepare_unitcell.h"
+#include "module_hamilt_pw/hamilt_pwdft/parallel_grid.h"
 
 #ifdef __LCAO
 InfoNonlocal::InfoNonlocal()
@@ -30,6 +31,7 @@ Magnetism::~Magnetism()
 {
     delete[] this->start_magnetization;
 }
+Parallel_Grid::~Parallel_Grid() {}
 
 #define private public
 #include "module_parameter/parameter.h"
@@ -91,15 +93,14 @@ TEST_F(RhoIOTest, Read)
     double ef;
     UcellTestPrepare utp = UcellTestLib["Si"];
     ucell = utp.SetUcellInfo();
-    ModuleIO::read_cube(my_rank, is, ofs_running, nspin, fn, rho[is], nx, ny, nz, ef, ucell, prenspin);
-    EXPECT_DOUBLE_EQ(ef, 0.461002);
+    Parallel_Grid pgrid(nx, ny, nz, nz, nrxx, nz, 1);
+    ModuleIO::read_vdata_palgrid(pgrid, my_rank, ofs_running, fn, rho[is], ucell->nat);
     EXPECT_DOUBLE_EQ(rho[0][0], 1.27020863940e-03);
     EXPECT_DOUBLE_EQ(rho[0][46655], 1.33581335706e-02);
 }
 
 TEST_F(RhoIOTest, TrilinearInterpolate)
 {
-    double data[36 * 40 * 44];
     int nx = 36;
     int ny = 40;
     int nz = 44;
@@ -111,7 +112,38 @@ TEST_F(RhoIOTest, TrilinearInterpolate)
     {
         ifs.ignore(300, '\n');
     }
-    ModuleIO::trilinear_interpolate(ifs, nx_read, ny_read, nz_read, nx, ny, nz, data);
+    std::vector<double> data_read(nx_read * ny_read * nz_read);
+    for (int ix = 0; ix < nx_read; ix++)
+    {
+        for (int iy = 0; iy < ny_read; iy++)
+        {
+            for (int iz = 0; iz < nz_read; iz++)
+            {
+                ifs >> data_read[(ix * ny_read + iy) * nz_read + iz];
+            }
+        }
+    }
+
+    // The old implementation is inconsistent: ifdef MPI, [x][y][z]; else, [z][x][y].
+    // Now we use [x][y][z] for both MPI and non-MPI, so here we need to chage the index order.
+    auto permute_xyz2zxy = [&](const double* const xyz, double* const zxy) -> void
+        {
+            for (int ix = 0; ix < nx; ix++)
+            {
+                for (int iy = 0; iy < ny; iy++)
+                {
+                    for (int iz = 0; iz < nz; iz++)
+                    {
+                        zxy[(iz * nx + ix) * ny + iy] = xyz[(ix * ny + iy) * nz + iz];
+                    }
+                }
+            }
+        };
+    const int nxyz = nx * ny * nz;
+    std::vector<double> data_xyz(nxyz);
+    std::vector<double> data(nxyz); // z > x > y
+    ModuleIO::trilinear_interpolate(data_read.data(), nx_read, ny_read, nz_read, nx, ny, nz, data_xyz.data());
+    permute_xyz2zxy(data_xyz.data(), data.data());
     EXPECT_DOUBLE_EQ(data[0], 0.0010824725010374092);
     EXPECT_DOUBLE_EQ(data[10], 0.058649850374240906);
     EXPECT_DOUBLE_EQ(data[100], 0.018931708073604996);
diff --git a/source/module_io/write_cube.cpp b/source/module_io/write_cube.cpp
index 2573740218..8fa212c60f 100644
--- a/source/module_io/write_cube.cpp
+++ b/source/module_io/write_cube.cpp
@@ -2,92 +2,92 @@
 #include "module_io/cube_io.h"
 #include "module_parameter/parameter.h"
 #include<vector>
+#include "module_hamilt_pw/hamilt_pwdft/parallel_grid.h"
 
-void ModuleIO::write_cube(
-#ifdef __MPI
-    const int bz,
-    const int nbz,
-    const int nplane,
-    const int startz_current,
-#endif
-    const double*const data,
+void ModuleIO::write_vdata_palgrid(
+    const Parallel_Grid& pgrid,
+    const double* const data,
     const int is,
     const int nspin,
     const int iter,
     const std::string& fn,
-    const int nx,
-    const int ny,
-    const int nz,
     const double ef,
     const UnitCell*const ucell,
     const int precision,
     const int out_fermi)
 {
-    ModuleBase::TITLE("ModuleIO", "write_cube");
+    ModuleBase::TITLE("ModuleIO", "write_vdata_palgrid");
 
     const int my_rank = GlobalV::MY_RANK;
+    const int my_pool = GlobalV::MY_POOL;
 
     time_t start;
     time_t end;
-    std::ofstream ofs_cube;
+    std::stringstream ss;
 
-    if (my_rank == 0)
-    {
-        start = time(NULL);
+    const int& nx = pgrid.nx;
+    const int& ny = pgrid.ny;
+    const int& nz = pgrid.nz;
+    const int& nxyz = nx * ny * nz;
 
-        if (iter == 0)
-        {
-            ofs_cube.open(fn.c_str());
-        }
-        else
-        {
-            ofs_cube.open(fn.c_str(), std::ios::app);
-        }
+    start = time(nullptr);
 
-        if (!ofs_cube)
-        {
-            ModuleBase::WARNING("ModuleIO::write_cube", "Can't create Output File!");
-        }
+    // reduce
+    std::vector<double> data_xyz_full(nxyz);    // data to be written
+#ifdef __MPI    // reduce to rank 0
+    if (my_pool == 0)
+    {
+        pgrid.reduce(data_xyz_full.data(), data);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+#else
+    std::memcpy(data_xyz_full.data(), data, nxyz * sizeof(double));
+#endif
 
+    // build the info structure
+    if (my_rank == 0)
+    {
         /// output header for cube file
-        ofs_cube << "STEP: " << iter << "  Cubefile created from ABACUS. Inner loop is z, followed by y and x" << std::endl;
-        ofs_cube << nspin << " (nspin) ";
+        ss << "STEP: " << iter << "  Cubefile created from ABACUS. Inner loop is z, followed by y and x" << std::endl;
 
-        ofs_cube << std::fixed;
-        ofs_cube << std::setprecision(6);
+        ss << nspin << " (nspin) ";
+        ss << std::fixed;
+        ss << std::setprecision(6);
         if (out_fermi == 1)
         {
             if (PARAM.globalv.two_fermi)
             {
                 if (is == 0)
                 {
-                    ofs_cube << ef << " (fermi energy for spin=1, in Ry)" << std::endl;
+                    ss << ef << " (fermi energy for spin=1, in Ry)" << std::endl;
                 }
                 else if (is == 1)
                 {
-                    ofs_cube << ef << " (fermi energy for spin=2, in Ry)" << std::endl;
+                    ss << ef << " (fermi energy for spin=2, in Ry)" << std::endl;
                 }
             }
             else
             {
-                ofs_cube << ef << " (fermi energy, in Ry)" << std::endl;
+                ss << ef << " (fermi energy, in Ry)" << std::endl;
             }
         }
         else
         {
-            ofs_cube << std::endl;
+            ss << std::endl;
         }
 
-        ofs_cube << ucell->nat << " 0.0 0.0 0.0 " << std::endl;
+        std::vector<std::string> comment(2);
+        for (int i = 0;i < 2;++i) { std::getline(ss, comment[i]); }
+
         double fac = ucell->lat0;
-        ofs_cube << nx << " " << fac * ucell->latvec.e11 / double(nx) << " " << fac * ucell->latvec.e12 / double(nx)
-                 << " " << fac * ucell->latvec.e13 / double(nx) << std::endl;
-        ofs_cube << ny << " " << fac * ucell->latvec.e21 / double(ny) << " " << fac * ucell->latvec.e22 / double(ny)
-                 << " " << fac * ucell->latvec.e23 / double(ny) << std::endl;
-        ofs_cube << nz << " " << fac * ucell->latvec.e31 / double(nz) << " " << fac * ucell->latvec.e32 / double(nz)
-                 << " " << fac * ucell->latvec.e33 / double(nz) << std::endl;
+        std::vector<double> dx = { fac * ucell->latvec.e11 / double(nx), fac * ucell->latvec.e12 / double(nx), fac * ucell->latvec.e13 / double(nx) };
+        std::vector<double> dy = { fac * ucell->latvec.e21 / double(ny), fac * ucell->latvec.e22 / double(ny), fac * ucell->latvec.e23 / double(ny) };
+        std::vector<double> dz = { fac * ucell->latvec.e31 / double(nz), fac * ucell->latvec.e32 / double(nz), fac * ucell->latvec.e33 / double(nz) };
 
         std::string element = "";
+        std::vector<int> atom_type;
+        std::vector<double> atom_charge;
+        std::vector<std::vector<double>> atom_pos;
         for (int it = 0; it < ucell->ntype; it++)
         {
             // erase the number in label, such as Fe1.
@@ -117,184 +117,79 @@ void ModuleIO::write_cube(
                         break;
                     }
                 }
-                ofs_cube << " " << z << " " << ucell->atoms[it].ncpp.zv << " " << fac * ucell->atoms[it].tau[ia].x
-                         << " " << fac * ucell->atoms[it].tau[ia].y << " " << fac * ucell->atoms[it].tau[ia].z
-                         << std::endl;
+                atom_type.push_back(z);
+                atom_charge.push_back(ucell->atoms[it].ncpp.zv);
+                atom_pos.push_back({ fac * ucell->atoms[it].tau[ia].x, fac * ucell->atoms[it].tau[ia].y, fac * ucell->atoms[it].tau[ia].z });
             }
         }
-        ofs_cube.unsetf(std::ostream::fixed);
-        ofs_cube << std::setprecision(precision);
-        ofs_cube << std::scientific;
-    }
-
-#ifdef __MPI
-    ModuleIO::write_cube_core(ofs_cube, bz, nbz, nplane, startz_current, data, nx*ny, nz, 6);
-#else
-    ModuleIO::write_cube_core(ofs_cube, data, nx*ny, nz, 6);
-#endif
-
-    if (my_rank == 0)
-    {
-        end = time(NULL);
-        ModuleBase::GlobalFunc::OUT_TIME("write_cube", start, end);
-
-        /// for cube file
-        ofs_cube.close();
+        write_cube(fn, comment, ucell->nat, { 0.0, 0.0, 0.0 }, nx, ny, nz, dx, dy, dz, atom_type, atom_charge, atom_pos, data_xyz_full, precision);
+        end = time(nullptr);
+        ModuleBase::GlobalFunc::OUT_TIME("write_vdata_palgrid", start, end);
     }
 
     return;
 }
 
-
-void ModuleIO::write_cube_core(
-    std::ofstream &ofs_cube,
-#ifdef __MPI
-    const int bz,
-    const int nbz,
-    const int nplane,
-    const int startz_current,
-#endif
-    const double*const data,
-    const int nxy,
-    const int nz,
-    const int n_data_newline)
+void ModuleIO::write_cube(const std::string& file,
+    const std::vector<std::string>& comment,
+    const int& natom,
+    const std::vector<double>& origin,
+    const int& nx,
+    const int& ny,
+    const int& nz,
+    const std::vector<double>& dx,
+    const std::vector<double>& dy,
+    const std::vector<double>& dz,
+    const std::vector<int>& atom_type,
+    const std::vector<double>& atom_charge,
+    const std::vector<std::vector<double>>& atom_pos,
+    const std::vector<double>& data,
+    const int precision,
+    const int ndata_line)
 {
-    ModuleBase::TITLE("ModuleIO", "write_cube_core");
-
-#ifdef __MPI
-
-    const int my_rank = GlobalV::MY_RANK;
-    const int my_pool = GlobalV::MY_POOL;
-    const int rank_in_pool = GlobalV::RANK_IN_POOL;
-    const int nproc_in_pool = GlobalV::NPROC_IN_POOL;
-
-    // only do in the first pool.
-    if (my_pool == 0)
-    {
-        /// for cube file
-        const int nxyz = nxy * nz;
-        std::vector<double> data_cube(nxyz, 0.0);
-
-        // num_z: how many planes on processor 'ip'
-        std::vector<int> num_z(nproc_in_pool, 0);
-
-        for (int iz = 0; iz < nbz; iz++)
-        {
-            const int ip = iz % nproc_in_pool;
-            num_z[ip] += bz;
-        }
-
-        // start_z: start position of z in
-        // processor ip.
-        std::vector<int> start_z(nproc_in_pool, 0);
-        for (int ip = 1; ip < nproc_in_pool; ip++)
-        {
-            start_z[ip] = start_z[ip - 1] + num_z[ip - 1];
-        }
-
-        // which_ip: found iz belongs to which ip.
-        std::vector<int> which_ip(nz, 0);
-        for (int iz = 0; iz < nz; iz++)
-        {
-            for (int ip = 0; ip < nproc_in_pool; ip++)
-            {
-                if (iz >= start_z[nproc_in_pool - 1])
-                {
-                    which_ip[iz] = nproc_in_pool - 1;
-                    break;
-                }
-                else if (iz >= start_z[ip] && iz < start_z[ip + 1])
-                {
-                    which_ip[iz] = ip;
-                    break;
-                }
-            }
-        }
+    assert(comment.size() >= 2);
+    for (int i = 0;i < 2;++i) { assert(comment[i].find("\n") == std::string::npos); }
+    assert(origin.size() >= 3);
+    assert(dx.size() >= 3);
+    assert(dy.size() >= 3);
+    assert(dz.size() >= 3);
+    assert(atom_type.size() >= natom);
+    assert(atom_charge.size() >= natom);
+    assert(atom_pos.size() >= natom);
+    for (int i = 0;i < natom;++i) { assert(atom_pos[i].size() >= 3); }
+    assert(data.size() >= nx * ny * nz);
 
-        int count = 0;
-        std::vector<double> zpiece(nxy, 0.0);
+    std::ofstream ofs(file);
 
-        // save the rho one z by one z.
-        for (int iz = 0; iz < nz; iz++)
-        {
-            zpiece.assign(nxy, 0.0);
+    for (int i = 0;i < 2;++i) { ofs << comment[i] << "\n"; }
 
-            // tag must be different for different iz.
-            const int tag = iz;
-            MPI_Status ierror;
+    ofs << std::fixed;
+    ofs << std::setprecision(1);    // as before
 
-            // case 1: the first part of rho in processor 0.
-            if (which_ip[iz] == 0 && rank_in_pool == 0)
-            {
-                for (int ixy = 0; ixy < nxy; ixy++)
-                {
-                    // mohan change to rho_save on 2012-02-10
-                    // because this can make our next restart calculation lead
-                    // to the same scf_thr as the one saved.
-                    zpiece[ixy] = data[ixy * nplane + iz - startz_current];
-                }
-            }
-            // case 2: > first part rho: send the rho to
-            // processor 0.
-            else if (which_ip[iz] == rank_in_pool)
-            {
-                for (int ixy = 0; ixy < nxy; ixy++)
-                {
-                    zpiece[ixy] = data[ixy * nplane + iz - startz_current];
-                }
-                MPI_Send(zpiece.data(), nxy, MPI_DOUBLE, 0, tag, POOL_WORLD);
-            }
+    ofs << natom << " " << origin[0] << " " << origin[1] << " " << origin[2] << " \n";
 
-            // case 2: > first part rho: processor 0 receive the rho
-            // from other processors
-            else if (rank_in_pool == 0)
-            {
-                MPI_Recv(zpiece.data(), nxy, MPI_DOUBLE, which_ip[iz], tag, POOL_WORLD, &ierror);
-            }
+    ofs << std::setprecision(6);    //as before
+    ofs << nx << " " << dx[0] << " " << dx[1] << " " << dx[2] << "\n";
+    ofs << ny << " " << dy[0] << " " << dy[1] << " " << dy[2] << "\n";
+    ofs << nz << " " << dz[0] << " " << dz[1] << " " << dz[2] << "\n";
 
-            if (my_rank == 0)
-            {
-                /// for cube file
-                for (int ixy = 0; ixy < nxy; ixy++)
-                {
-                    data_cube[ixy * nz + iz] = zpiece[ixy];
-                }
-                /// for cube file
-            }
-        } // end iz
-
-        // for cube file
-        if (my_rank == 0)
-        {
-            for (int ixy = 0; ixy < nxy; ixy++)
-            {
-                for (int iz = 0; iz < nz; iz++)
-                {
-                    ofs_cube << " " << data_cube[ixy * nz + iz];
-                    if ((iz % n_data_newline == n_data_newline-1) && (iz != nz - 1))
-                    {
-                        ofs_cube << "\n";
-                    }
-                }
-                ofs_cube << "\n";
-            }
-        }
-        /// for cube file
+    for (int i = 0;i < natom;++i)
+    {
+        ofs << " " << atom_type[i] << " " << atom_charge[i] << " " << atom_pos[i][0] << " " << atom_pos[i][1] << " " << atom_pos[i][2] << "\n";
     }
-    MPI_Barrier(MPI_COMM_WORLD);
-#else
-    for (int ixy = 0; ixy < nxy; ixy++)
+
+    ofs.unsetf(std::ofstream::fixed);
+    ofs << std::setprecision(precision);
+    ofs << std::scientific;
+    const int nxy = nx * ny;
+    for (int ixy = 0; ixy < nxy; ++ixy)
     {
-        for (int iz = 0; iz < nz; iz++)
+        for (int iz = 0;iz < nz;++iz)
         {
-            ofs_cube << " " << data[iz * nxy + ixy];
-            // ++count_cube;
-            if ((iz % n_data_newline == n_data_newline-1) && (iz != nz - 1))
-            {
-                ofs_cube << "\n";
-            }
+            ofs << " " << data[ixy * nz + iz];
+            if ((iz + 1) % ndata_line == 0 && iz != nz - 1) { ofs << "\n"; }
         }
-        ofs_cube << "\n";
+        ofs << "\n";
     }
-#endif
-}
+    ofs.close();
+}
\ No newline at end of file
diff --git a/source/module_io/write_elecstat_pot.cpp b/source/module_io/write_elecstat_pot.cpp
index 4a620a3efa..eaf7762285 100644
--- a/source/module_io/write_elecstat_pot.cpp
+++ b/source/module_io/write_elecstat_pot.cpp
@@ -93,21 +93,12 @@ void write_elecstat_pot(
     double ef_tmp = 0.0;
     int out_fermi = 0;
 
-    ModuleIO::write_cube(
-#ifdef __MPI
-        bz,
-        nbz,
-        rho_basis->nplane,
-        rho_basis->startz_current,
-#endif
+    ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
         v_elecstat.data(),
         is,
         nspin,
         istep,
         fn,
-        rho_basis->nx,
-        rho_basis->ny,
-        rho_basis->nz,
         ef_tmp,
         &(GlobalC::ucell),
         precision,
diff --git a/source/module_io/write_elf.cpp b/source/module_io/write_elf.cpp
index 452c9b6e45..a50063c395 100644
--- a/source/module_io/write_elf.cpp
+++ b/source/module_io/write_elf.cpp
@@ -1,5 +1,6 @@
 #include "write_elf.h"
 #include "module_io/cube_io.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
 
 namespace ModuleIO
 {
@@ -88,21 +89,12 @@ void write_elf(
         std::string fn = out_dir + "/ELF.cube";
 
         int is = -1;
-        ModuleIO::write_cube(
-    #ifdef __MPI
-            bz,
-            nbz,
-            rho_basis->nplane,
-            rho_basis->startz_current,
-    #endif
+        ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
             elf[0].data(),
             is,
             nspin,
             istep,
             fn,
-            rho_basis->nx,
-            rho_basis->ny,
-            rho_basis->nz,
             ef_tmp,
             ucell_,
             precision,
@@ -115,21 +107,12 @@ void write_elf(
             std::string fn_temp = out_dir + "/ELF_SPIN" + std::to_string(is + 1) + ".cube";
             int ispin = is + 1;
 
-            ModuleIO::write_cube(
-        #ifdef __MPI
-                bz,
-                nbz,
-                rho_basis->nplane,
-                rho_basis->startz_current,
-        #endif
+            ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
                 elf[is].data(),
                 ispin,
                 nspin,
                 istep,
                 fn_temp,
-                rho_basis->nx,
-                rho_basis->ny,
-                rho_basis->nz,
                 ef_tmp,
                 ucell_,
                 precision,
@@ -145,21 +128,12 @@ void write_elf(
         std::string fn = out_dir + "/ELF.cube";
 
         int is = -1;
-        ModuleIO::write_cube(
-    #ifdef __MPI
-            bz,
-            nbz,
-            rho_basis->nplane,
-            rho_basis->startz_current,
-    #endif
+        ModuleIO::write_vdata_palgrid(GlobalC::Pgrid,
             elf_tot.data(),
             is,
             nspin,
             istep,
             fn,
-            rho_basis->nx,
-            rho_basis->ny,
-            rho_basis->nz,
             ef_tmp,
             ucell_,
             precision,
diff --git a/source/module_lr/esolver_lrtd_lcao.cpp b/source/module_lr/esolver_lrtd_lcao.cpp
index 680e5bc84d..8ecc865b97 100644
--- a/source/module_lr/esolver_lrtd_lcao.cpp
+++ b/source/module_lr/esolver_lrtd_lcao.cpp
@@ -654,22 +654,12 @@ void LR::ESolver_LR<T, TR>::read_ks_chg(Charge& chg_gs)
         ssc << PARAM.globalv.global_readin_dir << "SPIN" << is + 1 << "_CHG.cube";
         GlobalV::ofs_running << ssc.str() << std::endl;
         double ef;
-        if (ModuleIO::read_cube(
-#ifdef __MPI
-            & (GlobalC::Pgrid),
-#endif
+        if (ModuleIO::read_vdata_palgrid(GlobalC::Pgrid,
             GlobalV::MY_RANK,
-            is,
             GlobalV::ofs_running,
-            this->nspin,
             ssc.str(),
             chg_gs.rho[is],
-            this->pw_rho->nx,
-            this->pw_rho->ny,
-            this->pw_rho->nz,
-            ef,
-            &(GlobalC::ucell),
-            chg_gs.prenspin)) {
+            ucell.nat)) {
             GlobalV::ofs_running << " Read in the charge density: " << ssc.str() << std::endl;
         } else {    // prenspin for nspin=4 is not supported currently
             ModuleBase::WARNING_QUIT(