diff --git a/CHANGELOG.md b/CHANGELOG.md index 75af5387..161118d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,22 @@ # Change Log All notable changes to this project will be documented in this file. +## OpenFPM 5.1.0 - Jun 2024 +- Refactor implementations of cell list and Verlet list `CellList`, `CellList_gpu`,`VerletList` and all neighborhood iterators. Move from keeping two sets (unordered and ordered) of positions/property vectors to reordering explicitly before launching CUDA kernels that utilize this feature + +### Added +- Add adaptive cut-off radius Verlet list on CPU. Particles individual radii are passed as a vector to `getVerletAdaptRCut` of `openfpm_pdata`. Time complexity: O(N^2) + +### Changes +- `cuda_launch` is a function instead of a macro +- Move project c++ standard to c++17 +- Add bit-wise option flags that control lell list behaviour. Pass as a parameter `opt` to `getCellList` of `openfpm_pdata` or set directly on cell list via `setOpt` +- Add support for the following cell list bit-wise option flags: `CL_SYMMETRIC`, `CL_NON_SYMMETRIC`, `CL_LOCAL_SYMMETRIC`, `CL_LINEAR_CELL_KEYS`, `CL_HILBERT_CELL_KEYS`, `CL_GPU_REORDER`, where for the last one `CL_GPU_REORDER` control options for reordering/restoring operations could be more fine-tuned: `CL_GPU_REORDER_POSITION`, `CL_GPU_REORDER_PROPERTY`, `CL_GPU_RESTORE_POSITION`, `CL_GPU_RESTORE_PROPERTY` +- Add bit-wise option flags that control Verlet list behaviour. Pass as a template parameter `opt` in `VerletList` or pass Verlet list type with `opt` set to `getVerlet` of `openfpm_pdata`. The difference in how `opt` is set for cell list and Verlet list is due to legacy code +- Add support for the following Verlet list bit-wise option flags: `VL_NON_SYMMETRIC`, `VL_SYMMETRIC`, `VL_CRS_SYMMETRIC`, `VL_ADAPTIVE_RCUT`, `VL_NMAX_NEIGHBOR`, `VL_SKIP_REF_PART` +- `getCellListGPU` of `openfpm_pdata` doesn't fill a cell list with particle locations. Additionally, `updateCellListGPU` has to be called to perform this operation. The reason is to remove two identical fill operations in a row in simulations +- To reorder `vector_dist` for GPU coalesced memory access dictated by the cell list structure, cell list on GPU has to be constructed with one of the following flags enabled: `CL_GPU_REORDER`, `CL_GPU_REORDER_POSITION`, `CL_GPU_REORDER_PROPERTY`, `CL_GPU_RESTORE_POSITION`, `CL_GPU_RESTORE_PROPERTY`. Properties to be copied to reordered copies of position/property vectors have to be passed to `updateCellListGPU(CELL LIST)` of `vector_dist`. Properties that undergone changes in reordered copies and have to be restored to the original position/property vectors have to be passed to `restoreOrder(CELL LIST)` of `vector_dist` + ## OpenFPM 5.0.0 - Feb 2024 - Move to `openfpm` meta OpenFPM project structure with git subprojects `openfpm_data`, `openfpm_devices`, `openfpm_io`, `openfpm_vcluster`, `openfpm_pdata`, `openfpm_numerics`. Example codes, installation scripts for dependencies, configuration scripts moved to `openfpm`. Only source code of `openfpm_pdata` kept in `openfpm_pdata` diff --git a/CMakeLists.txt b/CMakeLists.txt index 17c54ec5..2118f670 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,6 +131,9 @@ if(APPLE) endif() if (Boost_FOUND) + # ignore BOOST deprecated headers + add_definitions("-DBOOST_ALLOW_DEPRECATED_HEADERS") + if ((CUDA_ON_BACKEND STREQUAL "SEQUENTIAL" OR CUDA_ON_BACKEND STREQUAL "OpenMP") AND NOT Boost_CONTEXT_FOUND) message( FATAL_ERROR "BOOST is invalid reinstalling" ) endif() @@ -163,6 +166,7 @@ add_subdirectory(openfpm_pdata) if (ENABLE_NUMERICS) add_subdirectory(openfpm_numerics) + set(DEFINE_ENABLE_NUMERICS "#define ENABLE_NUMERICS") endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/config_cmake.h.in ${PROJECT_BINARY_DIR}/config/config.h) diff --git a/config/config_cmake.h.in b/config/config_cmake.h.in index b896e8b5..2d69d31c 100644 --- a/config/config_cmake.h.in +++ b/config/config_cmake.h.in @@ -195,6 +195,9 @@ ${DEFINE_VCLUSTER_GARBAGE_INJECTOR} /* Test coverage mode */ ${DEFINE_TEST_COVERAGE_MODE} +/* Enable numerics module */ +${DEFINE_ENABLE_NUMERICS} + /* when an error accur continue but avoid unsafe operation */ /* #undef THROW_ON_ERROR */ diff --git a/example/Numerics/PSE/0_Derivative_approx_1D/main.cpp b/example/Numerics/PSE/0_Derivative_approx_1D/main.cpp index 955a7719..044147ff 100644 --- a/example/Numerics/PSE/0_Derivative_approx_1D/main.cpp +++ b/example/Numerics/PSE/0_Derivative_approx_1D/main.cpp @@ -210,9 +210,7 @@ int main(int argc, char* argv[]) // // get and construct the Cell list - - Ghost<1,double> gp(enlarge); - auto cl = vd.getCellList(12*eps,gp); + auto cl = vd.getCellList(12*eps, CL_NON_SYMMETRIC, false, enlarge); // Maximum infinity norm double linf = 0.0; @@ -251,7 +249,7 @@ int main(int argc, char* argv[]) double prp_x = vd.template getProp<0>(key); // Get the neighborhood of the particles - auto NN = cl.getNNIterator(cl.getCell(p)); + auto NN = cl.getNNIteratorBox(cl.getCell(p)); while(NN.isNext()) { auto nnp = NN.get(); diff --git a/example/Numerics/PSE/1_Derivative_approx_1D_mp/main_float128.cpp b/example/Numerics/PSE/1_Derivative_approx_1D_mp/main_float128.cpp index cd7629f1..0c72ed0f 100644 --- a/example/Numerics/PSE/1_Derivative_approx_1D_mp/main_float128.cpp +++ b/example/Numerics/PSE/1_Derivative_approx_1D_mp/main_float128.cpp @@ -218,9 +218,8 @@ int main(int argc, char* argv[]) // // get and construct the Cell list + auto cl = vd.getCellList(12*eps, CL_NON_SYMMETRIC, false, enlarge); - Ghost<1,float128> gp(enlarge); - auto cl = vd.getCellList(12*eps,gp); // Maximum infinity norm double linf = 0.0; @@ -260,7 +259,7 @@ int main(int argc, char* argv[]) float128 prp_x = vd.template getProp<0>(key); // Get the neighborhood of the particles - auto NN = cl.getNNIterator(cl.getCell(p)); + auto NN = cl.getNNIteratorBox(cl.getCell(p)); while(NN.isNext()) { auto nnp = NN.get(); diff --git a/example/Numerics/PSE/1_Diffusion_1D/main.cpp b/example/Numerics/PSE/1_Diffusion_1D/main.cpp index 7185298e..97873abd 100644 --- a/example/Numerics/PSE/1_Diffusion_1D/main.cpp +++ b/example/Numerics/PSE/1_Diffusion_1D/main.cpp @@ -69,7 +69,7 @@ template double calcLap(Point<1,double> p, vect_dist_key_dx key, double prp_x = vd.template getProp<0>(key); // Get the neighborhood of the particles - auto NN = cl.getNNIterator(cl.getCell(p)); + auto NN = cl.getNNIteratorBox(cl.getCell(p)); while(NN.isNext()) { auto nnp = NN.get(); @@ -339,13 +339,11 @@ int main(int argc, char* argv[]) // keeping ghost size and padding area unrelated give us the possibility to show how to create a CellList // on a area bigger than the domain + ghost - Ghost<1,double> gp(enlarge); - - // Create a Cell list with Cell spaping 8*epsilon, the CellList is created on a domain+ghost+enlarge space - auto cl = vd.getCellList(8*eps,gp); + // Create a Cell list with Cell spaping 8*epsilon, the CellList is created on a domain+ghost+enlarge space + auto cl = vd.getCellList(8*eps, CL_NON_SYMMETRIC, false, enlarge); - // Maximum infinity norm - double linf = 0.0; + // Maximum infinity norm + double linf = 0.0; // // ### WIKI 13 ### diff --git a/example/SparseGrid/8_filling_benchmark_gpu/main.cu b/example/SparseGrid/8_filling_benchmark_gpu/main.cu index 644a4cc1..a8456118 100644 --- a/example/SparseGrid/8_filling_benchmark_gpu/main.cu +++ b/example/SparseGrid/8_filling_benchmark_gpu/main.cu @@ -2,7 +2,7 @@ #define SYNC_BEFORE_TAKE_TIME #define ENABLE_GRID_DIST_ID_PERF_STATS #include "Decomposition/Distribution/BoxDistribution.hpp" -#include "util/cuda_launch.hpp" +#include "util/cuda_util.hpp" #include "Grid/grid_dist_id.hpp" #include "data_type/aggregate.hpp" #include "timer.hpp" diff --git a/example/Vector/10_level_set/main.cpp b/example/Vector/10_level_set/main.cpp index 9860bb30..cc3b6dc6 100644 --- a/example/Vector/10_level_set/main.cpp +++ b/example/Vector/10_level_set/main.cpp @@ -63,7 +63,11 @@ const double rdist_cutoff_factor = 2.8; // dimensions of spatial and temporal domain const double l = 2.44; +#ifdef TEST_RUN +const double t_end = 0.1; +#else const double t_end = 1.0; +#endif // total mass in the domain to compute individual particle masses from const double M = l*l*rho_0; // number of particles in total @@ -89,7 +93,7 @@ template inline void density_summation(particles & vd, CellL // intitialize sum that yields 1/(particle volume) double V_inv = 0.0; - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(a))); // iterate over particles b (neighboring particles) while (Np.isNext() == true) { @@ -171,7 +175,7 @@ template inline void calc_forces(particles & vd, CellList & N Point<2, double> va = vd.getProp(a); // Get an iterator over the neighborhood particles of p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(a))); // For each neighborhood particle b while (Np.isNext() == true) diff --git a/example/Vector/1_celllist/main.cpp b/example/Vector/1_celllist/main.cpp index 4df03e5f..09dec2bb 100644 --- a/example/Vector/1_celllist/main.cpp +++ b/example/Vector/1_celllist/main.cpp @@ -247,7 +247,7 @@ int main(int argc, char* argv[]) Point<3,float> xp = vd.getPos(p); // Get an iterator of all the particles neighborhood of p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); // For each particle near p while (Np.isNext()) @@ -392,8 +392,8 @@ int main(int argc, char* argv[]) //! \cond [cell_list_types] \endcond // Get cell list - auto NN4 = vd.getCellList(r_cut); - auto NN5 = vd.getCellList(r_cut); + auto NN4 = vd.getCellList>(r_cut); + auto NN5 = vd.getCellList>(r_cut); //! \cond [cell_list_types] \endcond diff --git a/example/Vector/1_gpu_first_step/main.cu b/example/Vector/1_gpu_first_step/main.cu index 30a69fcd..28d514e2 100644 --- a/example/Vector/1_gpu_first_step/main.cu +++ b/example/Vector/1_gpu_first_step/main.cu @@ -127,7 +127,6 @@ #define OPENMPI //! \cond [using_openmpi] \endcond -#define SCAN_WITH_CUB <------ MODERNGPU is broken on RTX use CUB library for scan //#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default #include "Vector/vector_dist.hpp" diff --git a/example/Vector/3_molecular_dynamic/main.cpp b/example/Vector/3_molecular_dynamic/main.cpp index eaab13e3..c70d3b43 100644 --- a/example/Vector/3_molecular_dynamic/main.cpp +++ b/example/Vector/3_molecular_dynamic/main.cpp @@ -115,7 +115,7 @@ template void calc_forces(vector_dist<3,double, aggregate(p)[2] = 0.0; // Get an iterator over the neighborhood particles of p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); // For each neighborhood particle ... while (Np.isNext()) @@ -232,7 +232,7 @@ template double calc_energy(vector_dist<3,double, aggregate xp = vd.getPos(p); // Get an iterator over the neighborhood of the particle p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); // For each neighborhood of the particle p while (Np.isNext()) @@ -438,7 +438,7 @@ int main(int argc, char* argv[]) //! \cond [md steps] \endcond // Get the Cell list structure - auto NN = vd.getCellList(r_cut); + auto NN = vd.getCellList>(r_cut); // The standard // auto NN = vd.getCellList(r_cut); diff --git a/example/Vector/3_molecular_dynamic/main_expr_paper.cpp b/example/Vector/3_molecular_dynamic/main_expr_paper.cpp index e166299a..cc916483 100644 --- a/example/Vector/3_molecular_dynamic/main_expr_paper.cpp +++ b/example/Vector/3_molecular_dynamic/main_expr_paper.cpp @@ -94,7 +94,7 @@ int main(int argc, char* argv[]) particles.ghost_get<>(); // Calculate the force at t + dt - particles.updateCellListSym(NN); + particles.updateCellList(NN); force = applyKernel_in_sim(particles,NN,lennard_jones); // 2-step Verlet velocity diff --git a/example/Vector/3_molecular_dynamic/main_vl.cpp b/example/Vector/3_molecular_dynamic/main_vl.cpp index 78ab671c..7ad485c4 100644 --- a/example/Vector/3_molecular_dynamic/main_vl.cpp +++ b/example/Vector/3_molecular_dynamic/main_vl.cpp @@ -62,7 +62,7 @@ constexpr int force = 1; //! \cond [arg diff] \endcond -void calc_forces(vector_dist<3,double, aggregate > & vd, VerletList<3, double, Mem_fast<>, shift<3, double> > & NN, double sigma12, double sigma6, double r_cut) +void calc_forces(vector_dist<3,double, aggregate > & vd, VerletList<3, double, VL_NON_SYMMETRIC, Mem_fast<>, shift<3, double> > & NN, double sigma12, double sigma6, double r_cut) { //! \cond [arg diff] \endcond @@ -150,7 +150,7 @@ void calc_forces(vector_dist<3,double, aggregate > & vd, Ve //! \cond [calc energy vl] \endcond -double calc_energy(vector_dist<3,double, aggregate > & vd, VerletList<3, double, Mem_fast<>, shift<3, double> > & NN, double sigma12, double sigma6, double r_cut) +double calc_energy(vector_dist<3,double, aggregate > & vd, VerletList<3, double, VL_NON_SYMMETRIC, Mem_fast<>, shift<3, double> > & NN, double sigma12, double sigma6, double r_cut) { double E = 0.0; diff --git a/example/Vector/3_molecular_dynamic_gpu/main.cu b/example/Vector/3_molecular_dynamic_gpu/main.cu index 174ee030..395fbf84 100644 --- a/example/Vector/3_molecular_dynamic_gpu/main.cu +++ b/example/Vector/3_molecular_dynamic_gpu/main.cu @@ -1,5 +1,3 @@ -#define SCAN_WITH_CUB - /*! * \page Vector_3_md_dyn_gpu Vector 3 molecular dynamic on GPU * @@ -111,7 +109,7 @@ __global__ void calc_force_gpu(vector_dist_type vd, NN_type NN, real_number sigm // Get an iterator over the neighborhood particles of p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); // For each neighborhood particle ... while (Np.isNext()) @@ -189,7 +187,7 @@ __global__ void particle_energy(vector_dist_type vd, NN_type NN, real_number sig Point<3,real_number> xp = vd.getPos(p); // Get an iterator over the neighborhood of the particle p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); real_number E = 0; @@ -230,7 +228,7 @@ __global__ void particle_energy(vector_dist_type vd, NN_type NN, real_number sig template void calc_forces(vector_dist_gpu<3,real_number, aggregate > & vd, CellList & NN, real_number sigma12, real_number sigma6, real_number r_cut2) { - vd.updateCellList(NN); + vd.updateCellListGPU(NN); // Get an iterator over particles auto it2 = vd.getDomainIteratorGPU(); @@ -245,7 +243,7 @@ template real_number calc_energy(vector_dist_gpu<3,real_numbe real_number rc = r_cut2; real_number shift = 2.0 * ( sigma12 / (rc*rc*rc*rc*rc*rc) - sigma6 / ( rc*rc*rc) ); - vd.updateCellList(NN); + vd.updateCellListGPU(NN); auto it2 = vd.getDomainIteratorGPU(); @@ -328,7 +326,7 @@ int main(int argc, char* argv[]) //! \cond [md steps] \endcond // Get the Cell list structure - auto NN = vd.getCellListGPU(r_cut, 2); + auto NN = vd.getCellListGPU(r_cut, CL_NON_SYMMETRIC, 2); // The standard // auto NN = vd.getCellList(r_cut); diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu.cpp b/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu.cpp index a31730c1..b87afff3 100644 --- a/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu.cpp +++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu.cpp @@ -37,7 +37,7 @@ template void calc_forces(vector_dist<3,real_number, aggregat vd.template getProp(p)[2] = 0.0; // Get an iterator over the neighborhood particles of p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); // For each neighborhood particle ... while (Np.isNext()) @@ -97,7 +97,7 @@ template real_number calc_energy(vector_dist<3,real_number, a Point<3,real_number> xp = vd.getPos(p); // Get an iterator over the neighborhood of the particle p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); // For each neighborhood of the particle p while (Np.isNext()) diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu_best.cpp b/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu_best.cpp index ea250bb4..236e31f7 100644 --- a/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu_best.cpp +++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu_best.cpp @@ -241,7 +241,7 @@ int main(int argc, char* argv[]) tsim.start(); // Get the Cell list structure - auto NN = vd.getVerletCrs(r_gskin);; + auto NN = vd.getVerletCrs(r_gskin); // calculate forces calc_forces(vd,NN,sigma12,sigma6,r_cut); @@ -290,7 +290,7 @@ int main(int argc, char* argv[]) vd.map(); vd.template ghost_get<>(); // Get the Cell list structure - vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC); + vd.updateVerlet(NN,r_gskin); } else { diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu index 9de319f2..8df92655 100644 --- a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu +++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu @@ -46,8 +46,7 @@ * all particles processed by one SM stay in one cell or few neighborhood cell, the number of cache line that an SM has to read is * reduced, with a significant speed-up. * - * In OpenFPM get a Cell-list produce a re-ordered version of the original vector by default. It is possible to offload the sorted - * version vector_dist_gpu instead of the normal one using the function \b toKernel_sorted() \b instead of the function \b toKernel \b. + * In OpenFPM \b getCellListGPU() \b produces a re-ordered version of the original vector when CL_GPU_REORDER option is enabled. * * \snippet Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu calc_force_sorted * @@ -255,22 +254,19 @@ __global__ void particle_energy(vector_dist_type vd, NN_type NN, real_number sig template void calc_forces(vector_dist_gpu<3,real_number, aggregate > & vd, CellList & NN, real_number sigma12, real_number sigma6, real_number r_cut2) { - vd.updateCellList(NN); - // Get an iterator over particles auto it2 = vd.getDomainIteratorGPU(); //! \cond [calc_force_sorted] \endcond - CUDA_LAUNCH(calc_force_gpu,it2,vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,r_cut2); - - //! \cond [calc_force_sorted] \endcond - - //! \cond [merge_sort] \endcond + // reorder positions only (no properties) + // as nothing else is needed to be read in calc_force_gpu + vd.template updateCellListGPU<>(NN); - vd.merge_sort(NN); + CUDA_LAUNCH(calc_force_gpu,it2,vd.toKernel(),NN.toKernel(),sigma12,sigma6,r_cut2); - //! \cond [merge_sort] \endcond + //! \cond [calc_force_sorted] \endcond + vd.template restoreOrder(NN); } template real_number calc_energy(vector_dist_gpu<3,real_number, aggregate > & vd, CellList & NN, real_number sigma12, real_number sigma6, real_number r_cut2) @@ -278,13 +274,13 @@ template real_number calc_energy(vector_dist_gpu<3,real_numbe real_number rc = r_cut2; real_number shift = 2.0 * ( sigma12 / (rc*rc*rc*rc*rc*rc) - sigma6 / ( rc*rc*rc) ); - vd.updateCellList(NN); + vd.template updateCellListGPU(NN); auto it2 = vd.getDomainIteratorGPU(); - CUDA_LAUNCH(particle_energy,it2,vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,shift,r_cut2); + CUDA_LAUNCH(particle_energy,it2,vd.toKernel(),NN.toKernel(),sigma12,sigma6,shift,r_cut2); - vd.merge_sort(NN); + vd.template restoreOrder(NN); // Calculated energy return reduce_local(vd); @@ -361,7 +357,7 @@ int main(int argc, char* argv[]) //! \cond [get_half_cl] \endcond // Get the Cell list structure - auto NN = vd.getCellListGPU(r_cut / 2.0, 2); + auto NN = vd.getCellListGPU(r_cut / 2.0, CL_NON_SYMMETRIC | CL_GPU_REORDER, 2); //! \cond [get_half_cl] \endcond @@ -377,7 +373,6 @@ int main(int argc, char* argv[]) { // Get the iterator auto it3 = vd.getDomainIteratorGPU(); - CUDA_LAUNCH(update_velocity_position,it3,vd.toKernel(),dt); // Because we moved the particles in space we have to map them and re-sync the ghost @@ -393,7 +388,7 @@ int main(int argc, char* argv[]) CUDA_LAUNCH(update_velocity,it4,vd.toKernel(),dt); // After every iteration collect some statistic about the configuration - if (i % 1000 == 0) + if (i % 100 == 0) { vd.deviceToHostPos(); vd.deviceToHostProp<0,1,2>(); diff --git a/example/Vector/4_multiphase_celllist_verlet/main.cpp b/example/Vector/4_multiphase_celllist_verlet/main.cpp index 2d91d3b7..c63e9b1b 100644 --- a/example/Vector/4_multiphase_celllist_verlet/main.cpp +++ b/example/Vector/4_multiphase_celllist_verlet/main.cpp @@ -2,7 +2,7 @@ #include "Vector/vector_dist.hpp" #include "Decomposition/CartDecomposition.hpp" #include "data_type/aggregate.hpp" -#include "NN/CellList/CellListM.hpp" +#include "NN/CellList/multiphase/CellListM.hpp" #include "Vector/vector_dist_multiphase_functions.hpp" /*! @@ -128,7 +128,7 @@ int main(int argc, char* argv[]) * * ## Multi phase Verlet-list ## * - * In general verlet-list can be constructed from the vector itself using **getVerlerList()** + * In general verlet-list can be constructed from the vector itself using **getVerlet()** * * \see \ref Vector_3_md_vl * @@ -529,7 +529,7 @@ int main(int argc, char* argv[]) current_phase.getProp<0>(p) = 0.0; // Get an iterator of all the particles neighborhood of p - auto Np = CL_all.getNNIterator(CL_all.getCell(current_phase.getPos(p))); + auto Np = CL_all.getNNIteratorBox(CL_all.getCell(current_phase.getPos(p))); // For each particle near p while (Np.isNext()) diff --git a/example/Vector/4_reorder/main_comp_ord.cpp b/example/Vector/4_reorder/main_comp_ord.cpp index 2368d460..671f153c 100644 --- a/example/Vector/4_reorder/main_comp_ord.cpp +++ b/example/Vector/4_reorder/main_comp_ord.cpp @@ -57,7 +57,7 @@ template void calc_forces(vector_dist<3,double, aggregate void calc_forces(vector_dist<3,double, aggregate(p)[2] = 0.0; // Get an iterator over the neighborhood particles of p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); // For each neighborhood particle ... while (Np.isNext()) @@ -148,7 +148,7 @@ template double calc_energy(vector_dist<3,double, aggregate double calc_energy(vector_dist<3,double, aggregate(p)[2] = 0.0; // Get an iterator over the neighborhood of the particle p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); // For each neighborhood of the particle p while (Np.isNext()) @@ -304,8 +304,9 @@ int main(int argc, char* argv[]) * * ### Cell lists ### * - * Instead of getting the normal cell list we get an hilbert curve cell-list. Such cell list has a - * function called **getIterator** used inside the function **calc_forces** and **calc_energy** + * Instead of getting the normal cell list we get an hilbert curve cell-list by passing flag **CL_HILBERT_CELL_KEYS** + * to **getCellList** or by directly calling **getCellList_hilb**. Such cell list has a + * function called **getCellParticleIterator** used inside the function **calc_forces** and **calc_energy** * that iterate across all the particles but in a smart-way. In practice * given an r-cut a cell-list is constructed with the provided spacing. Suppose to have a cell-list * \f$ m \times n \f$, an hilbert curve \f$ 2^k \times 2^k \f$ is contructed with \f$ k = ceil(log_2(max(m,n))) \f$. diff --git a/example/Vector/5_molecular_dynamic_sym/main.cpp b/example/Vector/5_molecular_dynamic_sym/main.cpp index f5d27cb6..07e2dccd 100644 --- a/example/Vector/5_molecular_dynamic_sym/main.cpp +++ b/example/Vector/5_molecular_dynamic_sym/main.cpp @@ -387,7 +387,7 @@ int main(int argc, char* argv[]) vd.map(); vd.template ghost_get<>(); // Get the Cell list structure - vd.updateVerlet(NN,r_gskin,VL_SYMMETRIC); + vd.updateVerlet(NN,r_gskin); } else { diff --git a/example/Vector/5_molecular_dynamic_sym_crs/main.cpp b/example/Vector/5_molecular_dynamic_sym_crs/main.cpp index 4e5ccd33..f5dcf81a 100644 --- a/example/Vector/5_molecular_dynamic_sym_crs/main.cpp +++ b/example/Vector/5_molecular_dynamic_sym_crs/main.cpp @@ -471,7 +471,7 @@ int main(int argc, char* argv[]) vd.map(); vd.template ghost_get<>(); // Get the Cell list structure - vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC); + vd.updateVerlet(NN,r_gskin); } else { diff --git a/example/Vector/6_complex_usage/main.cpp b/example/Vector/6_complex_usage/main.cpp index f2e833df..b554918e 100644 --- a/example/Vector/6_complex_usage/main.cpp +++ b/example/Vector/6_complex_usage/main.cpp @@ -199,7 +199,7 @@ int main(int argc, char* argv[]) Point<3,float> xp = vd.getPos(p); - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p))); while (Np.isNext()) { @@ -265,7 +265,7 @@ int main(int argc, char* argv[]) Point<3,float> xp = vd.getPos(p); - auto Np = NN2.getNNIteratorSym(NN2.getCell(vd.getPos(p)),p.getKey(),vd.getPosVector()); + auto Np = NN2.getNNIteratorBoxSym(NN2.getCell(vd.getPos(p)),p.getKey(),vd.getPosVector()); while (Np.isNext()) { diff --git a/example/Vector/7_SPH_dlb/main.cpp b/example/Vector/7_SPH_dlb/main.cpp index d8562d3e..33448c05 100644 --- a/example/Vector/7_SPH_dlb/main.cpp +++ b/example/Vector/7_SPH_dlb/main.cpp @@ -493,7 +493,7 @@ template inline void calc_forces(particles & vd, CellList & N { // If it is a boundary particle calculate the delta rho based on equation 2 // This require to run across the neighborhoods particles of a - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(a))); // For each neighborhood particle while (Np.isNext() == true) @@ -548,7 +548,7 @@ template inline void calc_forces(particles & vd, CellList & N // If it is a fluid particle calculate based on equation 1 and 2 // Get an iterator over the neighborhood particles of p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a))); + auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(a))); // For each neighborhood particle while (Np.isNext() == true) @@ -936,7 +936,7 @@ inline void sensor_pressure(Vector & vd, Point<3,double> xp = probes.get(i); // get the iterator over the neighbohood particles of the probes position - auto itg = NN.getNNIterator(NN.getCell(probes.get(i))); + auto itg = NN.getNNIteratorBox(NN.getCell(probes.get(i))); while (itg.isNext()) { auto q = itg.get(); diff --git a/example/Vector/7_SPH_dlb_gpu/main.cu b/example/Vector/7_SPH_dlb_gpu/main.cu index be528601..c8509d77 100644 --- a/example/Vector/7_SPH_dlb_gpu/main.cu +++ b/example/Vector/7_SPH_dlb_gpu/main.cu @@ -210,10 +210,7 @@ inline void EqState(particles & vd) { auto it = vd.getDomainIteratorGPU(); - // You can use standard CUDA kernel launch or the macro CUDA_LAUNCH - - //EqState_gpuning<<>>(vd.toKernel(),B); - CUDA_LAUNCH(EqState_gpu,it,vd.toKernel(),B) + CUDA_LAUNCH(EqState_gpu,it,vd.toKernel(),B); } @@ -239,17 +236,17 @@ const real_number a2_4 = 0.25*a2; // Filled later real_number W_dap = 0.0; -inline __device__ __host__ void DWab(Point<3,real_number> & dx, Point<3,real_number> & DW, real_number r, bool print) +inline __device__ __host__ void DWab(Point<3,real_number> & dx, Point<3,real_number> & DW, real_number r) { const real_number qq=r/H; real_number qq2 = qq * qq; real_number fac1 = (c1*qq + d1*qq2)/r; - real_number b1 = (qq < 1.0)?1.0f:0.0f; + real_number b1 = (qq < 1.0f)?1.0f:0.0f; - real_number wqq = (2.0 - qq); + real_number wqq = (2.0f - qq); real_number fac2 = c2 * wqq * wqq / r; - real_number b2 = (qq >= 1.0 && qq < 2.0)?1.0f:0.0f; + real_number b2 = (qq >= 1.0f && qq < 2.0f)?1.0f:0.0f; real_number factor = (b1*fac1 + b2*fac2); @@ -282,8 +279,8 @@ inline __device__ __host__ real_number Tensile(real_number r, real_number rhoa, //-Tensile correction. real_number fab=wab*W_dap; fab*=fab; fab*=fab; //fab=fab^4 - const real_number tensilp1=(prs1/(rhoa*rhoa))*(prs1>0? 0.01: -0.2); - const real_number tensilp2=(prs2/(rhob*rhob))*(prs2>0? 0.01: -0.2); + const real_number tensilp1=(prs1/(rhoa*rhoa))*(prs1>0.0f? 0.01f: -0.2f); + const real_number tensilp2=(prs2/(rhob*rhob))*(prs2>0.0f? 0.01f: -0.2f); return (fab*(tensilp1+tensilp2)); } @@ -304,22 +301,21 @@ inline __device__ __host__ real_number Pi(const Point<3,real_number> & dr, real_ return pi_visc; } else - return 0.0; + return 0.0f; } template __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap, real_number cbar) { - // ... a auto a = GET_PARTICLE(vd); - real_number max_visc = 0.0; + real_number max_visc = 0.0f; // Get the position xp of the particle Point<3,real_number> xa = vd.getPos(a); - // Take the mass of the particle dependently if it is FLUID or BOUNDARY - real_number massa = (vd.template getProp(a) == FLUID)?MassFluid:MassBound; + // Type of the particle + unsigned int typea = vd.template getProp(a); // Get the density of the of the particle a real_number rhoa = vd.template getProp(a); @@ -331,122 +327,73 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap Point<3,real_number> va = vd.template getProp(a); // Reset the force counter (- gravity on zeta direction) - vd.template getProp(a)[0] = 0.0; - vd.template getProp(a)[1] = 0.0; - vd.template getProp(a)[2] = -gravity; - vd.template getProp(a) = 0.0; + Point<3,real_number> force_; + force_.get(0) = 0.0f; + force_.get(1) = 0.0f; + force_.get(2) = -gravity; + real_number drho_ = 0.0f; - // We threat FLUID particle differently from BOUNDARY PARTICLES ... - if (vd.template getProp(a) != FLUID) - { + // Get an iterator over the neighborhood particles of p + auto Np = NN.getNNIteratorBox(NN.getCell(xa)); - // If it is a boundary particle calculate the delta rho based on equation 2 - // This require to run across the neighborhoods particles of a - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a))); + // For each neighborhood particle + while (Np.isNext() == true) + { + // ... q + auto b = Np.get(); - // For each neighborhood particle - while (Np.isNext() == true) - { - // ... q - auto b = Np.get(); + // Get the position xp of the particle + Point<3,real_number> xb = vd.getPos(b); - // Get the position xp of the particle - Point<3,real_number> xb = vd.getPos(b); + if (a == b) {++Np; continue;}; - // if (p == q) skip this particle - if (a == b) {++Np; continue;}; + unsigned int typeb = vd.template getProp(b); - // get the mass of the particle - real_number massb = (vd.template getProp(b) == FLUID)?MassFluid:MassBound; + real_number massb = (typeb == FLUID)?MassFluid:MassBound; + Point<3,real_number> vb = vd.template getProp(b); + real_number Pb = vd.template getProp(b); + real_number rhob = vd.template getProp(b); - // Get the velocity of the particle b - Point<3,real_number> vb = vd.template getProp(b); + // Get the distance between p and q + Point<3,real_number> dr = xa - xb; + // take the norm of this vector + real_number r2 = norm2(dr); - // Get the pressure and density of particle b - real_number Pb = vd.template getProp(b); - real_number rhob = vd.template getProp(b); + // if they interact + if (r2 < 4.0*H*H && r2 >= 1e-16) + { + real_number r = sqrt(r2); - // Get the distance between p and q - Point<3,real_number> dr = xa - xb; - // take the norm of this vector - real_number r2 = norm2(dr); + Point<3,real_number> v_rel = va - vb; - // If the particles interact ... - if (r2 < 4.0*H*H) - { - // ... calculate delta rho - real_number r = sqrt(r2); + Point<3,real_number> DW; + DWab(dr,DW,r); - Point<3,real_number> dv = va - vb; + real_number factor = - massb*((Pa + Pb) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc)); - Point<3,real_number> DW; - DWab(dr,DW,r,false); + // Bound - Bound does not produce any change + // factor = (typea == BOUNDARY && typeb == BOUNDARY)?0.0f:factor; + factor = (typea != FLUID)?0.0f:factor; - const real_number dot = dr.get(0)*dv.get(0) + dr.get(1)*dv.get(1) + dr.get(2)*dv.get(2); - const real_number dot_rr2 = dot/(r2+Eta2); - max_visc = (dot_rr2 < max_visc)?max_visc:dot_rr2; + force_.get(0) += factor * DW.get(0); + force_.get(1) += factor * DW.get(1); + force_.get(2) += factor * DW.get(2); - vd.template getProp(a) += massb*(dv.get(0)*DW.get(0)+dv.get(1)*DW.get(1)+dv.get(2)*DW.get(2)); - } + real_number scal = massb*(v_rel.get(0)*DW.get(0)+v_rel.get(1)*DW.get(1)+v_rel.get(2)*DW.get(2)); + scal = (typea == BOUNDARY && typeb == BOUNDARY)?0.0f:scal; - ++Np; + drho_ += scal; } - vd.template getProp(a) = max_visc; + ++Np; } - else - { - // If it is a fluid particle calculate based on equation 1 and 2 - - // Get an iterator over the neighborhood particles of p - auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a))); - - // For each neighborhood particle - while (Np.isNext() == true) - { - // ... q - auto b = Np.get(); - - // Get the position xp of the particle - Point<3,real_number> xb = vd.getPos(b); - - // if (p == q) skip this particle - if (a == b) {++Np; continue;}; - - real_number massb = (vd.template getProp(b) == FLUID)?MassFluid:MassBound; - Point<3,real_number> vb = vd.template getProp(b); - real_number Pb = vd.template getProp(b); - real_number rhob = vd.template getProp(b); - // Get the distance between p and q - Point<3,real_number> dr = xa - xb; - // take the norm of this vector - real_number r2 = norm2(dr); + vd.template getProp(a) = max_visc; - // if they interact - if (r2 < 4.0*H*H) - { - real_number r = sqrt(r2); - - Point<3,real_number> v_rel = va - vb; - - Point<3,real_number> DW; - DWab(dr,DW,r,false); - - real_number factor = - massb*((vd.template getProp(a) + vd.template getProp(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc)); - - vd.template getProp(a)[0] += factor * DW.get(0); - vd.template getProp(a)[1] += factor * DW.get(1); - vd.template getProp(a)[2] += factor * DW.get(2); - - vd.template getProp(a) += massb*(v_rel.get(0)*DW.get(0)+v_rel.get(1)*DW.get(1)+v_rel.get(2)*DW.get(2)); - } - - ++Np; - } - - vd.template getProp(a) = max_visc; - } + vd.template getProp(a)[0] = force_.get(0); + vd.template getProp(a)[1] = force_.get(1); + vd.template getProp(a)[2] = force_.get(2); + vd.template getProp(a) = drho_; } template inline void calc_forces(particles & vd, CellList & NN, real_number & max_visc, size_t cnt) @@ -454,10 +401,9 @@ template inline void calc_forces(particles & vd, CellList & N auto part = vd.getDomainIteratorGPU(32); // Update the cell-list - vd.updateCellList(NN); + vd.updateCellListGPU(NN); - //calc_forces_gpu<<>>(vd.toKernel(),NN.toKernel(),W_dap,cbar); - CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar) + CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar); max_visc = reduce_local(vd); } @@ -479,7 +425,6 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n // Calculate the maximum acceleration auto part = vd.getDomainIteratorGPU(); - // max_acceleration_and_velocity_gpu<<>>(vd.toKernel()); CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel()); max_acc = reduce_local(vd); @@ -499,7 +444,7 @@ real_number calc_deltaT(particles & vd, real_number ViscDtMax) max_acceleration_and_velocity(vd,Maxacc,Maxvel); //-dt1 depends on force per unit mass. - const real_number dt_f = (Maxacc)?sqrt(H/Maxacc):std::numeric_limits::max(); + const real_number dt_f = (Maxacc)?sqrt(H/Maxacc):std::numeric_limits::max(); //-dt2 combines the Courant and the viscous time-step controls. const real_number dt_cv = H/(std::max(cbar,Maxvel*10.f) + H*ViscDtMax); @@ -538,7 +483,7 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number return; } - //-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion. + //-Calculate displacement and update position real_number dx = vd.template getProp(a)[0]*dt + vd.template getProp(a)[0]*dt205; real_number dy = vd.template getProp(a)[1]*dt + vd.template getProp(a)[1]*dt205; real_number dz = vd.template getProp(a)[2]*dt + vd.template getProp(a)[2]*dt205; @@ -558,17 +503,14 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number vd.template getProp(a)[2] = vd.template getProp(a)[2] + vd.template getProp(a)[2]*dt2; vd.template getProp(a) = vd.template getProp(a) + dt2*vd.template getProp(a); - //! \cond [mark_to_remove_kernel] \endcond - // Check if the particle go out of range in space and in density, if they do mark them to remove it later - if (vd.getPos(a)[0] < 0.000263878 || vd.getPos(a)[1] < 0.000263878 || vd.getPos(a)[2] < 0.000263878 || - vd.getPos(a)[0] > 0.000263878+1.59947 || vd.getPos(a)[1] > 0.000263878+0.672972 || vd.getPos(a)[2] > 0.50 || + if (vd.getPos(a)[0] < 0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 || + vd.getPos(a)[0] > 1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 || vd.template getProp(a) < RhoMin || vd.template getProp(a) > RhoMax) {vd.template getProp(a) = 1;} else {vd.template getProp(a) = 0;} - //! \cond [mark_to_remove_kernel] \endcond vd.template getProp(a)[0] = velX; vd.template getProp(a)[1] = velY; @@ -586,16 +528,11 @@ void verlet_int(particles & vd, real_number dt) real_number dt205 = dt*dt*0.5; real_number dt2 = dt*2.0; - // verlet_int_gpu<<>>(vd.toKernel(),dt,dt2,dt205); CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205); - //! \cond [remove_marked_part] \endcond - // remove the particles marked remove_marked(vd); - //! \cond [remove_marked_part] \endcond - // increment the iteration counter cnt++; } @@ -626,7 +563,7 @@ __global__ void euler_int_gpu(vector_type vd,real_number dt, real_number dt205) return; } - //-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion. + //-Calculate displacement and update position real_number dx = vd.template getProp(a)[0]*dt + vd.template getProp(a)[0]*dt205; real_number dy = vd.template getProp(a)[1]*dt + vd.template getProp(a)[1]*dt205; real_number dz = vd.template getProp(a)[2]*dt + vd.template getProp(a)[2]*dt205; @@ -646,8 +583,8 @@ __global__ void euler_int_gpu(vector_type vd,real_number dt, real_number dt205) vd.template getProp(a) = vd.template getProp(a) + dt*vd.template getProp(a); // Check if the particle go out of range in space and in density - if (vd.getPos(a)[0] < 0.000263878 || vd.getPos(a)[1] < 0.000263878 || vd.getPos(a)[2] < 0.000263878 || - vd.getPos(a)[0] > 0.000263878+1.59947 || vd.getPos(a)[1] > 0.000263878+0.672972 || vd.getPos(a)[2] > 0.50 || + if (vd.getPos(a)[0] < 0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 || + vd.getPos(a)[0] > 1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 || vd.template getProp(a) < RhoMin || vd.template getProp(a) > RhoMax) {vd.template getProp(a) = 1;} else @@ -685,7 +622,7 @@ __global__ void sensor_pressure_gpu(vector_type vd, NN_type NN, Point<3,real_num Point<3,real_number> xp = probe; // get the iterator over the neighbohood particles of the probes position - auto itg = NN.getNNIterator(NN.getCell(xp)); + auto itg = NN.getNNIteratorBox(NN.getCell(xp)); while (itg.isNext()) { auto q = itg.get(); @@ -744,8 +681,10 @@ inline void sensor_pressure(Vector & vd, // if the probe is inside the processor domain if (vd.getDecomposition().isLocal(probes.get(i)) == true) { - // sensor_pressure_gpu<<<1,1>>>(vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); - CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); + vd.updateCellListGPU(NN); + + Point<3,real_number> probe = probes.get(i); + CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probe,(real_number *)press_tmp_.toKernel()); // move calculated pressure on press_tmp_.deviceToHost(); @@ -772,11 +711,11 @@ int main(int argc, char* argv[]) openfpm::vector> press_t; openfpm::vector> probes; - probes.add({0.8779,0.3,0.02}); - probes.add({0.754,0.31,0.02}); + probes.add({0.8779f,0.3f,0.02f}); + probes.add({0.754f,0.31f,0.02f}); // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y - Box<3,real_number> domain({-0.05,-0.05,-0.05},{1.7010,0.7065,0.5025}); + Box<3,real_number> domain({-0.05f,-0.05f,-0.05f},{1.7010f,0.7065f,0.511f}); size_t sz[3] = {207,90,66}; // Fill W_dap @@ -919,7 +858,7 @@ int main(int argc, char* argv[]) vd.ghost_get(RUN_ON_DEVICE); - auto NN = vd.getCellListGPU(2*H, 2); + auto NN = vd.getCellListGPU(2*H, CL_NON_SYMMETRIC, 2); timer tot_sim; tot_sim.start(); @@ -932,6 +871,7 @@ int main(int argc, char* argv[]) { Vcluster<> & v_cl = create_vcluster(); timer it_time; + it_time.start(); ////// Do rebalancing every 200 timesteps it_reb++; @@ -939,6 +879,10 @@ int main(int argc, char* argv[]) { vd.map(RUN_ON_DEVICE); + // Rebalancer for now work on CPU , so move to CPU + vd.deviceToHostPos(); + vd.template deviceToHostProp(); + it_reb = 0; ModelCustom md; vd.addComputationCosts(md); @@ -985,7 +929,6 @@ int main(int argc, char* argv[]) // and ghost are updated vd.map(RUN_ON_DEVICE); vd.ghost_get(RUN_ON_DEVICE); - vd.updateCellList(NN); // calculate the pressure at the sensor points //sensor_pressure(vd,NN,press_t,probes); diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu index 08e0f0d2..e81c53e8 100644 --- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu +++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu @@ -38,8 +38,8 @@ * * the function get_indexes_by_type has three arguments the first is the vector of the properties of the particles. In * this case because we use the sorted particles to calculate forces, so we have to get the indexes for the sorted - * particles with vd.getPropVectorSort(). In case we want to use the non sorted we use vd.getPropVector(). The second - * argument is the output containing the indexes of the particles types we want to get. Because the vector can contain + * particles with vd.getPropVector() when enabled with flag CL_GPU_REORDER. The second argument is the output containing + * the indexes of the particles types we want to get. Because the vector can contain * ghost particles and real particles setting with the third argument we indicate we want only real particles and no ghost particles * The last argument is the GPU context handle * @@ -53,12 +53,8 @@ #define PRINT_STACKTRACE #define STOP_ON_ERROR #define OPENMPI -#define SCAN_WITH_CUB -#define SORT_WITH_CUB //#define SE_CLASS1 -//#define USE_LOW_REGISTER_ITERATOR - #include "Vector/vector_dist.hpp" #include #include "Draw/DrawParticles.hpp" @@ -74,7 +70,7 @@ typedef float real_number; #define FLUID 1 // initial spacing between particles dp in the formulas -const real_number dp = 0.00425; +const real_number dp = 0.0085; // Maximum height of the fluid water // is going to be calculated and filled later on real_number h_swl = 0.0; @@ -86,7 +82,7 @@ const real_number coeff_sound = 20.0; const real_number gamma_ = 7.0; // sqrt(3.0*dp*dp) support of the kernel -const real_number H = 0.00736121593217; +const real_number H = 0.0147224318643; // Eta in the formulas const real_number Eta2 = 0.01 * H*H; @@ -100,10 +96,10 @@ const real_number visco = 0.1; real_number cbar = 0.0; // Mass of the fluid particles -const real_number MassFluid = 0.0000767656; +const real_number MassFluid = 0.000614125; // Mass of the boundary particles -const real_number MassBound = 0.0000767656; +const real_number MassBound = 0.000614125; // @@ -178,10 +174,11 @@ typedef vector_dist_gpu<3,real_number,aggregate inline void addComputation(Decomposition & dec, - vector & vd, - size_t v, - size_t p) + template inline void addComputation( + Decomposition & dec, + vector & vd, + size_t v, + size_t p) { if (vd.template getProp(p) == FLUID) dec.addComputationCost(v,4); @@ -245,19 +242,19 @@ inline __device__ __host__ void DWab(Point<3,real_number> & dx, Point<3,real_num { const real_number qq=r/H; - real_number qq2 = qq * qq; - real_number fac1 = (c1*qq + d1*qq2)/r; - real_number b1 = (qq < 1.0f)?1.0f:0.0f; + real_number qq2 = qq * qq; + real_number fac1 = (c1*qq + d1*qq2)/r; + real_number b1 = (qq < 1.0f)?1.0f:0.0f; - real_number wqq = (2.0f - qq); - real_number fac2 = c2 * wqq * wqq / r; - real_number b2 = (qq >= 1.0f && qq < 2.0f)?1.0f:0.0f; + real_number wqq = (2.0f - qq); + real_number fac2 = c2 * wqq * wqq / r; + real_number b2 = (qq >= 1.0f && qq < 2.0f)?1.0f:0.0f; - real_number factor = (b1*fac1 + b2*fac2); + real_number factor = (b1*fac1 + b2*fac2); - DW.get(0) = factor * dx.get(0); - DW.get(1) = factor * dx.get(1); - DW.get(2) = factor * dx.get(2); + DW.get(0) = factor * dx.get(0); + DW.get(1) = factor * dx.get(1); + DW.get(2) = factor * dx.get(2); } // Tensile correction @@ -275,10 +272,10 @@ inline __device__ __host__ real_number Tensile(real_number r, real_number rhoa, } else { - real_number wqq2=qq*qq; - real_number wqq3=wqq2*qq; + real_number wqq2=qq*qq; + real_number wqq3=wqq2*qq; - wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3); + wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3); } //-Tensile correction. @@ -304,7 +301,7 @@ inline __device__ __host__ real_number Pi(const Point<3,real_number> & dr, real_ const float pi_visc=(-visco*cbar*amubar/robar); return pi_visc; - } + } else return 0.0f; } @@ -355,16 +352,15 @@ __global__ void calc_forces_fluid_gpu(particles_type vd, fluid_ids_type fids, NN // if (p == q) skip this particle this condition should be done in the r^2 = 0 //if (a == b) {++Np; continue;}; - unsigned int typeb = vd.template getProp(b); + unsigned int typeb = vd.template getProp(b); - real_number massb = (typeb == FLUID)?MassFluid:MassBound; - Point<3,real_number> vb = vd.template getProp(b); - real_number Pb = vd.template getProp(b); - real_number rhob = vd.template getProp(b); + real_number massb = (typeb == FLUID)?MassFluid:MassBound; + Point<3,real_number> vb = vd.template getProp(b); + real_number Pb = vd.template getProp(b); + real_number rhob = vd.template getProp(b); // Get the distance between p and q Point<3,real_number> dr = xa - xb; - Point<3,real_number> v_rel = va - vb; // take the norm of this vector real_number r2 = norm2(dr); @@ -373,6 +369,8 @@ __global__ void calc_forces_fluid_gpu(particles_type vd, fluid_ids_type fids, NN { real_number r = sqrtf(r2); + Point<3,real_number> v_rel = va - vb; + Point<3,real_number> DW; DWab(dr,DW,r); @@ -438,10 +436,10 @@ __global__ void calc_forces_border_gpu(particles_type vd, fluid_ids_type fbord, // if (p == q) skip this particle this condition should be done in the r^2 = 0 //if (a == b) {++Np; continue;}; - unsigned int typeb = vd.template getProp(b); + unsigned int typeb = vd.template getProp(b); - real_number massb = (typeb == FLUID)?MassFluid:MassBound; - Point<3,real_number> vb = vd.template getProp(b); + real_number massb = (typeb == FLUID)?MassFluid:MassBound; + Point<3,real_number> vb = vd.template getProp(b); // Get the distance between p and q Point<3,real_number> dr = xa - xb; @@ -481,34 +479,34 @@ struct type_is_fluid struct type_is_border { - __device__ static bool check(int c) - { - return c == BOUNDARY; - } + __device__ static bool check(int c) + { + return c == BOUNDARY; + } }; template inline void calc_forces(particles & vd, CellList & NN, real_number & max_visc, size_t cnt, openfpm::vector_gpu> & fluid_ids, openfpm::vector_gpu> & border_ids) { // Update the cell-list - vd.updateCellList(NN); + vd.template updateCellListGPU(NN); //! \cond [get indexes by type] \endcond // get the particles fluid ids - get_indexes_by_type(vd.getPropVectorSort(),fluid_ids,vd.size_local(),vd.getVC().getGpuContext()); + get_indexes_by_type(vd.getPropVector(),fluid_ids,vd.size_local(),vd.getVC().getGpuContext()); // get the particles fluid ids - get_indexes_by_type(vd.getPropVectorSort(),border_ids,vd.size_local(),vd.getVC().getGpuContext()); + get_indexes_by_type(vd.getPropVector(),border_ids,vd.size_local(),vd.getVC().getGpuContext()); auto part = fluid_ids.getGPUIterator(96); - CUDA_LAUNCH(calc_forces_fluid_gpu,part,vd.toKernel_sorted(),fluid_ids.toKernel(),NN.toKernel(),W_dap,cbar); + CUDA_LAUNCH(calc_forces_fluid_gpu,part,vd.toKernel(),fluid_ids.toKernel(),NN.toKernel(),W_dap,cbar); part = border_ids.getGPUIterator(96); - CUDA_LAUNCH(calc_forces_border_gpu,part,vd.toKernel_sorted(),border_ids.toKernel(),NN.toKernel(),W_dap,cbar); + CUDA_LAUNCH(calc_forces_border_gpu,part,vd.toKernel(),border_ids.toKernel(),NN.toKernel(),W_dap,cbar); //! \cond [get indexes by type] \endcond - vd.merge_sort(NN); + vd.template restoreOrder(NN); max_visc = reduce_local(vd); } @@ -575,56 +573,57 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number real_number rhop = vd.template getProp(a); // Update only the density - vd.template getProp(a)[0] = 0.0; - vd.template getProp(a)[1] = 0.0; - vd.template getProp(a)[2] = 0.0; - real_number rhonew = vd.template getProp(a) + dt2*vd.template getProp(a); - vd.template getProp(a) = (rhonew < rho_zero)?rho_zero:rhonew; + vd.template getProp(a)[0] = 0.0; + vd.template getProp(a)[1] = 0.0; + vd.template getProp(a)[2] = 0.0; + real_number rhonew = vd.template getProp(a) + dt2*vd.template getProp(a); + vd.template getProp(a) = (rhonew < rho_zero)?rho_zero:rhonew; - vd.template getProp(a) = rhop; + vd.template getProp(a) = rhop; - vd.template getProp(a) = 0; + vd.template getProp(a) = 0; return; } //-Calculate displacement and update position real_number dx = vd.template getProp(a)[0]*dt + vd.template getProp(a)[0]*dt205; - real_number dy = vd.template getProp(a)[1]*dt + vd.template getProp(a)[1]*dt205; - real_number dz = vd.template getProp(a)[2]*dt + vd.template getProp(a)[2]*dt205; + real_number dy = vd.template getProp(a)[1]*dt + vd.template getProp(a)[1]*dt205; + real_number dz = vd.template getProp(a)[2]*dt + vd.template getProp(a)[2]*dt205; - vd.getPos(a)[0] += dx; - vd.getPos(a)[1] += dy; - vd.getPos(a)[2] += dz; + vd.getPos(a)[0] += dx; + vd.getPos(a)[1] += dy; + vd.getPos(a)[2] += dz; - real_number velX = vd.template getProp(a)[0]; - real_number velY = vd.template getProp(a)[1]; - real_number velZ = vd.template getProp(a)[2]; + real_number velX = vd.template getProp(a)[0]; + real_number velY = vd.template getProp(a)[1]; + real_number velZ = vd.template getProp(a)[2]; - real_number rhop = vd.template getProp(a); + real_number rhop = vd.template getProp(a); vd.template getProp(a)[0] = vd.template getProp(a)[0] + vd.template getProp(a)[0]*dt2; vd.template getProp(a)[1] = vd.template getProp(a)[1] + vd.template getProp(a)[1]*dt2; vd.template getProp(a)[2] = vd.template getProp(a)[2] + vd.template getProp(a)[2]*dt2; vd.template getProp(a) = vd.template getProp(a) + dt2*vd.template getProp(a); - // Check if the particle go out of range in space and in density - if (vd.getPos(a)[0] < 0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 || - vd.getPos(a)[0] > 1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 || + // Check if the particle go out of range in space and in density + if (vd.getPos(a)[0] < 0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 || + vd.getPos(a)[0] > 1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 || vd.template getProp(a) < RhoMin || vd.template getProp(a) > RhoMax) - { - vd.template getProp(a) = 1; - } - else - { - vd.template getProp(a) = 0; - } - - - vd.template getProp(a)[0] = velX; - vd.template getProp(a)[1] = velY; - vd.template getProp(a)[2] = velZ; - vd.template getProp(a) = rhop; + { + vd.template getProp(a) = 1; + } + + else + { + vd.template getProp(a) = 0; + } + + + vd.template getProp(a)[0] = velX; + vd.template getProp(a)[1] = velY; + vd.template getProp(a)[2] = velZ; + vd.template getProp(a) = rhop; } size_t cnt = 0; @@ -659,50 +658,50 @@ __global__ void euler_int_gpu(vector_type vd,real_number dt, real_number dt205) real_number rhop = vd.template getProp(a); // Update only the density - vd.template getProp(a)[0] = 0.0; - vd.template getProp(a)[1] = 0.0; - vd.template getProp(a)[2] = 0.0; - real_number rhonew = vd.template getProp(a) + dt*vd.template getProp(a); - vd.template getProp(a) = (rhonew < rho_zero)?rho_zero:rhonew; + vd.template getProp(a)[0] = 0.0; + vd.template getProp(a)[1] = 0.0; + vd.template getProp(a)[2] = 0.0; + real_number rhonew = vd.template getProp(a) + dt*vd.template getProp(a); + vd.template getProp(a) = (rhonew < rho_zero)?rho_zero:rhonew; - vd.template getProp(a) = rhop; + vd.template getProp(a) = rhop; - vd.template getProp(a) = 0; + vd.template getProp(a) = 0; return; } //-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion. real_number dx = vd.template getProp(a)[0]*dt + vd.template getProp(a)[0]*dt205; - real_number dy = vd.template getProp(a)[1]*dt + vd.template getProp(a)[1]*dt205; - real_number dz = vd.template getProp(a)[2]*dt + vd.template getProp(a)[2]*dt205; + real_number dy = vd.template getProp(a)[1]*dt + vd.template getProp(a)[1]*dt205; + real_number dz = vd.template getProp(a)[2]*dt + vd.template getProp(a)[2]*dt205; - vd.getPos(a)[0] += dx; - vd.getPos(a)[1] += dy; - vd.getPos(a)[2] += dz; + vd.getPos(a)[0] += dx; + vd.getPos(a)[1] += dy; + vd.getPos(a)[2] += dz; - real_number velX = vd.template getProp(a)[0]; - real_number velY = vd.template getProp(a)[1]; - real_number velZ = vd.template getProp(a)[2]; - real_number rhop = vd.template getProp(a); + real_number velX = vd.template getProp(a)[0]; + real_number velY = vd.template getProp(a)[1]; + real_number velZ = vd.template getProp(a)[2]; + real_number rhop = vd.template getProp(a); vd.template getProp(a)[0] = vd.template getProp(a)[0] + vd.template getProp(a)[0]*dt; vd.template getProp(a)[1] = vd.template getProp(a)[1] + vd.template getProp(a)[1]*dt; - vd.template getProp(a)[2] = vd.template getProp(a)[2] + vd.template getProp(a)[2]*dt; - vd.template getProp(a) = vd.template getProp(a) + dt*vd.template getProp(a); + vd.template getProp(a)[2] = vd.template getProp(a)[2] + vd.template getProp(a)[2]*dt; + vd.template getProp(a) = vd.template getProp(a) + dt*vd.template getProp(a); - // Check if the particle go out of range in space and in density - if (vd.getPos(a)[0] < 0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 || - vd.getPos(a)[0] > 1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 || + // Check if the particle go out of range in space and in density + if (vd.getPos(a)[0] < 0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 || + vd.getPos(a)[0] > 1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 || vd.template getProp(a) < RhoMin || vd.template getProp(a) > RhoMax) - {vd.template getProp(a) = 1;} - else - {vd.template getProp(a) = 0;} - - vd.template getProp(a)[0] = velX; - vd.template getProp(a)[1] = velY; - vd.template getProp(a)[2] = velZ; - vd.template getProp(a) = rhop; + {vd.template getProp(a) = 1;} + else + {vd.template getProp(a) = 0;} + + vd.template getProp(a)[0] = velX; + vd.template getProp(a)[1] = velY; + vd.template getProp(a)[2] = velZ; + vd.template getProp(a) = rhop; } void euler_int(particles & vd, real_number dt) @@ -765,33 +764,36 @@ __global__ void sensor_pressure_gpu(vector_type vd, NN_type NN, Point<3,real_num // We calculate the pressure normalizing the // sum over all kernels if (tot_ker == 0.0) - {*press_tmp = 0.0;} + {*press_tmp = 0.0;} else - {*press_tmp = 1.0 / tot_ker * *press_tmp;} + {*press_tmp = 1.0 / tot_ker * *press_tmp;} } template inline void sensor_pressure(Vector & vd, - CellList & NN, - openfpm::vector> & press_t, - openfpm::vector> & probes) + CellList & NN, + openfpm::vector> & press_t, + openfpm::vector> & probes) { - Vcluster<> & v_cl = create_vcluster(); + Vcluster<> & v_cl = create_vcluster(); - press_t.add(); + press_t.add(); - for (size_t i = 0 ; i < probes.size() ; i++) - { - // A float variable to calculate the pressure of the problem - CudaMemory press_tmp_(sizeof(real_number)); - real_number press_tmp; + for (size_t i = 0 ; i < probes.size() ; i++) + { + // A float variable to calculate the pressure of the problem + CudaMemory press_tmp_(sizeof(real_number)); + real_number press_tmp; - // if the probe is inside the processor domain + // if the probe is inside the processor domain if (vd.getDecomposition().isLocal(probes.get(i)) == true) { - CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); + vd.template updateCellListGPU(NN); - vd.merge(NN); + Point<3,real_number> probe = probes.get(i); + CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probe,(real_number *)press_tmp_.toKernel()); + + vd.template restoreOrder<>(NN); // move calculated pressure on press_tmp_.deviceToHost(); @@ -811,7 +813,7 @@ inline void sensor_pressure(Vector & vd, int main(int argc, char* argv[]) { - // initialize the library + // initialize the library openfpm_init(&argc,&argv); openfpm::vector_gpu> fluid_ids; @@ -825,18 +827,18 @@ int main(int argc, char* argv[]) openfpm::vector> press_t; openfpm::vector> probes; - probes.add({0.8779,0.3,0.02}); - probes.add({0.754,0.31,0.02}); + probes.add({0.8779f,0.3f,0.02f}); + probes.add({0.754f,0.31f,0.02f}); // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y - Box<3,real_number> domain({-0.05,-0.05,-0.05},{1.7010,0.7065,0.511}); - size_t sz[3] = {413,179,133}; + Box<3,real_number> domain({-0.05f,-0.05f,-0.05f},{1.7010f,0.7065f,0.511f}); + size_t sz[3] = {207,90,66}; // Fill W_dap W_dap = 1.0/Wab(H/1.5); // Here we define the boundary conditions of our problem - size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}; + size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC}; // extended boundary around the domain, and the processor domain Ghost<3,real_number> g(2*H); @@ -847,7 +849,7 @@ int main(int argc, char* argv[]) // You can ignore all these dp/2.0 is a trick to reach the same initialization // of Dual-SPH that use a different criteria to draw particles - Box<3,real_number> fluid_box({dp/2.0,dp/2.0,dp/2.0},{0.4+dp/2.0,0.67-dp/2.0,0.3+dp/2.0}); + Box<3,real_number> fluid_box({dp/2.0f,dp/2.0f,dp/2.0f},{0.4f+dp/2.0f,0.67f-dp/2.0f,0.3f+dp/2.0f}); // return an iterator to the fluid particles to add to vd auto fluid_it = DrawParticles::DrawBox(vd,sz,domain,fluid_box); @@ -896,12 +898,12 @@ int main(int argc, char* argv[]) } // Recipient - Box<3,real_number> recipient1({0.0,0.0,0.0},{1.6+dp/2.0,0.67+dp/2.0,0.4+dp/2.0}); - Box<3,real_number> recipient2({dp,dp,dp},{1.6-dp/2.0,0.67-dp/2.0,0.4+dp/2.0}); + Box<3,real_number> recipient1({0.0f,0.0f,0.0f},{1.6f+dp/2.0f,0.67f+dp/2.0f,0.4f+dp/2.0f}); + Box<3,real_number> recipient2({dp,dp,dp},{1.6f-dp/2.0f,0.67f-dp/2.0f,0.4f+dp/2.0f}); - Box<3,real_number> obstacle1({0.9,0.24-dp/2.0,0.0},{1.02+dp/2.0,0.36,0.45+dp/2.0}); - Box<3,real_number> obstacle2({0.9+dp,0.24+dp/2.0,0.0},{1.02-dp/2.0,0.36-dp,0.45-dp/2.0}); - Box<3,real_number> obstacle3({0.9+dp,0.24,0.0},{1.02,0.36,0.45}); + Box<3,real_number> obstacle1({0.9f,0.24f-dp/2.0f,0.0f},{1.02f+dp/2.0f,0.36f,0.45f+dp/2.0f}); + Box<3,real_number> obstacle2({0.9f+dp,0.24f+dp/2.0f,0.0f},{1.02f-dp/2.0f,0.36f-dp,0.45f-dp/2.0f}); + Box<3,real_number> obstacle3({0.9f+dp,0.24f,0.0f},{1.02f,0.36f,0.45f}); openfpm::vector> holes; holes.add(recipient2); @@ -973,8 +975,7 @@ int main(int argc, char* argv[]) vd.ghost_get(RUN_ON_DEVICE); - auto NN = vd.getCellListGPU/*>*/(2*H / 2.0, 2); - NN.setBoxNN(2); + auto NN = vd.getCellListGPU/*>*/(2*H / 2.0, CL_NON_SYMMETRIC | CL_GPU_REORDER, 2); timer tot_sim; tot_sim.start(); @@ -996,8 +997,8 @@ int main(int argc, char* argv[]) vd.map(RUN_ON_DEVICE); // Rebalancer for now work on CPU , so move to CPU - vd.deviceToHostPos(); - vd.template deviceToHostProp(); + vd.deviceToHostPos(); + vd.template deviceToHostProp(); it_reb = 0; ModelCustom md; @@ -1040,13 +1041,12 @@ int main(int argc, char* argv[]) t += dt; - if (write < t*10) + if (write < t*100) { // Sensor pressure require update ghost, so we ensure that particles are distributed correctly // and ghost are updated vd.map(RUN_ON_DEVICE); vd.ghost_get(RUN_ON_DEVICE); - vd.updateCellList(NN); // calculate the pressure at the sensor points //sensor_pressure(vd,NN,press_t,probes); @@ -1106,7 +1106,7 @@ int main(int argc, char* argv[]) int main(int argc, char* argv[]) { - return 0; + return 0; } #endif diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu index cf565f05..92cf1114 100644 --- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu +++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu @@ -44,9 +44,6 @@ #define OPENMPI //#define SE_CLASS1 -//#define USE_LOW_REGISTER_ITERATOR -#define SCAN_WITH_CUB //<------ In case you want to use CUB for scan operations -#define SORT_WITH_CUB //#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default #include "Vector/vector_dist.hpp" @@ -64,7 +61,7 @@ typedef float real_number; #define FLUID 1 // initial spacing between particles dp in the formulas -const real_number dp = 0.00425; +const real_number dp = 0.0085; // Maximum height of the fluid water // is going to be calculated and filled later on real_number h_swl = 0.0; @@ -76,7 +73,7 @@ const real_number coeff_sound = 20.0; const real_number gamma_ = 7.0; // sqrt(3.0*dp*dp) support of the kernel -const real_number H = 0.00736121593217; +const real_number H = 0.0147224318643; // Eta in the formulas const real_number Eta2 = 0.01 * H*H; @@ -90,10 +87,10 @@ const real_number visco = 0.1; real_number cbar = 0.0; // Mass of the fluid particles -const real_number MassFluid = 0.0000767656; +const real_number MassFluid = 0.000614125; // Mass of the boundary particles -const real_number MassBound = 0.0000767656; +const real_number MassBound = 0.000614125; // @@ -303,9 +300,7 @@ inline __device__ __host__ real_number Pi(const Point<3,real_number> & dr, real_ template __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap, real_number cbar) { - // ... a unsigned int a; - GET_PARTICLE_SORT(a,NN); real_number max_visc = 0.0f; @@ -316,9 +311,6 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap // Type of the particle unsigned int typea = vd.template getProp(a); - // Take the mass of the particle dependently if it is FLUID or BOUNDARY - //real_number massa = (typea == FLUID)?MassFluid:MassBound; - // Get the density of the of the particle a real_number rhoa = vd.template getProp(a); @@ -349,16 +341,15 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap // if (p == q) skip this particle this condition should be done in the r^2 = 0 if (a == b) {++Np; continue;}; - unsigned int typeb = vd.template getProp(b); + unsigned int typeb = vd.template getProp(b); - real_number massb = (typeb == FLUID)?MassFluid:MassBound; - Point<3,real_number> vb = vd.template getProp(b); - real_number Pb = vd.template getProp(b); - real_number rhob = vd.template getProp(b); + real_number massb = (typeb == FLUID)?MassFluid:MassBound; + Point<3,real_number> vb = vd.template getProp(b); + real_number Pb = vd.template getProp(b); + real_number rhob = vd.template getProp(b); // Get the distance between p and q Point<3,real_number> dr = xa - xb; - Point<3,real_number> v_rel = va - vb; // take the norm of this vector real_number r2 = norm2(dr); @@ -367,6 +358,8 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap { real_number r = sqrtf(r2); + Point<3,real_number> v_rel = va - vb; + Point<3,real_number> DW; DWab(dr,DW,r); @@ -401,11 +394,11 @@ template inline void calc_forces(particles & vd, CellList & N auto part = vd.getDomainIteratorGPU(96); // Update the cell-list - vd.updateCellList(NN); + vd.updateCellListGPU(NN); - CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel_sorted(),NN.toKernel(),W_dap,cbar); + CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar); - vd.merge_sort(NN); + vd.restoreOrder(NN); max_visc = reduce_local(vd); } @@ -505,7 +498,7 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number vd.template getProp(a)[2] = vd.template getProp(a)[2] + vd.template getProp(a)[2]*dt2; vd.template getProp(a) = vd.template getProp(a) + dt2*vd.template getProp(a); - // Check if the particle go out of range in space and in density + // Check if the particle go out of range in space and in density, if they do mark them to remove it later if (vd.getPos(a)[0] < 0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 || vd.getPos(a)[0] > 1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 || vd.template getProp(a) < RhoMin || vd.template getProp(a) > RhoMax) @@ -565,7 +558,7 @@ __global__ void euler_int_gpu(vector_type vd,real_number dt, real_number dt205) return; } - //-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion. + //-Calculate displacement and update position real_number dx = vd.template getProp(a)[0]*dt + vd.template getProp(a)[0]*dt205; real_number dy = vd.template getProp(a)[1]*dt + vd.template getProp(a)[1]*dt205; real_number dz = vd.template getProp(a)[2]*dt + vd.template getProp(a)[2]*dt205; @@ -682,9 +675,12 @@ inline void sensor_pressure(Vector & vd, // if the probe is inside the processor domain if (vd.getDecomposition().isLocal(probes.get(i)) == true) { - CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel()); + vd.template updateCellListGPU(NN); + + Point<3,real_number> probe = probes.get(i); + CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probe,(real_number *)press_tmp_.toKernel()); - vd.merge(NN); + vd.template restoreOrder<>(NN); // move calculated pressure on press_tmp_.deviceToHost(); @@ -734,7 +730,7 @@ int main(int argc, char* argv[]) // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y Box<3,real_number> domain({-0.05f,-0.05f,-0.05f},{1.7010f,0.7065f,0.511f}); - size_t sz[3] = {413,179,133}; + size_t sz[3] = {207,90,66}; // Fill W_dap W_dap = 1.0/Wab(H/1.5); @@ -876,8 +872,7 @@ int main(int argc, char* argv[]) vd.ghost_get(RUN_ON_DEVICE); - auto NN = vd.getCellListGPU/*>*/(2*H / 2.0, 2); - //NN.setBoxNN(2); + auto NN = vd.getCellListGPU(2*H / 2.0, CL_NON_SYMMETRIC | CL_GPU_REORDER_POSITION | CL_GPU_REORDER_PROPERTY | CL_GPU_RESTORE_PROPERTY, 2); timer tot_sim; tot_sim.start(); @@ -899,8 +894,8 @@ int main(int argc, char* argv[]) vd.map(RUN_ON_DEVICE); // Rebalancer for now work on CPU , so move to CPU - vd.deviceToHostPos(); - vd.template deviceToHostProp(); + vd.deviceToHostPos(); + vd.template deviceToHostProp(); it_reb = 0; ModelCustom md; @@ -913,10 +908,6 @@ int main(int argc, char* argv[]) vd.map(RUN_ON_DEVICE); - // it sort the vector (doesn not seem to produce some advantage) - // note force calculation is anyway sorted calculation - //vd.make_sort(NN); - // Calculate pressure from the density EqState(vd); @@ -947,13 +938,12 @@ int main(int argc, char* argv[]) t += dt; - if (write < t*10) + if (write < t*100) { // Sensor pressure require update ghost, so we ensure that particles are distributed correctly // and ghost are updated vd.map(RUN_ON_DEVICE); vd.ghost_get(RUN_ON_DEVICE); - vd.updateCellList(NN); // calculate the pressure at the sensor points //sensor_pressure(vd,NN,press_t,probes); diff --git a/example/Vector/7_SPH_dlb_opt/main.cpp b/example/Vector/7_SPH_dlb_opt/main.cpp index 77e39c05..4872e4b3 100644 --- a/example/Vector/7_SPH_dlb_opt/main.cpp +++ b/example/Vector/7_SPH_dlb_opt/main.cpp @@ -1073,7 +1073,7 @@ int main(int argc, char* argv[]) vd.ghost_get(); /*! \cond [update_verlet_crs] \endcond */ - vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC); + vd.updateVerlet(NN,r_gskin); /*! \cond [update_verlet_crs] \endcond */ tot_disp = 0.0; diff --git a/example/Vector/7_SPH_dlb_opt/main_dbg.cpp b/example/Vector/7_SPH_dlb_opt/main_dbg.cpp index c5105744..545341a2 100644 --- a/example/Vector/7_SPH_dlb_opt/main_dbg.cpp +++ b/example/Vector/7_SPH_dlb_opt/main_dbg.cpp @@ -415,7 +415,7 @@ template inline double calc_forces(particles & vd, VerletLi ////////////////////// NN2 // Get an iterator over the neighborhood particles of p - auto Np2 = NN2.getNNIterator(NN2.getCell(vd.getPos(a))); + auto Np2 = NN2.getNNIteratorBox(NN2.getCell(vd.getPos(a))); // For each neighborhood particle while (Np2.isNext() == true) @@ -560,7 +560,7 @@ template inline double calc_forces(particles & vd, VerletLi ////////////////////// NN2 // Get an iterator over the neighborhood particles of p - auto Np2 = NN2.getNNIterator(NN2.getCell(vd.getPos(a))); + auto Np2 = NN2.getNNIteratorBox(NN2.getCell(vd.getPos(a))); if (a == 0 && create_vcluster().getProcessUnitID() == 0) { @@ -1148,7 +1148,7 @@ int main(int argc, char* argv[]) vd.ghost_get(); - //vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC); + //vd.updateVerlet(NN,r_gskin); NN = vd.getVerletCrs(r_gskin); vd.write("Debug"); diff --git a/example/Vector/8_DEM/main.cpp b/example/Vector/8_DEM/main.cpp index 04227b07..9933aa67 100644 --- a/example/Vector/8_DEM/main.cpp +++ b/example/Vector/8_DEM/main.cpp @@ -347,7 +347,7 @@ int main(int argc, char* argv[]) size_t cnt = 0; size_t cnt_reb = 0; - auto nlist = parts.getVerlet(cutoff + skin); + auto nlist = parts.getVerlet>(cutoff + skin); double tot_sp = 0.0; diff --git a/example/Vector/9_gpu_cuda_interop/main.cu b/example/Vector/9_gpu_cuda_interop/main.cu index b8e3a0b0..abdce17f 100644 --- a/example/Vector/9_gpu_cuda_interop/main.cu +++ b/example/Vector/9_gpu_cuda_interop/main.cu @@ -124,10 +124,11 @@ Second particle property 2 component 11, address: 0x7f8d83400604 #ifdef __NVCC__ #include "Vector/vector_dist.hpp" +#include "util/cuda_util.hpp" //! [print_data_kernel] -__global__ void print_data_particle_50(float * scalar, float * vector, float * tensor, int capacity) +__global__ void print_data_particle_50(float * scalar, float * vector, float * tensor, size_t capacity) { int p = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/example/common.mk b/example/common.mk index ae5dc686..f6722e70 100644 --- a/example/common.mk +++ b/example/common.mk @@ -8,7 +8,7 @@ CUDA_CC_LINK= ifdef HIP CUDA_CC=hipcc - CUDA_OPTIONS=-O3 --std=c++14 -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 + CUDA_OPTIONS=-O3 --std=c++17 -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 LIBS_SELECT=$(LIBS) CC=hipcc CUDA_CC_LINK=hipcc @@ -18,7 +18,7 @@ else CUDA_CC=mpic++ -x c++ INCLUDE_PATH_NVCC=$(INCLUDE_PATH) CUDA_CC_LINK=mpic++ - CUDA_OPTIONS=-O3 --std=c++14 -DTEST_RUN + CUDA_OPTIONS=-O3 --std=c++17 -DTEST_RUN LIBS_SELECT=$(LIBS) -lboost_context else ifeq (, $(shell which nvcc)) @@ -31,7 +31,7 @@ else INCLUDE_PATH_NVCC:=-Xcompiler=-Wno-deprecated-declarations $(INCLUDE_PATH_NVCC) CUDA_CC=nvcc -ccbin=mpic++ CUDA_CC_LINK=nvcc -ccbin=mpic++ - CUDA_OPTIONS=-O3 --std=c++14 -DTEST_RUN -use_fast_math -arch=sm_61 -lineinfo --extended-lambda --expt-relaxed-constexpr + CUDA_OPTIONS=-O3 --std=c++17 -DTEST_RUN -use_fast_math -arch=sm_61 -lineinfo --extended-lambda --expt-relaxed-constexpr LIBS_SELECT=$(LIBS_NVCC) endif endif @@ -46,7 +46,7 @@ else CUDA_CC_LINK:=$(CUDA_CC_LINK) endif -OPT=-g -O3 --std=c++14 -DTEST_RUN +OPT=-g -O3 --std=c++17 -DTEST_RUN INCLUDE_PATH:=-Wno-deprecated-declarations $(INCLUDE_PATH) INCLUDE_PATH_NVCC:=$(INCLUDE_PATH_NVCC) -I./include diff --git a/openfpm_data b/openfpm_data index 57c56d6e..cc219d5d 160000 --- a/openfpm_data +++ b/openfpm_data @@ -1 +1 @@ -Subproject commit 57c56d6ecb76b2407b797d3fb323648c1b1199c9 +Subproject commit cc219d5debc37dc29b0f5e2e054848e63e9b06aa diff --git a/openfpm_devices b/openfpm_devices index 47c09a15..5234c53d 160000 --- a/openfpm_devices +++ b/openfpm_devices @@ -1 +1 @@ -Subproject commit 47c09a1505df443fb5850695f4fa25b57530680f +Subproject commit 5234c53d59bffd4b1ec34f7c6832be97b2923b23 diff --git a/openfpm_io b/openfpm_io index ffd00931..65af8658 160000 --- a/openfpm_io +++ b/openfpm_io @@ -1 +1 @@ -Subproject commit ffd00931bd0c8133cfefd86697a48063cbc95907 +Subproject commit 65af8658a778c99429f769e7bc1c830fc32e5f25 diff --git a/openfpm_numerics b/openfpm_numerics index 3604644b..1f1619c3 160000 --- a/openfpm_numerics +++ b/openfpm_numerics @@ -1 +1 @@ -Subproject commit 3604644ba879a566d0fa5e2a96e66c861c6e7ed5 +Subproject commit 1f1619c31738021bbfe4f0d2db092f01568a2889 diff --git a/openfpm_pdata b/openfpm_pdata index 995d66dd..71b5c454 160000 --- a/openfpm_pdata +++ b/openfpm_pdata @@ -1 +1 @@ -Subproject commit 995d66dd3271b5e3238b66e4dbcac4befb2e3d94 +Subproject commit 71b5c45455a262b71f82f880d803f9d4620e52a2 diff --git a/openfpm_vcluster b/openfpm_vcluster index de9bc6a7..d85989d8 160000 --- a/openfpm_vcluster +++ b/openfpm_vcluster @@ -1 +1 @@ -Subproject commit de9bc6a7616451a00ccf0745e61039e48984bb0a +Subproject commit d85989d87a6509e50eed2756258d95b590932ede