diff --git a/CHANGELOG.md b/CHANGELOG.md
index 75af5387..161118d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,22 @@
 # Change Log
 All notable changes to this project will be documented in this file.
 
+## OpenFPM 5.1.0 - Jun 2024
+- Refactor implementations of cell list and Verlet list `CellList`, `CellList_gpu`,`VerletList` and all neighborhood iterators. Move from keeping two sets (unordered and ordered) of positions/property vectors to reordering explicitly before launching CUDA kernels that utilize this feature
+
+### Added
+- Add adaptive cut-off radius Verlet list on CPU. Particles individual radii are passed as a vector to `getVerletAdaptRCut` of `openfpm_pdata`. Time complexity: O(N^2)
+
+### Changes
+- `cuda_launch` is a function instead of a macro
+- Move project c++ standard to c++17
+- Add bit-wise option flags that control lell list behaviour. Pass as a parameter `opt` to `getCellList` of `openfpm_pdata` or set directly on cell list via `setOpt`
+- Add support for the following cell list bit-wise option flags: `CL_SYMMETRIC`, `CL_NON_SYMMETRIC`, `CL_LOCAL_SYMMETRIC`, `CL_LINEAR_CELL_KEYS`, `CL_HILBERT_CELL_KEYS`, `CL_GPU_REORDER`, where for the last one `CL_GPU_REORDER` control options for reordering/restoring operations could be more fine-tuned: `CL_GPU_REORDER_POSITION`, `CL_GPU_REORDER_PROPERTY`, `CL_GPU_RESTORE_POSITION`, `CL_GPU_RESTORE_PROPERTY`
+- Add bit-wise option flags that control Verlet list behaviour. Pass as a template parameter `opt` in `VerletList` or pass Verlet list type with `opt` set to `getVerlet` of `openfpm_pdata`. The difference in how `opt` is set for cell list and Verlet list is due to legacy code
+- Add support for the following Verlet list bit-wise option flags: `VL_NON_SYMMETRIC`, `VL_SYMMETRIC`, `VL_CRS_SYMMETRIC`, `VL_ADAPTIVE_RCUT`, `VL_NMAX_NEIGHBOR`, `VL_SKIP_REF_PART`
+- `getCellListGPU` of `openfpm_pdata` doesn't fill a cell list with particle locations. Additionally, `updateCellListGPU` has to be called to perform this operation. The reason is to remove two identical fill operations in a row in simulations
+- To reorder `vector_dist` for GPU coalesced memory access dictated by the cell list structure, cell list on GPU has to be constructed with one of the following flags enabled: `CL_GPU_REORDER`, `CL_GPU_REORDER_POSITION`, `CL_GPU_REORDER_PROPERTY`, `CL_GPU_RESTORE_POSITION`, `CL_GPU_RESTORE_PROPERTY`. Properties to be copied to reordered copies of position/property vectors have to be passed to `updateCellListGPU<PROPERTIES>(CELL LIST)` of `vector_dist`. Properties that undergone changes in reordered copies and have to be restored to the original position/property vectors have to be passed to `restoreOrder<PROPERTIES>(CELL LIST)` of `vector_dist`
+
 ## OpenFPM 5.0.0 - Feb 2024
 - Move to `openfpm` meta OpenFPM project structure with git subprojects `openfpm_data`, `openfpm_devices`, `openfpm_io`, `openfpm_vcluster`, `openfpm_pdata`, `openfpm_numerics`. Example codes, installation scripts for dependencies, configuration scripts moved to `openfpm`. Only source code of `openfpm_pdata` kept in `openfpm_pdata`
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17c54ec5..2118f670 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,9 @@ if(APPLE)
 endif()
 
 if (Boost_FOUND)
+	# ignore BOOST deprecated headers
+	add_definitions("-DBOOST_ALLOW_DEPRECATED_HEADERS")
+
 	if ((CUDA_ON_BACKEND STREQUAL "SEQUENTIAL" OR CUDA_ON_BACKEND STREQUAL "OpenMP") AND NOT Boost_CONTEXT_FOUND)
 		message( FATAL_ERROR "BOOST is invalid reinstalling" )
 	endif()
@@ -163,6 +166,7 @@ add_subdirectory(openfpm_pdata)
 
 if (ENABLE_NUMERICS)
 	add_subdirectory(openfpm_numerics)
+	set(DEFINE_ENABLE_NUMERICS "#define ENABLE_NUMERICS")
 endif()
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config/config_cmake.h.in ${PROJECT_BINARY_DIR}/config/config.h)
diff --git a/config/config_cmake.h.in b/config/config_cmake.h.in
index b896e8b5..2d69d31c 100644
--- a/config/config_cmake.h.in
+++ b/config/config_cmake.h.in
@@ -195,6 +195,9 @@ ${DEFINE_VCLUSTER_GARBAGE_INJECTOR}
 /* Test coverage mode */
 ${DEFINE_TEST_COVERAGE_MODE}
 
+/* Enable numerics module */
+${DEFINE_ENABLE_NUMERICS}
+
 /* when an error accur continue but avoid unsafe operation */
 /* #undef THROW_ON_ERROR */
 
diff --git a/example/Numerics/PSE/0_Derivative_approx_1D/main.cpp b/example/Numerics/PSE/0_Derivative_approx_1D/main.cpp
index 955a7719..044147ff 100644
--- a/example/Numerics/PSE/0_Derivative_approx_1D/main.cpp
+++ b/example/Numerics/PSE/0_Derivative_approx_1D/main.cpp
@@ -210,9 +210,7 @@ int main(int argc, char* argv[])
 	//
 
     // get and construct the Cell list
-
-	Ghost<1,double> gp(enlarge);
-    auto cl = vd.getCellList(12*eps,gp);
+	auto cl = vd.getCellList(12*eps, CL_NON_SYMMETRIC, false, enlarge);
 
     // Maximum infinity norm
     double linf = 0.0;
@@ -251,7 +249,7 @@ int main(int argc, char* argv[])
     	double prp_x = vd.template getProp<0>(key);
 
     	// Get the neighborhood of the particles
-    	auto NN = cl.getNNIterator(cl.getCell(p));
+	auto NN = cl.getNNIteratorBox(cl.getCell(p));
     	while(NN.isNext())
     	{
     		auto nnp = NN.get();
diff --git a/example/Numerics/PSE/1_Derivative_approx_1D_mp/main_float128.cpp b/example/Numerics/PSE/1_Derivative_approx_1D_mp/main_float128.cpp
index cd7629f1..0c72ed0f 100644
--- a/example/Numerics/PSE/1_Derivative_approx_1D_mp/main_float128.cpp
+++ b/example/Numerics/PSE/1_Derivative_approx_1D_mp/main_float128.cpp
@@ -218,9 +218,8 @@ int main(int argc, char* argv[])
 	//
 
     // get and construct the Cell list
+    auto cl = vd.getCellList(12*eps, CL_NON_SYMMETRIC, false, enlarge);
 
-	Ghost<1,float128> gp(enlarge);
-    auto cl = vd.getCellList(12*eps,gp);
 
     // Maximum infinity norm
     double linf = 0.0;
@@ -260,7 +259,7 @@ int main(int argc, char* argv[])
     	float128 prp_x = vd.template getProp<0>(key);
 
     	// Get the neighborhood of the particles
-    	auto NN = cl.getNNIterator(cl.getCell(p));
+	auto NN = cl.getNNIteratorBox(cl.getCell(p));
     	while(NN.isNext())
     	{
     		auto nnp = NN.get();
diff --git a/example/Numerics/PSE/1_Diffusion_1D/main.cpp b/example/Numerics/PSE/1_Diffusion_1D/main.cpp
index 7185298e..97873abd 100644
--- a/example/Numerics/PSE/1_Diffusion_1D/main.cpp
+++ b/example/Numerics/PSE/1_Diffusion_1D/main.cpp
@@ -69,7 +69,7 @@ template<typename CellL> double calcLap(Point<1,double> p, vect_dist_key_dx key,
 	double prp_x = vd.template getProp<0>(key);
 
 	// Get the neighborhood of the particles
-	auto NN = cl.getNNIterator(cl.getCell(p));
+	auto NN = cl.getNNIteratorBox(cl.getCell(p));
 	while(NN.isNext())
 	{
 		auto nnp = NN.get();
@@ -339,13 +339,11 @@ int main(int argc, char* argv[])
 	// keeping ghost size and padding area unrelated give us the possibility to show how to create a CellList
 	// on a area bigger than the domain + ghost
 	
-	Ghost<1,double> gp(enlarge);
-        
-        // Create a Cell list with Cell spaping 8*epsilon, the CellList is created on a domain+ghost+enlarge space
-        auto cl = vd.getCellList(8*eps,gp);
+	// Create a Cell list with Cell spaping 8*epsilon, the CellList is created on a domain+ghost+enlarge space
+	auto cl = vd.getCellList(8*eps, CL_NON_SYMMETRIC, false, enlarge);
 
-        // Maximum infinity norm
-        double linf = 0.0;
+	// Maximum infinity norm
+	double linf = 0.0;
 
 	//
 	// ### WIKI 13 ###
diff --git a/example/SparseGrid/8_filling_benchmark_gpu/main.cu b/example/SparseGrid/8_filling_benchmark_gpu/main.cu
index 644a4cc1..a8456118 100644
--- a/example/SparseGrid/8_filling_benchmark_gpu/main.cu
+++ b/example/SparseGrid/8_filling_benchmark_gpu/main.cu
@@ -2,7 +2,7 @@
 #define SYNC_BEFORE_TAKE_TIME
 #define ENABLE_GRID_DIST_ID_PERF_STATS
 #include "Decomposition/Distribution/BoxDistribution.hpp"
-#include "util/cuda_launch.hpp"
+#include "util/cuda_util.hpp"
 #include "Grid/grid_dist_id.hpp"
 #include "data_type/aggregate.hpp"
 #include "timer.hpp"
diff --git a/example/Vector/10_level_set/main.cpp b/example/Vector/10_level_set/main.cpp
index 9860bb30..cc3b6dc6 100644
--- a/example/Vector/10_level_set/main.cpp
+++ b/example/Vector/10_level_set/main.cpp
@@ -63,7 +63,11 @@ const double rdist_cutoff_factor = 2.8;
 
 // dimensions of spatial and temporal domain
 const double l = 2.44;
+#ifdef TEST_RUN
+const double t_end = 0.1;
+#else
 const double t_end = 1.0;
+#endif
 // total mass in the domain to compute individual particle masses from
 const double M = l*l*rho_0;
 // number of particles in total
@@ -89,7 +93,7 @@ template <typename CellList> inline void density_summation(particles & vd, CellL
         // intitialize sum that yields 1/(particle volume)
 	double V_inv = 0.0;
 
-        auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a)));
+        auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(a)));
 	// iterate over particles b (neighboring particles)
         while (Np.isNext() == true)
         {
@@ -171,7 +175,7 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N
         Point<2, double> va = vd.getProp<vel>(a);
 
         // Get an iterator over the neighborhood particles of p
-        auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a)));
+        auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(a)));
 
         // For each neighborhood particle b
         while (Np.isNext() == true)
diff --git a/example/Vector/1_celllist/main.cpp b/example/Vector/1_celllist/main.cpp
index 4df03e5f..09dec2bb 100644
--- a/example/Vector/1_celllist/main.cpp
+++ b/example/Vector/1_celllist/main.cpp
@@ -247,7 +247,7 @@ int main(int argc, char* argv[])
 		Point<3,float> xp = vd.getPos(p);
 
 		// Get an iterator of all the particles neighborhood of p
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+		auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 		// For each particle near p
 		while (Np.isNext())
@@ -392,8 +392,8 @@ int main(int argc, char* argv[])
 	//! \cond [cell_list_types] \endcond
 
 	// Get cell list
-	auto NN4 = vd.getCellList<CELL_MEMBAL(3,float)>(r_cut);
-	auto NN5 = vd.getCellList<CELL_MEMMW(3,float)>(r_cut);
+	auto NN4 = vd.getCellList<CELL_MEMBAL<3,float>>(r_cut);
+	auto NN5 = vd.getCellList<CELL_MEMMW<3,float>>(r_cut);
 
 	//! \cond [cell_list_types] \endcond
 
diff --git a/example/Vector/1_gpu_first_step/main.cu b/example/Vector/1_gpu_first_step/main.cu
index 30a69fcd..28d514e2 100644
--- a/example/Vector/1_gpu_first_step/main.cu
+++ b/example/Vector/1_gpu_first_step/main.cu
@@ -127,7 +127,6 @@
 #define OPENMPI
 //! \cond [using_openmpi] \endcond
 
-#define SCAN_WITH_CUB <------ MODERNGPU is broken on RTX use CUB library for scan
 //#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default
 
 #include "Vector/vector_dist.hpp"
diff --git a/example/Vector/3_molecular_dynamic/main.cpp b/example/Vector/3_molecular_dynamic/main.cpp
index eaab13e3..c70d3b43 100644
--- a/example/Vector/3_molecular_dynamic/main.cpp
+++ b/example/Vector/3_molecular_dynamic/main.cpp
@@ -115,7 +115,7 @@ template<typename CellList> void calc_forces(vector_dist<3,double, aggregate<dou
 		vd.template getProp<force>(p)[2] = 0.0;
 
 		// Get an iterator over the neighborhood particles of p
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+		auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 		// For each neighborhood particle ...
 		while (Np.isNext())
@@ -232,7 +232,7 @@ template<typename CellList> double calc_energy(vector_dist<3,double, aggregate<d
 		Point<3,double> xp = vd.getPos(p);
 
 		// Get an iterator over the neighborhood of the particle p
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+		auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 		// For each neighborhood of the particle p
 		while (Np.isNext())
@@ -438,7 +438,7 @@ int main(int argc, char* argv[])
 	//! \cond [md steps] \endcond
 
 	// Get the Cell list structure
-	auto NN = vd.getCellList<CELL_MEMBAL(3,double)>(r_cut);
+	auto NN = vd.getCellList<CELL_MEMBAL<3,double>>(r_cut);
 
 	// The standard
 	// auto NN = vd.getCellList(r_cut);
diff --git a/example/Vector/3_molecular_dynamic/main_expr_paper.cpp b/example/Vector/3_molecular_dynamic/main_expr_paper.cpp
index e166299a..cc916483 100644
--- a/example/Vector/3_molecular_dynamic/main_expr_paper.cpp
+++ b/example/Vector/3_molecular_dynamic/main_expr_paper.cpp
@@ -94,7 +94,7 @@ int main(int argc, char* argv[])
 		particles.ghost_get<>();
 
 		// Calculate the force at t + dt
-		particles.updateCellListSym(NN);
+		particles.updateCellList(NN);
 		force = applyKernel_in_sim(particles,NN,lennard_jones);
 
 		// 2-step Verlet velocity
diff --git a/example/Vector/3_molecular_dynamic/main_vl.cpp b/example/Vector/3_molecular_dynamic/main_vl.cpp
index 78ab671c..7ad485c4 100644
--- a/example/Vector/3_molecular_dynamic/main_vl.cpp
+++ b/example/Vector/3_molecular_dynamic/main_vl.cpp
@@ -62,7 +62,7 @@ constexpr int force = 1;
 
 //! \cond [arg diff] \endcond
 
-void calc_forces(vector_dist<3,double, aggregate<double[3],double[3]> > & vd, VerletList<3, double, Mem_fast<>, shift<3, double> > & NN, double sigma12, double sigma6, double r_cut)
+void calc_forces(vector_dist<3,double, aggregate<double[3],double[3]> > & vd, VerletList<3, double, VL_NON_SYMMETRIC, Mem_fast<>, shift<3, double> > & NN, double sigma12, double sigma6, double r_cut)
 {
 	//! \cond [arg diff] \endcond
 
@@ -150,7 +150,7 @@ void calc_forces(vector_dist<3,double, aggregate<double[3],double[3]> > & vd, Ve
 
 //! \cond [calc energy vl] \endcond
 
-double calc_energy(vector_dist<3,double, aggregate<double[3],double[3]> > & vd, VerletList<3, double, Mem_fast<>, shift<3, double> > & NN, double sigma12, double sigma6, double r_cut)
+double calc_energy(vector_dist<3,double, aggregate<double[3],double[3]> > & vd, VerletList<3, double, VL_NON_SYMMETRIC, Mem_fast<>, shift<3, double> > & NN, double sigma12, double sigma6, double r_cut)
 {
 	double E = 0.0;
 
diff --git a/example/Vector/3_molecular_dynamic_gpu/main.cu b/example/Vector/3_molecular_dynamic_gpu/main.cu
index 174ee030..395fbf84 100644
--- a/example/Vector/3_molecular_dynamic_gpu/main.cu
+++ b/example/Vector/3_molecular_dynamic_gpu/main.cu
@@ -1,5 +1,3 @@
-#define SCAN_WITH_CUB
-
 /*!
  * \page Vector_3_md_dyn_gpu Vector 3 molecular dynamic on GPU
  *
@@ -111,7 +109,7 @@ __global__ void calc_force_gpu(vector_dist_type vd, NN_type NN, real_number sigm
 
 
 	// Get an iterator over the neighborhood particles of p
-	auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+	auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 	// For each neighborhood particle ...
 	while (Np.isNext())
@@ -189,7 +187,7 @@ __global__ void particle_energy(vector_dist_type vd, NN_type NN, real_number sig
 	Point<3,real_number> xp = vd.getPos(p);
 
 	// Get an iterator over the neighborhood of the particle p
-	auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+	auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 	real_number E = 0;
 
@@ -230,7 +228,7 @@ __global__ void particle_energy(vector_dist_type vd, NN_type NN, real_number sig
 
 template<typename CellList> void calc_forces(vector_dist_gpu<3,real_number, aggregate<real_number[3],real_number[3],real_number> > & vd, CellList & NN, real_number sigma12, real_number sigma6, real_number r_cut2)
 {
-	vd.updateCellList(NN);
+	vd.updateCellListGPU(NN);
 
 	// Get an iterator over particles
 	auto it2 = vd.getDomainIteratorGPU();
@@ -245,7 +243,7 @@ template<typename CellList> real_number calc_energy(vector_dist_gpu<3,real_numbe
 	real_number rc = r_cut2;
 	real_number shift = 2.0 * ( sigma12 / (rc*rc*rc*rc*rc*rc) - sigma6 / ( rc*rc*rc) );
 
-	vd.updateCellList(NN);
+	vd.updateCellListGPU(NN);
 
 	auto it2 = vd.getDomainIteratorGPU();
 
@@ -328,7 +326,7 @@ int main(int argc, char* argv[])
 	//! \cond [md steps] \endcond
 
 	// Get the Cell list structure
-	auto NN = vd.getCellListGPU(r_cut, 2);
+	auto NN = vd.getCellListGPU(r_cut, CL_NON_SYMMETRIC, 2);
 
 	// The standard
 	// auto NN = vd.getCellList(r_cut);
diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu.cpp b/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu.cpp
index a31730c1..b87afff3 100644
--- a/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu.cpp
+++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu.cpp
@@ -37,7 +37,7 @@ template<typename CellList> void calc_forces(vector_dist<3,real_number, aggregat
 		vd.template getProp<force>(p)[2] = 0.0;
 
 		// Get an iterator over the neighborhood particles of p
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+		auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 		// For each neighborhood particle ...
 		while (Np.isNext())
@@ -97,7 +97,7 @@ template<typename CellList> real_number calc_energy(vector_dist<3,real_number, a
 		Point<3,real_number> xp = vd.getPos(p);
 
 		// Get an iterator over the neighborhood of the particle p
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+		auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 		// For each neighborhood of the particle p
 		while (Np.isNext())
diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu_best.cpp b/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu_best.cpp
index ea250bb4..236e31f7 100644
--- a/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu_best.cpp
+++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_cpu_best.cpp
@@ -241,7 +241,7 @@ int main(int argc, char* argv[])
 	tsim.start();
 
 	// Get the Cell list structure
-	auto NN = vd.getVerletCrs(r_gskin);;
+	auto NN = vd.getVerletCrs(r_gskin);
 
 	// calculate forces
 	calc_forces(vd,NN,sigma12,sigma6,r_cut);
@@ -290,7 +290,7 @@ int main(int argc, char* argv[])
 			vd.map();
 			vd.template ghost_get<>();
 			// Get the Cell list structure
-			vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC);
+			vd.updateVerlet(NN,r_gskin);
 		}
 		else
 		{
diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu
index 9de319f2..8df92655 100644
--- a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu
+++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu
@@ -46,8 +46,7 @@
  * all particles processed by one SM stay in one cell or few neighborhood cell, the number of cache line that an SM has to read is
  * reduced, with a significant speed-up.
  *
- * In OpenFPM get a Cell-list produce a re-ordered version of the original vector by default. It is possible to offload the sorted
- *  version vector_dist_gpu instead of the normal one using the function \b toKernel_sorted() \b instead of the function \b toKernel \b.
+ * In OpenFPM \b getCellListGPU() \b produces a re-ordered version of the original vector when CL_GPU_REORDER option is enabled.
  *
  * \snippet Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu calc_force_sorted
  *
@@ -255,22 +254,19 @@ __global__ void particle_energy(vector_dist_type vd, NN_type NN, real_number sig
 
 template<typename CellList> void calc_forces(vector_dist_gpu<3,real_number, aggregate<real_number[3],real_number[3],real_number> > & vd, CellList & NN, real_number sigma12, real_number sigma6, real_number r_cut2)
 {
-	vd.updateCellList(NN);
-
 	// Get an iterator over particles
 	auto it2 = vd.getDomainIteratorGPU();
 
 	//! \cond [calc_force_sorted] \endcond
 
-	CUDA_LAUNCH(calc_force_gpu,it2,vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,r_cut2);
-
-	//! \cond [calc_force_sorted] \endcond
-
-	//! \cond [merge_sort] \endcond
+	// reorder positions only (no properties)
+	// as nothing else is needed to be read in calc_force_gpu
+	vd.template updateCellListGPU<>(NN);
 
-	vd.merge_sort<force>(NN);
+	CUDA_LAUNCH(calc_force_gpu,it2,vd.toKernel(),NN.toKernel(),sigma12,sigma6,r_cut2);
 
-	//! \cond [merge_sort] \endcond
+	//! \cond [calc_force_sorted] \endcond
+	vd.template restoreOrder<force>(NN);
 }
 
 template<typename CellList> real_number calc_energy(vector_dist_gpu<3,real_number, aggregate<real_number[3],real_number[3],real_number> > & vd, CellList & NN, real_number sigma12, real_number sigma6, real_number r_cut2)
@@ -278,13 +274,13 @@ template<typename CellList> real_number calc_energy(vector_dist_gpu<3,real_numbe
 	real_number rc = r_cut2;
 	real_number shift = 2.0 * ( sigma12 / (rc*rc*rc*rc*rc*rc) - sigma6 / ( rc*rc*rc) );
 
-	vd.updateCellList(NN);
+	vd.template updateCellListGPU<velocity>(NN);
 
 	auto it2 = vd.getDomainIteratorGPU();
 
-	CUDA_LAUNCH(particle_energy,it2,vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,shift,r_cut2);
+	CUDA_LAUNCH(particle_energy,it2,vd.toKernel(),NN.toKernel(),sigma12,sigma6,shift,r_cut2);
 
-	vd.merge_sort<energy>(NN);
+	vd.template restoreOrder<energy>(NN);
 
 	// Calculated energy
 	return reduce_local<energy,_add_>(vd);
@@ -361,7 +357,7 @@ int main(int argc, char* argv[])
 	//! \cond [get_half_cl] \endcond
 
 	// Get the Cell list structure
-	auto NN = vd.getCellListGPU(r_cut / 2.0, 2);
+	auto NN = vd.getCellListGPU(r_cut / 2.0, CL_NON_SYMMETRIC | CL_GPU_REORDER, 2);
 
 	//! \cond [get_half_cl] \endcond
 
@@ -377,7 +373,6 @@ int main(int argc, char* argv[])
 	{
 		// Get the iterator
 		auto it3 = vd.getDomainIteratorGPU();
-
 		CUDA_LAUNCH(update_velocity_position,it3,vd.toKernel(),dt);
 
 		// Because we moved the particles in space we have to map them and re-sync the ghost
@@ -393,7 +388,7 @@ int main(int argc, char* argv[])
 		CUDA_LAUNCH(update_velocity,it4,vd.toKernel(),dt);
 
 		// After every iteration collect some statistic about the configuration
-		if (i % 1000 == 0)
+		if (i % 100 == 0)
 		{
 			vd.deviceToHostPos();
 			vd.deviceToHostProp<0,1,2>();
diff --git a/example/Vector/4_multiphase_celllist_verlet/main.cpp b/example/Vector/4_multiphase_celllist_verlet/main.cpp
index 2d91d3b7..c63e9b1b 100644
--- a/example/Vector/4_multiphase_celllist_verlet/main.cpp
+++ b/example/Vector/4_multiphase_celllist_verlet/main.cpp
@@ -2,7 +2,7 @@
 #include "Vector/vector_dist.hpp"
 #include "Decomposition/CartDecomposition.hpp"
 #include "data_type/aggregate.hpp"
-#include "NN/CellList/CellListM.hpp"
+#include "NN/CellList/multiphase/CellListM.hpp"
 #include "Vector/vector_dist_multiphase_functions.hpp"
 
 /*!
@@ -128,7 +128,7 @@ int main(int argc, char* argv[])
 	 *
 	 * ## Multi phase Verlet-list ##
 	 *
-	 * In general verlet-list can be constructed from the vector itself using **getVerlerList()**
+	 * In general verlet-list can be constructed from the vector itself using **getVerlet()**
 	 *
 	 * \see \ref Vector_3_md_vl
 	 *
@@ -529,7 +529,7 @@ int main(int argc, char* argv[])
 		current_phase.getProp<0>(p) = 0.0;
 
 		// Get an iterator of all the particles neighborhood of p
-		auto Np = CL_all.getNNIterator(CL_all.getCell(current_phase.getPos(p)));
+		auto Np = CL_all.getNNIteratorBox(CL_all.getCell(current_phase.getPos(p)));
 
 		// For each particle near p
 		while (Np.isNext())
diff --git a/example/Vector/4_reorder/main_comp_ord.cpp b/example/Vector/4_reorder/main_comp_ord.cpp
index 2368d460..671f153c 100644
--- a/example/Vector/4_reorder/main_comp_ord.cpp
+++ b/example/Vector/4_reorder/main_comp_ord.cpp
@@ -57,7 +57,7 @@ template<typename CellList> void calc_forces(vector_dist<3,double, aggregate<dou
 	//! \cond [NN iterator] \endcond
 
 	// Get an iterator over particles
-	auto it2 = NN.getIterator();
+	auto it2 = NN.getCellParticleIterator();
 
 	//! \cond [NN iterator] \endcond
 
@@ -76,7 +76,7 @@ template<typename CellList> void calc_forces(vector_dist<3,double, aggregate<dou
 		vd.template getProp<force>(p)[2] = 0.0;
 
 		// Get an iterator over the neighborhood particles of p
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+		auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 		// For each neighborhood particle ...
 		while (Np.isNext())
@@ -148,7 +148,7 @@ template<typename CellList> double calc_energy(vector_dist<3,double, aggregate<d
 	vd.updateCellList(NN);
 
 	// Get the iterator
-	auto it2 = NN.getIterator();
+	auto it2 = NN.getCellParticleIterator();
 
 	// For each particle ...
 	while (it2.isNext())
@@ -165,7 +165,7 @@ template<typename CellList> double calc_energy(vector_dist<3,double, aggregate<d
 		vd.template getProp<force>(p)[2] = 0.0;
 
 		// Get an iterator over the neighborhood of the particle p
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+		auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 		// For each neighborhood of the particle p
 		while (Np.isNext())
@@ -304,8 +304,9 @@ int main(int argc, char* argv[])
 	 *
 	 * ### Cell lists ###
 	 *
-	 * Instead of getting the normal cell list we get an hilbert curve cell-list. Such cell list has a
-	 * function called **getIterator** used inside the function **calc_forces** and **calc_energy**
+	 * Instead of getting the normal cell list we get an hilbert curve cell-list by passing flag **CL_HILBERT_CELL_KEYS**
+	 * to **getCellList** or by directly calling **getCellList_hilb**. Such cell list has a
+	 * function called **getCellParticleIterator** used inside the function **calc_forces** and **calc_energy**
 	 * that iterate across all the particles but in a smart-way. In practice
 	 * given an r-cut a cell-list is constructed with the provided spacing. Suppose to have a cell-list
 	 * \f$ m \times n \f$, an hilbert curve \f$ 2^k \times 2^k \f$ is contructed with \f$ k = ceil(log_2(max(m,n))) \f$.
diff --git a/example/Vector/5_molecular_dynamic_sym/main.cpp b/example/Vector/5_molecular_dynamic_sym/main.cpp
index f5d27cb6..07e2dccd 100644
--- a/example/Vector/5_molecular_dynamic_sym/main.cpp
+++ b/example/Vector/5_molecular_dynamic_sym/main.cpp
@@ -387,7 +387,7 @@ int main(int argc, char* argv[])
 			vd.map();
 			vd.template ghost_get<>();
 			// Get the Cell list structure
-			vd.updateVerlet(NN,r_gskin,VL_SYMMETRIC);
+			vd.updateVerlet(NN,r_gskin);
 		}
 		else
 		{
diff --git a/example/Vector/5_molecular_dynamic_sym_crs/main.cpp b/example/Vector/5_molecular_dynamic_sym_crs/main.cpp
index 4e5ccd33..f5dcf81a 100644
--- a/example/Vector/5_molecular_dynamic_sym_crs/main.cpp
+++ b/example/Vector/5_molecular_dynamic_sym_crs/main.cpp
@@ -471,7 +471,7 @@ int main(int argc, char* argv[])
 			vd.map();
 			vd.template ghost_get<>();
 			// Get the Cell list structure
-			vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC);
+			vd.updateVerlet(NN,r_gskin);
 		}
 		else
 		{
diff --git a/example/Vector/6_complex_usage/main.cpp b/example/Vector/6_complex_usage/main.cpp
index f2e833df..b554918e 100644
--- a/example/Vector/6_complex_usage/main.cpp
+++ b/example/Vector/6_complex_usage/main.cpp
@@ -199,7 +199,7 @@ int main(int argc, char* argv[])
 
 		Point<3,float> xp = vd.getPos(p);
 
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
+		auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
 
 		while (Np.isNext())
 		{
@@ -265,7 +265,7 @@ int main(int argc, char* argv[])
 
 		Point<3,float> xp = vd.getPos(p);
 
-		auto Np = NN2.getNNIteratorSym(NN2.getCell(vd.getPos(p)),p.getKey(),vd.getPosVector());
+		auto Np = NN2.getNNIteratorBoxSym(NN2.getCell(vd.getPos(p)),p.getKey(),vd.getPosVector());
 
 		while (Np.isNext())
 		{
diff --git a/example/Vector/7_SPH_dlb/main.cpp b/example/Vector/7_SPH_dlb/main.cpp
index d8562d3e..33448c05 100644
--- a/example/Vector/7_SPH_dlb/main.cpp
+++ b/example/Vector/7_SPH_dlb/main.cpp
@@ -493,7 +493,7 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N
 		{
 			// If it is a boundary particle calculate the delta rho based on equation 2
 			// This require to run across the neighborhoods particles of a
-			auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a)));
+			auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(a)));
 
 			// For each neighborhood particle
 			while (Np.isNext() == true)
@@ -548,7 +548,7 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N
 			// If it is a fluid particle calculate based on equation 1 and 2
 
 			// Get an iterator over the neighborhood particles of p
-			auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a)));
+			auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(a)));
 
 			// For each neighborhood particle
 			while (Np.isNext() == true)
@@ -936,7 +936,7 @@ inline void sensor_pressure(Vector & vd,
 			Point<3,double> xp = probes.get(i);
 
 			// get the iterator over the neighbohood particles of the probes position
-			auto itg = NN.getNNIterator(NN.getCell(probes.get(i)));
+			auto itg = NN.getNNIteratorBox(NN.getCell(probes.get(i)));
 			while (itg.isNext())
 			{
 				auto q = itg.get();
diff --git a/example/Vector/7_SPH_dlb_gpu/main.cu b/example/Vector/7_SPH_dlb_gpu/main.cu
index be528601..c8509d77 100644
--- a/example/Vector/7_SPH_dlb_gpu/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu/main.cu
@@ -210,10 +210,7 @@ inline void EqState(particles & vd)
 {
 	auto it = vd.getDomainIteratorGPU();
 
-	// You can use standard CUDA kernel launch or the macro CUDA_LAUNCH
-
-	//EqState_gpuning<<<it.wthr,it.thr>>>(vd.toKernel(),B);
-	CUDA_LAUNCH(EqState_gpu,it,vd.toKernel(),B)
+	CUDA_LAUNCH(EqState_gpu,it,vd.toKernel(),B);
 }
 
 
@@ -239,17 +236,17 @@ const real_number a2_4 = 0.25*a2;
 // Filled later
 real_number W_dap = 0.0;
 
-inline __device__ __host__ void DWab(Point<3,real_number> & dx, Point<3,real_number> & DW, real_number r, bool print)
+inline __device__ __host__ void DWab(Point<3,real_number> & dx, Point<3,real_number> & DW, real_number r)
 {
 	const real_number qq=r/H;
 
     real_number qq2 = qq * qq;
     real_number fac1 = (c1*qq + d1*qq2)/r;
-    real_number b1 = (qq < 1.0)?1.0f:0.0f;
+    real_number b1 = (qq < 1.0f)?1.0f:0.0f;
 
-    real_number wqq = (2.0 - qq);
+    real_number wqq = (2.0f - qq);
     real_number fac2 = c2 * wqq * wqq / r;
-    real_number b2 = (qq >= 1.0 && qq < 2.0)?1.0f:0.0f;
+    real_number b2 = (qq >= 1.0f && qq < 2.0f)?1.0f:0.0f;
 
     real_number factor = (b1*fac1 + b2*fac2);
 
@@ -282,8 +279,8 @@ inline __device__ __host__  real_number Tensile(real_number r, real_number rhoa,
 	//-Tensile correction.
 	real_number fab=wab*W_dap;
 	fab*=fab; fab*=fab; //fab=fab^4
-	const real_number tensilp1=(prs1/(rhoa*rhoa))*(prs1>0? 0.01: -0.2);
-	const real_number tensilp2=(prs2/(rhob*rhob))*(prs2>0? 0.01: -0.2);
+	const real_number tensilp1=(prs1/(rhoa*rhoa))*(prs1>0.0f? 0.01f: -0.2f);
+	const real_number tensilp2=(prs2/(rhob*rhob))*(prs2>0.0f? 0.01f: -0.2f);
 
 	return (fab*(tensilp1+tensilp2));
 }
@@ -304,22 +301,21 @@ inline __device__ __host__ real_number Pi(const Point<3,real_number> & dr, real_
 		return pi_visc;
     }
 	else
-		return 0.0;
+		return 0.0f;
 }
 
 template<typename particles_type, typename NN_type>
 __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap, real_number cbar)
 {
-	// ... a
 	auto a = GET_PARTICLE(vd);
 
-	real_number max_visc = 0.0;
+	real_number max_visc = 0.0f;
 
 	// Get the position xp of the particle
 	Point<3,real_number> xa = vd.getPos(a);
 
-	// Take the mass of the particle dependently if it is FLUID or BOUNDARY
-	real_number massa = (vd.template getProp<type>(a) == FLUID)?MassFluid:MassBound;
+	// Type of the particle
+	unsigned int typea = vd.template getProp<type>(a);
 
 	// Get the density of the of the particle a
 	real_number rhoa = vd.template getProp<rho>(a);
@@ -331,122 +327,73 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 	Point<3,real_number> va = vd.template getProp<velocity>(a);
 
 	// Reset the force counter (- gravity on zeta direction)
-	vd.template getProp<force>(a)[0] = 0.0;
-	vd.template getProp<force>(a)[1] = 0.0;
-	vd.template getProp<force>(a)[2] = -gravity;
-	vd.template getProp<drho>(a) = 0.0;
+	Point<3,real_number> force_;
+	force_.get(0) = 0.0f;
+	force_.get(1) = 0.0f;
+	force_.get(2) = -gravity;
+	real_number drho_ = 0.0f;
 
-	// We threat FLUID particle differently from BOUNDARY PARTICLES ...
-	if (vd.template getProp<type>(a) != FLUID)
-	{
+	// Get an iterator over the neighborhood particles of p
+	auto Np = NN.getNNIteratorBox(NN.getCell(xa));
 
-		// If it is a boundary particle calculate the delta rho based on equation 2
-		// This require to run across the neighborhoods particles of a
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a)));
+	// For each neighborhood particle
+	while (Np.isNext() == true)
+	{
+		// ... q
+		auto b = Np.get();
 
-		// For each neighborhood particle
-		while (Np.isNext() == true)
-		{
-			// ... q
-			auto b = Np.get();
+		// Get the position xp of the particle
+		Point<3,real_number> xb = vd.getPos(b);
 
-			// Get the position xp of the particle
-			Point<3,real_number> xb = vd.getPos(b);
+		if (a == b)	{++Np; continue;};
 
-			// if (p == q) skip this particle
-			if (a == b)	{++Np; continue;};
+		unsigned int typeb = vd.template getProp<type>(b);
 
-			// get the mass of the particle
-			real_number massb = (vd.template getProp<type>(b) == FLUID)?MassFluid:MassBound;
+		real_number massb = (typeb == FLUID)?MassFluid:MassBound;
+		Point<3,real_number> vb = vd.template getProp<velocity>(b);
+		real_number Pb = vd.template getProp<Pressure>(b);
+		real_number rhob = vd.template getProp<rho>(b);
 
-			// Get the velocity of the particle b
-			Point<3,real_number> vb = vd.template getProp<velocity>(b);
+		// Get the distance between p and q
+		Point<3,real_number> dr = xa - xb;
+		// take the norm of this vector
+		real_number r2 = norm2(dr);
 
-			// Get the pressure and density of particle b
-			real_number Pb = vd.template getProp<Pressure>(b);
-			real_number rhob = vd.template getProp<rho>(b);
+		// if they interact
+		if (r2 < 4.0*H*H && r2 >= 1e-16)
+		{
+			real_number r = sqrt(r2);
 
-			// Get the distance between p and q
-			Point<3,real_number> dr = xa - xb;
-			// take the norm of this vector
-			real_number r2 = norm2(dr);
+			Point<3,real_number> v_rel = va - vb;
 
-			// If the particles interact ...
-			if (r2 < 4.0*H*H)
-			{
-				// ... calculate delta rho
-				real_number r = sqrt(r2);
+			Point<3,real_number> DW;
+			DWab(dr,DW,r);
 
-				Point<3,real_number> dv = va - vb;
+			real_number factor = - massb*((Pa + Pb) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc));
 
-				Point<3,real_number> DW;
-				DWab(dr,DW,r,false);
+			// Bound - Bound does not produce any change
+			// factor = (typea == BOUNDARY && typeb == BOUNDARY)?0.0f:factor;
+			factor = (typea != FLUID)?0.0f:factor;
 
-				const real_number dot = dr.get(0)*dv.get(0) + dr.get(1)*dv.get(1) + dr.get(2)*dv.get(2);
-				const real_number dot_rr2 = dot/(r2+Eta2);
-				max_visc = (dot_rr2 < max_visc)?max_visc:dot_rr2;
+			force_.get(0) += factor * DW.get(0);
+			force_.get(1) += factor * DW.get(1);
+			force_.get(2) += factor * DW.get(2);
 
-				vd.template getProp<drho>(a) += massb*(dv.get(0)*DW.get(0)+dv.get(1)*DW.get(1)+dv.get(2)*DW.get(2));
-			}
+			real_number scal = massb*(v_rel.get(0)*DW.get(0)+v_rel.get(1)*DW.get(1)+v_rel.get(2)*DW.get(2));
+			scal = (typea == BOUNDARY && typeb == BOUNDARY)?0.0f:scal;
 
-			++Np;
+			drho_ += scal;
 		}
 
-		vd.template getProp<red>(a) = max_visc;
+		++Np;
 	}
-	else
-	{
-		// If it is a fluid particle calculate based on equation 1 and 2
-
-		// Get an iterator over the neighborhood particles of p
-		auto Np = NN.getNNIterator(NN.getCell(vd.getPos(a)));
-
-		// For each neighborhood particle
-		while (Np.isNext() == true)
-		{
-			// ... q
-			auto b = Np.get();
-
-			// Get the position xp of the particle
-			Point<3,real_number> xb = vd.getPos(b);
-
-			// if (p == q) skip this particle
-			if (a == b)	{++Np; continue;};
-
-			real_number massb = (vd.template getProp<type>(b) == FLUID)?MassFluid:MassBound;
-			Point<3,real_number> vb = vd.template getProp<velocity>(b);
-			real_number Pb = vd.template getProp<Pressure>(b);
-			real_number rhob = vd.template getProp<rho>(b);
 
-			// Get the distance between p and q
-			Point<3,real_number> dr = xa - xb;
-			// take the norm of this vector
-			real_number r2 = norm2(dr);
+	vd.template getProp<red>(a) = max_visc;
 
-			// if they interact
-			if (r2 < 4.0*H*H)
-			{
-				real_number r = sqrt(r2);
-
-				Point<3,real_number> v_rel = va - vb;
-
-				Point<3,real_number> DW;
-				DWab(dr,DW,r,false);
-
-				real_number factor = - massb*((vd.template getProp<Pressure>(a) + vd.template getProp<Pressure>(b)) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc));
-
-				vd.template getProp<force>(a)[0] += factor * DW.get(0);
-				vd.template getProp<force>(a)[1] += factor * DW.get(1);
-				vd.template getProp<force>(a)[2] += factor * DW.get(2);
-
-				vd.template getProp<drho>(a) += massb*(v_rel.get(0)*DW.get(0)+v_rel.get(1)*DW.get(1)+v_rel.get(2)*DW.get(2));
-			}
-
-			++Np;
-		}
-
-		vd.template getProp<red>(a) = max_visc;
-	}
+	vd.template getProp<force>(a)[0] = force_.get(0);
+	vd.template getProp<force>(a)[1] = force_.get(1);
+	vd.template getProp<force>(a)[2] = force_.get(2);
+	vd.template getProp<drho>(a) = drho_;
 }
 
 template<typename CellList> inline void calc_forces(particles & vd, CellList & NN, real_number & max_visc, size_t cnt)
@@ -454,10 +401,9 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N
 	auto part = vd.getDomainIteratorGPU(32);
 
 	// Update the cell-list
-	vd.updateCellList(NN);
+	vd.updateCellListGPU(NN);
 
-	//calc_forces_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),NN.toKernel(),W_dap,cbar);
-	CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar)
+	CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar);
 
 	max_visc = reduce_local<red,_max_>(vd);
 }
@@ -479,7 +425,6 @@ void max_acceleration_and_velocity(particles & vd, real_number & max_acc, real_n
 	// Calculate the maximum acceleration
 	auto part = vd.getDomainIteratorGPU();
 
-	// max_acceleration_and_velocity_gpu<<<part.wthr,part.thr>>>(vd.toKernel());
 	CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel());
 
 	max_acc = reduce_local<red,_max_>(vd);
@@ -499,7 +444,7 @@ real_number calc_deltaT(particles & vd, real_number ViscDtMax)
 	max_acceleration_and_velocity(vd,Maxacc,Maxvel);
 
 	//-dt1 depends on force per unit mass.
-	const real_number dt_f = (Maxacc)?sqrt(H/Maxacc):std::numeric_limits<int>::max();
+	const real_number dt_f = (Maxacc)?sqrt(H/Maxacc):std::numeric_limits<float>::max();
 
 	//-dt2 combines the Courant and the viscous time-step controls.
 	const real_number dt_cv = H/(std::max(cbar,Maxvel*10.f) + H*ViscDtMax);
@@ -538,7 +483,7 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number
 		return;
 	}
 
-	//-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion.
+	//-Calculate displacement and update position
 	real_number dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
     real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
     real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
@@ -558,17 +503,14 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number
 	vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity_prev>(a)[2] + vd.template getProp<force>(a)[2]*dt2;
 	vd.template getProp<rho>(a) = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
 
-	//! \cond [mark_to_remove_kernel] \endcond
-
     // Check if the particle go out of range in space and in density, if they do mark them to remove it later
-    if (vd.getPos(a)[0] <  0.000263878 || vd.getPos(a)[1] < 0.000263878 || vd.getPos(a)[2] < 0.000263878 ||
-        vd.getPos(a)[0] >  0.000263878+1.59947 || vd.getPos(a)[1] > 0.000263878+0.672972 || vd.getPos(a)[2] > 0.50 ||
+    if (vd.getPos(a)[0] <  0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 ||
+        vd.getPos(a)[0] >  1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 ||
 		vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
     {vd.template getProp<red>(a) = 1;}
     else
     {vd.template getProp<red>(a) = 0;}
 
-    //! \cond [mark_to_remove_kernel] \endcond
 
     vd.template getProp<velocity_prev>(a)[0] = velX;
     vd.template getProp<velocity_prev>(a)[1] = velY;
@@ -586,16 +528,11 @@ void verlet_int(particles & vd, real_number dt)
 	real_number dt205 = dt*dt*0.5;
 	real_number dt2 = dt*2.0;
 
-	// verlet_int_gpu<<<part.wthr,part.thr>>>(vd.toKernel(),dt,dt2,dt205);
 	CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205);
 
-	//! \cond [remove_marked_part] \endcond
-
 	// remove the particles marked
 	remove_marked<red>(vd);
 
-	//! \cond [remove_marked_part] \endcond
-
 	// increment the iteration counter
 	cnt++;
 }
@@ -626,7 +563,7 @@ __global__ void euler_int_gpu(vector_type vd,real_number dt, real_number dt205)
 		return;
 	}
 
-	//-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion.
+	//-Calculate displacement and update position
 	real_number dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
     real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
     real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
@@ -646,8 +583,8 @@ __global__ void euler_int_gpu(vector_type vd,real_number dt, real_number dt205)
    	vd.template getProp<rho>(a) = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
 
     // Check if the particle go out of range in space and in density
-    if (vd.getPos(a)[0] <  0.000263878 || vd.getPos(a)[1] < 0.000263878 || vd.getPos(a)[2] < 0.000263878 ||
-        vd.getPos(a)[0] >  0.000263878+1.59947 || vd.getPos(a)[1] > 0.000263878+0.672972 || vd.getPos(a)[2] > 0.50 ||
+    if (vd.getPos(a)[0] <  0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 ||
+        vd.getPos(a)[0] >  1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 ||
 		vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
     {vd.template getProp<red>(a) = 1;}
     else
@@ -685,7 +622,7 @@ __global__ void sensor_pressure_gpu(vector_type vd, NN_type NN, Point<3,real_num
 	Point<3,real_number> xp = probe;
 
 	// get the iterator over the neighbohood particles of the probes position
-	auto itg = NN.getNNIterator(NN.getCell(xp));
+	auto itg = NN.getNNIteratorBox(NN.getCell(xp));
 	while (itg.isNext())
 	{
 		auto q = itg.get();
@@ -744,8 +681,10 @@ inline void sensor_pressure(Vector & vd,
         // if the probe is inside the processor domain
 		if (vd.getDecomposition().isLocal(probes.get(i)) == true)
 		{
-			// sensor_pressure_gpu<<<1,1>>>(vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
-			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			vd.updateCellListGPU(NN);
+
+			Point<3,real_number> probe = probes.get(i);
+			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probe,(real_number *)press_tmp_.toKernel());
 
 			// move calculated pressure on
 			press_tmp_.deviceToHost();
@@ -772,11 +711,11 @@ int main(int argc, char* argv[])
 	openfpm::vector<openfpm::vector<real_number>> press_t;
 	openfpm::vector<Point<3,real_number>> probes;
 
-	probes.add({0.8779,0.3,0.02});
-	probes.add({0.754,0.31,0.02});
+	probes.add({0.8779f,0.3f,0.02f});
+	probes.add({0.754f,0.31f,0.02f});
 
 	// Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
-	Box<3,real_number> domain({-0.05,-0.05,-0.05},{1.7010,0.7065,0.5025});
+	Box<3,real_number> domain({-0.05f,-0.05f,-0.05f},{1.7010f,0.7065f,0.511f});
 	size_t sz[3] = {207,90,66};
 
 	// Fill W_dap
@@ -919,7 +858,7 @@ int main(int argc, char* argv[])
 
 	vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
 
-	auto NN = vd.getCellListGPU(2*H, 2);
+	auto NN = vd.getCellListGPU(2*H, CL_NON_SYMMETRIC, 2);
 
 	timer tot_sim;
 	tot_sim.start();
@@ -932,6 +871,7 @@ int main(int argc, char* argv[])
 	{
 		Vcluster<> & v_cl = create_vcluster();
 		timer it_time;
+		it_time.start();
 
 		////// Do rebalancing every 200 timesteps
 		it_reb++;
@@ -939,6 +879,10 @@ int main(int argc, char* argv[])
 		{
 			vd.map(RUN_ON_DEVICE);
 
+			// Rebalancer for now work on CPU , so move to CPU
+			vd.deviceToHostPos();
+			vd.template deviceToHostProp<type>();
+
 			it_reb = 0;
 			ModelCustom md;
 			vd.addComputationCosts(md);
@@ -985,7 +929,6 @@ int main(int argc, char* argv[])
 			// and ghost are updated
 			vd.map(RUN_ON_DEVICE);
 			vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
-			vd.updateCellList(NN);
 
 			// calculate the pressure at the sensor points
 			//sensor_pressure(vd,NN,press_t,probes);
diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
index 08e0f0d2..e81c53e8 100644
--- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu
@@ -38,8 +38,8 @@
  *
  * the function get_indexes_by_type has three arguments the first is the vector of the properties of the particles. In
  * this case because we use the sorted particles to calculate forces, so we have to get the indexes for the sorted
- * particles with vd.getPropVectorSort(). In case we want to use the non sorted we use vd.getPropVector(). The second
- * argument is the output containing the indexes of the particles types we want to get. Because the vector can contain
+ * particles with vd.getPropVector() when enabled with flag CL_GPU_REORDER. The second argument is the output containing
+ * the indexes of the particles types we want to get. Because the vector can contain
  * ghost particles and real particles setting with the third argument we indicate we want only real particles and no ghost particles
  * The last argument is the GPU context handle
  *
@@ -53,12 +53,8 @@
 #define PRINT_STACKTRACE
 #define STOP_ON_ERROR
 #define OPENMPI
-#define SCAN_WITH_CUB
-#define SORT_WITH_CUB
 //#define SE_CLASS1
 
-//#define USE_LOW_REGISTER_ITERATOR
-
 #include "Vector/vector_dist.hpp"
 #include <math.h>
 #include "Draw/DrawParticles.hpp"
@@ -74,7 +70,7 @@ typedef float real_number;
 #define FLUID 1
 
 // initial spacing between particles dp in the formulas
-const real_number dp = 0.00425;
+const real_number dp = 0.0085;
 // Maximum height of the fluid water
 // is going to be calculated and filled later on
 real_number h_swl = 0.0;
@@ -86,7 +82,7 @@ const real_number coeff_sound = 20.0;
 const real_number gamma_ = 7.0;
 
 // sqrt(3.0*dp*dp) support of the kernel
-const real_number H = 0.00736121593217;
+const real_number H = 0.0147224318643;
 
 // Eta in the formulas
 const real_number Eta2 = 0.01 * H*H;
@@ -100,10 +96,10 @@ const real_number visco = 0.1;
 real_number cbar = 0.0;
 
 // Mass of the fluid particles
-const real_number MassFluid = 0.0000767656;
+const real_number MassFluid = 0.000614125;
 
 // Mass of the boundary particles
-const real_number MassBound = 0.0000767656;
+const real_number MassBound = 0.000614125;
 
 //
 
@@ -178,10 +174,11 @@ typedef vector_dist_gpu<3,real_number,aggregate<unsigned int,real_number,  real_
 
 struct ModelCustom
 {
-	template<typename Decomposition, typename vector> inline void addComputation(Decomposition & dec,
-			                                                                     vector & vd,
-																				 size_t v,
-																				 size_t p)
+	template<typename Decomposition, typename vector> inline void addComputation(
+		Decomposition & dec,
+		vector & vd,
+		size_t v,
+		size_t p)
 	{
 		if (vd.template getProp<type>(p) == FLUID)
 			dec.addComputationCost(v,4);
@@ -245,19 +242,19 @@ inline __device__ __host__ void DWab(Point<3,real_number> & dx, Point<3,real_num
 {
 	const real_number qq=r/H;
 
-    real_number qq2 = qq * qq;
-    real_number fac1 = (c1*qq + d1*qq2)/r;
-    real_number b1 = (qq < 1.0f)?1.0f:0.0f;
+	real_number qq2 = qq * qq;
+	real_number fac1 = (c1*qq + d1*qq2)/r;
+	real_number b1 = (qq < 1.0f)?1.0f:0.0f;
 
-    real_number wqq = (2.0f - qq);
-    real_number fac2 = c2 * wqq * wqq / r;
-    real_number b2 = (qq >= 1.0f && qq < 2.0f)?1.0f:0.0f;
+	real_number wqq = (2.0f - qq);
+	real_number fac2 = c2 * wqq * wqq / r;
+	real_number b2 = (qq >= 1.0f && qq < 2.0f)?1.0f:0.0f;
 
-    real_number factor = (b1*fac1 + b2*fac2);
+	real_number factor = (b1*fac1 + b2*fac2);
 
-    DW.get(0) = factor * dx.get(0);
-    DW.get(1) = factor * dx.get(1);
-    DW.get(2) = factor * dx.get(2);
+	DW.get(0) = factor * dx.get(0);
+	DW.get(1) = factor * dx.get(1);
+	DW.get(2) = factor * dx.get(2);
 }
 
 // Tensile correction
@@ -275,10 +272,10 @@ inline __device__ __host__  real_number Tensile(real_number r, real_number rhoa,
 	}
 	else
 	{
-	    real_number wqq2=qq*qq;
-	    real_number wqq3=wqq2*qq;
+		real_number wqq2=qq*qq;
+		real_number wqq3=wqq2*qq;
 
-	    wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3);
+		wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3);
 	}
 
 	//-Tensile correction.
@@ -304,7 +301,7 @@ inline __device__ __host__ real_number Pi(const Point<3,real_number> & dr, real_
 		const float pi_visc=(-visco*cbar*amubar/robar);
 
 		return pi_visc;
-    }
+	}
 	else
 		return 0.0f;
 }
@@ -355,16 +352,15 @@ __global__ void calc_forces_fluid_gpu(particles_type vd, fluid_ids_type fids, NN
 		// if (p == q) skip this particle this condition should be done in the r^2 = 0
 		//if (a == b)	{++Np; continue;};
 
-        unsigned int typeb = vd.template getProp<type>(b);
+		unsigned int typeb = vd.template getProp<type>(b);
 
-        real_number massb = (typeb == FLUID)?MassFluid:MassBound;
-        Point<3,real_number> vb = vd.template getProp<velocity>(b);
-        real_number Pb = vd.template getProp<Pressure>(b);
-        real_number rhob = vd.template getProp<rho>(b);
+		real_number massb = (typeb == FLUID)?MassFluid:MassBound;
+		Point<3,real_number> vb = vd.template getProp<velocity>(b);
+		real_number Pb = vd.template getProp<Pressure>(b);
+		real_number rhob = vd.template getProp<rho>(b);
 
 		// Get the distance between p and q
 		Point<3,real_number> dr = xa - xb;
-		Point<3,real_number> v_rel = va - vb;
 		// take the norm of this vector
 		real_number r2 = norm2(dr);
 
@@ -373,6 +369,8 @@ __global__ void calc_forces_fluid_gpu(particles_type vd, fluid_ids_type fids, NN
 		{
 			real_number r = sqrtf(r2);
 
+			Point<3,real_number> v_rel = va - vb;
+
 			Point<3,real_number> DW;
 			DWab(dr,DW,r);
 
@@ -438,10 +436,10 @@ __global__ void calc_forces_border_gpu(particles_type vd, fluid_ids_type fbord,
 		// if (p == q) skip this particle this condition should be done in the r^2 = 0
 		//if (a == b)	{++Np; continue;};
 
-        unsigned int typeb = vd.template getProp<type>(b);
+		unsigned int typeb = vd.template getProp<type>(b);
 
-        real_number massb = (typeb == FLUID)?MassFluid:MassBound;
-        Point<3,real_number> vb = vd.template getProp<velocity>(b);
+		real_number massb = (typeb == FLUID)?MassFluid:MassBound;
+		Point<3,real_number> vb = vd.template getProp<velocity>(b);
 
 		// Get the distance between p and q
 		Point<3,real_number> dr = xa - xb;
@@ -481,34 +479,34 @@ struct type_is_fluid
 
 struct type_is_border
 {
-        __device__ static bool check(int c)
-        {
-                return c == BOUNDARY;
-        }
+		__device__ static bool check(int c)
+		{
+				return c == BOUNDARY;
+		}
 };
 
 template<typename CellList> inline void calc_forces(particles & vd, CellList & NN, real_number & max_visc, size_t cnt, openfpm::vector_gpu<aggregate<int>> & fluid_ids, openfpm::vector_gpu<aggregate<int>> & border_ids)
 {
 	// Update the cell-list
-	vd.updateCellList<type,rho,Pressure,velocity>(NN);
+	vd.template updateCellListGPU<type,rho,Pressure,velocity>(NN);
 
 	//! \cond [get indexes by type] \endcond
 
 	// get the particles fluid ids
-	get_indexes_by_type<type,type_is_fluid>(vd.getPropVectorSort(),fluid_ids,vd.size_local(),vd.getVC().getGpuContext());
+	get_indexes_by_type<type,type_is_fluid>(vd.getPropVector(),fluid_ids,vd.size_local(),vd.getVC().getGpuContext());
 
 	// get the particles fluid ids
-	get_indexes_by_type<type,type_is_border>(vd.getPropVectorSort(),border_ids,vd.size_local(),vd.getVC().getGpuContext());
+	get_indexes_by_type<type,type_is_border>(vd.getPropVector(),border_ids,vd.size_local(),vd.getVC().getGpuContext());
 
 	auto part = fluid_ids.getGPUIterator(96);
-	CUDA_LAUNCH(calc_forces_fluid_gpu,part,vd.toKernel_sorted(),fluid_ids.toKernel(),NN.toKernel(),W_dap,cbar);
+	CUDA_LAUNCH(calc_forces_fluid_gpu,part,vd.toKernel(),fluid_ids.toKernel(),NN.toKernel(),W_dap,cbar);
 
 	part = border_ids.getGPUIterator(96);
-	CUDA_LAUNCH(calc_forces_border_gpu,part,vd.toKernel_sorted(),border_ids.toKernel(),NN.toKernel(),W_dap,cbar);
+	CUDA_LAUNCH(calc_forces_border_gpu,part,vd.toKernel(),border_ids.toKernel(),NN.toKernel(),W_dap,cbar);
 
 	//! \cond [get indexes by type] \endcond
 
-	vd.merge_sort<force,drho,red>(NN);
+	vd.template restoreOrder<drho,force,red>(NN);
 
 	max_visc = reduce_local<red,_max_>(vd);
 }
@@ -575,56 +573,57 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number
 		real_number rhop = vd.template getProp<rho>(a);
 
 		// Update only the density
-    	vd.template getProp<velocity>(a)[0] = 0.0;
-    	vd.template getProp<velocity>(a)[1] = 0.0;
-    	vd.template getProp<velocity>(a)[2] = 0.0;
-    	real_number rhonew = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
-    	vd.template getProp<rho>(a) = (rhonew < rho_zero)?rho_zero:rhonew;
+		vd.template getProp<velocity>(a)[0] = 0.0;
+		vd.template getProp<velocity>(a)[1] = 0.0;
+		vd.template getProp<velocity>(a)[2] = 0.0;
+		real_number rhonew = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
+		vd.template getProp<rho>(a) = (rhonew < rho_zero)?rho_zero:rhonew;
 
-	    vd.template getProp<rho_prev>(a) = rhop;
+		vd.template getProp<rho_prev>(a) = rhop;
 
-	    vd.template getProp<red>(a) = 0;
+		vd.template getProp<red>(a) = 0;
 
 		return;
 	}
 
 	//-Calculate displacement and update position
 	real_number dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
-    real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
-    real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
+	real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
+	real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
 
-    vd.getPos(a)[0] += dx;
-    vd.getPos(a)[1] += dy;
-    vd.getPos(a)[2] += dz;
+	vd.getPos(a)[0] += dx;
+	vd.getPos(a)[1] += dy;
+	vd.getPos(a)[2] += dz;
 
-    real_number velX = vd.template getProp<velocity>(a)[0];
-    real_number velY = vd.template getProp<velocity>(a)[1];
-    real_number velZ = vd.template getProp<velocity>(a)[2];
+	real_number velX = vd.template getProp<velocity>(a)[0];
+	real_number velY = vd.template getProp<velocity>(a)[1];
+	real_number velZ = vd.template getProp<velocity>(a)[2];
 
-    real_number rhop = vd.template getProp<rho>(a);
+	real_number rhop = vd.template getProp<rho>(a);
 
 	vd.template getProp<velocity>(a)[0] = vd.template getProp<velocity_prev>(a)[0] + vd.template getProp<force>(a)[0]*dt2;
 	vd.template getProp<velocity>(a)[1] = vd.template getProp<velocity_prev>(a)[1] + vd.template getProp<force>(a)[1]*dt2;
 	vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity_prev>(a)[2] + vd.template getProp<force>(a)[2]*dt2;
 	vd.template getProp<rho>(a) = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
 
-    // Check if the particle go out of range in space and in density
-    if (vd.getPos(a)[0] <  0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 ||
-        vd.getPos(a)[0] >  1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 ||
+	// Check if the particle go out of range in space and in density
+	if (vd.getPos(a)[0] <  0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 ||
+		vd.getPos(a)[0] >  1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 ||
 		vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
-    {
-    	vd.template getProp<red>(a) = 1;
-    }
-    else
-    {
-    	vd.template getProp<red>(a) = 0;
-    }
-
-
-    vd.template getProp<velocity_prev>(a)[0] = velX;
-    vd.template getProp<velocity_prev>(a)[1] = velY;
-    vd.template getProp<velocity_prev>(a)[2] = velZ;
-    vd.template getProp<rho_prev>(a) = rhop;
+	{
+		vd.template getProp<red>(a) = 1;
+	}
+
+	else
+	{
+		vd.template getProp<red>(a) = 0;
+	}
+
+
+	vd.template getProp<velocity_prev>(a)[0] = velX;
+	vd.template getProp<velocity_prev>(a)[1] = velY;
+	vd.template getProp<velocity_prev>(a)[2] = velZ;
+	vd.template getProp<rho_prev>(a) = rhop;
 }
 
 size_t cnt = 0;
@@ -659,50 +658,50 @@ __global__ void euler_int_gpu(vector_type vd,real_number dt, real_number dt205)
 		real_number rhop = vd.template getProp<rho>(a);
 
 		// Update only the density
-    	vd.template getProp<velocity>(a)[0] = 0.0;
-    	vd.template getProp<velocity>(a)[1] = 0.0;
-    	vd.template getProp<velocity>(a)[2] = 0.0;
-    	real_number rhonew = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
-    	vd.template getProp<rho>(a) = (rhonew < rho_zero)?rho_zero:rhonew;
+		vd.template getProp<velocity>(a)[0] = 0.0;
+		vd.template getProp<velocity>(a)[1] = 0.0;
+		vd.template getProp<velocity>(a)[2] = 0.0;
+		real_number rhonew = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
+		vd.template getProp<rho>(a) = (rhonew < rho_zero)?rho_zero:rhonew;
 
-	    vd.template getProp<rho_prev>(a) = rhop;
+		vd.template getProp<rho_prev>(a) = rhop;
 
-	    vd.template getProp<red>(a) = 0;
+		vd.template getProp<red>(a) = 0;
 
 		return;
 	}
 
 	//-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion.
 	real_number dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
-    real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
-    real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
+	real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
+	real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
 
-    vd.getPos(a)[0] += dx;
-    vd.getPos(a)[1] += dy;
-    vd.getPos(a)[2] += dz;
+	vd.getPos(a)[0] += dx;
+	vd.getPos(a)[1] += dy;
+	vd.getPos(a)[2] += dz;
 
-    real_number velX = vd.template getProp<velocity>(a)[0];
-    real_number velY = vd.template getProp<velocity>(a)[1];
-    real_number velZ = vd.template getProp<velocity>(a)[2];
-    real_number rhop = vd.template getProp<rho>(a);
+	real_number velX = vd.template getProp<velocity>(a)[0];
+	real_number velY = vd.template getProp<velocity>(a)[1];
+	real_number velZ = vd.template getProp<velocity>(a)[2];
+	real_number rhop = vd.template getProp<rho>(a);
 
 	vd.template getProp<velocity>(a)[0] = vd.template getProp<velocity>(a)[0] + vd.template getProp<force>(a)[0]*dt;
 	vd.template getProp<velocity>(a)[1] = vd.template getProp<velocity>(a)[1] + vd.template getProp<force>(a)[1]*dt;
-   	vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity>(a)[2] + vd.template getProp<force>(a)[2]*dt;
-   	vd.template getProp<rho>(a) = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
+	vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity>(a)[2] + vd.template getProp<force>(a)[2]*dt;
+	vd.template getProp<rho>(a) = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
 
-    // Check if the particle go out of range in space and in density
-    if (vd.getPos(a)[0] <  0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 ||
-        vd.getPos(a)[0] >  1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 ||
+	// Check if the particle go out of range in space and in density
+	if (vd.getPos(a)[0] <  0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 ||
+		vd.getPos(a)[0] >  1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 ||
 		vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
-    {vd.template getProp<red>(a) = 1;}
-    else
-    {vd.template getProp<red>(a) = 0;}
-
-    vd.template getProp<velocity_prev>(a)[0] = velX;
-    vd.template getProp<velocity_prev>(a)[1] = velY;
-    vd.template getProp<velocity_prev>(a)[2] = velZ;
-    vd.template getProp<rho_prev>(a) = rhop;
+		{vd.template getProp<red>(a) = 1;}
+	else
+		{vd.template getProp<red>(a) = 0;}
+
+	vd.template getProp<velocity_prev>(a)[0] = velX;
+	vd.template getProp<velocity_prev>(a)[1] = velY;
+	vd.template getProp<velocity_prev>(a)[2] = velZ;
+	vd.template getProp<rho_prev>(a) = rhop;
 }
 
 void euler_int(particles & vd, real_number dt)
@@ -765,33 +764,36 @@ __global__ void sensor_pressure_gpu(vector_type vd, NN_type NN, Point<3,real_num
 	// We calculate the pressure normalizing the
 	// sum over all kernels
 	if (tot_ker == 0.0)
-	{*press_tmp = 0.0;}
+		{*press_tmp = 0.0;}
 	else
-	{*press_tmp = 1.0 / tot_ker * *press_tmp;}
+		{*press_tmp = 1.0 / tot_ker * *press_tmp;}
 }
 
 template<typename Vector, typename CellList>
 inline void sensor_pressure(Vector & vd,
-                            CellList & NN,
-                            openfpm::vector<openfpm::vector<real_number>> & press_t,
-                            openfpm::vector<Point<3,real_number>> & probes)
+							CellList & NN,
+							openfpm::vector<openfpm::vector<real_number>> & press_t,
+							openfpm::vector<Point<3,real_number>> & probes)
 {
-    Vcluster<> & v_cl = create_vcluster();
+	Vcluster<> & v_cl = create_vcluster();
 
-    press_t.add();
+	press_t.add();
 
-    for (size_t i = 0 ; i < probes.size() ; i++)
-    {
-    	// A float variable to calculate the pressure of the problem
-    	CudaMemory press_tmp_(sizeof(real_number));
-    	real_number press_tmp;
+	for (size_t i = 0 ; i < probes.size() ; i++)
+	{
+		// A float variable to calculate the pressure of the problem
+		CudaMemory press_tmp_(sizeof(real_number));
+		real_number press_tmp;
 
-        // if the probe is inside the processor domain
+		// if the probe is inside the processor domain
 		if (vd.getDecomposition().isLocal(probes.get(i)) == true)
 		{
-			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			vd.template updateCellListGPU<type,Pressure>(NN);
 
-			vd.merge<Pressure>(NN);
+			Point<3,real_number> probe = probes.get(i);
+			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probe,(real_number *)press_tmp_.toKernel());
+
+			vd.template restoreOrder<>(NN);
 
 			// move calculated pressure on
 			press_tmp_.deviceToHost();
@@ -811,7 +813,7 @@ inline void sensor_pressure(Vector & vd,
 
 int main(int argc, char* argv[])
 {
-    // initialize the library
+	// initialize the library
 	openfpm_init(&argc,&argv);
 
 	openfpm::vector_gpu<aggregate<int>> fluid_ids;
@@ -825,18 +827,18 @@ int main(int argc, char* argv[])
 	openfpm::vector<openfpm::vector<real_number>> press_t;
 	openfpm::vector<Point<3,real_number>> probes;
 
-	probes.add({0.8779,0.3,0.02});
-	probes.add({0.754,0.31,0.02});
+	probes.add({0.8779f,0.3f,0.02f});
+	probes.add({0.754f,0.31f,0.02f});
 
 	// Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
-	Box<3,real_number> domain({-0.05,-0.05,-0.05},{1.7010,0.7065,0.511});
-	size_t sz[3] = {413,179,133};
+	Box<3,real_number> domain({-0.05f,-0.05f,-0.05f},{1.7010f,0.7065f,0.511f});
+	size_t sz[3] = {207,90,66};
 
 	// Fill W_dap
 	W_dap = 1.0/Wab(H/1.5);
 
 	// Here we define the boundary conditions of our problem
-    size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
+	size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
 
 	// extended boundary around the domain, and the processor domain
 	Ghost<3,real_number> g(2*H);
@@ -847,7 +849,7 @@ int main(int argc, char* argv[])
 
 	// You can ignore all these dp/2.0 is a trick to reach the same initialization
 	// of Dual-SPH that use a different criteria to draw particles
-	Box<3,real_number> fluid_box({dp/2.0,dp/2.0,dp/2.0},{0.4+dp/2.0,0.67-dp/2.0,0.3+dp/2.0});
+	Box<3,real_number> fluid_box({dp/2.0f,dp/2.0f,dp/2.0f},{0.4f+dp/2.0f,0.67f-dp/2.0f,0.3f+dp/2.0f});
 
 	// return an iterator to the fluid particles to add to vd
 	auto fluid_it = DrawParticles::DrawBox(vd,sz,domain,fluid_box);
@@ -896,12 +898,12 @@ int main(int argc, char* argv[])
 	}
 
 	// Recipient
-	Box<3,real_number> recipient1({0.0,0.0,0.0},{1.6+dp/2.0,0.67+dp/2.0,0.4+dp/2.0});
-	Box<3,real_number> recipient2({dp,dp,dp},{1.6-dp/2.0,0.67-dp/2.0,0.4+dp/2.0});
+	Box<3,real_number> recipient1({0.0f,0.0f,0.0f},{1.6f+dp/2.0f,0.67f+dp/2.0f,0.4f+dp/2.0f});
+	Box<3,real_number> recipient2({dp,dp,dp},{1.6f-dp/2.0f,0.67f-dp/2.0f,0.4f+dp/2.0f});
 
-	Box<3,real_number> obstacle1({0.9,0.24-dp/2.0,0.0},{1.02+dp/2.0,0.36,0.45+dp/2.0});
-	Box<3,real_number> obstacle2({0.9+dp,0.24+dp/2.0,0.0},{1.02-dp/2.0,0.36-dp,0.45-dp/2.0});
-	Box<3,real_number> obstacle3({0.9+dp,0.24,0.0},{1.02,0.36,0.45});
+	Box<3,real_number> obstacle1({0.9f,0.24f-dp/2.0f,0.0f},{1.02f+dp/2.0f,0.36f,0.45f+dp/2.0f});
+	Box<3,real_number> obstacle2({0.9f+dp,0.24f+dp/2.0f,0.0f},{1.02f-dp/2.0f,0.36f-dp,0.45f-dp/2.0f});
+	Box<3,real_number> obstacle3({0.9f+dp,0.24f,0.0f},{1.02f,0.36f,0.45f});
 
 	openfpm::vector<Box<3,real_number>> holes;
 	holes.add(recipient2);
@@ -973,8 +975,7 @@ int main(int argc, char* argv[])
 
 	vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
 
-	auto NN = vd.getCellListGPU/*<CELLLIST_GPU_SPARSE<3,float>>*/(2*H / 2.0, 2);
-	NN.setBoxNN(2);
+	auto NN = vd.getCellListGPU/*<CELLLIST_GPU_SPARSE<3,float>>*/(2*H / 2.0, CL_NON_SYMMETRIC | CL_GPU_REORDER, 2);
 
 	timer tot_sim;
 	tot_sim.start();
@@ -996,8 +997,8 @@ int main(int argc, char* argv[])
 			vd.map(RUN_ON_DEVICE);
 
 			// Rebalancer for now work on CPU , so move to CPU
-            vd.deviceToHostPos();
-            vd.template deviceToHostProp<type>();
+			vd.deviceToHostPos();
+			vd.template deviceToHostProp<type>();
 
 			it_reb = 0;
 			ModelCustom md;
@@ -1040,13 +1041,12 @@ int main(int argc, char* argv[])
 
 		t += dt;
 
-		if (write < t*10)
+		if (write < t*100)
 		{
 			// Sensor pressure require update ghost, so we ensure that particles are distributed correctly
 			// and ghost are updated
 			vd.map(RUN_ON_DEVICE);
 			vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
-			vd.updateCellList(NN);
 
 			// calculate the pressure at the sensor points
 			//sensor_pressure(vd,NN,press_t,probes);
@@ -1106,7 +1106,7 @@ int main(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
-        return 0;
+	return 0;
 }
 
 #endif
diff --git a/example/Vector/7_SPH_dlb_gpu_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
index cf565f05..92cf1114 100644
--- a/example/Vector/7_SPH_dlb_gpu_opt/main.cu
+++ b/example/Vector/7_SPH_dlb_gpu_opt/main.cu
@@ -44,9 +44,6 @@
 #define OPENMPI
 //#define SE_CLASS1
 
-//#define USE_LOW_REGISTER_ITERATOR
-#define SCAN_WITH_CUB //<------ In case you want to use CUB for scan operations
-#define SORT_WITH_CUB
 //#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default
 
 #include "Vector/vector_dist.hpp"
@@ -64,7 +61,7 @@ typedef float real_number;
 #define FLUID 1
 
 // initial spacing between particles dp in the formulas
-const real_number dp = 0.00425;
+const real_number dp = 0.0085;
 // Maximum height of the fluid water
 // is going to be calculated and filled later on
 real_number h_swl = 0.0;
@@ -76,7 +73,7 @@ const real_number coeff_sound = 20.0;
 const real_number gamma_ = 7.0;
 
 // sqrt(3.0*dp*dp) support of the kernel
-const real_number H = 0.00736121593217;
+const real_number H = 0.0147224318643;
 
 // Eta in the formulas
 const real_number Eta2 = 0.01 * H*H;
@@ -90,10 +87,10 @@ const real_number visco = 0.1;
 real_number cbar = 0.0;
 
 // Mass of the fluid particles
-const real_number MassFluid = 0.0000767656;
+const real_number MassFluid = 0.000614125;
 
 // Mass of the boundary particles
-const real_number MassBound = 0.0000767656;
+const real_number MassBound = 0.000614125;
 
 //
 
@@ -303,9 +300,7 @@ inline __device__ __host__ real_number Pi(const Point<3,real_number> & dr, real_
 template<typename particles_type, typename NN_type>
 __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap, real_number cbar)
 {
-	// ... a
 	unsigned int a;
-
 	GET_PARTICLE_SORT(a,NN);
 
 	real_number max_visc = 0.0f;
@@ -316,9 +311,6 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 	// Type of the particle
 	unsigned int typea = vd.template getProp<type>(a);
 
-	// Take the mass of the particle dependently if it is FLUID or BOUNDARY
-	//real_number massa = (typea == FLUID)?MassFluid:MassBound;
-
 	// Get the density of the of the particle a
 	real_number rhoa = vd.template getProp<rho>(a);
 
@@ -349,16 +341,15 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 		// if (p == q) skip this particle this condition should be done in the r^2 = 0
 		if (a == b)	{++Np; continue;};
 
-        	unsigned int typeb = vd.template getProp<type>(b);
+		unsigned int typeb = vd.template getProp<type>(b);
 
-        	real_number massb = (typeb == FLUID)?MassFluid:MassBound;
-        	Point<3,real_number> vb = vd.template getProp<velocity>(b);
-        	real_number Pb = vd.template getProp<Pressure>(b);
-        	real_number rhob = vd.template getProp<rho>(b);
+		real_number massb = (typeb == FLUID)?MassFluid:MassBound;
+		Point<3,real_number> vb = vd.template getProp<velocity>(b);
+		real_number Pb = vd.template getProp<Pressure>(b);
+		real_number rhob = vd.template getProp<rho>(b);
 
 		// Get the distance between p and q
 		Point<3,real_number> dr = xa - xb;
-		Point<3,real_number> v_rel = va - vb;
 		// take the norm of this vector
 		real_number r2 = norm2(dr);
 
@@ -367,6 +358,8 @@ __global__ void calc_forces_gpu(particles_type vd, NN_type NN, real_number W_dap
 		{
 			real_number r = sqrtf(r2);
 
+			Point<3,real_number> v_rel = va - vb;
+
 			Point<3,real_number> DW;
 			DWab(dr,DW,r);
 
@@ -401,11 +394,11 @@ template<typename CellList> inline void calc_forces(particles & vd, CellList & N
 	auto part = vd.getDomainIteratorGPU(96);
 
 	// Update the cell-list
-	vd.updateCellList(NN);
+	vd.updateCellListGPU<type,rho,Pressure,velocity>(NN);
 
-	CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel_sorted(),NN.toKernel(),W_dap,cbar);
+	CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel(),NN.toKernel(),W_dap,cbar);
 
-	vd.merge_sort<force,drho,red>(NN);
+	vd.restoreOrder<drho,force,red>(NN);
 
 	max_visc = reduce_local<red,_max_>(vd);
 }
@@ -505,7 +498,7 @@ __global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number
 	vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity_prev>(a)[2] + vd.template getProp<force>(a)[2]*dt2;
 	vd.template getProp<rho>(a) = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
 
-    // Check if the particle go out of range in space and in density
+    // Check if the particle go out of range in space and in density, if they do mark them to remove it later
     if (vd.getPos(a)[0] <  0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 ||
         vd.getPos(a)[0] >  1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 ||
 		vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
@@ -565,7 +558,7 @@ __global__ void euler_int_gpu(vector_type vd,real_number dt, real_number dt205)
 		return;
 	}
 
-	//-Calculate displacement and update position / Calcula desplazamiento y actualiza posicion.
+	//-Calculate displacement and update position
 	real_number dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
     real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
     real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
@@ -682,9 +675,12 @@ inline void sensor_pressure(Vector & vd,
         // if the probe is inside the processor domain
 		if (vd.getDecomposition().isLocal(probes.get(i)) == true)
 		{
-			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel_sorted(),NN.toKernel(),probes.get(i),(real_number *)press_tmp_.toKernel());
+			vd.template updateCellListGPU<type,Pressure>(NN);
+
+			Point<3,real_number> probe = probes.get(i);
+			CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel(),NN.toKernel(),probe,(real_number *)press_tmp_.toKernel());
 
-			vd.merge<Pressure>(NN);
+			vd.template restoreOrder<>(NN);
 
 			// move calculated pressure on
 			press_tmp_.deviceToHost();
@@ -734,7 +730,7 @@ int main(int argc, char* argv[])
 
 	// Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
 	Box<3,real_number> domain({-0.05f,-0.05f,-0.05f},{1.7010f,0.7065f,0.511f});
-	size_t sz[3] = {413,179,133};
+	size_t sz[3] = {207,90,66};
 
 	// Fill W_dap
 	W_dap = 1.0/Wab(H/1.5);
@@ -876,8 +872,7 @@ int main(int argc, char* argv[])
 
 	vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
 
-	auto NN = vd.getCellListGPU/*<CELLLIST_GPU_SPARSE<3,float>>*/(2*H / 2.0, 2);
-	//NN.setBoxNN(2);
+	auto NN = vd.getCellListGPU(2*H / 2.0, CL_NON_SYMMETRIC | CL_GPU_REORDER_POSITION | CL_GPU_REORDER_PROPERTY | CL_GPU_RESTORE_PROPERTY, 2);
 
 	timer tot_sim;
 	tot_sim.start();
@@ -899,8 +894,8 @@ int main(int argc, char* argv[])
 			vd.map(RUN_ON_DEVICE);
 
 			// Rebalancer for now work on CPU , so move to CPU
-            vd.deviceToHostPos();
-            vd.template deviceToHostProp<type>();
+			vd.deviceToHostPos();
+			vd.template deviceToHostProp<type>();
 
 			it_reb = 0;
 			ModelCustom md;
@@ -913,10 +908,6 @@ int main(int argc, char* argv[])
 
 		vd.map(RUN_ON_DEVICE);
 
-		// it sort the vector (doesn not seem to produce some advantage)
-		// note force calculation is anyway sorted calculation
-		//vd.make_sort(NN);
-
 		// Calculate pressure from the density
 		EqState(vd);
 
@@ -947,13 +938,12 @@ int main(int argc, char* argv[])
 
 		t += dt;
 
-		if (write < t*10)
+		if (write < t*100)
 		{
 			// Sensor pressure require update ghost, so we ensure that particles are distributed correctly
 			// and ghost are updated
 			vd.map(RUN_ON_DEVICE);
 			vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
-			vd.updateCellList(NN);
 
 			// calculate the pressure at the sensor points
 			//sensor_pressure(vd,NN,press_t,probes);
diff --git a/example/Vector/7_SPH_dlb_opt/main.cpp b/example/Vector/7_SPH_dlb_opt/main.cpp
index 77e39c05..4872e4b3 100644
--- a/example/Vector/7_SPH_dlb_opt/main.cpp
+++ b/example/Vector/7_SPH_dlb_opt/main.cpp
@@ -1073,7 +1073,7 @@ int main(int argc, char* argv[])
 			vd.ghost_get<type,rho,Pressure,velocity>();
 
 			/*! \cond [update_verlet_crs] \endcond */
-			vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC);
+			vd.updateVerlet(NN,r_gskin);
 			/*! \cond [update_verlet_crs] \endcond */
 
 			tot_disp = 0.0;
diff --git a/example/Vector/7_SPH_dlb_opt/main_dbg.cpp b/example/Vector/7_SPH_dlb_opt/main_dbg.cpp
index c5105744..545341a2 100644
--- a/example/Vector/7_SPH_dlb_opt/main_dbg.cpp
+++ b/example/Vector/7_SPH_dlb_opt/main_dbg.cpp
@@ -415,7 +415,7 @@ template<typename VerletList> inline double calc_forces(particles & vd, VerletLi
 			////////////////////// NN2
 
 			// Get an iterator over the neighborhood particles of p
-			auto Np2 = NN2.getNNIterator(NN2.getCell(vd.getPos(a)));
+			auto Np2 = NN2.getNNIteratorBox(NN2.getCell(vd.getPos(a)));
 
 			// For each neighborhood particle
 			while (Np2.isNext() == true)
@@ -560,7 +560,7 @@ template<typename VerletList> inline double calc_forces(particles & vd, VerletLi
 			////////////////////// NN2
 
 			// Get an iterator over the neighborhood particles of p
-			auto Np2 = NN2.getNNIterator(NN2.getCell(vd.getPos(a)));
+			auto Np2 = NN2.getNNIteratorBox(NN2.getCell(vd.getPos(a)));
 
 			if (a == 0 && create_vcluster().getProcessUnitID() == 0)
 			{
@@ -1148,7 +1148,7 @@ int main(int argc, char* argv[])
 
 			vd.ghost_get<type,rho,Pressure,velocity,12>();
 
-			//vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC);
+			//vd.updateVerlet(NN,r_gskin);
 			NN = vd.getVerletCrs(r_gskin);
 
 			vd.write("Debug");
diff --git a/example/Vector/8_DEM/main.cpp b/example/Vector/8_DEM/main.cpp
index 04227b07..9933aa67 100644
--- a/example/Vector/8_DEM/main.cpp
+++ b/example/Vector/8_DEM/main.cpp
@@ -347,7 +347,7 @@ int main(int argc, char* argv[])
 
 	size_t cnt = 0;
 	size_t cnt_reb = 0;
-	auto nlist = parts.getVerlet<VERLETLIST_BAL(3,double)>(cutoff + skin);
+	auto nlist = parts.getVerlet<VERLET_MEMBAL<3,double,VL_NON_SYMMETRIC>>(cutoff + skin);
 
 	double tot_sp = 0.0;
 
diff --git a/example/Vector/9_gpu_cuda_interop/main.cu b/example/Vector/9_gpu_cuda_interop/main.cu
index b8e3a0b0..abdce17f 100644
--- a/example/Vector/9_gpu_cuda_interop/main.cu
+++ b/example/Vector/9_gpu_cuda_interop/main.cu
@@ -124,10 +124,11 @@ Second particle property 2 component 11, address: 0x7f8d83400604
 #ifdef __NVCC__
 
 #include "Vector/vector_dist.hpp"
+#include "util/cuda_util.hpp"
 
 //! [print_data_kernel]
 
-__global__ void print_data_particle_50(float * scalar, float * vector, float * tensor, int capacity)
+__global__ void print_data_particle_50(float * scalar, float * vector, float * tensor, size_t capacity)
 {
 	int p = threadIdx.x + blockIdx.x * blockDim.x;
 
diff --git a/example/common.mk b/example/common.mk
index ae5dc686..f6722e70 100644
--- a/example/common.mk
+++ b/example/common.mk
@@ -8,7 +8,7 @@ CUDA_CC_LINK=
 
 ifdef HIP
     CUDA_CC=hipcc
-    CUDA_OPTIONS=-O3 --std=c++14 -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
+    CUDA_OPTIONS=-O3 --std=c++17 -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
     LIBS_SELECT=$(LIBS)
     CC=hipcc
     CUDA_CC_LINK=hipcc
@@ -18,7 +18,7 @@ else
         CUDA_CC=mpic++ -x c++
         INCLUDE_PATH_NVCC=$(INCLUDE_PATH)
         CUDA_CC_LINK=mpic++
-        CUDA_OPTIONS=-O3 --std=c++14 -DTEST_RUN
+        CUDA_OPTIONS=-O3 --std=c++17 -DTEST_RUN
         LIBS_SELECT=$(LIBS) -lboost_context
     else
         ifeq (, $(shell which nvcc))
@@ -31,7 +31,7 @@ else
             INCLUDE_PATH_NVCC:=-Xcompiler=-Wno-deprecated-declarations $(INCLUDE_PATH_NVCC)
             CUDA_CC=nvcc -ccbin=mpic++
             CUDA_CC_LINK=nvcc -ccbin=mpic++
-            CUDA_OPTIONS=-O3 --std=c++14 -DTEST_RUN -use_fast_math  -arch=sm_61 -lineinfo --extended-lambda --expt-relaxed-constexpr
+            CUDA_OPTIONS=-O3 --std=c++17 -DTEST_RUN -use_fast_math  -arch=sm_61 -lineinfo --extended-lambda --expt-relaxed-constexpr
             LIBS_SELECT=$(LIBS_NVCC)
         endif
     endif
@@ -46,7 +46,7 @@ else
     CUDA_CC_LINK:=$(CUDA_CC_LINK)
 endif
 
-OPT=-g -O3 --std=c++14 -DTEST_RUN
+OPT=-g -O3 --std=c++17 -DTEST_RUN
 INCLUDE_PATH:=-Wno-deprecated-declarations $(INCLUDE_PATH) 
 
 INCLUDE_PATH_NVCC:=$(INCLUDE_PATH_NVCC) -I./include 
diff --git a/openfpm_data b/openfpm_data
index 57c56d6e..cc219d5d 160000
--- a/openfpm_data
+++ b/openfpm_data
@@ -1 +1 @@
-Subproject commit 57c56d6ecb76b2407b797d3fb323648c1b1199c9
+Subproject commit cc219d5debc37dc29b0f5e2e054848e63e9b06aa
diff --git a/openfpm_devices b/openfpm_devices
index 47c09a15..5234c53d 160000
--- a/openfpm_devices
+++ b/openfpm_devices
@@ -1 +1 @@
-Subproject commit 47c09a1505df443fb5850695f4fa25b57530680f
+Subproject commit 5234c53d59bffd4b1ec34f7c6832be97b2923b23
diff --git a/openfpm_io b/openfpm_io
index ffd00931..65af8658 160000
--- a/openfpm_io
+++ b/openfpm_io
@@ -1 +1 @@
-Subproject commit ffd00931bd0c8133cfefd86697a48063cbc95907
+Subproject commit 65af8658a778c99429f769e7bc1c830fc32e5f25
diff --git a/openfpm_numerics b/openfpm_numerics
index 3604644b..1f1619c3 160000
--- a/openfpm_numerics
+++ b/openfpm_numerics
@@ -1 +1 @@
-Subproject commit 3604644ba879a566d0fa5e2a96e66c861c6e7ed5
+Subproject commit 1f1619c31738021bbfe4f0d2db092f01568a2889
diff --git a/openfpm_pdata b/openfpm_pdata
index 995d66dd..71b5c454 160000
--- a/openfpm_pdata
+++ b/openfpm_pdata
@@ -1 +1 @@
-Subproject commit 995d66dd3271b5e3238b66e4dbcac4befb2e3d94
+Subproject commit 71b5c45455a262b71f82f880d803f9d4620e52a2
diff --git a/openfpm_vcluster b/openfpm_vcluster
index de9bc6a7..d85989d8 160000
--- a/openfpm_vcluster
+++ b/openfpm_vcluster
@@ -1 +1 @@
-Subproject commit de9bc6a7616451a00ccf0745e61039e48984bb0a
+Subproject commit d85989d87a6509e50eed2756258d95b590932ede