LBL-EESA · burlen · Sep 25, 2023 · Sep 20, 2023 · Sep 22, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -158,6 +158,14 @@ else()
     message(STATUS "HAMR: User defined objects -- disabled")
 endif()
 
+# page locked memory for host transfers
+set(HAMR_ENABLE_PAGE_LOCKED_MEMORY OFF CACHE BOOL
+    "Enables the use of page locked memory for host transfers.")
+if (HAMR_ENABLE_PAGE_LOCKED_MEMORY)
+    message(STATUS "HAMR: Page locked memory for host transfers -- enabled")
+else()
+    message(STATUS "HAMR: Page locked memory for host transfers -- disabled")
+endif()
 
 # add the requisite flags. CMake enthusiasts will tell you that this is "not
 # the CMake way". However, CMake has spotty coverage, is inconsistent in

diff --git a/hamr_buffer_impl.h b/hamr_buffer_impl.h
@@ -1768,17 +1768,20 @@ std::shared_ptr<const T> buffer<T>::get_host_accessible() const
     else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_async))
     {
         // make a copy on the host.
-        std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
-        /*TODO:Using cudaMallocHost caused performance issues on Perlmutter
+#if defined(HAMR_ENABLE_PAGE_LOCKED_MEMORY)
+        // Using cudaMallocHost caused performance issues on Perlmutter w. CUDA 11.7
+        // however, page locked memory is required for asynchronous transfers.
         std::shared_ptr<T> tmp = cuda_malloc_host_allocator<T>::allocate(m_size);
         if (!tmp)
         {
             std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
                 " CUDA failed to allocate host pinned memory, falling back"
                 " to the default system allocator." << std::endl;
             tmp = malloc_allocator<T>::allocate(m_size);
-        }*/
-
+        }
+#else
+        std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
+#endif
         activate_cuda_device dev(m_owner);
 
         if (copy_to_host_from_cuda(m_stream, tmp.get(), m_data.get(), m_size))

diff --git a/hamr_config.cmake.in b/hamr_config.cmake.in
@@ -16,6 +16,7 @@ set(HAMR_NVHPC_CUDA @HAMR_NVHPC_CUDA@)
 set(HAMR_ENABLE_HIP @HAMR_ENABLE_HIP@)
 set(HAMR_ENABLE_OPENMP @HAMR_ENABLE_HIP@)
 set(HAMR_ENABLE_OBJECTS @HAMR_ENABLE_OBJECTS@)
+set(HAMR_ENABLE_PAGE_LOCKED_MEMORY @HAMR_ENABLE_PAGE_LOCKED_MEMORY@)
 set(HAMR_ENABLE_PYTHON @HAMR_ENABLE_PYTHON@)
 set(HAMR_VERBOSE @HAMR_VERBOSE@)
 

diff --git a/hamr_config.h.in b/hamr_config.h.in
@@ -13,6 +13,7 @@
 #cmakedefine HAMR_ENABLE_OPENMP
 #define HAMR_OPENMP_LOOP @HAMR_OPENMP_LOOP@
 #cmakedefine HAMR_ENABLE_OBJECTS
+#cmakedefine HAMR_ENABLE_PAGE_LOCKED_MEMORY
 #cmakedefine HAMR_ENABLE_PYTHON
 #cmakedefine HAMR_VERBOSE
 

diff --git a/hamr_copier_traits.h b/hamr_copier_traits.h
@@ -0,0 +1,39 @@
+#ifndef hamr_copier_traits_h
+#define hamr_copier_traits_h
+
+#include "hamr_config.h"
+#include <type_traits>
+
+namespace hamr
+{
+/// @name type trait that enables object copy
+///@{
+template <typename T, typename U, bool val = (!std::is_arithmetic<T>::value || !std::is_arithmetic<U>::value)> struct use_object_copier : std::false_type {};
+template <typename T, typename U> struct use_object_copier<T, U, true> : std::true_type {};
+template <typename T, typename U> using use_object_copier_t = typename std::enable_if<use_object_copier<T,U>::value>::type;
+///@}
+
+
+/// @name type trait that enables POD copy from different types
+///@{
+#if defined(HAMR_ENABLE_OBJECTS)
+template <typename T, typename U, bool val = (!std::is_same<T,U>::value)> struct use_cons_copier : std::false_type {};
+template <typename T, typename U> struct use_cons_copier<T, U, true> : std::true_type {};
+template <typename T, typename U> using use_cons_copier_t = typename std::enable_if<use_cons_copier<T,U>::value>::type;
+#else
+template <typename T, typename U, bool val = (!std::is_same<T,U>::value && std::is_arithmetic<T>::value)> struct use_cons_copier : std::false_type {};
+template <typename T, typename U> struct use_cons_copier<T, U, true> : std::true_type {};
+template <typename T, typename U> using use_cons_copier_t = typename std::enable_if<use_cons_copier<T,U>::value>::type;
+#endif
+///@}
+
+/// @name type trait that enables POD copy from the same types
+///@{
+template <typename T, typename U, bool obj = (std::is_same<T,U>::value && std::is_arithmetic<T>::value)> struct use_bytes_copier : std::false_type {};
+template <typename T, typename U> struct use_bytes_copier<T, U, true> : std::true_type {};
+template <typename T, typename U> using use_bytes_copier_t = typename std::enable_if<use_bytes_copier<T,U>::value>::type;
+///@}
+
+}
+
+#endif
diff --git a/hamr_cuda_copy_async.cxx b/hamr_cuda_copy_async.cxx
@@ -3,29 +3,18 @@
 #include "hamr_cuda_copy_async.h"
 #include "hamr_cuda_copy_async_impl.h"
 
-#if !defined(HAMR_ENABLE_OBJECTS)
-
-#define hamr_cuda_copy_async_instantiate_(T, U) \
-template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *); \
-template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *); \
-template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem, void *); \
-template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *);
-
-#else
-
 #define hamr_cuda_copy_async_instantiate_(T, U) \
-template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem); \
-template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem); \
-template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem); \
-template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem);
-
-#endif
+template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
+template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
+template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
+template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *);
 
 #define hamr_cuda_copy_async_instantiate__(T) \
-template int hamr::copy_to_cuda_from_host<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *); \
-template int hamr::copy_to_cuda_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *); \
-template int hamr::copy_to_cuda_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, int src_device, size_t n_elem, void *); \
-template int hamr::copy_to_host_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *);
+template int hamr::copy_to_cuda_from_host<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
+template int hamr::copy_to_cuda_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
+template int hamr::copy_to_cuda_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, int src_device, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
+template int hamr::copy_to_host_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
+
 
 hamr_cuda_copy_async_instantiate__(float)
 hamr_cuda_copy_async_instantiate__(double)

diff --git a/hamr_cuda_copy_async.h b/hamr_cuda_copy_async.h
@@ -2,6 +2,8 @@
 #define hamr_cuda_copy_async_h
 
 #include "hamr_config.h"
+#include "hamr_copier_traits.h"
+
 #include <memory>
 #include <type_traits>
 
@@ -20,8 +22,9 @@ namespace hamr
  */
 template <typename T, typename U>
 int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
-   typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
-#else
+    hamr::use_object_copier_t<T,U> * = nullptr);
+#endif
+
 /** Copies an array to the active CUDA device (fast path for arrays of
  * arithmetic types of the same type).
  *
@@ -32,10 +35,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_ele
  *
  * @returns 0 if there were no errors
  */
-template <typename T>
-int copy_to_cuda_from_host(cudaStream_t str, T *dest, const T *src, size_t n_elem,
-   typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
-#endif
+template <typename T, typename U>
+int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
+    hamr::use_bytes_copier_t<T,U> * = nullptr);
 
 /** Copies an array to the active CUDA device.
  *
@@ -47,11 +49,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const T *src, size_t n_ele
  * @returns 0 if there were no errors
  */
 template <typename T, typename U>
-int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem
-#if !defined(HAMR_ENABLE_OBJECTS)
-    ,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
-#endif
-    );
+int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
+    hamr::use_cons_copier_t<T,U> * = nullptr);
+
 
 #if !defined(HAMR_ENABLE_OBJECTS)
 /** Copies an array on the active CUDA device.
@@ -65,8 +65,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_ele
  */
 template <typename T, typename U>
 int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
-   typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
-#else
+    hamr::use_object_copier_t<T,U> * = nullptr);
+#endif
+
 /** Ccopies an array on the active CUAD device (fast path for arrays of
  * arithmetic types of the same type).
  *
@@ -77,10 +78,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_ele
  *
  * @returns 0 if there were no errors
  */
-template <typename T>
-int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const T *src, size_t n_elem,
-    typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
-#endif
+template <typename T, typename U>
+int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
+    hamr::use_bytes_copier_t<T,U> * = nullptr);
+
 
 /** Copies an array on the active CUDA device.
  *
@@ -92,11 +93,8 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const T *src, size_t n_ele
  * @returns 0 if there were no errors
  */
 template <typename T, typename U>
-int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem
-#if !defined(HAMR_ENABLE_OBJECTS)
-    ,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
-#endif
-    );
+int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
+    hamr::use_cons_copier_t<T,U> * = nullptr);
 
 #if !defined(HAMR_ENABLE_OBJECTS)
 /** Copies an array to the active CUDA device from the named CUDA device,
@@ -110,10 +108,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_ele
  * @returns 0 if there were no errors
  */
 template <typename T, typename U>
-int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
-    const U *src, int src_device, size_t n_elem,
-    typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
-#else
+int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
+    int src_device, size_t n_elem, hamr::use_object_copier_t<T,U> * = nullptr);
+#endif
+
 /** Copies an array to the active CUDA device from the named CUDA device, (fast
  * path for arrays of arithmetic types of the same type).
  *
@@ -125,11 +123,9 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
  *
  * @returns 0 if there were no errors
  */
-template <typename T>
-int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
-    const T *src, int src_device, size_t n_elem,
-    typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
-#endif
+template <typename T, typename U>
+int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
+    int src_device, size_t n_elem, hamr::use_bytes_copier_t<T,U> * = nullptr);
 
 /** Copies an array on the active CUDA device.
  *
@@ -142,12 +138,8 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
  * @returns 0 if there were no errors
  */
 template <typename T, typename U>
-int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
-    const U *src, int src_device, size_t n_elem
-#if !defined(HAMR_ENABLE_OBJECTS)
-    ,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
-#endif
-    );
+int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
+    int src_device, size_t n_elem, hamr::use_cons_copier_t<T,U> * = nullptr);
 
 #if !defined(HAMR_ENABLE_OBJECTS)
 /** Copies an array from the active CUDA device.
@@ -160,10 +152,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
  * @returns 0 if there were no errors
  */
 template <typename T, typename U>
-int copy_to_host_from_cuda(cudaStream_t str, T *dest,
-    const U *src, size_t n_elem,
-    typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
-#else
+int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
+    hamr::use_object_copier_t<T,U> * = nullptr);
+#endif
+
 /** Copies an array from the active CUDA device (fast path for arrays of
  * arithmetic types of the same type).
  *
@@ -174,11 +166,9 @@ int copy_to_host_from_cuda(cudaStream_t str, T *dest,
  *
  * @returns 0 if there were no errors
  */
-template <typename T>
-int copy_to_host_from_cuda(cudaStream_t str, T *dest,
-    const T *src, size_t n_elem,
-    typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
-#endif
+template <typename T, typename U>
+int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
+    hamr::use_bytes_copier_t<T,U> * = nullptr);
 
 /** Copies an array from the active CUDA device.
  *
@@ -188,11 +178,8 @@ int copy_to_host_from_cuda(cudaStream_t str, T *dest,
  * @returns 0 if there were no errors
  */
 template <typename T, typename U>
-int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem
-#if !defined(HAMR_ENABLE_OBJECTS)
-    ,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
-#endif
-    );
+int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
+    hamr::use_cons_copier_t<T,U> * = nullptr);
 
 }