Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

buffer cmake option for page locked memory in host transfers #71

Merged
merged 2 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ else()
message(STATUS "HAMR: User defined objects -- disabled")
endif()

# page locked memory for host transfers
set(HAMR_ENABLE_PAGE_LOCKED_MEMORY OFF CACHE BOOL
"Enables the use of page locked memory for host transfers.")
if (HAMR_ENABLE_PAGE_LOCKED_MEMORY)
message(STATUS "HAMR: Page locked memory for host transfers -- enabled")
else()
message(STATUS "HAMR: Page locked memory for host transfers -- disabled")
endif()

# add the requisite flags. CMake enthusiasts will tell you that this is "not
# the CMake way". However, CMake has spotty coverage, is inconsistent in
Expand Down
11 changes: 7 additions & 4 deletions hamr_buffer_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1768,17 +1768,20 @@ std::shared_ptr<const T> buffer<T>::get_host_accessible() const
else if ((m_alloc == allocator::cuda) || (m_alloc == allocator::cuda_async))
{
// make a copy on the host.
std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
/*TODO:Using cudaMallocHost caused performance issues on Perlmutter
#if defined(HAMR_ENABLE_PAGE_LOCKED_MEMORY)
// Using cudaMallocHost caused performance issues on Perlmutter w. CUDA 11.7
// however, page locked memory is required for asynchronous transfers.
std::shared_ptr<T> tmp = cuda_malloc_host_allocator<T>::allocate(m_size);
if (!tmp)
{
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" CUDA failed to allocate host pinned memory, falling back"
" to the default system allocator." << std::endl;
tmp = malloc_allocator<T>::allocate(m_size);
}*/

}
#else
std::shared_ptr<T> tmp = malloc_allocator<T>::allocate(m_size);
#endif
activate_cuda_device dev(m_owner);

if (copy_to_host_from_cuda(m_stream, tmp.get(), m_data.get(), m_size))
Expand Down
1 change: 1 addition & 0 deletions hamr_config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set(HAMR_NVHPC_CUDA @HAMR_NVHPC_CUDA@)
set(HAMR_ENABLE_HIP @HAMR_ENABLE_HIP@)
set(HAMR_ENABLE_OPENMP @HAMR_ENABLE_HIP@)
set(HAMR_ENABLE_OBJECTS @HAMR_ENABLE_OBJECTS@)
set(HAMR_ENABLE_PAGE_LOCKED_MEMORY @HAMR_ENABLE_PAGE_LOCKED_MEMORY@)
set(HAMR_ENABLE_PYTHON @HAMR_ENABLE_PYTHON@)
set(HAMR_VERBOSE @HAMR_VERBOSE@)

Expand Down
1 change: 1 addition & 0 deletions hamr_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#cmakedefine HAMR_ENABLE_OPENMP
#define HAMR_OPENMP_LOOP @HAMR_OPENMP_LOOP@
#cmakedefine HAMR_ENABLE_OBJECTS
#cmakedefine HAMR_ENABLE_PAGE_LOCKED_MEMORY
#cmakedefine HAMR_ENABLE_PYTHON
#cmakedefine HAMR_VERBOSE

Expand Down
39 changes: 39 additions & 0 deletions hamr_copier_traits.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#ifndef hamr_copier_traits_h
#define hamr_copier_traits_h

#include "hamr_config.h"
#include <type_traits>

namespace hamr
{
/// @name type trait that enables object copy
///@{
template <typename T, typename U, bool val = (!std::is_arithmetic<T>::value || !std::is_arithmetic<U>::value)> struct use_object_copier : std::false_type {};
template <typename T, typename U> struct use_object_copier<T, U, true> : std::true_type {};
template <typename T, typename U> using use_object_copier_t = typename std::enable_if<use_object_copier<T,U>::value>::type;
///@}


/// @name type trait that enables POD copy from different types
///@{
#if defined(HAMR_ENABLE_OBJECTS)
template <typename T, typename U, bool val = (!std::is_same<T,U>::value)> struct use_cons_copier : std::false_type {};
template <typename T, typename U> struct use_cons_copier<T, U, true> : std::true_type {};
template <typename T, typename U> using use_cons_copier_t = typename std::enable_if<use_cons_copier<T,U>::value>::type;
#else
template <typename T, typename U, bool val = (!std::is_same<T,U>::value && std::is_arithmetic<T>::value)> struct use_cons_copier : std::false_type {};
template <typename T, typename U> struct use_cons_copier<T, U, true> : std::true_type {};
template <typename T, typename U> using use_cons_copier_t = typename std::enable_if<use_cons_copier<T,U>::value>::type;
#endif
///@}

/// @name type trait that enables POD copy from the same types
///@{
template <typename T, typename U, bool obj = (std::is_same<T,U>::value && std::is_arithmetic<T>::value)> struct use_bytes_copier : std::false_type {};
template <typename T, typename U> struct use_bytes_copier<T, U, true> : std::true_type {};
template <typename T, typename U> using use_bytes_copier_t = typename std::enable_if<use_bytes_copier<T,U>::value>::type;
///@}

}

#endif
29 changes: 9 additions & 20 deletions hamr_cuda_copy_async.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,18 @@
#include "hamr_cuda_copy_async.h"
#include "hamr_cuda_copy_async_impl.h"

#if !defined(HAMR_ENABLE_OBJECTS)

#define hamr_cuda_copy_async_instantiate_(T, U) \
template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem, void *); \
template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *);

#else

#define hamr_cuda_copy_async_instantiate_(T, U) \
template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem); \
template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem);

#endif
template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *);

#define hamr_cuda_copy_async_instantiate__(T) \
template int hamr::copy_to_cuda_from_host<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *); \
template int hamr::copy_to_cuda_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *); \
template int hamr::copy_to_cuda_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, int src_device, size_t n_elem, void *); \
template int hamr::copy_to_host_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *);
template int hamr::copy_to_cuda_from_host<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
template int hamr::copy_to_cuda_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
template int hamr::copy_to_cuda_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, int src_device, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
template int hamr::copy_to_host_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \


hamr_cuda_copy_async_instantiate__(float)
hamr_cuda_copy_async_instantiate__(double)
Expand Down
89 changes: 38 additions & 51 deletions hamr_cuda_copy_async.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#define hamr_cuda_copy_async_h

#include "hamr_config.h"
#include "hamr_copier_traits.h"

#include <memory>
#include <type_traits>

Expand All @@ -20,8 +22,9 @@ namespace hamr
*/
template <typename T, typename U>
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
#else
hamr::use_object_copier_t<T,U> * = nullptr);
#endif

/** Copies an array to the active CUDA device (fast path for arrays of
* arithmetic types of the same type).
*
Expand All @@ -32,10 +35,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_ele
*
* @returns 0 if there were no errors
*/
template <typename T>
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const T *src, size_t n_elem,
typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
#endif
template <typename T, typename U>
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_bytes_copier_t<T,U> * = nullptr);

/** Copies an array to the active CUDA device.
*
Expand All @@ -47,11 +49,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const T *src, size_t n_ele
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem
#if !defined(HAMR_ENABLE_OBJECTS)
,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
#endif
);
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_cons_copier_t<T,U> * = nullptr);


#if !defined(HAMR_ENABLE_OBJECTS)
/** Copies an array on the active CUDA device.
Expand All @@ -65,8 +65,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_ele
*/
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
#else
hamr::use_object_copier_t<T,U> * = nullptr);
#endif

/** Ccopies an array on the active CUAD device (fast path for arrays of
* arithmetic types of the same type).
*
Expand All @@ -77,10 +78,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_ele
*
* @returns 0 if there were no errors
*/
template <typename T>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const T *src, size_t n_elem,
typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
#endif
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_bytes_copier_t<T,U> * = nullptr);


/** Copies an array on the active CUDA device.
*
Expand All @@ -92,11 +93,8 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const T *src, size_t n_ele
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem
#if !defined(HAMR_ENABLE_OBJECTS)
,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
#endif
);
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_cons_copier_t<T,U> * = nullptr);

#if !defined(HAMR_ENABLE_OBJECTS)
/** Copies an array to the active CUDA device from the named CUDA device,
Expand All @@ -110,10 +108,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_ele
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
const U *src, int src_device, size_t n_elem,
typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
#else
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
int src_device, size_t n_elem, hamr::use_object_copier_t<T,U> * = nullptr);
#endif

/** Copies an array to the active CUDA device from the named CUDA device, (fast
* path for arrays of arithmetic types of the same type).
*
Expand All @@ -125,11 +123,9 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
*
* @returns 0 if there were no errors
*/
template <typename T>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
const T *src, int src_device, size_t n_elem,
typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
#endif
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
int src_device, size_t n_elem, hamr::use_bytes_copier_t<T,U> * = nullptr);

/** Copies an array on the active CUDA device.
*
Expand All @@ -142,12 +138,8 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
const U *src, int src_device, size_t n_elem
#if !defined(HAMR_ENABLE_OBJECTS)
,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
#endif
);
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
int src_device, size_t n_elem, hamr::use_cons_copier_t<T,U> * = nullptr);

#if !defined(HAMR_ENABLE_OBJECTS)
/** Copies an array from the active CUDA device.
Expand All @@ -160,10 +152,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_host_from_cuda(cudaStream_t str, T *dest,
const U *src, size_t n_elem,
typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
#else
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_object_copier_t<T,U> * = nullptr);
#endif

/** Copies an array from the active CUDA device (fast path for arrays of
* arithmetic types of the same type).
*
Expand All @@ -174,11 +166,9 @@ int copy_to_host_from_cuda(cudaStream_t str, T *dest,
*
* @returns 0 if there were no errors
*/
template <typename T>
int copy_to_host_from_cuda(cudaStream_t str, T *dest,
const T *src, size_t n_elem,
typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
#endif
template <typename T, typename U>
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_bytes_copier_t<T,U> * = nullptr);

/** Copies an array from the active CUDA device.
*
Expand All @@ -188,11 +178,8 @@ int copy_to_host_from_cuda(cudaStream_t str, T *dest,
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem
#if !defined(HAMR_ENABLE_OBJECTS)
,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
#endif
);
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_cons_copier_t<T,U> * = nullptr);

}

Expand Down
Loading
Loading