Skip to content

Commit

Permalink
copiers use PM memcpy equivalent when possible
Browse files Browse the repository at this point in the history
when possible use the progrmamming model equivalent of memcpy. This had
inadvertantly been disabled. This avoids a kernel launch.
  • Loading branch information
burlen committed Sep 25, 2023
1 parent 144928f commit e6f1cf8
Show file tree
Hide file tree
Showing 11 changed files with 274 additions and 346 deletions.
39 changes: 39 additions & 0 deletions hamr_copier_traits.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#ifndef hamr_copier_traits_h
#define hamr_copier_traits_h

#include "hamr_config.h"
#include <type_traits>

namespace hamr
{
/// @name type trait that enables object copy
///@{
template <typename T, typename U, bool val = (!std::is_arithmetic<T>::value || !std::is_arithmetic<U>::value)> struct use_object_copier : std::false_type {};
template <typename T, typename U> struct use_object_copier<T, U, true> : std::true_type {};
template <typename T, typename U> using use_object_copier_t = typename std::enable_if<use_object_copier<T,U>::value>::type;
///@}


/// @name type trait that enables POD copy from different types
///@{
#if defined(HAMR_ENABLE_OBJECTS)
template <typename T, typename U, bool val = (!std::is_same<T,U>::value)> struct use_cons_copier : std::false_type {};
template <typename T, typename U> struct use_cons_copier<T, U, true> : std::true_type {};
template <typename T, typename U> using use_cons_copier_t = typename std::enable_if<use_cons_copier<T,U>::value>::type;
#else
template <typename T, typename U, bool val = (!std::is_same<T,U>::value && std::is_arithmetic<T>::value)> struct use_cons_copier : std::false_type {};
template <typename T, typename U> struct use_cons_copier<T, U, true> : std::true_type {};
template <typename T, typename U> using use_cons_copier_t = typename std::enable_if<use_cons_copier<T,U>::value>::type;
#endif
///@}

/// @name type trait that enables POD copy from the same types
///@{
template <typename T, typename U, bool obj = (std::is_same<T,U>::value && std::is_arithmetic<T>::value)> struct use_bytes_copier : std::false_type {};
template <typename T, typename U> struct use_bytes_copier<T, U, true> : std::true_type {};
template <typename T, typename U> using use_bytes_copier_t = typename std::enable_if<use_bytes_copier<T,U>::value>::type;
///@}

}

#endif
29 changes: 9 additions & 20 deletions hamr_cuda_copy_async.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,18 @@
#include "hamr_cuda_copy_async.h"
#include "hamr_cuda_copy_async_impl.h"

#if !defined(HAMR_ENABLE_OBJECTS)

#define hamr_cuda_copy_async_instantiate_(T, U) \
template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem, void *); \
template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, void *);

#else

#define hamr_cuda_copy_async_instantiate_(T, U) \
template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem); \
template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem);

#endif
template int hamr::copy_to_cuda_from_host<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
template int hamr::copy_to_cuda_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, int src_device, size_t n_elem, hamr::use_cons_copier_t<T,U> *); \
template int hamr::copy_to_host_from_cuda<T,U>(cudaStream_t strm, T *dest, const U *src, size_t n_elem, hamr::use_cons_copier_t<T,U> *);

#define hamr_cuda_copy_async_instantiate__(T) \
template int hamr::copy_to_cuda_from_host<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *); \
template int hamr::copy_to_cuda_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *); \
template int hamr::copy_to_cuda_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, int src_device, size_t n_elem, void *); \
template int hamr::copy_to_host_from_cuda<T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, void *);
template int hamr::copy_to_cuda_from_host<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
template int hamr::copy_to_cuda_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
template int hamr::copy_to_cuda_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, int src_device, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \
template int hamr::copy_to_host_from_cuda<T,T>(cudaStream_t strm, T *dest, const T *src, size_t n_elem, hamr::use_bytes_copier_t<T,T> *); \


hamr_cuda_copy_async_instantiate__(float)
hamr_cuda_copy_async_instantiate__(double)
Expand Down
89 changes: 38 additions & 51 deletions hamr_cuda_copy_async.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#define hamr_cuda_copy_async_h

#include "hamr_config.h"
#include "hamr_copier_traits.h"

#include <memory>
#include <type_traits>

Expand All @@ -20,8 +22,9 @@ namespace hamr
*/
template <typename T, typename U>
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
#else
hamr::use_object_copier_t<T,U> * = nullptr);
#endif

/** Copies an array to the active CUDA device (fast path for arrays of
* arithmetic types of the same type).
*
Expand All @@ -32,10 +35,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_ele
*
* @returns 0 if there were no errors
*/
template <typename T>
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const T *src, size_t n_elem,
typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
#endif
template <typename T, typename U>
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_bytes_copier_t<T,U> * = nullptr);

/** Copies an array to the active CUDA device.
*
Expand All @@ -47,11 +49,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const T *src, size_t n_ele
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem
#if !defined(HAMR_ENABLE_OBJECTS)
,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
#endif
);
int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_cons_copier_t<T,U> * = nullptr);


#if !defined(HAMR_ENABLE_OBJECTS)
/** Copies an array on the active CUDA device.
Expand All @@ -65,8 +65,9 @@ int copy_to_cuda_from_host(cudaStream_t str, T *dest, const U *src, size_t n_ele
*/
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
#else
hamr::use_object_copier_t<T,U> * = nullptr);
#endif

/** Ccopies an array on the active CUAD device (fast path for arrays of
* arithmetic types of the same type).
*
Expand All @@ -77,10 +78,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_ele
*
* @returns 0 if there were no errors
*/
template <typename T>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const T *src, size_t n_elem,
typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
#endif
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_bytes_copier_t<T,U> * = nullptr);


/** Copies an array on the active CUDA device.
*
Expand All @@ -92,11 +93,8 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const T *src, size_t n_ele
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem
#if !defined(HAMR_ENABLE_OBJECTS)
,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
#endif
);
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_cons_copier_t<T,U> * = nullptr);

#if !defined(HAMR_ENABLE_OBJECTS)
/** Copies an array to the active CUDA device from the named CUDA device,
Expand All @@ -110,10 +108,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_ele
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
const U *src, int src_device, size_t n_elem,
typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
#else
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
int src_device, size_t n_elem, hamr::use_object_copier_t<T,U> * = nullptr);
#endif

/** Copies an array to the active CUDA device from the named CUDA device, (fast
* path for arrays of arithmetic types of the same type).
*
Expand All @@ -125,11 +123,9 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
*
* @returns 0 if there were no errors
*/
template <typename T>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
const T *src, int src_device, size_t n_elem,
typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
#endif
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
int src_device, size_t n_elem, hamr::use_bytes_copier_t<T,U> * = nullptr);

/** Copies an array on the active CUDA device.
*
Expand All @@ -142,12 +138,8 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
const U *src, int src_device, size_t n_elem
#if !defined(HAMR_ENABLE_OBJECTS)
,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
#endif
);
int copy_to_cuda_from_cuda(cudaStream_t str, T *dest, const U *src,
int src_device, size_t n_elem, hamr::use_cons_copier_t<T,U> * = nullptr);

#if !defined(HAMR_ENABLE_OBJECTS)
/** Copies an array from the active CUDA device.
Expand All @@ -160,10 +152,10 @@ int copy_to_cuda_from_cuda(cudaStream_t str, T *dest,
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_host_from_cuda(cudaStream_t str, T *dest,
const U *src, size_t n_elem,
typename std::enable_if<!std::is_arithmetic<T>::value>::type * = nullptr);
#else
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_object_copier_t<T,U> * = nullptr);
#endif

/** Copies an array from the active CUDA device (fast path for arrays of
* arithmetic types of the same type).
*
Expand All @@ -174,11 +166,9 @@ int copy_to_host_from_cuda(cudaStream_t str, T *dest,
*
* @returns 0 if there were no errors
*/
template <typename T>
int copy_to_host_from_cuda(cudaStream_t str, T *dest,
const T *src, size_t n_elem,
typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
#endif
template <typename T, typename U>
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_bytes_copier_t<T,U> * = nullptr);

/** Copies an array from the active CUDA device.
*
Expand All @@ -188,11 +178,8 @@ int copy_to_host_from_cuda(cudaStream_t str, T *dest,
* @returns 0 if there were no errors
*/
template <typename T, typename U>
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem
#if !defined(HAMR_ENABLE_OBJECTS)
,typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr
#endif
);
int copy_to_host_from_cuda(cudaStream_t str, T *dest, const U *src, size_t n_elem,
hamr::use_cons_copier_t<T,U> * = nullptr);

}

Expand Down
Loading

0 comments on commit e6f1cf8

Please sign in to comment.