diff --git a/COMMIT b/COMMIT old mode 100644 new mode 100755 index b94efbd..e8cf65e --- a/COMMIT +++ b/COMMIT @@ -1 +1 @@ -30c52a0fd155774e18cc06328a1ba83c2a6a8104 \ No newline at end of file +442d97d67dd0667a8bb11a99f6b10dbfb12fec63 \ No newline at end of file diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 7571183..bc1e98f 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -13,3 +13,4 @@ Dmitry (dmitrygx on github.com) Florian Weimer (fweimer on github.com) Jonas Hahnfeld (hahnjo on github.com) Tom Stellard (tstellar on github.com) +Chuck Cranor (chuchcranor on github.com) diff --git a/Makefile b/Makefile index 5a31d64..bead074 100644 --- a/Makefile +++ b/Makefile @@ -129,7 +129,7 @@ INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips -I$(OUTDIR) ifneq (x86_64,$(arch)) ifneq (i386,$(arch)) - $(error Unsupported architecture $(arch)) + anerr := $(error Unsupported architecture $(arch)) endif endif @@ -164,7 +164,7 @@ nthreads := $(shell echo $$(( `nproc` * 2 )) ) # The DISTRO variable is used subsequently for variable # behaviors of the 3 distros. -DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID) +DISTRO := $(shell . /etc/os-release; if [ "$$ID" = "sle_hpc" ]; then ID="sles"; fi; echo $$ID) # By default the following two variables have the following values: LIBPSM2_COMPAT_CONF_DIR := /etc diff --git a/buildflags.mak b/buildflags.mak index 7c3cda0..21eb729 100644 --- a/buildflags.mak +++ b/buildflags.mak @@ -118,13 +118,13 @@ ifneq (icc,${CC}) RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?) else RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?) - $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance) + anerr := $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance) endif ifeq (0,${RET}) BASECFLAGS += ${MAVX2} else - $(error Compiler does not support ${MAVX2} ) + anerr := $(error Compiler does not support ${MAVX2} ) endif else BASECFLAGS += ${MAVX2} @@ -138,7 +138,7 @@ ifneq (,${PSM_AVX512}) ifeq (0,${RET}) BASECFLAGS += -mavx512f else - $(error Compiler does not support AVX512 ) + anerr := $(error Compiler does not support AVX512 ) endif BASECFLAGS += -DPSM_AVX512 endif @@ -203,7 +203,7 @@ else BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security else ifneq (${CCARCH},gcc4) - $(error Unknown compiler arch "${CCARCH}") + anerr := $(error Unknown compiler arch "${CCARCH}") endif # gcc4 endif # gcc endif # icc diff --git a/psm.c b/psm.c index 7f929ce..8677910 100644 --- a/psm.c +++ b/psm.c @@ -92,8 +92,8 @@ uint32_t psmi_cpu_model; #ifdef PSM_CUDA int is_cuda_enabled; int is_gdr_copy_enabled; -int device_support_gpudirect; -int gpu_p2p_supported = 0; +int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect(). +int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported(). int my_gpu_device = 0; int cuda_lib_version; int is_driver_gpudirect_enabled; @@ -116,6 +116,7 @@ CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); CUresult (*psmi_cuDeviceGetCount)(int* count); CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); CUresult (*psmi_cuStreamDestroy)(CUstream phStream); +CUresult (*psmi_cuStreamSynchronize)(CUstream phStream); CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); CUresult (*psmi_cuEventDestroy)(CUevent hEvent); CUresult (*psmi_cuEventQuery)(CUevent hEvent); @@ -217,6 +218,7 @@ int psmi_cuda_lib_load() PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery); @@ -251,7 +253,6 @@ int psmi_cuda_lib_load() int psmi_cuda_initialize() { psm2_error_t err = PSM2_OK; - int num_devices, dev; PSM2_LOG_MSG("entering"); _HFI_VDBG("Enabling CUDA support.\n"); @@ -262,77 +263,6 @@ int psmi_cuda_initialize() PSMI_CUDA_CALL(cuInit, 0); - /* Check if CUDA context is available. If not, we are not allowed to - * launch any CUDA API calls */ - PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt); - if (ctxt == NULL) { - _HFI_INFO("Unable to find active CUDA context\n"); - is_cuda_enabled = 0; - err = PSM2_OK; - return err; - } - - CUdevice current_device; - CUcontext primary_ctx; - PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device); - int is_ctx_active; - unsigned ctx_flags; - PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags, - &is_ctx_active); - if (!is_ctx_active) { - /* There is an issue where certain CUDA API calls create - * contexts but does not make it active which cause the - * driver API call to fail with error 709 */ - PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx, - current_device); - is_cuda_primary_context_retain = 1; - } - - /* Check if all devices support Unified Virtual Addressing. */ - PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); - - device_support_gpudirect = 1; - - for (dev = 0; dev < num_devices; dev++) { - CUdevice device; - PSMI_CUDA_CALL(cuDeviceGet, &device, dev); - int unifiedAddressing; - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &unifiedAddressing, - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, - device); - - if (unifiedAddressing !=1) { - _HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev); - goto fail; - } - - int major; - PSMI_CUDA_CALL(cuDeviceGetAttribute, - &major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device); - if (major < 3) { - device_support_gpudirect = 0; - _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev); - } - - if (device != current_device) { - int canAccessPeer = 0; - PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer, - current_device, device); - - if (canAccessPeer != 1) - _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev); - else - gpu_p2p_supported |= (1 << device); - } else { - /* Always support p2p on the same GPU */ - my_gpu_device = device; - gpu_p2p_supported |= (1 << device); - } - } - union psmi_envvar_val env_enable_gdr_copy; psmi_getenv("PSM2_GDRCOPY", "Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)", diff --git a/psm_hal_gen1/psm_hal_gen1_spio.c b/psm_hal_gen1/psm_hal_gen1_spio.c index eb9d5aa..a53d060 100644 --- a/psm_hal_gen1/psm_hal_gen1_spio.c +++ b/psm_hal_gen1/psm_hal_gen1_spio.c @@ -181,10 +181,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl, #ifdef PSM_CUDA - if (PSMI_IS_CUDA_ENABLED) { - PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer, - MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE); - } + ctrl->cuda_pio_buffer = NULL; #endif _HFI_PRDBG("ips_spio_init() done\n"); @@ -195,7 +192,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl, static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl) { #ifdef PSM_CUDA - if (PSMI_IS_CUDA_ENABLED) + if (PSMI_IS_CUDA_ENABLED && ctrl->cuda_pio_buffer != NULL) PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer); #endif spio_report_stall(ctrl, get_cycles(), 0ULL); @@ -810,6 +807,10 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, /* Write to PIO: other blocks of payload */ #ifdef PSM_CUDA if (is_cuda_payload) { + if (ctrl->cuda_pio_buffer == NULL) { + PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer, + MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE); + } /* Since the implementation of cuMemcpy is unknown, and the HFI specifies several conditions for how PIO writes must occur, for safety reasons we should not assume diff --git a/psm_mpool.c b/psm_mpool.c index 1f2a365..2d035e3 100644 --- a/psm_mpool.c +++ b/psm_mpool.c @@ -101,7 +101,6 @@ struct mpool { #ifdef PSM_CUDA alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb; - void *mp_alloc_dealloc_cb_context; #endif }; @@ -230,7 +229,7 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, non_empty_callback_fn_t cb, void *context, - alloc_dealloc_callback_fn_t ad_cb, void *ad_context) + alloc_dealloc_callback_fn_t ad_cb) { mpool_t mp; @@ -242,7 +241,6 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, return NULL; mp->mp_alloc_dealloc_cb = ad_cb; - mp->mp_alloc_dealloc_cb_context = ad_context; if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) { psmi_mpool_destroy(mp); @@ -418,7 +416,6 @@ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx) int j; for (j = 0; j < mp->mp_num_obj_per_chunk; j++) mp->mp_alloc_dealloc_cb(0 /* is not alloc */, - mp->mp_alloc_dealloc_cb_context, ((void *) mp->mp_elm_vector[idx]) + j * mp->mp_elm_size + sizeof(struct mpool_element)); @@ -509,7 +506,6 @@ static int psmi_mpool_allocate_chunk(mpool_t mp) #ifdef PSM_CUDA if (mp->mp_alloc_dealloc_cb) mp->mp_alloc_dealloc_cb(1 /* is alloc */, - mp->mp_alloc_dealloc_cb_context, chunk + i * mp->mp_elm_size + sizeof(struct mpool_element)); #endif diff --git a/psm_mpool.h b/psm_mpool.h index 8098f60..653d80b 100644 --- a/psm_mpool.h +++ b/psm_mpool.h @@ -70,8 +70,7 @@ typedef struct mpool *mpool_t; typedef void (*non_empty_callback_fn_t) (void *context); -typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context, - void *chunk); +typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *chunk); mpool_t MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, @@ -84,8 +83,7 @@ mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, non_empty_callback_fn_t cb, void *context, - alloc_dealloc_callback_fn_t ad_cb, - void *ad_context); + alloc_dealloc_callback_fn_t ad_cb); void psmi_mpool_destroy(mpool_t mp); diff --git a/psm_mq_internal.h b/psm_mq_internal.h index a1afaf8..0793d68 100644 --- a/psm_mq_internal.h +++ b/psm_mq_internal.h @@ -249,7 +249,6 @@ struct psm2_mq_req { uint32_t prefetch_send_msgoff; int cuda_hostbuf_used; CUipcMemHandle cuda_ipc_handle; - CUevent cuda_ipc_event; uint8_t cuda_ipc_handle_attached; uint32_t cuda_ipc_offset; /* diff --git a/psm_mq_utils.c b/psm_mq_utils.c index a0409db..8f58b37 100644 --- a/psm_mq_utils.c +++ b/psm_mq_utils.c @@ -114,19 +114,6 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type) } MOCK_DEF_EPILOGUE(psmi_mq_req_alloc); -#ifdef PSM_CUDA -void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) { - psm2_mq_req_t recvreq = (psm2_mq_req_t)obj; - if (PSMI_IS_CUDA_ENABLED) { - if (is_alloc) - PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT); - else - PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event); - } - return; -} -#endif - psm2_error_t psmi_mq_req_init(psm2_mq_t mq) { psm2_mq_req_t warmup_req; @@ -165,29 +152,6 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq) if ((err = psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) goto fail; - /* Have a callback function for receive req mpool which creates - * and destroy events. - */ -#ifdef PSM_CUDA - if (PSMI_IS_CUDA_ENABLED) { - if ((mq->rreq_pool = - psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz, - maxsz, 0, DESCRIPTORS, NULL, - NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - } - else { - if ((mq->rreq_pool = - psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, - maxsz, 0, DESCRIPTORS, NULL, - NULL)) == NULL) { - err = PSM2_NO_MEMORY; - goto fail; - } - } -#else if ((mq->rreq_pool = psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, maxsz, 0, DESCRIPTORS, NULL, @@ -195,7 +159,6 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq) err = PSM2_NO_MEMORY; goto fail; } -#endif } /* Warm up the allocators */ diff --git a/psm_user.h b/psm_user.h index 09477c5..fa38a42 100644 --- a/psm_user.h +++ b/psm_user.h @@ -296,6 +296,7 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak)); #endif #ifdef PSM_CUDA + #include #include @@ -305,12 +306,12 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak)); extern int is_cuda_enabled; extern int is_gdr_copy_enabled; -extern int device_support_gpudirect; -extern int gpu_p2p_supported; +extern int _device_support_gpudirect; +extern int _gpu_p2p_supported; extern int my_gpu_device; extern int cuda_lib_version; -extern CUcontext ctxt; +extern CUcontext cu_ctxt; extern void *psmi_cuda_lib; extern CUresult (*psmi_cuInit)(unsigned int Flags ); @@ -326,6 +327,7 @@ extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); extern CUresult (*psmi_cuDeviceGetCount)(int* count); extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream); +extern CUresult (*psmi_cuStreamSynchronize)(CUstream phStream); extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent); extern CUresult (*psmi_cuEventQuery)(CUevent hEvent); @@ -348,14 +350,34 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); +static int check_set_cuda_ctxt(void) +{ + CUresult err; + CUcontext tmpctxt = {0}; + + if (!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent) + return 0; + + err = psmi_cuCtxGetCurrent(&tmpctxt); + if (!err) { + if (!tmpctxt && cu_ctxt) { + err = psmi_cuCtxSetCurrent(cu_ctxt); + return !!err; + } else if (tmpctxt && !cu_ctxt) { + cu_ctxt = tmpctxt; + } + } + return 0; +} + #define PSMI_CUDA_CALL(func, args...) do { \ CUresult cudaerr; \ + if (check_set_cuda_ctxt()) { \ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Failed to set/synchronize CUDA context.\n"); \ + } \ cudaerr = psmi_##func(args); \ if (cudaerr != CUDA_SUCCESS) { \ - if (ctxt == NULL) \ - _HFI_ERROR( \ - "Check if CUDA is initialized" \ - "before psm2_ep_open call \n"); \ _HFI_ERROR( \ "CUDA failure: %s() (at %s:%d)" \ "returned %d\n", \ @@ -366,6 +388,92 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); } \ } while (0) +PSMI_ALWAYS_INLINE( +int device_support_gpudirect()) +{ + if (_device_support_gpudirect > -1) return _device_support_gpudirect; + + int num_devices, dev; + + /* Check if all devices support Unified Virtual Addressing. */ + PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); + + _device_support_gpudirect = 1; + + for (dev = 0; dev < num_devices; dev++) { + CUdevice device; + PSMI_CUDA_CALL(cuDeviceGet, &device, dev); + int unifiedAddressing; + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &unifiedAddressing, + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, + device); + + if (unifiedAddressing !=1) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE, + "CUDA device %d does not support Unified Virtual Addressing.\n", + dev); + } + + int major; + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device); + if (major < 3) { + _device_support_gpudirect = 0; + _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev); + } + } + + return _device_support_gpudirect; +} + +#define PSMI_IS_CUDA_ENABLED likely(is_cuda_enabled) +#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled) + +PSMI_ALWAYS_INLINE( +int gpu_p2p_supported()) +{ + if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported; + + if (PSMI_IS_CUDA_DISABLED) { + _gpu_p2p_supported=0; + return 0; + } + + int num_devices, dev; + + /* Check which devices the current device has p2p access to. */ + CUdevice current_device; + PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device); + PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); + + _gpu_p2p_supported = 0; + + for (dev = 0; dev < num_devices; dev++) { + CUdevice device; + PSMI_CUDA_CALL(cuDeviceGet, &device, dev); + + if (device != current_device) { + int canAccessPeer = 0; + PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer, + current_device, device); + + if (canAccessPeer != 1) + _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev); + else + _gpu_p2p_supported |= (1 << device); + } else { + /* Always support p2p on the same GPU */ + my_gpu_device = device; + _gpu_p2p_supported |= (1 << device); + } + } + + return _gpu_p2p_supported; +} + /** * Similar to PSMI_CUDA_CALL() except does not error out * if func(args) returns CUDA_SUCCESS or except_err @@ -378,9 +486,13 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); * DBG level. */ #define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \ + if (check_set_cuda_ctxt()) { \ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Failed to set/synchronize CUDA context.\n"); \ + } \ cudaerr = psmi_##func(args); \ if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) { \ - if (ctxt == NULL) \ + if (cu_ctxt == NULL) \ _HFI_ERROR( \ "Check if CUDA is initialized" \ "before psm2_ep_open call \n"); \ @@ -442,9 +554,6 @@ _psmi_is_cuda_mem(const void *ptr)) return 0; } -#define PSMI_IS_CUDA_ENABLED likely(is_cuda_enabled) -#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled) - PSMI_ALWAYS_INLINE( int _psmi_is_gdr_copy_enabled()) @@ -473,7 +582,7 @@ struct ips_cuda_hostbuf { struct ips_cuda_hostbuf_mpool_cb_context { unsigned bufsz; }; -void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj); +void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *obj); #define CUDA_HOSTBUFFER_LIMITS { \ .env = "PSM_CUDA_BOUNCEBUFFERS_MAX", \ diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c index 730562d..253d261 100644 --- a/ptl_am/am_cuda_memhandle_cache.c +++ b/ptl_am/am_cuda_memhandle_cache.c @@ -168,7 +168,7 @@ static void print_cuda_memhandle_cache_stats(void) * which helps in closing all memhandles. */ static void -psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) +psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* obj) { cl_map_item_t* memcache_item = (cl_map_item_t*)obj; if (!is_alloc) { @@ -196,8 +196,7 @@ am_cuda_memhandle_mpool_init(uint32_t memcache_size) cuda_memhandle_cache_size, cuda_memhandle_cache_size, 0, UNDEFINED, NULL, NULL, - psmi_cuda_memhandle_cache_alloc_func, - NULL); + psmi_cuda_memhandle_cache_alloc_func); if (cuda_memhandle_mpool == NULL) { err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, "Couldn't allocate CUDA host receive buffer pool"); diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c index 9be72f9..bcc0182 100644 --- a/ptl_am/am_reqrep_shmem.c +++ b/ptl_am/am_reqrep_shmem.c @@ -2099,7 +2099,7 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, #ifdef PSM_CUDA int gpu_mem = 0; - int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported; + int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported(); if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) { gpu_mem = 1; diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c index 2e42c1b..3b3ed9f 100644 --- a/ptl_am/ptl.c +++ b/ptl_am/ptl.c @@ -96,8 +96,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, if (req->is_buf_gpu_mem) { PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_dev_ptr, req->req_data.recv_msglen); - PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0); - PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event); + PSMI_CUDA_CALL(cuStreamSynchronize, 0); } else PSMI_CUDA_CALL(cuMemcpyDtoH, req->req_data.buf, cuda_ipc_dev_ptr, req->req_data.recv_msglen); @@ -129,8 +128,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, * copies for msg sizes less than 64k. The event record * and synchronize calls are to guarentee completion. */ - PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0); - PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event); + PSMI_CUDA_CALL(cuStreamSynchronize, 0); psmi_free(cuda_ipc_bounce_buf); } else { /* cma can be done in handler context or not. */ diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c index dfd03e6..c28b7d4 100644 --- a/ptl_ips/ips_proto.c +++ b/ptl_ips/ips_proto.c @@ -95,22 +95,17 @@ static psm2_error_t proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context); #ifdef PSM_CUDA -void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj) +void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *obj) { - struct ips_cuda_hostbuf *icb; - struct ips_cuda_hostbuf_mpool_cb_context *ctxt = - (struct ips_cuda_hostbuf_mpool_cb_context *) context; - - icb = (struct ips_cuda_hostbuf *)obj; + struct ips_cuda_hostbuf *icb = (struct ips_cuda_hostbuf *)obj; if (is_alloc) { - PSMI_CUDA_CALL(cuMemHostAlloc, - (void **) &icb->host_buf, - ctxt->bufsz, - CU_MEMHOSTALLOC_PORTABLE); - PSMI_CUDA_CALL(cuEventCreate, &icb->copy_status, CU_EVENT_DEFAULT); + icb->host_buf = NULL; + icb->copy_status = NULL; } else { - if (icb->host_buf) { + if (icb->host_buf != NULL) { PSMI_CUDA_CALL(cuMemFreeHost, icb->host_buf); + } + if (icb->copy_status != NULL) { PSMI_CUDA_CALL(cuEventDestroy, icb->copy_status); } } @@ -520,10 +515,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { #ifdef PSM_CUDA - if (PSMI_IS_CUDA_ENABLED) { - PSMI_CUDA_CALL(cuStreamCreate, - &proto->cudastream_send, CU_STREAM_NON_BLOCKING); - } + proto->cudastream_send = NULL; #endif proto->scbc_rv = NULL; if ((err = ips_protoexp_init(context, proto, protoexp_flags, @@ -635,14 +627,34 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)0, /* Disabled by default */ &env_gpudirect_rdma); + /* Use GPUDirect RDMA for SDMA send? */ + union psmi_envvar_val env_gpudirect_rdma_send; + psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND", + "Use GPUDirect RDMA support to allow the HFI to directly" + " read from the GPU for SDMA. Requires driver" + " support.(default is disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_gpudirect_rdma_send); + + /* Use GPUDirect RDMA for recv? */ + union psmi_envvar_val env_gpudirect_rdma_recv; + psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV", + "Use GPUDirect RDMA support to allow the HFI to directly" + " write into GPU. Requires driver support.(default is" + " disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_gpudirect_rdma_recv); + /* The following cases need to be handled: * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or * by default - Turn off GDR COPY * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave *. this config as it is. */ - if (!env_gpudirect_rdma.e_uint) - is_gdr_copy_enabled = 0; + if (!env_gpudirect_rdma.e_uint && !env_gpudirect_rdma_send.e_uint && !env_gpudirect_rdma_recv.e_uint) + is_gdr_copy_enabled = 0; /* Default Send threshold for Gpu-direct set to 30000 */ union psmi_envvar_val env_gpudirect_send_thresh; @@ -659,7 +671,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, (union psmi_envvar_val)UINT_MAX, &env_gpudirect_recv_thresh); gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint; - if (env_gpudirect_rdma.e_uint && device_support_gpudirect) { + if (env_gpudirect_rdma.e_uint && device_support_gpudirect()) { if (PSMI_IS_CUDA_DISABLED || /* All pio, No SDMA*/ (proto->flags & IPS_PROTO_FLAG_SPIO) || @@ -675,16 +687,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, * experimentation and will not be documented for any customers. */ /* Use GPUDirect RDMA for SDMA send? */ - union psmi_envvar_val env_gpudirect_rdma_send; - psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND", - "Use GPUDirect RDMA support to allow the HFI to directly" - " read from the GPU for SDMA. Requires driver" - " support.(default is disabled i.e. 0)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val)0, /* Disabled by default */ - &env_gpudirect_rdma_send); - - if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) { + if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect()) { if (PSMI_IS_CUDA_DISABLED || /* All pio, No SDMA*/ (proto->flags & IPS_PROTO_FLAG_SPIO)) @@ -695,16 +698,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND; } /* Use GPUDirect RDMA for recv? */ - union psmi_envvar_val env_gpudirect_rdma_recv; - psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV", - "Use GPUDirect RDMA support to allow the HFI to directly" - " write into GPU. Requires driver support.(default is" - " disabled i.e. 0)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val)0, /* Disabled by default */ - &env_gpudirect_rdma_recv); - - if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) { + if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect()) { if (PSMI_IS_CUDA_DISABLED || !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) err = psmi_handle_error(PSMI_EP_NORETURN, @@ -734,9 +728,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), chunksz, max_elements, 0, UNDEFINED, NULL, NULL, - psmi_cuda_hostbuf_alloc_func, - (void *) - &proto->cuda_hostbuf_send_cfg); + psmi_cuda_hostbuf_alloc_func); if (proto->cuda_hostbuf_pool_send == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, @@ -750,9 +742,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), chunksz, max_elements, 0, UNDEFINED, NULL, NULL, - psmi_cuda_hostbuf_alloc_func, - (void *) - &proto->cuda_hostbuf_small_send_cfg); + psmi_cuda_hostbuf_alloc_func); if (proto->cuda_hostbuf_pool_small_send == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, @@ -928,7 +918,7 @@ ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in) #endif #ifdef PSM_CUDA - if (PSMI_IS_CUDA_ENABLED) { + if (PSMI_IS_CUDA_ENABLED && proto->cudastream_send) { PSMI_CUDA_CALL(cuStreamDestroy, proto->cudastream_send); } #endif diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c index 7e7e997..a579dd6 100644 --- a/ptl_ips/ips_proto_expected.c +++ b/ptl_ips/ips_proto_expected.c @@ -370,9 +370,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), chunksz, max_elements, 0, UNDEFINED, NULL, NULL, - psmi_cuda_hostbuf_alloc_func, - (void *) - &protoexp->cuda_hostbuf_recv_cfg); + psmi_cuda_hostbuf_alloc_func); if (protoexp->cuda_hostbuf_pool_recv == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, @@ -386,9 +384,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), chunksz, max_elements, 0, UNDEFINED, NULL, NULL, - psmi_cuda_hostbuf_alloc_func, - (void *) - &protoexp->cuda_hostbuf_small_recv_cfg); + psmi_cuda_hostbuf_alloc_func); if (protoexp->cuda_hostbuf_pool_small_recv == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, @@ -396,9 +392,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, goto fail; } - PSMI_CUDA_CALL(cuStreamCreate, - &protoexp->cudastream_recv, - CU_STREAM_NON_BLOCKING); + protoexp->cudastream_recv = NULL; STAILQ_INIT(&protoexp->cudapend_getreqsq); } else { protoexp->cuda_hostbuf_pool_recv = NULL; @@ -437,7 +431,9 @@ psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp) !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv); psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv); - PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv); + if (protoexp->cudastream_recv != NULL) { + PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv); + } } #endif psmi_mpool_destroy(protoexp->tid_getreq_pool); @@ -1094,12 +1090,17 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, window_len = ips_cuda_next_window(tidsendc->ipsaddr->window_rv, offset, req->req_data.buf_len); - if (window_len <= CUDA_SMALLHOSTBUF_SZ) + unsigned bufsz; + if (window_len <= CUDA_SMALLHOSTBUF_SZ) { chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_small_send); - if (chb == NULL) + bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz; + } + if (chb == NULL) { chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_send); + bufsz = proto->cuda_hostbuf_send_cfg.bufsz; + } /* were any buffers available for the prefetcher? */ if (chb == NULL) return; @@ -1109,6 +1110,20 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, chb->req = req; chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset; chb->bytes_read = 0; + + if (proto->cudastream_send == NULL) { + PSMI_CUDA_CALL(cuStreamCreate, + &proto->cudastream_send, CU_STREAM_NON_BLOCKING); + } + if (chb->host_buf == NULL) { + PSMI_CUDA_CALL(cuMemHostAlloc, + (void **) &chb->host_buf, + bufsz, + CU_MEMHOSTALLOC_PORTABLE); + } + if (chb->copy_status == NULL) { + PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT); + } PSMI_CUDA_CALL(cuMemcpyDtoHAsync, chb->host_buf, chb->gpu_buf, window_len, @@ -1143,12 +1158,17 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, window_len = ips_cuda_next_window(tidsendc->ipsaddr->window_rv, offset, req->req_data.buf_len); - if (window_len <= CUDA_SMALLHOSTBUF_SZ) + unsigned bufsz; + if (window_len <= CUDA_SMALLHOSTBUF_SZ) { chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_small_send); - if (chb == NULL) + bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz; + } + if (chb == NULL) { chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_send); + bufsz = proto->cuda_hostbuf_send_cfg.bufsz; + } /* were any buffers available? If not force allocate */ if (chb == NULL) { @@ -1162,6 +1182,19 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, chb->req = req; chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset; chb->bytes_read = 0; + if (proto->cudastream_send == NULL) { + PSMI_CUDA_CALL(cuStreamCreate, + &proto->cudastream_send, CU_STREAM_NON_BLOCKING); + } + if (chb->host_buf == NULL) { + PSMI_CUDA_CALL(cuMemHostAlloc, + (void **) &chb->host_buf, + bufsz, + CU_MEMHOSTALLOC_PORTABLE); + } + if (chb->copy_status == NULL) { + PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT); + } PSMI_CUDA_CALL(cuMemcpyDtoHAsync, chb->host_buf, chb->gpu_buf, window_len, @@ -2047,14 +2080,19 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, /* 4. allocate a cuda bounce buffer, if required */ struct ips_cuda_hostbuf *chb = NULL; if (getreq->cuda_hostbuf_used) { - if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) + unsigned bufsz; + if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) { chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( protoexp->cuda_hostbuf_pool_small_recv); - if (chb == NULL) + bufsz = protoexp->cuda_hostbuf_small_recv_cfg.bufsz; + } + if (chb == NULL) { chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( protoexp->cuda_hostbuf_pool_recv); + bufsz = protoexp->cuda_hostbuf_recv_cfg.bufsz; + } if (chb == NULL) { /* Unable to get a cudahostbuf for TID. * Release the resources we're holding and reschedule.*/ @@ -2069,6 +2107,12 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, return PSM2_EP_NO_RESOURCES; } + if (chb->host_buf == NULL) { + PSMI_CUDA_CALL(cuMemHostAlloc, + (void **) &chb->host_buf, + bufsz, + CU_MEMHOSTALLOC_PORTABLE); + } tidrecvc->cuda_hostbuf = chb; tidrecvc->buffer = chb->host_buf; chb->size = 0; @@ -2423,11 +2467,20 @@ void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc) chb->size += tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start + tidrecvc->tid_list.tsess_unaligned_end; + if (protoexp->cudastream_recv == NULL) { + PSMI_CUDA_CALL(cuStreamCreate, + &protoexp->cudastream_recv, + CU_STREAM_NON_BLOCKING); + } + PSMI_CUDA_CALL(cuMemcpyHtoDAsync, chb->gpu_buf, chb->host_buf, tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start + tidrecvc->tid_list.tsess_unaligned_end, protoexp->cudastream_recv); + if (chb->copy_status == NULL) { + PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT); + } PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, protoexp->cudastream_recv); diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c index 8a047c6..1989c7a 100644 --- a/ptl_ips/ips_proto_mq.c +++ b/ptl_ips/ips_proto_mq.c @@ -486,14 +486,19 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, ips_cuda_next_window(ipsaddr->window_rv, offset, len); - if (window_len <= CUDA_SMALLHOSTBUF_SZ) + unsigned bufsz; + if (window_len <= CUDA_SMALLHOSTBUF_SZ) { chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_small_send); - if (chb == NULL) + bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz; + } + if (chb == NULL) { chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_send); + bufsz = proto->cuda_hostbuf_send_cfg.bufsz; + } /* any buffers available? */ if (chb == NULL) @@ -507,6 +512,19 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, chb->gpu_buf = (CUdeviceptr) buf + offset; chb->bytes_read = 0; + if (proto->cudastream_send == NULL) { + PSMI_CUDA_CALL(cuStreamCreate, + &proto->cudastream_send, CU_STREAM_NON_BLOCKING); + } + if (chb->host_buf == NULL) { + PSMI_CUDA_CALL(cuMemHostAlloc, + (void **) &chb->host_buf, + bufsz, + CU_MEMHOSTALLOC_PORTABLE); + } + if (chb->copy_status == NULL) { + PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT); + } PSMI_CUDA_CALL(cuMemcpyDtoHAsync, chb->host_buf, chb->gpu_buf, window_len, diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c index 6c5fd07..5edf4cc 100644 --- a/ptl_ips/ips_recvhdrq.c +++ b/ptl_ips/ips_recvhdrq.c @@ -162,8 +162,11 @@ static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev) if (hfi_debug & __HFI_PKTDBG) { ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE, "header"); - if (paylen) + if (!payload) { + _HFI_DBG("Cannot dump frame; payload is NULL\n"); + } else if (paylen) { ips_proto_dump_frame(payload, paylen, "data"); + } } } diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c index 4adb65a..b2d1d73 100644 --- a/ptl_ips/ptl_rcvthread.c +++ b/ptl_ips/ptl_rcvthread.c @@ -96,7 +96,7 @@ struct ptl_rcvthread { * stored to provide hints during a cuda failure * due to a null cuda context. */ - CUcontext ctxt; + CUcontext cu_ctxt; #endif /* @@ -124,7 +124,7 @@ psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *recvq) #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) - PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt); + PSMI_CUDA_CALL(cuCtxGetCurrent, &cu_ctxt); #endif if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) && @@ -347,8 +347,8 @@ void *ips_ptl_pollintr(void *rcvthreadc) psm2_error_t err; #ifdef PSM_CUDA - if (PSMI_IS_CUDA_ENABLED && ctxt != NULL) - PSMI_CUDA_CALL(cuCtxSetCurrent, ctxt); + if (PSMI_IS_CUDA_ENABLED && cu_ctxt != NULL) + PSMI_CUDA_CALL(cuCtxSetCurrent, cu_ctxt); #endif PSM2_LOG_MSG("entering"); diff --git a/rpm_release_extension b/rpm_release_extension old mode 100644 new mode 100755 index 725a5ba..ae8563e --- a/rpm_release_extension +++ b/rpm_release_extension @@ -1 +1 @@ -185 +200nccl