From 9ce43bef85badb0474a66b2331a794fe2ed8d64f Mon Sep 17 00:00:00 2001
From: Michael Heinz <michael.william.heinz@cornelisnetworks.com>
Date: Tue, 9 Feb 2021 15:03:19 -0500
Subject: [PATCH] PSM2 NCCL support

This patch adds all outstanding bug fixes planned for IFS 10.11 release
and adds beta support for the nVidia Collective Communications Library.

Signed-off-by: Michael Heinz <mheinz@cornelisnetworks.com>
---
 COMMIT                           |   2 +-
 CONTRIBUTORS                     |   1 +
 Makefile                         |   4 +-
 buildflags.mak                   |   8 +-
 psm.c                            |  78 +-----------------
 psm_hal_gen1/psm_hal_gen1_spio.c |  11 +--
 psm_mpool.c                      |   6 +-
 psm_mpool.h                      |   6 +-
 psm_mq_internal.h                |   1 -
 psm_mq_utils.c                   |  37 ---------
 psm_user.h                       | 133 ++++++++++++++++++++++++++++---
 ptl_am/am_cuda_memhandle_cache.c |   5 +-
 ptl_am/am_reqrep_shmem.c         |   2 +-
 ptl_am/ptl.c                     |   6 +-
 ptl_ips/ips_proto.c              |  82 +++++++++----------
 ptl_ips/ips_proto_expected.c     |  85 ++++++++++++++++----
 ptl_ips/ips_proto_mq.c           |  22 ++++-
 ptl_ips/ips_recvhdrq.c           |   5 +-
 ptl_ips/ptl_rcvthread.c          |   8 +-
 rpm_release_extension            |   2 +-
 20 files changed, 281 insertions(+), 223 deletions(-)
 mode change 100644 => 100755 COMMIT
 mode change 100644 => 100755 rpm_release_extension

diff --git a/COMMIT b/COMMIT
old mode 100644
new mode 100755
index b94efbd..e8cf65e
--- a/COMMIT
+++ b/COMMIT
@@ -1 +1 @@
-30c52a0fd155774e18cc06328a1ba83c2a6a8104
\ No newline at end of file
+442d97d67dd0667a8bb11a99f6b10dbfb12fec63
\ No newline at end of file
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 7571183..bc1e98f 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -13,3 +13,4 @@ Dmitry (dmitrygx on github.com)
 Florian Weimer (fweimer on github.com)
 Jonas Hahnfeld (hahnjo on github.com)
 Tom Stellard (tstellar on github.com)
+Chuck Cranor (chuchcranor on github.com)
diff --git a/Makefile b/Makefile
index 5a31d64..bead074 100644
--- a/Makefile
+++ b/Makefile
@@ -129,7 +129,7 @@ INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips -I$(OUTDIR)
 
 ifneq (x86_64,$(arch))
    ifneq (i386,$(arch))
-      $(error Unsupported architecture $(arch))
+      anerr := $(error Unsupported architecture $(arch))
    endif
 endif
 
@@ -164,7 +164,7 @@ nthreads := $(shell echo $$(( `nproc` * 2 )) )
 # The DISTRO variable is used subsequently for variable
 # behaviors of the 3 distros.
 
-DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID)
+DISTRO := $(shell . /etc/os-release; if [ "$$ID" = "sle_hpc" ]; then ID="sles"; fi; echo $$ID)
 
 # By default the following two variables have the following values:
 LIBPSM2_COMPAT_CONF_DIR := /etc
diff --git a/buildflags.mak b/buildflags.mak
index 7c3cda0..21eb729 100644
--- a/buildflags.mak
+++ b/buildflags.mak
@@ -118,13 +118,13 @@ ifneq (icc,${CC})
 		RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
 	else
 		RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?)
-		$(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
+                anerr := $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
 	endif
 
 	ifeq (0,${RET})
 		BASECFLAGS += ${MAVX2}
 	else
-		$(error Compiler does not support ${MAVX2} )
+		anerr := $(error Compiler does not support ${MAVX2} )
 	endif
 else
 		BASECFLAGS += ${MAVX2}
@@ -138,7 +138,7 @@ ifneq (,${PSM_AVX512})
 		ifeq (0,${RET})
 			BASECFLAGS += -mavx512f
 		else
-			$(error Compiler does not support AVX512 )
+			anerr := $(error Compiler does not support AVX512 )
 		endif
 		BASECFLAGS += -DPSM_AVX512
 	endif
@@ -203,7 +203,7 @@ else
 		BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security
 	else
 		ifneq (${CCARCH},gcc4)
-			$(error Unknown compiler arch "${CCARCH}")
+			anerr := $(error Unknown compiler arch "${CCARCH}")
 		endif # gcc4
 	endif # gcc
 endif # icc
diff --git a/psm.c b/psm.c
index 7f929ce..8677910 100644
--- a/psm.c
+++ b/psm.c
@@ -92,8 +92,8 @@ uint32_t psmi_cpu_model;
 #ifdef PSM_CUDA
 int is_cuda_enabled;
 int is_gdr_copy_enabled;
-int device_support_gpudirect;
-int gpu_p2p_supported = 0;
+int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect().
+int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported().
 int my_gpu_device = 0;
 int cuda_lib_version;
 int is_driver_gpudirect_enabled;
@@ -116,6 +116,7 @@ CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
 CUresult (*psmi_cuDeviceGetCount)(int* count);
 CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
 CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
 CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
 CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
 CUresult (*psmi_cuEventQuery)(CUevent hEvent);
@@ -217,6 +218,7 @@ int psmi_cuda_lib_load()
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery);
@@ -251,7 +253,6 @@ int psmi_cuda_lib_load()
 int psmi_cuda_initialize()
 {
 	psm2_error_t err = PSM2_OK;
-	int num_devices, dev;
 
 	PSM2_LOG_MSG("entering");
 	_HFI_VDBG("Enabling CUDA support.\n");
@@ -262,77 +263,6 @@ int psmi_cuda_initialize()
 
 	PSMI_CUDA_CALL(cuInit, 0);
 
-	/* Check if CUDA context is available. If not, we are not allowed to
-	 * launch any CUDA API calls */
-	PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt);
-	if (ctxt == NULL) {
-		_HFI_INFO("Unable to find active CUDA context\n");
-		is_cuda_enabled = 0;
-		err = PSM2_OK;
-		return err;
-	}
-
-	CUdevice current_device;
-	CUcontext primary_ctx;
-	PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
-	int is_ctx_active;
-	unsigned ctx_flags;
-	PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags,
-			&is_ctx_active);
-	if (!is_ctx_active) {
-		/* There is an issue where certain CUDA API calls create
-		 * contexts but does not make it active which cause the
-		 * driver API call to fail with error 709 */
-		PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx,
-				current_device);
-		is_cuda_primary_context_retain = 1;
-	}
-
-	/* Check if all devices support Unified Virtual Addressing. */
-	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-
-	device_support_gpudirect = 1;
-
-	for (dev = 0; dev < num_devices; dev++) {
-		CUdevice device;
-		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
-		int unifiedAddressing;
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&unifiedAddressing,
-				CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
-				device);
-
-		if (unifiedAddressing !=1) {
-			_HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev);
-			goto fail;
-		}
-
-		int major;
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&major,
-				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-				device);
-		if (major < 3) {
-			device_support_gpudirect = 0;
-			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
-		}
-
-		if (device != current_device) {
-			int canAccessPeer = 0;
-			PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
-					current_device, device);
-
-			if (canAccessPeer != 1)
-				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
-			else
-				gpu_p2p_supported |= (1 << device);
-		} else {
-			/* Always support p2p on the same GPU */
-			my_gpu_device = device;
-			gpu_p2p_supported |= (1 << device);
-		}
-	}
-
 	union psmi_envvar_val env_enable_gdr_copy;
 	psmi_getenv("PSM2_GDRCOPY",
 				"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",
diff --git a/psm_hal_gen1/psm_hal_gen1_spio.c b/psm_hal_gen1/psm_hal_gen1_spio.c
index eb9d5aa..a53d060 100644
--- a/psm_hal_gen1/psm_hal_gen1_spio.c
+++ b/psm_hal_gen1/psm_hal_gen1_spio.c
@@ -181,10 +181,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
 
 
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED) {
-		PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
-				MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
-	}
+	ctrl->cuda_pio_buffer = NULL;
 #endif
 
 	_HFI_PRDBG("ips_spio_init() done\n");
@@ -195,7 +192,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
 static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl)
 {
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED)
+	if (PSMI_IS_CUDA_ENABLED && ctrl->cuda_pio_buffer != NULL)
 		PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer);
 #endif
 	spio_report_stall(ctrl, get_cycles(), 0ULL);
@@ -810,6 +807,10 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 	/* Write to PIO: other blocks of payload */
 #ifdef PSM_CUDA
 	if (is_cuda_payload) {
+		if (ctrl->cuda_pio_buffer == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
+							MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
+		}
 		/* Since the implementation of cuMemcpy is unknown,
 		   and the HFI specifies several conditions for how PIO
 		   writes must occur, for safety reasons we should not assume
diff --git a/psm_mpool.c b/psm_mpool.c
index 1f2a365..2d035e3 100644
--- a/psm_mpool.c
+++ b/psm_mpool.c
@@ -101,7 +101,6 @@ struct mpool {
 
 #ifdef PSM_CUDA
 	alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
-	void *mp_alloc_dealloc_cb_context;
 #endif
 };
 
@@ -230,7 +229,7 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 			   uint32_t num_obj_max_total, int flags,
 			   psmi_memtype_t statstype,
 			   non_empty_callback_fn_t cb, void *context,
-			   alloc_dealloc_callback_fn_t ad_cb, void *ad_context)
+			   alloc_dealloc_callback_fn_t ad_cb)
 {
 	mpool_t mp;
 
@@ -242,7 +241,6 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 		return NULL;
 
 	mp->mp_alloc_dealloc_cb = ad_cb;
-	mp->mp_alloc_dealloc_cb_context = ad_context;
 
 	if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
 		psmi_mpool_destroy(mp);
@@ -418,7 +416,6 @@ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
 	int j;
 	for (j = 0; j < mp->mp_num_obj_per_chunk; j++)
 		mp->mp_alloc_dealloc_cb(0 /* is not alloc */,
-					mp->mp_alloc_dealloc_cb_context,
 					((void *) mp->mp_elm_vector[idx]) +
 					j * mp->mp_elm_size +
 					sizeof(struct mpool_element));
@@ -509,7 +506,6 @@ static int psmi_mpool_allocate_chunk(mpool_t mp)
 #ifdef PSM_CUDA
 		if (mp->mp_alloc_dealloc_cb)
 			mp->mp_alloc_dealloc_cb(1 /* is alloc */,
-						mp->mp_alloc_dealloc_cb_context,
 						chunk + i * mp->mp_elm_size +
 						sizeof(struct mpool_element));
 #endif
diff --git a/psm_mpool.h b/psm_mpool.h
index 8098f60..653d80b 100644
--- a/psm_mpool.h
+++ b/psm_mpool.h
@@ -70,8 +70,7 @@
 
 typedef struct mpool *mpool_t;
 typedef void (*non_empty_callback_fn_t) (void *context);
-typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context,
-					     void *chunk);
+typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *chunk);
 
 mpool_t
 MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
@@ -84,8 +83,7 @@ mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 				   uint32_t num_obj_max_total, int flags,
 				   psmi_memtype_t statstype,
 				   non_empty_callback_fn_t cb, void *context,
-				   alloc_dealloc_callback_fn_t ad_cb,
-				   void *ad_context);
+				   alloc_dealloc_callback_fn_t ad_cb);
 
 void psmi_mpool_destroy(mpool_t mp);
 
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
index a1afaf8..0793d68 100644
--- a/psm_mq_internal.h
+++ b/psm_mq_internal.h
@@ -249,7 +249,6 @@ struct psm2_mq_req {
 	uint32_t prefetch_send_msgoff;
 	int cuda_hostbuf_used;
 	CUipcMemHandle cuda_ipc_handle;
-	CUevent cuda_ipc_event;
 	uint8_t cuda_ipc_handle_attached;
 	uint32_t cuda_ipc_offset;
 	/*
diff --git a/psm_mq_utils.c b/psm_mq_utils.c
index a0409db..8f58b37 100644
--- a/psm_mq_utils.c
+++ b/psm_mq_utils.c
@@ -114,19 +114,6 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
 }
 MOCK_DEF_EPILOGUE(psmi_mq_req_alloc);
 
-#ifdef PSM_CUDA
-void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) {
-	psm2_mq_req_t recvreq = (psm2_mq_req_t)obj;
-	if (PSMI_IS_CUDA_ENABLED) {
-		if (is_alloc)
-			PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT);
-		else
-			PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event);
-	}
-	return;
-}
-#endif
-
 psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
 {
 	psm2_mq_req_t warmup_req;
@@ -165,29 +152,6 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
 		if ((err =
 		     psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
 			goto fail;
-		/* Have a callback function for receive req mpool which creates
-		 * and destroy events.
-		 */
-#ifdef PSM_CUDA
-		if (PSMI_IS_CUDA_ENABLED) {
-			if ((mq->rreq_pool =
-	                     psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz,
-                                       maxsz, 0, DESCRIPTORS, NULL,
-                                       NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) {
-				err = PSM2_NO_MEMORY;
-				goto fail;
-			}
-		}
-		else {
-			if ((mq->rreq_pool =
-				psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
-                                       maxsz, 0, DESCRIPTORS, NULL,
-                                       NULL)) == NULL) {
-				err = PSM2_NO_MEMORY;
-				goto fail;
-			}
-		}
-#else
 		if ((mq->rreq_pool =
 			psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
 				       maxsz, 0, DESCRIPTORS, NULL,
@@ -195,7 +159,6 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
 			err = PSM2_NO_MEMORY;
 			goto fail;
 		}
-#endif
 	}
 
 	/* Warm up the allocators */
diff --git a/psm_user.h b/psm_user.h
index 09477c5..fa38a42 100644
--- a/psm_user.h
+++ b/psm_user.h
@@ -296,6 +296,7 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
 #endif
 
 #ifdef PSM_CUDA
+
 #include <cuda.h>
 #include <driver_types.h>
 
@@ -305,12 +306,12 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
 
 extern int is_cuda_enabled;
 extern int is_gdr_copy_enabled;
-extern int device_support_gpudirect;
-extern int gpu_p2p_supported;
+extern int _device_support_gpudirect;
+extern int _gpu_p2p_supported;
 extern int my_gpu_device;
 extern int cuda_lib_version;
 
-extern CUcontext ctxt;
+extern CUcontext cu_ctxt;
 extern void *psmi_cuda_lib;
 
 extern CUresult (*psmi_cuInit)(unsigned int  Flags );
@@ -326,6 +327,7 @@ extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
 extern CUresult (*psmi_cuDeviceGetCount)(int* count);
 extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
 extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+extern CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
 extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
 extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
 extern CUresult (*psmi_cuEventQuery)(CUevent hEvent);
@@ -348,14 +350,34 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
 extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
 extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 
+static int check_set_cuda_ctxt(void)
+{
+	CUresult err;
+	CUcontext tmpctxt = {0};
+
+	if (!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent)
+		return 0;
+
+	err = psmi_cuCtxGetCurrent(&tmpctxt);
+	if (!err) {
+		if (!tmpctxt && cu_ctxt) {
+			err = psmi_cuCtxSetCurrent(cu_ctxt);
+			return !!err;
+		} else if (tmpctxt && !cu_ctxt) {
+			cu_ctxt = tmpctxt;
+		}
+	}
+	return 0;
+}
+
 #define PSMI_CUDA_CALL(func, args...) do {				\
 		CUresult cudaerr;					\
+		if (check_set_cuda_ctxt()) { \
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+				"Failed to set/synchronize CUDA context.\n"); \
+		} \
 		cudaerr = psmi_##func(args);				\
 		if (cudaerr != CUDA_SUCCESS) {				\
-			if (ctxt == NULL)				\
-				_HFI_ERROR(				\
-				"Check if CUDA is initialized"	\
-				"before psm2_ep_open call \n");		\
 			_HFI_ERROR(					\
 				"CUDA failure: %s() (at %s:%d)"		\
 				"returned %d\n",			\
@@ -366,6 +388,92 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
 		}							\
 	} while (0)
 
+PSMI_ALWAYS_INLINE(
+int device_support_gpudirect())
+{
+	if (_device_support_gpudirect > -1) return _device_support_gpudirect;
+
+	int num_devices, dev;
+	
+	/* Check if all devices support Unified Virtual Addressing. */
+	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	_device_support_gpudirect = 1;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+		int unifiedAddressing;
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&unifiedAddressing,
+				CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+				device);
+
+		if (unifiedAddressing !=1) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE,
+				"CUDA device %d does not support Unified Virtual Addressing.\n",
+				dev);
+		}
+
+		int major;
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+				&major,
+				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+				device);
+		if (major < 3) {
+			_device_support_gpudirect = 0;
+			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
+		}
+	}
+
+	return _device_support_gpudirect;
+}
+
+#define PSMI_IS_CUDA_ENABLED  likely(is_cuda_enabled)
+#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
+
+PSMI_ALWAYS_INLINE(
+int gpu_p2p_supported())
+{
+	if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported;
+
+	if (PSMI_IS_CUDA_DISABLED) {
+		_gpu_p2p_supported=0; 
+		return 0;
+	}
+
+	int num_devices, dev;
+	
+	/* Check which devices the current device has p2p access to. */
+	CUdevice current_device;
+	PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
+	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	_gpu_p2p_supported = 0;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+
+		if (device != current_device) {
+			int canAccessPeer = 0;
+			PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
+					current_device, device);
+
+			if (canAccessPeer != 1)
+				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
+			else
+				_gpu_p2p_supported |= (1 << device);
+		} else {
+			/* Always support p2p on the same GPU */
+			my_gpu_device = device;
+			_gpu_p2p_supported |= (1 << device);
+		}
+	}
+
+	return _gpu_p2p_supported;
+}
+
 /**
  * Similar to PSMI_CUDA_CALL() except does not error out
  * if func(args) returns CUDA_SUCCESS or except_err
@@ -378,9 +486,13 @@ extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
  * DBG level.
  */
 #define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \
+		if (check_set_cuda_ctxt()) { \
+			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+				"Failed to set/synchronize CUDA context.\n"); \
+		} \
 		cudaerr = psmi_##func(args);				\
 		if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) {	\
-			if (ctxt == NULL)				\
+			if (cu_ctxt == NULL)				\
 				_HFI_ERROR(				\
 				"Check if CUDA is initialized"	\
 				"before psm2_ep_open call \n");		\
@@ -442,9 +554,6 @@ _psmi_is_cuda_mem(const void *ptr))
 		return 0;
 }
 
-#define PSMI_IS_CUDA_ENABLED  likely(is_cuda_enabled)
-#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
-
 PSMI_ALWAYS_INLINE(
 int
 _psmi_is_gdr_copy_enabled())
@@ -473,7 +582,7 @@ struct ips_cuda_hostbuf {
 struct ips_cuda_hostbuf_mpool_cb_context {
 	unsigned bufsz;
 };
-void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj);
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *obj);
 
 #define CUDA_HOSTBUFFER_LIMITS {				\
 	    .env = "PSM_CUDA_BOUNCEBUFFERS_MAX",		\
diff --git a/ptl_am/am_cuda_memhandle_cache.c b/ptl_am/am_cuda_memhandle_cache.c
index 730562d..253d261 100644
--- a/ptl_am/am_cuda_memhandle_cache.c
+++ b/ptl_am/am_cuda_memhandle_cache.c
@@ -168,7 +168,7 @@ static void print_cuda_memhandle_cache_stats(void)
  * which helps in closing all memhandles.
  */
 static void
-psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* obj)
 {
 	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
 	if (!is_alloc) {
@@ -196,8 +196,7 @@ am_cuda_memhandle_mpool_init(uint32_t memcache_size)
 					cuda_memhandle_cache_size,
 					cuda_memhandle_cache_size, 0,
 					UNDEFINED, NULL, NULL,
-					psmi_cuda_memhandle_cache_alloc_func,
-					NULL);
+					psmi_cuda_memhandle_cache_alloc_func);
 	if (cuda_memhandle_mpool == NULL) {
 		err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
 				"Couldn't allocate CUDA host receive buffer pool");
diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c
index 9be72f9..bcc0182 100644
--- a/ptl_am/am_reqrep_shmem.c
+++ b/ptl_am/am_reqrep_shmem.c
@@ -2099,7 +2099,7 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 
 #ifdef PSM_CUDA
 	int gpu_mem = 0;
-	int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported;
+	int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported();
 
 	if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) {
 		gpu_mem = 1;
diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c
index 2e42c1b..3b3ed9f 100644
--- a/ptl_am/ptl.c
+++ b/ptl_am/ptl.c
@@ -96,8 +96,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 		if (req->is_buf_gpu_mem) {
 			PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_dev_ptr,
 				       req->req_data.recv_msglen);
-			PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0);
-			PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event);
+			PSMI_CUDA_CALL(cuStreamSynchronize, 0);
 		} else
 			PSMI_CUDA_CALL(cuMemcpyDtoH, req->req_data.buf, cuda_ipc_dev_ptr,
 				       req->req_data.recv_msglen);
@@ -129,8 +128,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 			 * copies for msg sizes less than 64k. The event record
 			 * and synchronize calls are to guarentee completion.
 			 */
-			PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0);
-			PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event);
+			PSMI_CUDA_CALL(cuStreamSynchronize, 0);
 			psmi_free(cuda_ipc_bounce_buf);
 		} else {
 			/* cma can be done in handler context or not. */
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
index dfd03e6..c28b7d4 100644
--- a/ptl_ips/ips_proto.c
+++ b/ptl_ips/ips_proto.c
@@ -95,22 +95,17 @@ static psm2_error_t proto_sdma_init(struct ips_proto *proto,
 				   const psmi_context_t *context);
 
 #ifdef PSM_CUDA
-void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj)
+void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *obj)
 {
-	struct ips_cuda_hostbuf *icb;
-	struct ips_cuda_hostbuf_mpool_cb_context *ctxt =
-		(struct ips_cuda_hostbuf_mpool_cb_context *) context;
-
-	icb = (struct ips_cuda_hostbuf *)obj;
+	struct ips_cuda_hostbuf *icb = (struct ips_cuda_hostbuf *)obj;
 	if (is_alloc) {
-		PSMI_CUDA_CALL(cuMemHostAlloc,
-			       (void **) &icb->host_buf,
-			       ctxt->bufsz,
-			       CU_MEMHOSTALLOC_PORTABLE);
-		PSMI_CUDA_CALL(cuEventCreate, &icb->copy_status, CU_EVENT_DEFAULT);
+		icb->host_buf = NULL;
+		icb->copy_status = NULL;
 	} else {
-		if (icb->host_buf) {
+		if (icb->host_buf != NULL) {
 			PSMI_CUDA_CALL(cuMemFreeHost, icb->host_buf);
+		}
+		if (icb->copy_status != NULL) {
 			PSMI_CUDA_CALL(cuEventDestroy, icb->copy_status);
 		}
 	}
@@ -520,10 +515,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 
 	if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
 #ifdef PSM_CUDA
-		if (PSMI_IS_CUDA_ENABLED) {
-			PSMI_CUDA_CALL(cuStreamCreate,
-				   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
-		}
+		proto->cudastream_send = NULL;
 #endif
 		proto->scbc_rv = NULL;
 		if ((err = ips_protoexp_init(context, proto, protoexp_flags,
@@ -635,14 +627,34 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
 				(union psmi_envvar_val)0, /* Disabled by default */
 				&env_gpudirect_rdma);
+        /* Use GPUDirect RDMA for SDMA send? */
+        union psmi_envvar_val env_gpudirect_rdma_send;
+        psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND",
+                                "Use GPUDirect RDMA support to allow the HFI to directly"
+                                " read from the GPU for SDMA.  Requires driver"
+                                " support.(default is disabled i.e. 0)",
+                                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+                                (union psmi_envvar_val)0, /* Disabled by default */
+                                &env_gpudirect_rdma_send);
+ 
+        /* Use GPUDirect RDMA for recv? */
+        union psmi_envvar_val env_gpudirect_rdma_recv;
+        psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV",
+                                "Use GPUDirect RDMA support to allow the HFI to directly"
+                                " write into GPU.  Requires driver support.(default is"
+                                " disabled i.e. 0)",
+                                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+                                (union psmi_envvar_val)0, /* Disabled by default */
+                                &env_gpudirect_rdma_recv);
+
 	/* The following cases need to be handled:
 	 * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or
 	 *    by default - Turn off GDR COPY
 	 * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave
 	 *.   this config as it is.
 	 */
-	if (!env_gpudirect_rdma.e_uint)
-		is_gdr_copy_enabled = 0;
+        if (!env_gpudirect_rdma.e_uint && !env_gpudirect_rdma_send.e_uint && !env_gpudirect_rdma_recv.e_uint)		
+                is_gdr_copy_enabled = 0;
 
 	/* Default Send threshold for Gpu-direct set to 30000 */
 	union psmi_envvar_val env_gpudirect_send_thresh;
@@ -659,7 +671,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 		    (union psmi_envvar_val)UINT_MAX, &env_gpudirect_recv_thresh);
 	gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint;
 
-	if (env_gpudirect_rdma.e_uint && device_support_gpudirect) {
+	if (env_gpudirect_rdma.e_uint && device_support_gpudirect()) {
 		if (PSMI_IS_CUDA_DISABLED ||
 			/* All pio, No SDMA*/
 			(proto->flags & IPS_PROTO_FLAG_SPIO) ||
@@ -675,16 +687,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 		 * experimentation and will not be documented for any customers.
 		 */
 		/* Use GPUDirect RDMA for SDMA send? */
-		union psmi_envvar_val env_gpudirect_rdma_send;
-		psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND",
-					"Use GPUDirect RDMA support to allow the HFI to directly"
-					" read from the GPU for SDMA.  Requires driver"
-					" support.(default is disabled i.e. 0)",
-					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
-					(union psmi_envvar_val)0, /* Disabled by default */
-					&env_gpudirect_rdma_send);
-
-		if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) {
+		if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect()) {
 			if (PSMI_IS_CUDA_DISABLED ||
 				/* All pio, No SDMA*/
 				(proto->flags & IPS_PROTO_FLAG_SPIO))
@@ -695,16 +698,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 			proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
 		}
 		/* Use GPUDirect RDMA for recv? */
-		union psmi_envvar_val env_gpudirect_rdma_recv;
-		psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV",
-					"Use GPUDirect RDMA support to allow the HFI to directly"
-					" write into GPU.  Requires driver support.(default is"
-					" disabled i.e. 0)",
-					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
-					(union psmi_envvar_val)0, /* Disabled by default */
-					&env_gpudirect_rdma_recv);
-
-		if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) {
+		if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect()) {
 			if (PSMI_IS_CUDA_DISABLED ||
 				!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED))
 					err = psmi_handle_error(PSMI_EP_NORETURN,
@@ -734,9 +728,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 			psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
 						   chunksz, max_elements, 0,
 						   UNDEFINED, NULL, NULL,
-						   psmi_cuda_hostbuf_alloc_func,
-						   (void *)
-						   &proto->cuda_hostbuf_send_cfg);
+						   psmi_cuda_hostbuf_alloc_func);
 
 		if (proto->cuda_hostbuf_pool_send == NULL) {
 			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -750,9 +742,7 @@ ips_proto_init(const psmi_context_t *context, const ptl_t *ptl,
 			psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
 						   chunksz, max_elements, 0,
 						   UNDEFINED, NULL, NULL,
-						   psmi_cuda_hostbuf_alloc_func,
-						   (void *)
-						   &proto->cuda_hostbuf_small_send_cfg);
+						   psmi_cuda_hostbuf_alloc_func);
 
 		if (proto->cuda_hostbuf_pool_small_send == NULL) {
 			err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -928,7 +918,7 @@ ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
 #endif
 
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED) {
+	if (PSMI_IS_CUDA_ENABLED && proto->cudastream_send) {
 		PSMI_CUDA_CALL(cuStreamDestroy, proto->cudastream_send);
 	}
 #endif
diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c
index 7e7e997..a579dd6 100644
--- a/ptl_ips/ips_proto_expected.c
+++ b/ptl_ips/ips_proto_expected.c
@@ -370,9 +370,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
 				psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
 							   chunksz, max_elements, 0,
 							   UNDEFINED, NULL, NULL,
-							   psmi_cuda_hostbuf_alloc_func,
-							   (void *)
-							   &protoexp->cuda_hostbuf_recv_cfg);
+							   psmi_cuda_hostbuf_alloc_func);
 
 			if (protoexp->cuda_hostbuf_pool_recv == NULL) {
 				err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -386,9 +384,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
 				psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf),
 							   chunksz, max_elements, 0,
 							   UNDEFINED, NULL, NULL,
-							   psmi_cuda_hostbuf_alloc_func,
-							   (void *)
-							   &protoexp->cuda_hostbuf_small_recv_cfg);
+							   psmi_cuda_hostbuf_alloc_func);
 
 			if (protoexp->cuda_hostbuf_pool_small_recv == NULL) {
 				err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY,
@@ -396,9 +392,7 @@ MOCKABLE(ips_protoexp_init)(const psmi_context_t *context,
 				goto fail;
 			}
 
-			PSMI_CUDA_CALL(cuStreamCreate,
-				&protoexp->cudastream_recv,
-				CU_STREAM_NON_BLOCKING);
+			protoexp->cudastream_recv = NULL;
 			STAILQ_INIT(&protoexp->cudapend_getreqsq);
 		} else {
 			protoexp->cuda_hostbuf_pool_recv = NULL;
@@ -437,7 +431,9 @@ psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp)
 		 !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
 		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv);
 		psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv);
-		PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv);
+		if (protoexp->cudastream_recv != NULL) {
+			PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv);
+		}
 	}
 #endif
 	psmi_mpool_destroy(protoexp->tid_getreq_pool);
@@ -1094,12 +1090,17 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
 		window_len =
 			ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
 					     offset, req->req_data.buf_len);
-		if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+		unsigned bufsz;
+		if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
 			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
 				proto->cuda_hostbuf_pool_small_send);
-		if (chb == NULL)
+			bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+		}
+		if (chb == NULL) {
 			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
 				proto->cuda_hostbuf_pool_send);
+			bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+		}
 		/* were any buffers available for the prefetcher? */
 		if (chb == NULL)
 			return;
@@ -1109,6 +1110,20 @@ void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp,
 		chb->req = req;
 		chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset;
 		chb->bytes_read = 0;
+
+		if (proto->cudastream_send == NULL) {
+			PSMI_CUDA_CALL(cuStreamCreate,
+				   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+		}
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
+		if (chb->copy_status == NULL) {
+			PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+		}
 		PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
 			       chb->host_buf, chb->gpu_buf,
 			       window_len,
@@ -1143,12 +1158,17 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 		window_len =
 			ips_cuda_next_window(tidsendc->ipsaddr->window_rv,
 					     offset, req->req_data.buf_len);
-		if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+		unsigned bufsz;
+		if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
 			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
 				proto->cuda_hostbuf_pool_small_send);
-		if (chb == NULL)
+			bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+		}
+		if (chb == NULL) {
 			chb = (struct ips_cuda_hostbuf *) psmi_mpool_get(
 				proto->cuda_hostbuf_pool_send);
+			bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+		}
 
 		/* were any buffers available? If not force allocate */
 		if (chb == NULL) {
@@ -1162,6 +1182,19 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 		chb->req = req;
 		chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset;
 		chb->bytes_read = 0;
+		if (proto->cudastream_send == NULL) {
+			PSMI_CUDA_CALL(cuStreamCreate,
+				   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+		}
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
+		if (chb->copy_status == NULL) {
+			PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+		}
 		PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
 			       chb->host_buf, chb->gpu_buf,
 			       window_len,
@@ -2047,14 +2080,19 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 	/* 4. allocate a cuda bounce buffer, if required */
 	struct ips_cuda_hostbuf *chb = NULL;
 	if (getreq->cuda_hostbuf_used) {
-		if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ)
+		unsigned bufsz;
+		if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) {
 			chb = (struct ips_cuda_hostbuf *)
 				psmi_mpool_get(
 					protoexp->cuda_hostbuf_pool_small_recv);
-		if (chb == NULL)
+			bufsz = protoexp->cuda_hostbuf_small_recv_cfg.bufsz;
+		}
+		if (chb == NULL) {
 			chb = (struct ips_cuda_hostbuf *)
 				psmi_mpool_get(
 					protoexp->cuda_hostbuf_pool_recv);
+			bufsz = protoexp->cuda_hostbuf_recv_cfg.bufsz;
+		}
 		if (chb == NULL) {
 			/* Unable to get a cudahostbuf for TID.
 			 * Release the resources we're holding and reschedule.*/
@@ -2069,6 +2107,12 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 			return PSM2_EP_NO_RESOURCES;
 		}
 
+		if (chb->host_buf == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc,
+				       (void **) &chb->host_buf,
+				       bufsz,
+				       CU_MEMHOSTALLOC_PORTABLE);
+		}
 		tidrecvc->cuda_hostbuf = chb;
 		tidrecvc->buffer = chb->host_buf;
 		chb->size = 0;
@@ -2423,11 +2467,20 @@ void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc)
 	chb->size += tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
 			tidrecvc->tid_list.tsess_unaligned_end;
 
+	if (protoexp->cudastream_recv == NULL) {
+		PSMI_CUDA_CALL(cuStreamCreate,
+			&protoexp->cudastream_recv,
+			CU_STREAM_NON_BLOCKING);
+	}
+
 	PSMI_CUDA_CALL(cuMemcpyHtoDAsync,
 		       chb->gpu_buf, chb->host_buf,
 		       tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start +
 							tidrecvc->tid_list.tsess_unaligned_end,
 		       protoexp->cudastream_recv);
+	if (chb->copy_status == NULL) {
+		PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+	}
 	PSMI_CUDA_CALL(cuEventRecord, chb->copy_status,
 		       protoexp->cudastream_recv);
 
diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c
index 8a047c6..1989c7a 100644
--- a/ptl_ips/ips_proto_mq.c
+++ b/ptl_ips/ips_proto_mq.c
@@ -486,14 +486,19 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 				ips_cuda_next_window(ipsaddr->window_rv,
 						     offset, len);
 
-			if (window_len <= CUDA_SMALLHOSTBUF_SZ)
+			unsigned bufsz;
+			if (window_len <= CUDA_SMALLHOSTBUF_SZ) {
 				chb = (struct ips_cuda_hostbuf *)
 					psmi_mpool_get(
 					proto->cuda_hostbuf_pool_small_send);
-			if (chb == NULL)
+				bufsz = proto->cuda_hostbuf_small_send_cfg.bufsz;
+			}
+			if (chb == NULL) {
 				chb = (struct ips_cuda_hostbuf *)
 					psmi_mpool_get(
 					proto->cuda_hostbuf_pool_send);
+				bufsz = proto->cuda_hostbuf_send_cfg.bufsz;
+			}
 
 			/* any buffers available? */
 			if (chb == NULL)
@@ -507,6 +512,19 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 			chb->gpu_buf = (CUdeviceptr) buf + offset;
 			chb->bytes_read = 0;
 
+			if (proto->cudastream_send == NULL) {
+				PSMI_CUDA_CALL(cuStreamCreate,
+					   &proto->cudastream_send, CU_STREAM_NON_BLOCKING);
+			}
+			if (chb->host_buf == NULL) {
+				PSMI_CUDA_CALL(cuMemHostAlloc,
+					       (void **) &chb->host_buf,
+					       bufsz,
+					       CU_MEMHOSTALLOC_PORTABLE);
+			}
+			if (chb->copy_status == NULL) {
+				PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT);
+			}
 			PSMI_CUDA_CALL(cuMemcpyDtoHAsync,
 				       chb->host_buf, chb->gpu_buf,
 				       window_len,
diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c
index 6c5fd07..5edf4cc 100644
--- a/ptl_ips/ips_recvhdrq.c
+++ b/ptl_ips/ips_recvhdrq.c
@@ -162,8 +162,11 @@ static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev)
 	if (hfi_debug & __HFI_PKTDBG) {
 		ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE,
 				     "header");
-		if (paylen)
+		if (!payload) {
+			_HFI_DBG("Cannot dump frame; payload is NULL\n");
+		} else if (paylen) {
 			ips_proto_dump_frame(payload, paylen, "data");
+		}
 	}
 
 }
diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c
index 4adb65a..b2d1d73 100644
--- a/ptl_ips/ptl_rcvthread.c
+++ b/ptl_ips/ptl_rcvthread.c
@@ -96,7 +96,7 @@ struct ptl_rcvthread {
          * stored to provide hints during a cuda failure
          * due to a null cuda context.
          */
-	CUcontext ctxt;
+	CUcontext cu_ctxt;
 #endif
 
 /*
@@ -124,7 +124,7 @@ psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *recvq)
 
 #ifdef PSM_CUDA
 	if (PSMI_IS_CUDA_ENABLED)
-		PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt);
+		PSMI_CUDA_CALL(cuCtxGetCurrent, &cu_ctxt);
 #endif
 
 	if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) &&
@@ -347,8 +347,8 @@ void *ips_ptl_pollintr(void *rcvthreadc)
 	psm2_error_t err;
 
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED && ctxt != NULL)
-		PSMI_CUDA_CALL(cuCtxSetCurrent, ctxt);
+	if (PSMI_IS_CUDA_ENABLED && cu_ctxt != NULL)
+		PSMI_CUDA_CALL(cuCtxSetCurrent, cu_ctxt);
 #endif
 
 	PSM2_LOG_MSG("entering");
diff --git a/rpm_release_extension b/rpm_release_extension
old mode 100644
new mode 100755
index 725a5ba..ae8563e
--- a/rpm_release_extension
+++ b/rpm_release_extension
@@ -1 +1 @@
-185
+200nccl