PSM2 NCCL support

This patch adds all outstanding bug fixes planned for IFS 10.11 release and adds beta support for the nVidia Collective Communications Library. Signed-off-by: Michael Heinz <[email protected]>
cornelisnetworks · Feb 12, 2021 · 9ce43be · 9ce43be
1 parent 7a33bed
commit 9ce43be
Show file tree

Hide file tree

Showing 20 changed files with 281 additions and 223 deletions.
diff --git a/COMMIT b/COMMIT
@@ -1 +1 @@
-30c52a0fd155774e18cc06328a1ba83c2a6a8104
+442d97d67dd0667a8bb11a99f6b10dbfb12fec63
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -13,3 +13,4 @@ Dmitry (dmitrygx on github.com)
 Florian Weimer (fweimer on github.com)
 Jonas Hahnfeld (hahnjo on github.com)
 Tom Stellard (tstellar on github.com)
+Chuck Cranor (chuchcranor on github.com)
diff --git a/Makefile b/Makefile
@@ -129,7 +129,7 @@ INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips -I$(OUTDIR)
 
 ifneq (x86_64,$(arch))
    ifneq (i386,$(arch))
-      $(error Unsupported architecture $(arch))
+      anerr := $(error Unsupported architecture $(arch))
    endif
 endif
 
@@ -164,7 +164,7 @@ nthreads := $(shell echo $$(( `nproc` * 2 )) )
 # The DISTRO variable is used subsequently for variable
 # behaviors of the 3 distros.
 
-DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID)
+DISTRO := $(shell . /etc/os-release; if [ "$$ID" = "sle_hpc" ]; then ID="sles"; fi; echo $$ID)
 
 # By default the following two variables have the following values:
 LIBPSM2_COMPAT_CONF_DIR := /etc

diff --git a/buildflags.mak b/buildflags.mak
@@ -118,13 +118,13 @@ ifneq (icc,${CC})
 		RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
 	else
 		RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?)
-		$(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
+                anerr := $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
 	endif
 
 	ifeq (0,${RET})
 		BASECFLAGS += ${MAVX2}
 	else
-		$(error Compiler does not support ${MAVX2} )
+		anerr := $(error Compiler does not support ${MAVX2} )
 	endif
 else
 		BASECFLAGS += ${MAVX2}
@@ -138,7 +138,7 @@ ifneq (,${PSM_AVX512})
 		ifeq (0,${RET})
 			BASECFLAGS += -mavx512f
 		else
-			$(error Compiler does not support AVX512 )
+			anerr := $(error Compiler does not support AVX512 )
 		endif
 		BASECFLAGS += -DPSM_AVX512
 	endif
@@ -203,7 +203,7 @@ else
 		BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security
 	else
 		ifneq (${CCARCH},gcc4)
-			$(error Unknown compiler arch "${CCARCH}")
+			anerr := $(error Unknown compiler arch "${CCARCH}")
 		endif # gcc4
 	endif # gcc
 endif # icc

diff --git a/psm.c b/psm.c
@@ -92,8 +92,8 @@ uint32_t psmi_cpu_model;
 #ifdef PSM_CUDA
 int is_cuda_enabled;
 int is_gdr_copy_enabled;
-int device_support_gpudirect;
-int gpu_p2p_supported = 0;
+int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect().
+int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported().
 int my_gpu_device = 0;
 int cuda_lib_version;
 int is_driver_gpudirect_enabled;
@@ -116,6 +116,7 @@ CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
 CUresult (*psmi_cuDeviceGetCount)(int* count);
 CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
 CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
+CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
 CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
 CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
 CUresult (*psmi_cuEventQuery)(CUevent hEvent);
@@ -217,6 +218,7 @@ int psmi_cuda_lib_load()
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery);
@@ -251,7 +253,6 @@ int psmi_cuda_lib_load()
 int psmi_cuda_initialize()
 {
 	psm2_error_t err = PSM2_OK;
-	int num_devices, dev;
 
 	PSM2_LOG_MSG("entering");
 	_HFI_VDBG("Enabling CUDA support.\n");
@@ -262,77 +263,6 @@ int psmi_cuda_initialize()
 
 	PSMI_CUDA_CALL(cuInit, 0);
 
-	/* Check if CUDA context is available. If not, we are not allowed to
-	 * launch any CUDA API calls */
-	PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt);
-	if (ctxt == NULL) {
-		_HFI_INFO("Unable to find active CUDA context\n");
-		is_cuda_enabled = 0;
-		err = PSM2_OK;
-		return err;
-	}
-
-	CUdevice current_device;
-	CUcontext primary_ctx;
-	PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
-	int is_ctx_active;
-	unsigned ctx_flags;
-	PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags,
-			&is_ctx_active);
-	if (!is_ctx_active) {
-		/* There is an issue where certain CUDA API calls create
-		 * contexts but does not make it active which cause the
-		 * driver API call to fail with error 709 */
-		PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx,
-				current_device);
-		is_cuda_primary_context_retain = 1;
-	}
-
-	/* Check if all devices support Unified Virtual Addressing. */
-	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-
-	device_support_gpudirect = 1;
-
-	for (dev = 0; dev < num_devices; dev++) {
-		CUdevice device;
-		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
-		int unifiedAddressing;
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&unifiedAddressing,
-				CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
-				device);
-
-		if (unifiedAddressing !=1) {
-			_HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev);
-			goto fail;
-		}
-
-		int major;
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&major,
-				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-				device);
-		if (major < 3) {
-			device_support_gpudirect = 0;
-			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
-		}
-
-		if (device != current_device) {
-			int canAccessPeer = 0;
-			PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
-					current_device, device);
-
-			if (canAccessPeer != 1)
-				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
-			else
-				gpu_p2p_supported |= (1 << device);
-		} else {
-			/* Always support p2p on the same GPU */
-			my_gpu_device = device;
-			gpu_p2p_supported |= (1 << device);
-		}
-	}
-
 	union psmi_envvar_val env_enable_gdr_copy;
 	psmi_getenv("PSM2_GDRCOPY",
 				"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",

diff --git a/psm_hal_gen1/psm_hal_gen1_spio.c b/psm_hal_gen1/psm_hal_gen1_spio.c
@@ -181,10 +181,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
 
 
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED) {
-		PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
-				MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
-	}
+	ctrl->cuda_pio_buffer = NULL;
 #endif
 
 	_HFI_PRDBG("ips_spio_init() done\n");
@@ -195,7 +192,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
 static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl)
 {
 #ifdef PSM_CUDA
-	if (PSMI_IS_CUDA_ENABLED)
+	if (PSMI_IS_CUDA_ENABLED && ctrl->cuda_pio_buffer != NULL)
 		PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer);
 #endif
 	spio_report_stall(ctrl, get_cycles(), 0ULL);
@@ -810,6 +807,10 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 	/* Write to PIO: other blocks of payload */
 #ifdef PSM_CUDA
 	if (is_cuda_payload) {
+		if (ctrl->cuda_pio_buffer == NULL) {
+			PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
+							MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
+		}
 		/* Since the implementation of cuMemcpy is unknown,
 		   and the HFI specifies several conditions for how PIO
 		   writes must occur, for safety reasons we should not assume

diff --git a/psm_mpool.c b/psm_mpool.c
@@ -101,7 +101,6 @@ struct mpool {
 
 #ifdef PSM_CUDA
 	alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
-	void *mp_alloc_dealloc_cb_context;
 #endif
 };
 
@@ -230,7 +229,7 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 			   uint32_t num_obj_max_total, int flags,
 			   psmi_memtype_t statstype,
 			   non_empty_callback_fn_t cb, void *context,
-			   alloc_dealloc_callback_fn_t ad_cb, void *ad_context)
+			   alloc_dealloc_callback_fn_t ad_cb)
 {
 	mpool_t mp;
 
@@ -242,7 +241,6 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 		return NULL;
 
 	mp->mp_alloc_dealloc_cb = ad_cb;
-	mp->mp_alloc_dealloc_cb_context = ad_context;
 
 	if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
 		psmi_mpool_destroy(mp);
@@ -418,7 +416,6 @@ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
 	int j;
 	for (j = 0; j < mp->mp_num_obj_per_chunk; j++)
 		mp->mp_alloc_dealloc_cb(0 /* is not alloc */,
-					mp->mp_alloc_dealloc_cb_context,
 					((void *) mp->mp_elm_vector[idx]) +
 					j * mp->mp_elm_size +
 					sizeof(struct mpool_element));
@@ -509,7 +506,6 @@ static int psmi_mpool_allocate_chunk(mpool_t mp)
 #ifdef PSM_CUDA
 		if (mp->mp_alloc_dealloc_cb)
 			mp->mp_alloc_dealloc_cb(1 /* is alloc */,
-						mp->mp_alloc_dealloc_cb_context,
 						chunk + i * mp->mp_elm_size +
 						sizeof(struct mpool_element));
 #endif

diff --git a/psm_mpool.h b/psm_mpool.h
@@ -70,8 +70,7 @@
 
 typedef struct mpool *mpool_t;
 typedef void (*non_empty_callback_fn_t) (void *context);
-typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context,
-					     void *chunk);
+typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *chunk);
 
 mpool_t
 MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
@@ -84,8 +83,7 @@ mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
 				   uint32_t num_obj_max_total, int flags,
 				   psmi_memtype_t statstype,
 				   non_empty_callback_fn_t cb, void *context,
-				   alloc_dealloc_callback_fn_t ad_cb,
-				   void *ad_context);
+				   alloc_dealloc_callback_fn_t ad_cb);
 
 void psmi_mpool_destroy(mpool_t mp);
 

diff --git a/psm_mq_internal.h b/psm_mq_internal.h
@@ -249,7 +249,6 @@ struct psm2_mq_req {
 	uint32_t prefetch_send_msgoff;
 	int cuda_hostbuf_used;
 	CUipcMemHandle cuda_ipc_handle;
-	CUevent cuda_ipc_event;
 	uint8_t cuda_ipc_handle_attached;
 	uint32_t cuda_ipc_offset;
 	/*

diff --git a/psm_mq_utils.c b/psm_mq_utils.c
@@ -114,19 +114,6 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
 }
 MOCK_DEF_EPILOGUE(psmi_mq_req_alloc);
 
-#ifdef PSM_CUDA
-void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) {
-	psm2_mq_req_t recvreq = (psm2_mq_req_t)obj;
-	if (PSMI_IS_CUDA_ENABLED) {
-		if (is_alloc)
-			PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT);
-		else
-			PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event);
-	}
-	return;
-}
-#endif
-
 psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
 {
 	psm2_mq_req_t warmup_req;
@@ -165,37 +152,13 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
 		if ((err =
 		     psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
 			goto fail;
-		/* Have a callback function for receive req mpool which creates
-		 * and destroy events.
-		 */
-#ifdef PSM_CUDA
-		if (PSMI_IS_CUDA_ENABLED) {
-			if ((mq->rreq_pool =
-	                     psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz,
-                                       maxsz, 0, DESCRIPTORS, NULL,
-                                       NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) {
-				err = PSM2_NO_MEMORY;
-				goto fail;
-			}
-		}
-		else {
-			if ((mq->rreq_pool =
-				psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
-                                       maxsz, 0, DESCRIPTORS, NULL,
-                                       NULL)) == NULL) {
-				err = PSM2_NO_MEMORY;
-				goto fail;
-			}
-		}
-#else
 		if ((mq->rreq_pool =
 			psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
 				       maxsz, 0, DESCRIPTORS, NULL,
 				       NULL)) == NULL) {
 			err = PSM2_NO_MEMORY;
 			goto fail;
 		}
-#endif
 	}
 
 	/* Warm up the allocators */