Skip to content

Commit

Permalink
PSM2 NCCL support
Browse files Browse the repository at this point in the history
This patch adds all outstanding bug fixes planned for IFS 10.11 release
and adds beta support for the nVidia Collective Communications Library.

Signed-off-by: Michael Heinz <[email protected]>
  • Loading branch information
Michael Heinz committed Feb 12, 2021
1 parent 7a33bed commit 9ce43be
Show file tree
Hide file tree
Showing 20 changed files with 281 additions and 223 deletions.
2 changes: 1 addition & 1 deletion COMMIT
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1 @@
30c52a0fd155774e18cc06328a1ba83c2a6a8104
442d97d67dd0667a8bb11a99f6b10dbfb12fec63
1 change: 1 addition & 0 deletions CONTRIBUTORS
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ Dmitry (dmitrygx on github.com)
Florian Weimer (fweimer on github.com)
Jonas Hahnfeld (hahnjo on github.com)
Tom Stellard (tstellar on github.com)
Chuck Cranor (chuchcranor on github.com)
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips -I$(OUTDIR)
ifneq (x86_64,$(arch))
ifneq (i386,$(arch))
$(error Unsupported architecture $(arch))
anerr := $(error Unsupported architecture $(arch))
endif
endif
Expand Down Expand Up @@ -164,7 +164,7 @@ nthreads := $(shell echo $$(( `nproc` * 2 )) )
# The DISTRO variable is used subsequently for variable
# behaviors of the 3 distros.
DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID)
DISTRO := $(shell . /etc/os-release; if [ "$$ID" = "sle_hpc" ]; then ID="sles"; fi; echo $$ID)
# By default the following two variables have the following values:
LIBPSM2_COMPAT_CONF_DIR := /etc
Expand Down
8 changes: 4 additions & 4 deletions buildflags.mak
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,13 @@ ifneq (icc,${CC})
RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?)
else
RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?)
$(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
anerr := $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance)
endif

ifeq (0,${RET})
BASECFLAGS += ${MAVX2}
else
$(error Compiler does not support ${MAVX2} )
anerr := $(error Compiler does not support ${MAVX2} )
endif
else
BASECFLAGS += ${MAVX2}
Expand All @@ -138,7 +138,7 @@ ifneq (,${PSM_AVX512})
ifeq (0,${RET})
BASECFLAGS += -mavx512f
else
$(error Compiler does not support AVX512 )
anerr := $(error Compiler does not support AVX512 )
endif
BASECFLAGS += -DPSM_AVX512
endif
Expand Down Expand Up @@ -203,7 +203,7 @@ else
BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security
else
ifneq (${CCARCH},gcc4)
$(error Unknown compiler arch "${CCARCH}")
anerr := $(error Unknown compiler arch "${CCARCH}")
endif # gcc4
endif # gcc
endif # icc
Expand Down
78 changes: 4 additions & 74 deletions psm.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ uint32_t psmi_cpu_model;
#ifdef PSM_CUDA
int is_cuda_enabled;
int is_gdr_copy_enabled;
int device_support_gpudirect;
int gpu_p2p_supported = 0;
int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect().
int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported().
int my_gpu_device = 0;
int cuda_lib_version;
int is_driver_gpudirect_enabled;
Expand All @@ -116,6 +116,7 @@ CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
CUresult (*psmi_cuDeviceGetCount)(int* count);
CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
CUresult (*psmi_cuEventQuery)(CUevent hEvent);
Expand Down Expand Up @@ -217,6 +218,7 @@ int psmi_cuda_lib_load()
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy);
PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery);
Expand Down Expand Up @@ -251,7 +253,6 @@ int psmi_cuda_lib_load()
int psmi_cuda_initialize()
{
psm2_error_t err = PSM2_OK;
int num_devices, dev;

PSM2_LOG_MSG("entering");
_HFI_VDBG("Enabling CUDA support.\n");
Expand All @@ -262,77 +263,6 @@ int psmi_cuda_initialize()

PSMI_CUDA_CALL(cuInit, 0);

/* Check if CUDA context is available. If not, we are not allowed to
* launch any CUDA API calls */
PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt);
if (ctxt == NULL) {
_HFI_INFO("Unable to find active CUDA context\n");
is_cuda_enabled = 0;
err = PSM2_OK;
return err;
}

CUdevice current_device;
CUcontext primary_ctx;
PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
int is_ctx_active;
unsigned ctx_flags;
PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags,
&is_ctx_active);
if (!is_ctx_active) {
/* There is an issue where certain CUDA API calls create
* contexts but does not make it active which cause the
* driver API call to fail with error 709 */
PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx,
current_device);
is_cuda_primary_context_retain = 1;
}

/* Check if all devices support Unified Virtual Addressing. */
PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);

device_support_gpudirect = 1;

for (dev = 0; dev < num_devices; dev++) {
CUdevice device;
PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
int unifiedAddressing;
PSMI_CUDA_CALL(cuDeviceGetAttribute,
&unifiedAddressing,
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
device);

if (unifiedAddressing !=1) {
_HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev);
goto fail;
}

int major;
PSMI_CUDA_CALL(cuDeviceGetAttribute,
&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
device);
if (major < 3) {
device_support_gpudirect = 0;
_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
}

if (device != current_device) {
int canAccessPeer = 0;
PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
current_device, device);

if (canAccessPeer != 1)
_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
else
gpu_p2p_supported |= (1 << device);
} else {
/* Always support p2p on the same GPU */
my_gpu_device = device;
gpu_p2p_supported |= (1 << device);
}
}

union psmi_envvar_val env_enable_gdr_copy;
psmi_getenv("PSM2_GDRCOPY",
"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",
Expand Down
11 changes: 6 additions & 5 deletions psm_hal_gen1/psm_hal_gen1_spio.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,


#ifdef PSM_CUDA
if (PSMI_IS_CUDA_ENABLED) {
PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
}
ctrl->cuda_pio_buffer = NULL;
#endif

_HFI_PRDBG("ips_spio_init() done\n");
Expand All @@ -195,7 +192,7 @@ ips_spio_init(const struct psmi_context *context, struct ptl *ptl,
static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl)
{
#ifdef PSM_CUDA
if (PSMI_IS_CUDA_ENABLED)
if (PSMI_IS_CUDA_ENABLED && ctrl->cuda_pio_buffer != NULL)
PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer);
#endif
spio_report_stall(ctrl, get_cycles(), 0ULL);
Expand Down Expand Up @@ -810,6 +807,10 @@ ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
/* Write to PIO: other blocks of payload */
#ifdef PSM_CUDA
if (is_cuda_payload) {
if (ctrl->cuda_pio_buffer == NULL) {
PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer,
MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE);
}
/* Since the implementation of cuMemcpy is unknown,
and the HFI specifies several conditions for how PIO
writes must occur, for safety reasons we should not assume
Expand Down
6 changes: 1 addition & 5 deletions psm_mpool.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ struct mpool {

#ifdef PSM_CUDA
alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
void *mp_alloc_dealloc_cb_context;
#endif
};

Expand Down Expand Up @@ -230,7 +229,7 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
uint32_t num_obj_max_total, int flags,
psmi_memtype_t statstype,
non_empty_callback_fn_t cb, void *context,
alloc_dealloc_callback_fn_t ad_cb, void *ad_context)
alloc_dealloc_callback_fn_t ad_cb)
{
mpool_t mp;

Expand All @@ -242,7 +241,6 @@ psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
return NULL;

mp->mp_alloc_dealloc_cb = ad_cb;
mp->mp_alloc_dealloc_cb_context = ad_context;

if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) {
psmi_mpool_destroy(mp);
Expand Down Expand Up @@ -418,7 +416,6 @@ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
int j;
for (j = 0; j < mp->mp_num_obj_per_chunk; j++)
mp->mp_alloc_dealloc_cb(0 /* is not alloc */,
mp->mp_alloc_dealloc_cb_context,
((void *) mp->mp_elm_vector[idx]) +
j * mp->mp_elm_size +
sizeof(struct mpool_element));
Expand Down Expand Up @@ -509,7 +506,6 @@ static int psmi_mpool_allocate_chunk(mpool_t mp)
#ifdef PSM_CUDA
if (mp->mp_alloc_dealloc_cb)
mp->mp_alloc_dealloc_cb(1 /* is alloc */,
mp->mp_alloc_dealloc_cb_context,
chunk + i * mp->mp_elm_size +
sizeof(struct mpool_element));
#endif
Expand Down
6 changes: 2 additions & 4 deletions psm_mpool.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@

typedef struct mpool *mpool_t;
typedef void (*non_empty_callback_fn_t) (void *context);
typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context,
void *chunk);
typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *chunk);

mpool_t
MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
Expand All @@ -84,8 +83,7 @@ mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk,
uint32_t num_obj_max_total, int flags,
psmi_memtype_t statstype,
non_empty_callback_fn_t cb, void *context,
alloc_dealloc_callback_fn_t ad_cb,
void *ad_context);
alloc_dealloc_callback_fn_t ad_cb);

void psmi_mpool_destroy(mpool_t mp);

Expand Down
1 change: 0 additions & 1 deletion psm_mq_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,6 @@ struct psm2_mq_req {
uint32_t prefetch_send_msgoff;
int cuda_hostbuf_used;
CUipcMemHandle cuda_ipc_handle;
CUevent cuda_ipc_event;
uint8_t cuda_ipc_handle_attached;
uint32_t cuda_ipc_offset;
/*
Expand Down
37 changes: 0 additions & 37 deletions psm_mq_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,6 @@ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
}
MOCK_DEF_EPILOGUE(psmi_mq_req_alloc);

#ifdef PSM_CUDA
void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) {
psm2_mq_req_t recvreq = (psm2_mq_req_t)obj;
if (PSMI_IS_CUDA_ENABLED) {
if (is_alloc)
PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT);
else
PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event);
}
return;
}
#endif

psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
{
psm2_mq_req_t warmup_req;
Expand Down Expand Up @@ -165,37 +152,13 @@ psm2_error_t psmi_mq_req_init(psm2_mq_t mq)
if ((err =
psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
goto fail;
/* Have a callback function for receive req mpool which creates
* and destroy events.
*/
#ifdef PSM_CUDA
if (PSMI_IS_CUDA_ENABLED) {
if ((mq->rreq_pool =
psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz,
maxsz, 0, DESCRIPTORS, NULL,
NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) {
err = PSM2_NO_MEMORY;
goto fail;
}
}
else {
if ((mq->rreq_pool =
psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
maxsz, 0, DESCRIPTORS, NULL,
NULL)) == NULL) {
err = PSM2_NO_MEMORY;
goto fail;
}
}
#else
if ((mq->rreq_pool =
psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz,
maxsz, 0, DESCRIPTORS, NULL,
NULL)) == NULL) {
err = PSM2_NO_MEMORY;
goto fail;
}
#endif
}

/* Warm up the allocators */
Expand Down
Loading

0 comments on commit 9ce43be

Please sign in to comment.