From cb5fec11d64f76f69e0145d1e609f38c7a1903d6 Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Tue, 13 Dec 2022 06:22:16 +0000 Subject: [PATCH 1/2] EC/CUDA: enable memops for executor --- src/components/ec/base/ucc_ec_base.h | 7 +- src/components/ec/cpu/ec_cpu.c | 3 - src/components/ec/cuda/Makefile.am | 13 +- src/components/ec/cuda/ec_cuda.c | 180 ------------------ src/components/ec/cuda/ec_cuda.h | 19 +- src/components/ec/cuda/ec_cuda_executor.c | 33 +++- .../ec/cuda/ec_cuda_executor_interruptible.c | 1 - .../cuda/ec_cuda_executor_persistent_wait.c | 63 ++++++ .../ec/cuda/kernel/ec_cuda_wait_kernel.cu | 30 ++- src/components/ec/ucc_ec.c | 19 -- src/components/ec/ucc_ec.h | 7 - src/core/ucc_coll.c | 6 +- tools/perf/ucc_pt_comm.cc | 9 +- 13 files changed, 124 insertions(+), 266 deletions(-) create mode 100644 src/components/ec/cuda/ec_cuda_executor_persistent_wait.c diff --git a/src/components/ec/base/ucc_ec_base.h b/src/components/ec/base/ucc_ec_base.h index 6eee48d3d6..db57f1da85 100644 --- a/src/components/ec/base/ucc_ec_base.h +++ b/src/components/ec/base/ucc_ec_base.h @@ -44,9 +44,6 @@ typedef struct ucc_ec_attr { } ucc_ec_attr_t; typedef struct ucc_ec_ops { - ucc_status_t (*task_post)(void *ee_context, void **ee_req); - ucc_status_t (*task_query)(void *ee_req); - ucc_status_t (*task_end)(void *ee_req); ucc_status_t (*create_event)(void **event); ucc_status_t (*destroy_event)(void *event); ucc_status_t (*event_post)(void *ee_context, void *event); @@ -59,7 +56,8 @@ typedef struct ucc_ee_executor { } ucc_ee_executor_t; enum ucc_ee_executor_params_field { - UCC_EE_EXECUTOR_PARAM_FIELD_TYPE = UCC_BIT(0), + UCC_EE_EXECUTOR_PARAM_FIELD_TYPE = UCC_BIT(0), + UCC_EE_EXECUTOR_PARAM_FIELD_TASK_TYPES = UCC_BIT(1), }; typedef enum ucc_ee_executor_task_type { @@ -74,6 +72,7 @@ typedef enum ucc_ee_executor_task_type { typedef struct ucc_ee_executor_params { uint64_t mask; ucc_ee_type_t ee_type; + uint64_t task_types; } ucc_ee_executor_params_t; #define UCC_EE_EXECUTOR_NUM_BUFS 9 diff --git a/src/components/ec/cpu/ec_cpu.c b/src/components/ec/cpu/ec_cpu.c index bf53d209d7..8d08d2365c 100644 --- a/src/components/ec/cpu/ec_cpu.c +++ b/src/components/ec/cpu/ec_cpu.c @@ -203,9 +203,6 @@ ucc_ec_cpu_t ucc_ec_cpu = { .table = ucc_ec_cpu_config_table, .size = sizeof(ucc_ec_cpu_config_t), }, - .super.ops.task_post = NULL, - .super.ops.task_query = NULL, - .super.ops.task_end = NULL, .super.ops.create_event = NULL, .super.ops.destroy_event = NULL, .super.ops.event_post = NULL, diff --git a/src/components/ec/cuda/Makefile.am b/src/components/ec/cuda/Makefile.am index ff74621ba7..3d7a862ef4 100644 --- a/src/components/ec/cuda/Makefile.am +++ b/src/components/ec/cuda/Makefile.am @@ -6,12 +6,13 @@ if HAVE_CUDA SUBDIRS = kernel sources = \ - ec_cuda.h \ - ec_cuda.c \ - ec_cuda_executor.h \ - ec_cuda_executor.c \ - ec_cuda_executor_interruptible.c \ - ec_cuda_executor_persistent.c + ec_cuda.h \ + ec_cuda.c \ + ec_cuda_executor.h \ + ec_cuda_executor.c \ + ec_cuda_executor_interruptible.c \ + ec_cuda_executor_persistent.c \ + ec_cuda_executor_persistent_wait.c module_LTLIBRARIES = libucc_ec_cuda.la libucc_ec_cuda_la_SOURCES = $(sources) diff --git a/src/components/ec/cuda/ec_cuda.c b/src/components/ec/cuda/ec_cuda.c index 6bb24fa15e..87ba3dcd3a 100644 --- a/src/components/ec/cuda/ec_cuda.c +++ b/src/components/ec/cuda/ec_cuda.c @@ -18,12 +18,6 @@ static const char *stream_task_modes[] = { [UCC_EC_CUDA_TASK_LAST] = NULL }; -static const char *task_stream_types[] = { - [UCC_EC_CUDA_USER_STREAM] = "user", - [UCC_EC_CUDA_INTERNAL_STREAM] = "ucc", - [UCC_EC_CUDA_TASK_STREAM_LAST] = NULL -}; - static ucc_config_field_t ucc_ec_cuda_config_table[] = { {"", "", NULL, ucc_offsetof(ucc_ec_cuda_config_t, super), UCC_CONFIG_TYPE_TABLE(ucc_ec_config_table)}, @@ -36,18 +30,6 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = { ucc_offsetof(ucc_ec_cuda_config_t, strm_task_mode), UCC_CONFIG_TYPE_ENUM(stream_task_modes)}, - {"TASK_STREAM", "user", - "Stream for cuda task\n" - "user - user stream provided in execution engine context\n" - "ucc - ucc library internal stream", - ucc_offsetof(ucc_ec_cuda_config_t, task_strm_type), - UCC_CONFIG_TYPE_ENUM(task_stream_types)}, - - {"STREAM_BLOCKING_WAIT", "1", - "Stream is blocked until collective operation is done", - ucc_offsetof(ucc_ec_cuda_config_t, stream_blocking_wait), - UCC_CONFIG_TYPE_UINT}, - {"EXEC_NUM_WORKERS", "1", "Number of thread blocks to use for cuda persistent executor", ucc_offsetof(ucc_ec_cuda_config_t, exec_num_workers), @@ -143,38 +125,6 @@ static ucc_mpool_ops_t ucc_ec_cuda_ee_executor_mpool_ops = { .obj_cleanup = ucc_ec_cuda_executor_chunk_cleanup, }; -static ucc_status_t ucc_ec_cuda_stream_req_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused - size_t *size_p, - void ** chunk_p) -{ - ucc_status_t status; - - status = CUDA_FUNC(cudaHostAlloc((void**)chunk_p, *size_p, - cudaHostAllocMapped)); - return status; -} - -static void ucc_ec_cuda_stream_req_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused - void * chunk) -{ - cudaFreeHost(chunk); -} - -static void ucc_ec_cuda_stream_req_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused -{ - ucc_ec_cuda_stream_request_t *req = (ucc_ec_cuda_stream_request_t*) obj; - - CUDA_FUNC(cudaHostGetDevicePointer( - (void**)(&req->dev_status), (void *)&req->status, 0)); -} - -static ucc_mpool_ops_t ucc_ec_cuda_stream_req_mpool_ops = { - .chunk_alloc = ucc_ec_cuda_stream_req_mpool_chunk_malloc, - .chunk_release = ucc_ec_cuda_stream_req_mpool_chunk_free, - .obj_init = ucc_ec_cuda_stream_req_init, - .obj_cleanup = NULL -}; - static void ucc_ec_cuda_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused { ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj; @@ -196,28 +146,6 @@ static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = { .obj_cleanup = ucc_ec_cuda_event_cleanup, }; -ucc_status_t ucc_ec_cuda_post_kernel_stream_task(uint32_t *status, - int blocking_wait, - cudaStream_t stream); - -static ucc_status_t ucc_ec_cuda_post_driver_stream_task(uint32_t *status, - int blocking_wait, - cudaStream_t stream) -{ - CUdeviceptr status_ptr = (CUdeviceptr)status; - - if (blocking_wait) { - CUDADRV_FUNC(cuStreamWriteValue32(stream, status_ptr, - UCC_EC_CUDA_TASK_STARTED, 0)); - CUDADRV_FUNC(cuStreamWaitValue32(stream, status_ptr, - UCC_EC_CUDA_TASK_COMPLETED, - CU_STREAM_WAIT_VALUE_EQ)); - } - CUDADRV_FUNC(cuStreamWriteValue32(stream, status_ptr, - UCC_EC_CUDA_TASK_COMPLETED_ACK, 0)); - return UCC_OK; -} - static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock) { if (*nt != UCC_ULUNITS_AUTO) { @@ -303,16 +231,6 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params) return status; } - /* create request pool */ - status = ucc_mpool_init( - &ucc_ec_cuda.strm_reqs, 0, sizeof(ucc_ec_cuda_stream_request_t), 0, - UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_cuda_stream_req_mpool_ops, - UCC_THREAD_MULTIPLE, "CUDA Event Objects"); - if (status != UCC_OK) { - ec_error(&ucc_ec_cuda.super, "failed to create event pool"); - return status; - } - status = ucc_mpool_init( &ucc_ec_cuda.executors, 0, sizeof(ucc_ec_cuda_executor_t), 0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_cuda_ee_executor_mpool_ops, @@ -344,10 +262,8 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params) if (cfg->strm_task_mode == UCC_EC_CUDA_TASK_KERNEL) { ucc_ec_cuda.strm_task_mode = UCC_EC_CUDA_TASK_KERNEL; - ucc_ec_cuda.post_strm_task = ucc_ec_cuda_post_kernel_stream_task; } else { ucc_ec_cuda.strm_task_mode = UCC_EC_CUDA_TASK_MEM_OPS; - ucc_ec_cuda.post_strm_task = ucc_ec_cuda_post_driver_stream_task; #if CUDA_VERSION < 12000 CUresult cu_st; CUdevice cu_dev; @@ -370,7 +286,6 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params) ec_info(&ucc_ec_cuda.super, "CUDA MEM OPS are not supported or disabled"); ucc_ec_cuda.strm_task_mode = UCC_EC_CUDA_TASK_KERNEL; - ucc_ec_cuda.post_strm_task = ucc_ec_cuda_post_kernel_stream_task; } } else if (attr == 0) { ec_error(&ucc_ec_cuda.super, @@ -391,7 +306,6 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params) } } - ucc_ec_cuda.task_strm_type = cfg->task_strm_type; ucc_spinlock_init(&ucc_ec_cuda.init_spinlock, 0); return UCC_OK; } @@ -404,96 +318,6 @@ static ucc_status_t ucc_ec_cuda_get_attr(ucc_ec_attr_t *ec_attr) return UCC_OK; } -ucc_status_t ucc_ec_cuda_task_post(void *ee_stream, void **ee_req) -{ - ucc_ec_cuda_config_t *cfg = EC_CUDA_CONFIG; - ucc_ec_cuda_stream_request_t *req; - ucc_ec_cuda_event_t *cuda_event; - ucc_status_t status; - - UCC_EC_CUDA_INIT_STREAM(); - req = ucc_mpool_get(&ucc_ec_cuda.strm_reqs); - if (ucc_unlikely(!req)) { - ec_error(&ucc_ec_cuda.super, "failed to get stream request from mpool"); - return UCC_ERR_NO_MEMORY; - } - req->status = UCC_EC_CUDA_TASK_POSTED; - req->stream = (cudaStream_t)ee_stream; - - if (ucc_ec_cuda.task_strm_type == UCC_EC_CUDA_USER_STREAM) { - status = ucc_ec_cuda.post_strm_task(req->dev_status, - cfg->stream_blocking_wait, - req->stream); - if (ucc_unlikely(status != UCC_OK)) { - goto free_req; - } - } else { - cuda_event = ucc_mpool_get(&ucc_ec_cuda.events); - if (ucc_unlikely(!cuda_event)) { - ec_error(&ucc_ec_cuda.super, "failed to get event from mpool"); - status = UCC_ERR_NO_MEMORY; - goto free_req; - } - - CUDA_CHECK(cudaEventRecord(cuda_event->event, req->stream)); - CUDA_CHECK(cudaStreamWaitEvent(ucc_ec_cuda.stream, cuda_event->event, 0)); - status = ucc_ec_cuda.post_strm_task(req->dev_status, - cfg->stream_blocking_wait, - ucc_ec_cuda.stream); - if (ucc_unlikely(status != UCC_OK)) { - goto free_event; - } - CUDA_CHECK(cudaEventRecord(cuda_event->event, ucc_ec_cuda.stream)); - CUDA_CHECK(cudaStreamWaitEvent(req->stream, cuda_event->event, 0)); - ucc_mpool_put(cuda_event); - } - - *ee_req = (void *) req; - - ec_info(&ucc_ec_cuda.super, "stream task posted on \"%s\" stream. req:%p", - task_stream_types[ucc_ec_cuda.task_strm_type], req); - - return UCC_OK; - -free_event: - ucc_mpool_put(cuda_event); -free_req: - ucc_mpool_put(req); - return status; -} - -ucc_status_t ucc_ec_cuda_task_query(void *ee_req) -{ - ucc_ec_cuda_stream_request_t *req = ee_req; - - /* ee task might be only in POSTED, STARTED or COMPLETED_ACK state - COMPLETED state is used by ucc_ee_cuda_task_end function to request - stream unblock*/ - ucc_assert(req->status != UCC_EC_CUDA_TASK_COMPLETED); - if (req->status == UCC_EC_CUDA_TASK_POSTED) { - return UCC_INPROGRESS; - } - ec_info(&ucc_ec_cuda.super, "stream task started. req:%p", req); - return UCC_OK; -} - -ucc_status_t ucc_ec_cuda_task_end(void *ee_req) -{ - ucc_ec_cuda_stream_request_t *req = ee_req; - volatile ucc_ec_task_status_t *st = &req->status; - - /* can be safely ended only if it's in STARTED or COMPLETED_ACK state */ - ucc_assert((*st != UCC_EC_CUDA_TASK_POSTED) && - (*st != UCC_EC_CUDA_TASK_COMPLETED)); - if (*st == UCC_EC_CUDA_TASK_STARTED) { - *st = UCC_EC_CUDA_TASK_COMPLETED; - while(*st != UCC_EC_CUDA_TASK_COMPLETED_ACK) { } - } - ucc_mpool_put(req); - ec_info(&ucc_ec_cuda.super, "stream task done. req:%p", req); - return UCC_OK; -} - ucc_status_t ucc_ec_cuda_event_create(void **event) { ucc_ec_cuda_event_t *cuda_event; @@ -556,7 +380,6 @@ static ucc_status_t ucc_ec_cuda_finalize() } ucc_mpool_cleanup(&ucc_ec_cuda.events, 1); - ucc_mpool_cleanup(&ucc_ec_cuda.strm_reqs, 1); ucc_mpool_cleanup(&ucc_ec_cuda.executors, 1); ucc_mpool_cleanup(&ucc_ec_cuda.executor_interruptible_tasks, 1); ucc_mpool_cleanup(&ucc_ec_cuda.executor_persistent_tasks, 1); @@ -579,9 +402,6 @@ ucc_ec_cuda_t ucc_ec_cuda = { .table = ucc_ec_cuda_config_table, .size = sizeof(ucc_ec_cuda_config_t), }, - .super.ops.task_post = ucc_ec_cuda_task_post, - .super.ops.task_query = ucc_ec_cuda_task_query, - .super.ops.task_end = ucc_ec_cuda_task_end, .super.ops.create_event = ucc_ec_cuda_event_create, .super.ops.destroy_event = ucc_ec_cuda_event_destroy, .super.ops.event_post = ucc_ec_cuda_event_post, diff --git a/src/components/ec/cuda/ec_cuda.h b/src/components/ec/cuda/ec_cuda.h index 2305cf7416..677d14b9a7 100644 --- a/src/components/ec/cuda/ec_cuda.h +++ b/src/components/ec/cuda/ec_cuda.h @@ -21,19 +21,6 @@ typedef enum ucc_ec_cuda_strm_task_mode { UCC_EC_CUDA_TASK_LAST, } ucc_ec_cuda_strm_task_mode_t; -typedef enum ucc_ec_cuda_task_stream_type { - UCC_EC_CUDA_USER_STREAM, - UCC_EC_CUDA_INTERNAL_STREAM, - UCC_EC_CUDA_TASK_STREAM_LAST -} ucc_ec_cuda_task_stream_type_t; - -typedef enum ucc_ec_task_status { - UCC_EC_CUDA_TASK_COMPLETED, - UCC_EC_CUDA_TASK_POSTED, - UCC_EC_CUDA_TASK_STARTED, - UCC_EC_CUDA_TASK_COMPLETED_ACK -} ucc_ec_task_status_t; - typedef enum ucc_ec_cuda_executor_state { UCC_EC_CUDA_EXECUTOR_INITIALIZED, UCC_EC_CUDA_EXECUTOR_POSTED, @@ -54,8 +41,6 @@ typedef ucc_status_t (*ucc_ec_cuda_task_post_fn) (uint32_t *dev_status, typedef struct ucc_ec_cuda_config { ucc_ec_config_t super; ucc_ec_cuda_strm_task_mode_t strm_task_mode; - ucc_ec_cuda_task_stream_type_t task_strm_type; - int stream_blocking_wait; unsigned long exec_num_workers; unsigned long exec_num_threads; unsigned long exec_max_tasks; @@ -72,14 +57,11 @@ typedef struct ucc_ec_cuda { int exec_streams_initialized; cudaStream_t *exec_streams; ucc_mpool_t events; - ucc_mpool_t strm_reqs; ucc_mpool_t executors; ucc_mpool_t executor_interruptible_tasks; ucc_mpool_t executor_persistent_tasks; ucc_thread_mode_t thread_mode; ucc_ec_cuda_strm_task_mode_t strm_task_mode; - ucc_ec_cuda_task_stream_type_t task_strm_type; - ucc_ec_cuda_task_post_fn post_strm_task; ucc_spinlock_t init_spinlock; } ucc_ec_cuda_t; @@ -116,6 +98,7 @@ typedef struct ucc_ec_cuda_executor_task_ops { typedef struct ucc_ec_cuda_executor { ucc_ee_executor_t super; ucc_ec_cuda_executor_mode_t mode; + uint64_t requested_ops; ucc_ec_cuda_executor_task_ops_t ops; ucc_spinlock_t tasks_lock; ucc_ec_cuda_executor_state_t state; diff --git a/src/components/ec/cuda/ec_cuda_executor.c b/src/components/ec/cuda/ec_cuda_executor.c index d031e4f45a..49ae469140 100644 --- a/src/components/ec/cuda/ec_cuda_executor.c +++ b/src/components/ec/cuda/ec_cuda_executor.c @@ -6,14 +6,19 @@ #include "ec_cuda_executor.h" +ucc_status_t ucc_cuda_executor_interruptible_start(ucc_ee_executor_t *executor); + +ucc_status_t ucc_cuda_executor_interruptible_stop(ucc_ee_executor_t *executor); + ucc_status_t ucc_cuda_executor_persistent_start(ucc_ee_executor_t *executor, void *ee_context); ucc_status_t ucc_cuda_executor_persistent_stop(ucc_ee_executor_t *executor); -ucc_status_t ucc_cuda_executor_interruptible_start(ucc_ee_executor_t *executor); +ucc_status_t ucc_cuda_executor_persistent_wait_start(ucc_ee_executor_t *executor, + void *ee_context); -ucc_status_t ucc_cuda_executor_interruptible_stop(ucc_ee_executor_t *executor); +ucc_status_t ucc_cuda_executor_persistent_wait_stop(ucc_ee_executor_t *executor); ucc_status_t ucc_cuda_executor_init(const ucc_ee_executor_params_t *params, ucc_ee_executor_t **executor) @@ -25,6 +30,13 @@ ucc_status_t ucc_cuda_executor_init(const ucc_ee_executor_params_t *params, return UCC_ERR_NO_MEMORY; } + if (params->mask & UCC_EE_EXECUTOR_PARAM_FIELD_TASK_TYPES) { + eee->requested_ops = params->task_types; + } else { + /* if no task types provided assume all tasks types required */ + eee->requested_ops = 1; + } + ec_debug(&ucc_ec_cuda.super, "executor init, eee: %p", eee); eee->super.ee_type = params->ee_type; eee->state = UCC_EC_CUDA_EXECUTOR_INITIALIZED; @@ -90,10 +102,19 @@ ucc_status_t ucc_cuda_executor_task_finalize(ucc_ee_executor_task_t *task) ucc_status_t ucc_cuda_executor_start(ucc_ee_executor_t *executor, void *ee_context) { + ucc_ec_cuda_executor_t *eee = ucc_derived_of(executor, + ucc_ec_cuda_executor_t); + if (!ee_context) { return ucc_cuda_executor_interruptible_start(executor); } else { - return ucc_cuda_executor_persistent_start(executor, ee_context); + if (eee->requested_ops == 0) { + /* no operations requested, just mark stream busy */ + return ucc_cuda_executor_persistent_wait_start(executor, + ee_context); + } else { + return ucc_cuda_executor_persistent_start(executor, ee_context); + } } } @@ -104,6 +125,10 @@ ucc_status_t ucc_cuda_executor_stop(ucc_ee_executor_t *executor) if (eee->mode == UCC_EC_CUDA_EXECUTOR_MODE_INTERRUPTIBLE) { return ucc_cuda_executor_interruptible_stop(executor); } else { - return ucc_cuda_executor_persistent_stop(executor); + if (eee->requested_ops == 0) { + return ucc_cuda_executor_persistent_wait_stop(executor); + } else { + return ucc_cuda_executor_persistent_stop(executor); + } } } diff --git a/src/components/ec/cuda/ec_cuda_executor_interruptible.c b/src/components/ec/cuda/ec_cuda_executor_interruptible.c index 55fd3addf0..e4a027f4d3 100644 --- a/src/components/ec/cuda/ec_cuda_executor_interruptible.c +++ b/src/components/ec/cuda/ec_cuda_executor_interruptible.c @@ -5,7 +5,6 @@ */ #include "ec_cuda_executor.h" -#include "components/mc/ucc_mc.h" #include "utils/ucc_atomic.h" ucc_status_t ucc_cuda_executor_interruptible_get_stream(cudaStream_t *stream) diff --git a/src/components/ec/cuda/ec_cuda_executor_persistent_wait.c b/src/components/ec/cuda/ec_cuda_executor_persistent_wait.c new file mode 100644 index 0000000000..98b97f990c --- /dev/null +++ b/src/components/ec/cuda/ec_cuda_executor_persistent_wait.c @@ -0,0 +1,63 @@ +/** + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "ec_cuda_executor.h" + +ucc_status_t +ucc_ec_cuda_post_kernel_stream_task(ucc_ec_cuda_executor_state_t *state, + cudaStream_t stream); + +static ucc_status_t +ucc_ec_cuda_post_driver_stream_task(ucc_ec_cuda_executor_state_t *state, + cudaStream_t stream) +{ + CUdeviceptr state_ptr = (CUdeviceptr)state; + + CUDADRV_FUNC(cuStreamWriteValue32(stream, state_ptr, + UCC_EC_CUDA_EXECUTOR_STARTED, 0)); + CUDADRV_FUNC(cuStreamWaitValue32(stream, state_ptr, + UCC_EC_CUDA_EXECUTOR_SHUTDOWN, + CU_STREAM_WAIT_VALUE_EQ)); + CUDADRV_FUNC(cuStreamWriteValue32(stream, state_ptr, + UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK, 0)); + return UCC_OK; +} + +ucc_status_t ucc_cuda_executor_persistent_wait_start(ucc_ee_executor_t *executor, + void *ee_context) +{ + ucc_ec_cuda_executor_t *eee = ucc_derived_of(executor, + ucc_ec_cuda_executor_t); + cudaStream_t stream = (cudaStream_t)ee_context; + + eee->super.ee_context = ee_context; + eee->state = UCC_EC_CUDA_EXECUTOR_POSTED; + eee->mode = UCC_EC_CUDA_EXECUTOR_MODE_PERSISTENT; + + ucc_memory_cpu_store_fence(); + if (ucc_ec_cuda.strm_task_mode == UCC_EC_CUDA_TASK_KERNEL) { + return ucc_ec_cuda_post_kernel_stream_task(eee->dev_state, stream); + } else { + return ucc_ec_cuda_post_driver_stream_task(eee->dev_state, stream); + } +} + +ucc_status_t ucc_cuda_executor_persistent_wait_stop(ucc_ee_executor_t *executor) +{ + ucc_ec_cuda_executor_t *eee = ucc_derived_of(executor, + ucc_ec_cuda_executor_t); + volatile ucc_ec_cuda_executor_state_t *st = &eee->state; + + ec_debug(&ucc_ec_cuda.super, "executor stop, eee: %p", eee); + ucc_assert((*st != UCC_EC_CUDA_EXECUTOR_POSTED) && + (*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN)); + *st = UCC_EC_CUDA_EXECUTOR_SHUTDOWN; + while(*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK) { } + eee->super.ee_context = NULL; + eee->state = UCC_EC_CUDA_EXECUTOR_INITIALIZED; + + return UCC_OK; +} diff --git a/src/components/ec/cuda/kernel/ec_cuda_wait_kernel.cu b/src/components/ec/cuda/kernel/ec_cuda_wait_kernel.cu index cfbe7f29eb..89743b91d9 100644 --- a/src/components/ec/cuda/kernel/ec_cuda_wait_kernel.cu +++ b/src/components/ec/cuda/kernel/ec_cuda_wait_kernel.cu @@ -17,18 +17,14 @@ extern "C" { } #endif -__global__ void wait_kernel(volatile uint32_t *status) { - ucc_status_t st; - *status = UCC_EC_CUDA_TASK_STARTED; - do { - st = (ucc_status_t)*status; - } while(st != UCC_EC_CUDA_TASK_COMPLETED); - *status = UCC_EC_CUDA_TASK_COMPLETED_ACK; - return; -} +__global__ void wait_kernel(volatile ucc_ec_cuda_executor_state_t *state) { + ucc_ec_cuda_executor_state_t st; -__global__ void wait_kernel_nb(volatile uint32_t *status) { - *status = UCC_EC_CUDA_TASK_COMPLETED_ACK; + *state = UCC_EC_CUDA_EXECUTOR_STARTED; + do { + st = *state; + } while (st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN); + *state = UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK; return; } @@ -36,15 +32,11 @@ __global__ void wait_kernel_nb(volatile uint32_t *status) { extern "C" { #endif -ucc_status_t ucc_ec_cuda_post_kernel_stream_task(uint32_t *status, - int blocking_wait, - cudaStream_t stream) +ucc_status_t +ucc_ec_cuda_post_kernel_stream_task(ucc_ec_cuda_executor_state_t *state, + cudaStream_t stream) { - if (blocking_wait) { - wait_kernel<<<1, 1, 0, stream>>>(status); - } else { - wait_kernel_nb<<<1, 1, 0, stream>>>(status); - } + wait_kernel<<<1, 1, 0, stream>>>(state); CUDA_CHECK(cudaGetLastError()); return UCC_OK; } diff --git a/src/components/ec/ucc_ec.c b/src/components/ec/ucc_ec.c index 093ba381dd..f3c6dadfb2 100644 --- a/src/components/ec/ucc_ec.c +++ b/src/components/ec/ucc_ec.c @@ -119,25 +119,6 @@ ucc_status_t ucc_ec_finalize() return UCC_OK; } -ucc_status_t ucc_ec_task_post(void *ee_context, ucc_ee_type_t ee_type, - void **ee_task) -{ - UCC_CHECK_EC_AVAILABLE(ee_type); - return ec_ops[ee_type]->task_post(ee_context, ee_task); -} - -ucc_status_t ucc_ec_task_query(void *ee_task, ucc_ee_type_t ee_type) -{ - UCC_CHECK_EC_AVAILABLE(ee_type); - return ec_ops[ee_type]->task_query(ee_task); -} - -ucc_status_t ucc_ec_task_end(void *ee_task, ucc_ee_type_t ee_type) -{ - UCC_CHECK_EC_AVAILABLE(ee_type); - return ec_ops[ee_type]->task_end(ee_task); -} - ucc_status_t ucc_ec_create_event(void **event, ucc_ee_type_t ee_type) { UCC_CHECK_EC_AVAILABLE(ee_type); diff --git a/src/components/ec/ucc_ec.h b/src/components/ec/ucc_ec.h index b9cfb5334a..9d078ef00b 100644 --- a/src/components/ec/ucc_ec.h +++ b/src/components/ec/ucc_ec.h @@ -17,13 +17,6 @@ ucc_status_t ucc_ec_get_attr(ucc_ec_attr_t *attr); ucc_status_t ucc_ec_finalize(); -ucc_status_t ucc_ec_task_post(void *ee_context, ucc_ee_type_t ee_type, - void **ee_task); - -ucc_status_t ucc_ec_task_query(void *ee_task, ucc_ee_type_t ee_type); - -ucc_status_t ucc_ec_task_end(void *ee_task, ucc_ee_type_t ee_type); - ucc_status_t ucc_ec_create_event(void **event, ucc_ee_type_t ee_type); ucc_status_t ucc_ec_destroy_event(void *event, ucc_ee_type_t ee_type); diff --git a/src/core/ucc_coll.c b/src/core/ucc_coll.c index 8c6508a4f4..ad4890e442 100644 --- a/src/core/ucc_coll.c +++ b/src/core/ucc_coll.c @@ -402,8 +402,10 @@ static void ucc_trigger_test(ucc_coll_task_t *task) /* triggered task doesn't need executor, init and start executor on * trigger task */ - params.mask = UCC_EE_EXECUTOR_PARAM_FIELD_TYPE; - params.ee_type = task->ee->ee_type; + params.mask = UCC_EE_EXECUTOR_PARAM_FIELD_TYPE | + UCC_EE_EXECUTOR_PARAM_FIELD_TASK_TYPES; + params.ee_type = task->ee->ee_type; + params.task_types = 0; status = ucc_ee_executor_init(¶ms, &task->executor); if (ucc_unlikely(status != UCC_OK)) { ucc_error("error in ee executor init, %s", diff --git a/tools/perf/ucc_pt_comm.cc b/tools/perf/ucc_pt_comm.cc index 4cff62f71e..1f6a91a651 100644 --- a/tools/perf/ucc_pt_comm.cc +++ b/tools/perf/ucc_pt_comm.cc @@ -69,7 +69,8 @@ ucc_ee_h ucc_pt_comm::get_ee() throw std::runtime_error(ucc_status_string(status)); } } else { - std::cerr << "execution engine is not supported for given memory type"; + std::cerr << "execution engine is not supported for given memory type" + << std::endl; throw std::runtime_error("not supported"); } } @@ -91,7 +92,8 @@ ucc_ee_executor_t* ucc_pt_comm::get_executor() } else if (cfg.mt == UCC_MEMORY_TYPE_ROCM) { executor_params.ee_type = UCC_EE_ROCM_STREAM; } else { - std::cerr << "executor is not supported for given memory type"; + std::cerr << "executor is not supported for given memory type" + << std::endl; throw std::runtime_error("not supported"); } status = ucc_ee_executor_init(&executor_params, &executor); @@ -197,7 +199,8 @@ ucc_status_t ucc_pt_comm::finalize() if (cfg.mt == UCC_MEMORY_TYPE_CUDA) { ucc_pt_cudaStreamDestroy((cudaStream_t)stream); } else { - std::cerr << "execution engine is not supported for given memory type"; + std::cerr << "execution engine is not supported for given memory type" + << std::endl; throw std::runtime_error("not supported"); } } From 40a1c795b21001b82a4b5e1409b197eee4d98d28 Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Fri, 16 Dec 2022 08:32:53 +0000 Subject: [PATCH 2/2] EC/ROCM: remove unused functions --- src/components/ec/rocm/ec_rocm.c | 234 +----------------- src/components/ec/rocm/ec_rocm.h | 19 -- src/components/ec/rocm/kernel/Makefile.am | 7 +- .../ec/rocm/kernel/ec_rocm_wait_kernel.cu | 44 ---- 4 files changed, 4 insertions(+), 300 deletions(-) delete mode 100644 src/components/ec/rocm/kernel/ec_rocm_wait_kernel.cu diff --git a/src/components/ec/rocm/ec_rocm.c b/src/components/ec/rocm/ec_rocm.c index 8c86384a4d..32b31cf48e 100644 --- a/src/components/ec/rocm/ec_rocm.c +++ b/src/components/ec/rocm/ec_rocm.c @@ -13,43 +13,10 @@ #include #include -static const char *stream_task_modes[] = { - [UCC_EC_ROCM_TASK_KERNEL] = "kernel", - [UCC_EC_ROCM_TASK_MEM_OPS] = "driver", - [UCC_EC_ROCM_TASK_AUTO] = "auto", - [UCC_EC_ROCM_TASK_LAST] = NULL -}; - -static const char *task_stream_types[] = { - [UCC_EC_ROCM_USER_STREAM] = "user", - [UCC_EC_ROCM_INTERNAL_STREAM] = "ucc", - [UCC_EC_ROCM_TASK_STREAM_LAST] = NULL -}; - static ucc_config_field_t ucc_ec_rocm_config_table[] = { {"", "", NULL, ucc_offsetof(ucc_ec_rocm_config_t, super), UCC_CONFIG_TYPE_TABLE(ucc_ec_config_table)}, - {"STREAM_TASK_MODE", "auto", - "Mechanism to create stream dependency\n" - "kernel - use waiting kernel\n" - "driver - use driver MEM_OPS\n" - "auto - runtime automatically chooses best one", - ucc_offsetof(ucc_ec_rocm_config_t, strm_task_mode), - UCC_CONFIG_TYPE_ENUM(stream_task_modes)}, - - {"TASK_STREAM", "user", - "Stream for rocm task\n" - "user - user stream provided in execution engine context\n" - "ucc - ucc library internal stream", - ucc_offsetof(ucc_ec_rocm_config_t, task_strm_type), - UCC_CONFIG_TYPE_ENUM(task_stream_types)}, - - {"STREAM_BLOCKING_WAIT", "1", - "Stream is blocked until collective operation is done", - ucc_offsetof(ucc_ec_rocm_config_t, stream_blocking_wait), - UCC_CONFIG_TYPE_UINT}, - {"EXEC_NUM_WORKERS", "1", "Number of thread blocks to use for rocm executor", ucc_offsetof(ucc_ec_rocm_config_t, exec_num_workers), @@ -138,7 +105,6 @@ static void ucc_ec_rocm_executor_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOL } } - static ucc_mpool_ops_t ucc_ec_rocm_ee_executor_mpool_ops = { .chunk_alloc = ucc_ec_rocm_ee_executor_mpool_chunk_malloc, .chunk_release = ucc_ec_rocm_ee_executor_mpool_chunk_free, @@ -146,40 +112,7 @@ static ucc_mpool_ops_t ucc_ec_rocm_ee_executor_mpool_ops = { .obj_cleanup = ucc_ec_rocm_executor_chunk_cleanup, }; - -static ucc_status_t ucc_ec_rocm_stream_req_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused - size_t *size_p, - void ** chunk_p) -{ - ucc_status_t status; - - status = ROCM_FUNC(hipHostMalloc((void**)chunk_p, *size_p, - hipHostMallocMapped)); - return status; -} - -static void ucc_ec_rocm_stream_req_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused - void * chunk) -{ - hipHostFree(chunk); -} - -static void ucc_ec_rocm_stream_req_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused -{ - ucc_ec_rocm_stream_request_t *req = (ucc_ec_rocm_stream_request_t*) obj; - - ROCM_FUNC(hipHostGetDevicePointer( - (void**)(&req->dev_status), (void *)&req->status, 0)); -} - -static ucc_mpool_ops_t ucc_ec_rocm_stream_req_mpool_ops = { - .chunk_alloc = ucc_ec_rocm_stream_req_mpool_chunk_malloc, - .chunk_release = ucc_ec_rocm_stream_req_mpool_chunk_free, - .obj_init = ucc_ec_rocm_stream_req_init, - .obj_cleanup = NULL -}; - -static void ucc_ec_rocm_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused +static void ucc_ec_rocm_event_init(ucc_mpool_t *mp, void *obj, void *chunk) { ucc_ec_rocm_event_t *base = (ucc_ec_rocm_event_t *) obj; @@ -206,37 +139,12 @@ static ucc_mpool_ops_t ucc_ec_rocm_event_mpool_ops = { .obj_cleanup = ucc_ec_rocm_event_cleanup, }; -ucc_status_t ucc_ec_rocm_post_kernel_stream_task(uint32_t *status, - int blocking_wait, - hipStream_t stream); - -static ucc_status_t ucc_ec_rocm_post_driver_stream_task(uint32_t *status, - int blocking_wait, - hipStream_t stream) -{ - hipDeviceptr_t status_ptr = (hipDeviceptr_t)status; - - if (blocking_wait) { - ROCM_FUNC(hipStreamWriteValue32(stream, status_ptr, - UCC_EC_ROCM_TASK_STARTED, 0)); - ROCM_FUNC(hipStreamWaitValue32(stream, status_ptr, - UCC_EC_ROCM_TASK_COMPLETED, - hipStreamWaitValueEq, 0xFFFFFFFF)); - } - ROCM_FUNC(hipStreamWriteValue32(stream, status_ptr, - UCC_EC_ROCM_TASK_COMPLETED_ACK, 0)); - return UCC_OK; -} - static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params) { ucc_ec_rocm_config_t *cfg = EC_ROCM_CONFIG; ucc_status_t status; int device, num_devices; - int attr=0; hipError_t rocm_st; - hipDevice_t hip_dev; - const char *hip_err_st_str; hipDeviceProp_t prop; ucc_ec_rocm.stream = NULL; @@ -290,16 +198,6 @@ static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params) return status; } - /* create request pool */ - status = ucc_mpool_init( - &ucc_ec_rocm.strm_reqs, 0, sizeof(ucc_ec_rocm_stream_request_t), 0, - UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_rocm_stream_req_mpool_ops, - UCC_THREAD_MULTIPLE, "ROCM Event Objects"); - if (status != UCC_OK) { - ec_error(&ucc_ec_rocm.super, "failed to create stream pool"); - return status; - } - status = ucc_mpool_init( &ucc_ec_rocm.executors, 0, sizeof(ucc_ec_rocm_executor_t), 0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_rocm_ee_executor_mpool_ops, @@ -314,44 +212,7 @@ static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params) sizeof(ucc_ec_rocm_executor_interruptible_task_t), 0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX, NULL, UCC_THREAD_MULTIPLE, "interruptible executor tasks"); - if (status != UCC_OK) { - ec_error(&ucc_ec_rocm.super, "failed to create interruptible tasks pool"); - return status; - } - - if (cfg->strm_task_mode == UCC_EC_ROCM_TASK_KERNEL) { - ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_KERNEL; - ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_kernel_stream_task; - } else { - ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_MEM_OPS; - ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_driver_stream_task; - - rocm_st = hipCtxGetDevice(&hip_dev); - if (rocm_st != hipSuccess){ - hip_err_st_str = hipGetErrorString(rocm_st); - ec_debug(&ucc_ec_rocm.super, "hipCtxGetDevice() failed: %s", - hip_err_st_str); - attr = 0; - } else { - ROCM_FUNC(hipDeviceGetAttribute(&attr, - hipDeviceAttributeCanUseStreamWaitValue, - hip_dev)); - } - if (cfg->strm_task_mode == UCC_EC_ROCM_TASK_AUTO) { - if (attr == 0) { - ec_info(&ucc_ec_rocm.super, - "ROCm MEM OPS are not supported or disabled"); - ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_KERNEL; - ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_kernel_stream_task; - } - } else if (attr == 0) { - ec_error(&ucc_ec_rocm.super, - "ROCm MEM OPS are not supported or disabled"); - return UCC_ERR_NOT_SUPPORTED; - } - } - ucc_ec_rocm.task_strm_type = cfg->task_strm_type; ucc_spinlock_init(&ucc_ec_rocm.init_spinlock, 0); return UCC_OK; } @@ -364,95 +225,6 @@ static ucc_status_t ucc_ec_rocm_get_attr(ucc_ec_attr_t *ec_attr) return UCC_OK; } -ucc_status_t ucc_ec_rocm_task_post(void *ee_stream, void **ee_req) -{ - ucc_ec_rocm_config_t *cfg = EC_ROCM_CONFIG; - ucc_ec_rocm_stream_request_t *req; - ucc_ec_rocm_event_t *rocm_event; - ucc_status_t status = UCC_OK; - - UCC_EC_ROCM_INIT_STREAM(); - req = ucc_mpool_get(&ucc_ec_rocm.strm_reqs); - if (ucc_unlikely(!req)) { - ec_error(&ucc_ec_rocm.super, "Failed to allocate stream request"); - return UCC_ERR_NO_MEMORY; - } - req->status = UCC_EC_ROCM_TASK_POSTED; - req->stream = (hipStream_t)ee_stream; - - if (ucc_ec_rocm.task_strm_type == UCC_EC_ROCM_USER_STREAM) { - status = ucc_ec_rocm.post_strm_task(req->dev_status, - cfg->stream_blocking_wait, - req->stream); - if (status != UCC_OK) { - goto free_req; - } - } else { - rocm_event = ucc_mpool_get(&ucc_ec_rocm.events); - if (ucc_unlikely(!rocm_event)) { - ec_error(&ucc_ec_rocm.super, "Failed to allocate rocm event"); - status = UCC_ERR_NO_MEMORY; - goto free_req; - } - ROCMCHECK(hipEventRecord(rocm_event->event, req->stream)); - ROCMCHECK(hipStreamWaitEvent(ucc_ec_rocm.stream, rocm_event->event, 0)); - status = ucc_ec_rocm.post_strm_task(req->dev_status, - cfg->stream_blocking_wait, - ucc_ec_rocm.stream); - if (ucc_unlikely(status != UCC_OK)) { - goto free_event; - } - ROCMCHECK(hipEventRecord(rocm_event->event, ucc_ec_rocm.stream)); - ROCMCHECK(hipStreamWaitEvent(req->stream, rocm_event->event, 0)); - ucc_mpool_put(rocm_event); - } - - *ee_req = (void *) req; - - ec_info(&ucc_ec_rocm.super, "ROCM stream task posted on \"%s\" stream. req:%p", - task_stream_types[ucc_ec_rocm.task_strm_type], req); - - return UCC_OK; - -free_event: - ucc_mpool_put(rocm_event); -free_req: - ucc_mpool_put(req); - return status; -} - -ucc_status_t ucc_ec_rocm_task_query(void *ee_req) -{ - ucc_ec_rocm_stream_request_t *req = ee_req; - - /* ee task might be only in POSTED, STARTED or COMPLETED_ACK state - COMPLETED state is used by ucc_ee_rocm_task_end function to request - stream unblock*/ - ucc_assert(req->status != UCC_EC_ROCM_TASK_COMPLETED); - if (req->status == UCC_EC_ROCM_TASK_POSTED) { - return UCC_INPROGRESS; - } - ec_info(&ucc_ec_rocm.super, "ROCM stream task started. req:%p", req); - return UCC_OK; -} - -ucc_status_t ucc_ec_rocm_task_end(void *ee_req) -{ - ucc_ec_rocm_stream_request_t *req = ee_req; - volatile ucc_ec_task_status_t *st = &req->status; - - /* can be safely ended only if it's in STARTED or COMPLETED_ACK state */ - ucc_assert((*st != UCC_EC_ROCM_TASK_POSTED) && - (*st != UCC_EC_ROCM_TASK_COMPLETED)); - if (*st == UCC_EC_ROCM_TASK_STARTED) { - *st = UCC_EC_ROCM_TASK_COMPLETED; - while(*st != UCC_EC_ROCM_TASK_COMPLETED_ACK) { } - } - ucc_mpool_put(req); - ec_info(&ucc_ec_rocm.super, "ROCM stream task done. req:%p", req); - return UCC_OK; -} - ucc_status_t ucc_ec_rocm_event_create(void **event) { ucc_ec_rocm_event_t *rocm_event; @@ -512,7 +284,6 @@ static ucc_status_t ucc_ec_rocm_finalize() } ucc_mpool_cleanup(&ucc_ec_rocm.events, 1); - ucc_mpool_cleanup(&ucc_ec_rocm.strm_reqs, 1); ucc_mpool_cleanup(&ucc_ec_rocm.executors, 1); ucc_free(ucc_ec_rocm.exec_streams); return UCC_OK; @@ -532,9 +303,6 @@ ucc_ec_rocm_t ucc_ec_rocm = { .table = ucc_ec_rocm_config_table, .size = sizeof(ucc_ec_rocm_config_t), }, - .super.ops.task_post = ucc_ec_rocm_task_post, - .super.ops.task_query = ucc_ec_rocm_task_query, - .super.ops.task_end = ucc_ec_rocm_task_end, .super.ops.create_event = ucc_ec_rocm_event_create, .super.ops.destroy_event = ucc_ec_rocm_event_destroy, .super.ops.event_post = ucc_ec_rocm_event_post, diff --git a/src/components/ec/rocm/ec_rocm.h b/src/components/ec/rocm/ec_rocm.h index c22370df03..26bc81ba3d 100644 --- a/src/components/ec/rocm/ec_rocm.h +++ b/src/components/ec/rocm/ec_rocm.h @@ -19,19 +19,6 @@ #include #include -typedef enum ucc_ec_rocm_strm_task_mode { - UCC_EC_ROCM_TASK_KERNEL, - UCC_EC_ROCM_TASK_MEM_OPS, - UCC_EC_ROCM_TASK_AUTO, - UCC_EC_ROCM_TASK_LAST, -} ucc_ec_rocm_strm_task_mode_t; - -typedef enum ucc_ec_rocm_task_stream_type { - UCC_EC_ROCM_USER_STREAM, - UCC_EC_ROCM_INTERNAL_STREAM, - UCC_EC_ROCM_TASK_STREAM_LAST -} ucc_ec_rocm_task_stream_type_t; - typedef enum ucc_ec_task_status { UCC_EC_ROCM_TASK_COMPLETED, UCC_EC_ROCM_TASK_POSTED, @@ -72,9 +59,6 @@ typedef ucc_status_t (*ucc_ec_rocm_task_post_fn) (uint32_t *dev_status, typedef struct ucc_ec_rocm_config { ucc_ec_config_t super; - ucc_ec_rocm_strm_task_mode_t strm_task_mode; - ucc_ec_rocm_task_stream_type_t task_strm_type; - int stream_blocking_wait; unsigned long exec_num_workers; unsigned long exec_num_threads; unsigned long exec_max_tasks; @@ -96,9 +80,6 @@ typedef struct ucc_ec_rocm { ucc_mpool_t executors; ucc_mpool_t executor_interruptible_tasks; ucc_thread_mode_t thread_mode; - ucc_ec_rocm_strm_task_mode_t strm_task_mode; - ucc_ec_rocm_task_stream_type_t task_strm_type; - ucc_ec_rocm_task_post_fn post_strm_task; ucc_spinlock_t init_spinlock; ucc_ee_executor_t *cpu_executor; } ucc_ec_rocm_t; diff --git a/src/components/ec/rocm/kernel/Makefile.am b/src/components/ec/rocm/kernel/Makefile.am index ee36579358..6f95d2b33b 100644 --- a/src/components/ec/rocm/kernel/Makefile.am +++ b/src/components/ec/rocm/kernel/Makefile.am @@ -17,15 +17,14 @@ HIPCCFLAGS = \ LINK = $(LIBTOOL) --mode=link $(CC) -o $@ .cu.o: - $(HIPCC) -c $< -o $@ $(HIPCCFLAGS) + $(HIPCC) -c $< -o $@ $(HIPCCFLAGS) .cu.lo: - /bin/bash $(top_srcdir)/cuda_lt.sh "$(LIBTOOL)" $@ $(HIPCC) -c $< $(HIPCCFLAGS) + /bin/bash $(top_srcdir)/cuda_lt.sh "$(LIBTOOL)" $@ $(HIPCC) -c $< $(HIPCCFLAGS) comp_noinst = libucc_ec_rocm_kernels.la -libucc_ec_rocm_kernels_la_SOURCES = ec_rocm_wait_kernel.cu \ - ec_rocm_executor_kernel.cu \ +libucc_ec_rocm_kernels_la_SOURCES = ec_rocm_executor_kernel.cu \ ec_rocm_reduce.cu libucc_ec_rocm_kernels_la_CPPFLAGS = diff --git a/src/components/ec/rocm/kernel/ec_rocm_wait_kernel.cu b/src/components/ec/rocm/kernel/ec_rocm_wait_kernel.cu deleted file mode 100644 index b33edff060..0000000000 --- a/src/components/ec/rocm/kernel/ec_rocm_wait_kernel.cu +++ /dev/null @@ -1,44 +0,0 @@ -/** - * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * Copyright (C) Advanced Micro Devices, Inc. 2022. ALL RIGHTS RESERVED. - * - * See file LICENSE for terms. - */ - -#include "ec_rocm.h" - -__global__ void wait_kernel(volatile uint32_t *status) { - ucc_status_t st; - *status = UCC_EC_ROCM_TASK_STARTED; - do { - st = (ucc_status_t)*status; - } while(st != (ucc_status_t)UCC_EC_ROCM_TASK_COMPLETED); - *status = UCC_EC_ROCM_TASK_COMPLETED_ACK; - return; -} - -__global__ void wait_kernel_nb(volatile uint32_t *status) { - *status = UCC_EC_ROCM_TASK_COMPLETED_ACK; - return; -} - -#ifdef __cplusplus -extern "C" { -#endif - -ucc_status_t ucc_ec_rocm_post_kernel_stream_task(uint32_t *status, - int blocking_wait, - hipStream_t stream) -{ - if (blocking_wait) { - wait_kernel<<<1, 1, 0, stream>>>(status); - } else { - wait_kernel_nb<<<1, 1, 0, stream>>>(status); - } - ROCMCHECK(hipGetLastError()); - return UCC_OK; -} - -#ifdef __cplusplus -} -#endif