From cb5fec11d64f76f69e0145d1e609f38c7a1903d6 Mon Sep 17 00:00:00 2001
From: Sergey Lebedev <sergeyle@nvidia.com>
Date: Tue, 13 Dec 2022 06:22:16 +0000
Subject: [PATCH 1/2] EC/CUDA: enable memops for executor

---
 src/components/ec/base/ucc_ec_base.h          |   7 +-
 src/components/ec/cpu/ec_cpu.c                |   3 -
 src/components/ec/cuda/Makefile.am            |  13 +-
 src/components/ec/cuda/ec_cuda.c              | 180 ------------------
 src/components/ec/cuda/ec_cuda.h              |  19 +-
 src/components/ec/cuda/ec_cuda_executor.c     |  33 +++-
 .../ec/cuda/ec_cuda_executor_interruptible.c  |   1 -
 .../cuda/ec_cuda_executor_persistent_wait.c   |  63 ++++++
 .../ec/cuda/kernel/ec_cuda_wait_kernel.cu     |  30 ++-
 src/components/ec/ucc_ec.c                    |  19 --
 src/components/ec/ucc_ec.h                    |   7 -
 src/core/ucc_coll.c                           |   6 +-
 tools/perf/ucc_pt_comm.cc                     |   9 +-
 13 files changed, 124 insertions(+), 266 deletions(-)
 create mode 100644 src/components/ec/cuda/ec_cuda_executor_persistent_wait.c

diff --git a/src/components/ec/base/ucc_ec_base.h b/src/components/ec/base/ucc_ec_base.h
index 6eee48d3d6..db57f1da85 100644
--- a/src/components/ec/base/ucc_ec_base.h
+++ b/src/components/ec/base/ucc_ec_base.h
@@ -44,9 +44,6 @@ typedef struct ucc_ec_attr {
 } ucc_ec_attr_t;
 
 typedef struct ucc_ec_ops {
-    ucc_status_t (*task_post)(void *ee_context, void **ee_req);
-    ucc_status_t (*task_query)(void *ee_req);
-    ucc_status_t (*task_end)(void *ee_req);
     ucc_status_t (*create_event)(void **event);
     ucc_status_t (*destroy_event)(void *event);
     ucc_status_t (*event_post)(void *ee_context, void *event);
@@ -59,7 +56,8 @@ typedef struct ucc_ee_executor {
 } ucc_ee_executor_t;
 
 enum ucc_ee_executor_params_field {
-    UCC_EE_EXECUTOR_PARAM_FIELD_TYPE    = UCC_BIT(0),
+    UCC_EE_EXECUTOR_PARAM_FIELD_TYPE       = UCC_BIT(0),
+    UCC_EE_EXECUTOR_PARAM_FIELD_TASK_TYPES = UCC_BIT(1),
 };
 
 typedef enum ucc_ee_executor_task_type {
@@ -74,6 +72,7 @@ typedef enum ucc_ee_executor_task_type {
 typedef struct ucc_ee_executor_params {
     uint64_t        mask;
     ucc_ee_type_t   ee_type;
+    uint64_t        task_types;
 } ucc_ee_executor_params_t;
 
 #define UCC_EE_EXECUTOR_NUM_BUFS 9
diff --git a/src/components/ec/cpu/ec_cpu.c b/src/components/ec/cpu/ec_cpu.c
index bf53d209d7..8d08d2365c 100644
--- a/src/components/ec/cpu/ec_cpu.c
+++ b/src/components/ec/cpu/ec_cpu.c
@@ -203,9 +203,6 @@ ucc_ec_cpu_t ucc_ec_cpu = {
             .table  = ucc_ec_cpu_config_table,
             .size   = sizeof(ucc_ec_cpu_config_t),
         },
-    .super.ops.task_post              = NULL,
-    .super.ops.task_query             = NULL,
-    .super.ops.task_end               = NULL,
     .super.ops.create_event           = NULL,
     .super.ops.destroy_event          = NULL,
     .super.ops.event_post             = NULL,
diff --git a/src/components/ec/cuda/Makefile.am b/src/components/ec/cuda/Makefile.am
index ff74621ba7..3d7a862ef4 100644
--- a/src/components/ec/cuda/Makefile.am
+++ b/src/components/ec/cuda/Makefile.am
@@ -6,12 +6,13 @@ if HAVE_CUDA
 SUBDIRS = kernel
 
 sources =    \
-	ec_cuda.h                        \
-	ec_cuda.c                        \
-	ec_cuda_executor.h               \
-	ec_cuda_executor.c               \
-	ec_cuda_executor_interruptible.c \
-	ec_cuda_executor_persistent.c
+	ec_cuda.h                          \
+	ec_cuda.c                          \
+	ec_cuda_executor.h                 \
+	ec_cuda_executor.c                 \
+	ec_cuda_executor_interruptible.c   \
+	ec_cuda_executor_persistent.c      \
+	ec_cuda_executor_persistent_wait.c
 
 module_LTLIBRARIES         = libucc_ec_cuda.la
 libucc_ec_cuda_la_SOURCES  = $(sources)
diff --git a/src/components/ec/cuda/ec_cuda.c b/src/components/ec/cuda/ec_cuda.c
index 6bb24fa15e..87ba3dcd3a 100644
--- a/src/components/ec/cuda/ec_cuda.c
+++ b/src/components/ec/cuda/ec_cuda.c
@@ -18,12 +18,6 @@ static const char *stream_task_modes[] = {
     [UCC_EC_CUDA_TASK_LAST]    = NULL
 };
 
-static const char *task_stream_types[] = {
-    [UCC_EC_CUDA_USER_STREAM]      = "user",
-    [UCC_EC_CUDA_INTERNAL_STREAM]  = "ucc",
-    [UCC_EC_CUDA_TASK_STREAM_LAST] = NULL
-};
-
 static ucc_config_field_t ucc_ec_cuda_config_table[] = {
     {"", "", NULL, ucc_offsetof(ucc_ec_cuda_config_t, super),
      UCC_CONFIG_TYPE_TABLE(ucc_ec_config_table)},
@@ -36,18 +30,6 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = {
      ucc_offsetof(ucc_ec_cuda_config_t, strm_task_mode),
      UCC_CONFIG_TYPE_ENUM(stream_task_modes)},
 
-    {"TASK_STREAM", "user",
-     "Stream for cuda task\n"
-     "user - user stream provided in execution engine context\n"
-     "ucc  - ucc library internal stream",
-     ucc_offsetof(ucc_ec_cuda_config_t, task_strm_type),
-     UCC_CONFIG_TYPE_ENUM(task_stream_types)},
-
-    {"STREAM_BLOCKING_WAIT", "1",
-     "Stream is blocked until collective operation is done",
-     ucc_offsetof(ucc_ec_cuda_config_t, stream_blocking_wait),
-     UCC_CONFIG_TYPE_UINT},
-
     {"EXEC_NUM_WORKERS", "1",
      "Number of thread blocks to use for cuda persistent executor",
      ucc_offsetof(ucc_ec_cuda_config_t, exec_num_workers),
@@ -143,38 +125,6 @@ static ucc_mpool_ops_t ucc_ec_cuda_ee_executor_mpool_ops = {
     .obj_cleanup   = ucc_ec_cuda_executor_chunk_cleanup,
 };
 
-static ucc_status_t ucc_ec_cuda_stream_req_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused
-                                                              size_t *size_p,
-                                                              void ** chunk_p)
-{
-    ucc_status_t status;
-
-    status = CUDA_FUNC(cudaHostAlloc((void**)chunk_p, *size_p,
-                       cudaHostAllocMapped));
-    return status;
-}
-
-static void ucc_ec_cuda_stream_req_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused
-                                                    void *       chunk)
-{
-    cudaFreeHost(chunk);
-}
-
-static void ucc_ec_cuda_stream_req_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
-{
-    ucc_ec_cuda_stream_request_t *req = (ucc_ec_cuda_stream_request_t*) obj;
-
-    CUDA_FUNC(cudaHostGetDevicePointer(
-                  (void**)(&req->dev_status), (void *)&req->status, 0));
-}
-
-static ucc_mpool_ops_t ucc_ec_cuda_stream_req_mpool_ops = {
-    .chunk_alloc   = ucc_ec_cuda_stream_req_mpool_chunk_malloc,
-    .chunk_release = ucc_ec_cuda_stream_req_mpool_chunk_free,
-    .obj_init      = ucc_ec_cuda_stream_req_init,
-    .obj_cleanup   = NULL
-};
-
 static void ucc_ec_cuda_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
 {
     ucc_ec_cuda_event_t *base = (ucc_ec_cuda_event_t *) obj;
@@ -196,28 +146,6 @@ static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = {
     .obj_cleanup   = ucc_ec_cuda_event_cleanup,
 };
 
-ucc_status_t ucc_ec_cuda_post_kernel_stream_task(uint32_t *status,
-                                                 int blocking_wait,
-                                                 cudaStream_t stream);
-
-static ucc_status_t ucc_ec_cuda_post_driver_stream_task(uint32_t *status,
-                                                        int blocking_wait,
-                                                        cudaStream_t stream)
-{
-    CUdeviceptr status_ptr  = (CUdeviceptr)status;
-
-    if (blocking_wait) {
-        CUDADRV_FUNC(cuStreamWriteValue32(stream, status_ptr,
-                                          UCC_EC_CUDA_TASK_STARTED, 0));
-        CUDADRV_FUNC(cuStreamWaitValue32(stream, status_ptr,
-                                         UCC_EC_CUDA_TASK_COMPLETED,
-                                         CU_STREAM_WAIT_VALUE_EQ));
-    }
-    CUDADRV_FUNC(cuStreamWriteValue32(stream, status_ptr,
-                                      UCC_EC_CUDA_TASK_COMPLETED_ACK, 0));
-    return UCC_OK;
-}
-
 static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock)
 {
     if (*nt != UCC_ULUNITS_AUTO) {
@@ -303,16 +231,6 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
         return status;
     }
 
-    /* create request pool */
-    status = ucc_mpool_init(
-        &ucc_ec_cuda.strm_reqs, 0, sizeof(ucc_ec_cuda_stream_request_t), 0,
-        UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_cuda_stream_req_mpool_ops,
-        UCC_THREAD_MULTIPLE, "CUDA Event Objects");
-    if (status != UCC_OK) {
-        ec_error(&ucc_ec_cuda.super, "failed to create event pool");
-        return status;
-    }
-
     status = ucc_mpool_init(
         &ucc_ec_cuda.executors, 0, sizeof(ucc_ec_cuda_executor_t), 0,
         UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_cuda_ee_executor_mpool_ops,
@@ -344,10 +262,8 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
 
     if (cfg->strm_task_mode == UCC_EC_CUDA_TASK_KERNEL) {
         ucc_ec_cuda.strm_task_mode = UCC_EC_CUDA_TASK_KERNEL;
-        ucc_ec_cuda.post_strm_task = ucc_ec_cuda_post_kernel_stream_task;
     } else {
         ucc_ec_cuda.strm_task_mode = UCC_EC_CUDA_TASK_MEM_OPS;
-        ucc_ec_cuda.post_strm_task = ucc_ec_cuda_post_driver_stream_task;
 #if CUDA_VERSION < 12000
         CUresult cu_st;
         CUdevice cu_dev;
@@ -370,7 +286,6 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
                 ec_info(&ucc_ec_cuda.super,
                         "CUDA MEM OPS are not supported or disabled");
                 ucc_ec_cuda.strm_task_mode = UCC_EC_CUDA_TASK_KERNEL;
-                ucc_ec_cuda.post_strm_task = ucc_ec_cuda_post_kernel_stream_task;
             }
         } else if (attr == 0) {
             ec_error(&ucc_ec_cuda.super,
@@ -391,7 +306,6 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
         }
     }
 
-    ucc_ec_cuda.task_strm_type = cfg->task_strm_type;
     ucc_spinlock_init(&ucc_ec_cuda.init_spinlock, 0);
     return UCC_OK;
 }
@@ -404,96 +318,6 @@ static ucc_status_t ucc_ec_cuda_get_attr(ucc_ec_attr_t *ec_attr)
     return UCC_OK;
 }
 
-ucc_status_t ucc_ec_cuda_task_post(void *ee_stream, void **ee_req)
-{
-    ucc_ec_cuda_config_t         *cfg = EC_CUDA_CONFIG;
-    ucc_ec_cuda_stream_request_t *req;
-    ucc_ec_cuda_event_t          *cuda_event;
-    ucc_status_t                  status;
-
-    UCC_EC_CUDA_INIT_STREAM();
-    req = ucc_mpool_get(&ucc_ec_cuda.strm_reqs);
-    if (ucc_unlikely(!req)) {
-        ec_error(&ucc_ec_cuda.super, "failed to get stream request from mpool");
-        return UCC_ERR_NO_MEMORY;
-    }
-    req->status = UCC_EC_CUDA_TASK_POSTED;
-    req->stream = (cudaStream_t)ee_stream;
-
-    if (ucc_ec_cuda.task_strm_type == UCC_EC_CUDA_USER_STREAM) {
-        status = ucc_ec_cuda.post_strm_task(req->dev_status,
-                                            cfg->stream_blocking_wait,
-                                            req->stream);
-        if (ucc_unlikely(status != UCC_OK)) {
-            goto free_req;
-        }
-    } else {
-        cuda_event = ucc_mpool_get(&ucc_ec_cuda.events);
-        if (ucc_unlikely(!cuda_event)) {
-            ec_error(&ucc_ec_cuda.super, "failed to get event from mpool");
-            status = UCC_ERR_NO_MEMORY;
-            goto free_req;
-        }
-
-        CUDA_CHECK(cudaEventRecord(cuda_event->event, req->stream));
-        CUDA_CHECK(cudaStreamWaitEvent(ucc_ec_cuda.stream, cuda_event->event, 0));
-        status = ucc_ec_cuda.post_strm_task(req->dev_status,
-                                            cfg->stream_blocking_wait,
-                                            ucc_ec_cuda.stream);
-        if (ucc_unlikely(status != UCC_OK)) {
-            goto free_event;
-        }
-        CUDA_CHECK(cudaEventRecord(cuda_event->event, ucc_ec_cuda.stream));
-        CUDA_CHECK(cudaStreamWaitEvent(req->stream, cuda_event->event, 0));
-        ucc_mpool_put(cuda_event);
-    }
-
-    *ee_req = (void *) req;
-
-    ec_info(&ucc_ec_cuda.super, "stream task posted on \"%s\" stream. req:%p",
-            task_stream_types[ucc_ec_cuda.task_strm_type], req);
-
-    return UCC_OK;
-
-free_event:
-    ucc_mpool_put(cuda_event);
-free_req:
-    ucc_mpool_put(req);
-    return status;
-}
-
-ucc_status_t ucc_ec_cuda_task_query(void *ee_req)
-{
-    ucc_ec_cuda_stream_request_t *req = ee_req;
-
-    /* ee task might be only in POSTED, STARTED or COMPLETED_ACK state
-       COMPLETED state is used by ucc_ee_cuda_task_end function to request
-       stream unblock*/
-    ucc_assert(req->status != UCC_EC_CUDA_TASK_COMPLETED);
-    if (req->status == UCC_EC_CUDA_TASK_POSTED) {
-        return UCC_INPROGRESS;
-    }
-    ec_info(&ucc_ec_cuda.super, "stream task started. req:%p", req);
-    return UCC_OK;
-}
-
-ucc_status_t ucc_ec_cuda_task_end(void *ee_req)
-{
-    ucc_ec_cuda_stream_request_t *req = ee_req;
-    volatile ucc_ec_task_status_t *st = &req->status;
-
-    /* can be safely ended only if it's in STARTED or COMPLETED_ACK state */
-    ucc_assert((*st != UCC_EC_CUDA_TASK_POSTED) &&
-               (*st != UCC_EC_CUDA_TASK_COMPLETED));
-    if (*st == UCC_EC_CUDA_TASK_STARTED) {
-        *st = UCC_EC_CUDA_TASK_COMPLETED;
-        while(*st != UCC_EC_CUDA_TASK_COMPLETED_ACK) { }
-    }
-    ucc_mpool_put(req);
-    ec_info(&ucc_ec_cuda.super, "stream task done. req:%p", req);
-    return UCC_OK;
-}
-
 ucc_status_t ucc_ec_cuda_event_create(void **event)
 {
     ucc_ec_cuda_event_t *cuda_event;
@@ -556,7 +380,6 @@ static ucc_status_t ucc_ec_cuda_finalize()
     }
 
     ucc_mpool_cleanup(&ucc_ec_cuda.events, 1);
-    ucc_mpool_cleanup(&ucc_ec_cuda.strm_reqs, 1);
     ucc_mpool_cleanup(&ucc_ec_cuda.executors, 1);
     ucc_mpool_cleanup(&ucc_ec_cuda.executor_interruptible_tasks, 1);
     ucc_mpool_cleanup(&ucc_ec_cuda.executor_persistent_tasks, 1);
@@ -579,9 +402,6 @@ ucc_ec_cuda_t ucc_ec_cuda = {
             .table  = ucc_ec_cuda_config_table,
             .size   = sizeof(ucc_ec_cuda_config_t),
         },
-    .super.ops.task_post              = ucc_ec_cuda_task_post,
-    .super.ops.task_query             = ucc_ec_cuda_task_query,
-    .super.ops.task_end               = ucc_ec_cuda_task_end,
     .super.ops.create_event           = ucc_ec_cuda_event_create,
     .super.ops.destroy_event          = ucc_ec_cuda_event_destroy,
     .super.ops.event_post             = ucc_ec_cuda_event_post,
diff --git a/src/components/ec/cuda/ec_cuda.h b/src/components/ec/cuda/ec_cuda.h
index 2305cf7416..677d14b9a7 100644
--- a/src/components/ec/cuda/ec_cuda.h
+++ b/src/components/ec/cuda/ec_cuda.h
@@ -21,19 +21,6 @@ typedef enum ucc_ec_cuda_strm_task_mode {
     UCC_EC_CUDA_TASK_LAST,
 } ucc_ec_cuda_strm_task_mode_t;
 
-typedef enum ucc_ec_cuda_task_stream_type {
-    UCC_EC_CUDA_USER_STREAM,
-    UCC_EC_CUDA_INTERNAL_STREAM,
-    UCC_EC_CUDA_TASK_STREAM_LAST
-} ucc_ec_cuda_task_stream_type_t;
-
-typedef enum ucc_ec_task_status {
-    UCC_EC_CUDA_TASK_COMPLETED,
-    UCC_EC_CUDA_TASK_POSTED,
-    UCC_EC_CUDA_TASK_STARTED,
-    UCC_EC_CUDA_TASK_COMPLETED_ACK
-} ucc_ec_task_status_t;
-
 typedef enum ucc_ec_cuda_executor_state {
     UCC_EC_CUDA_EXECUTOR_INITIALIZED,
     UCC_EC_CUDA_EXECUTOR_POSTED,
@@ -54,8 +41,6 @@ typedef ucc_status_t (*ucc_ec_cuda_task_post_fn) (uint32_t *dev_status,
 typedef struct ucc_ec_cuda_config {
     ucc_ec_config_t                super;
     ucc_ec_cuda_strm_task_mode_t   strm_task_mode;
-    ucc_ec_cuda_task_stream_type_t task_strm_type;
-    int                            stream_blocking_wait;
     unsigned long                  exec_num_workers;
     unsigned long                  exec_num_threads;
     unsigned long                  exec_max_tasks;
@@ -72,14 +57,11 @@ typedef struct ucc_ec_cuda {
     int                            exec_streams_initialized;
     cudaStream_t                  *exec_streams;
     ucc_mpool_t                    events;
-    ucc_mpool_t                    strm_reqs;
     ucc_mpool_t                    executors;
     ucc_mpool_t                    executor_interruptible_tasks;
     ucc_mpool_t                    executor_persistent_tasks;
     ucc_thread_mode_t              thread_mode;
     ucc_ec_cuda_strm_task_mode_t   strm_task_mode;
-    ucc_ec_cuda_task_stream_type_t task_strm_type;
-    ucc_ec_cuda_task_post_fn       post_strm_task;
     ucc_spinlock_t                 init_spinlock;
 } ucc_ec_cuda_t;
 
@@ -116,6 +98,7 @@ typedef struct ucc_ec_cuda_executor_task_ops {
 typedef struct ucc_ec_cuda_executor {
     ucc_ee_executor_t                super;
     ucc_ec_cuda_executor_mode_t      mode;
+    uint64_t                         requested_ops;
     ucc_ec_cuda_executor_task_ops_t  ops;
     ucc_spinlock_t                   tasks_lock;
     ucc_ec_cuda_executor_state_t     state;
diff --git a/src/components/ec/cuda/ec_cuda_executor.c b/src/components/ec/cuda/ec_cuda_executor.c
index d031e4f45a..49ae469140 100644
--- a/src/components/ec/cuda/ec_cuda_executor.c
+++ b/src/components/ec/cuda/ec_cuda_executor.c
@@ -6,14 +6,19 @@
 
 #include "ec_cuda_executor.h"
 
+ucc_status_t ucc_cuda_executor_interruptible_start(ucc_ee_executor_t *executor);
+
+ucc_status_t ucc_cuda_executor_interruptible_stop(ucc_ee_executor_t *executor);
+
 ucc_status_t ucc_cuda_executor_persistent_start(ucc_ee_executor_t *executor,
                                                 void *ee_context);
 
 ucc_status_t ucc_cuda_executor_persistent_stop(ucc_ee_executor_t *executor);
 
-ucc_status_t ucc_cuda_executor_interruptible_start(ucc_ee_executor_t *executor);
+ucc_status_t ucc_cuda_executor_persistent_wait_start(ucc_ee_executor_t *executor,
+                                                     void *ee_context);
 
-ucc_status_t ucc_cuda_executor_interruptible_stop(ucc_ee_executor_t *executor);
+ucc_status_t ucc_cuda_executor_persistent_wait_stop(ucc_ee_executor_t *executor);
 
 ucc_status_t ucc_cuda_executor_init(const ucc_ee_executor_params_t *params,
                                     ucc_ee_executor_t **executor)
@@ -25,6 +30,13 @@ ucc_status_t ucc_cuda_executor_init(const ucc_ee_executor_params_t *params,
         return UCC_ERR_NO_MEMORY;
     }
 
+    if (params->mask & UCC_EE_EXECUTOR_PARAM_FIELD_TASK_TYPES) {
+        eee->requested_ops = params->task_types;
+    } else {
+        /* if no task types provided assume all tasks types required */
+        eee->requested_ops = 1;
+    }
+
     ec_debug(&ucc_ec_cuda.super, "executor init, eee: %p", eee);
     eee->super.ee_type = params->ee_type;
     eee->state         = UCC_EC_CUDA_EXECUTOR_INITIALIZED;
@@ -90,10 +102,19 @@ ucc_status_t ucc_cuda_executor_task_finalize(ucc_ee_executor_task_t *task)
 ucc_status_t ucc_cuda_executor_start(ucc_ee_executor_t *executor,
                                      void *ee_context)
 {
+    ucc_ec_cuda_executor_t *eee = ucc_derived_of(executor,
+                                                 ucc_ec_cuda_executor_t);
+
     if (!ee_context) {
         return ucc_cuda_executor_interruptible_start(executor);
     } else {
-        return ucc_cuda_executor_persistent_start(executor, ee_context);
+        if (eee->requested_ops == 0) {
+            /* no operations requested, just mark stream busy */
+            return ucc_cuda_executor_persistent_wait_start(executor,
+                                                           ee_context);
+        } else {
+            return ucc_cuda_executor_persistent_start(executor, ee_context);
+        }
     }
 }
 
@@ -104,6 +125,10 @@ ucc_status_t ucc_cuda_executor_stop(ucc_ee_executor_t *executor)
     if (eee->mode == UCC_EC_CUDA_EXECUTOR_MODE_INTERRUPTIBLE) {
         return ucc_cuda_executor_interruptible_stop(executor);
     } else {
-        return ucc_cuda_executor_persistent_stop(executor);
+        if (eee->requested_ops == 0) {
+            return ucc_cuda_executor_persistent_wait_stop(executor);
+        } else {
+            return ucc_cuda_executor_persistent_stop(executor);
+        }
     }
 }
diff --git a/src/components/ec/cuda/ec_cuda_executor_interruptible.c b/src/components/ec/cuda/ec_cuda_executor_interruptible.c
index 55fd3addf0..e4a027f4d3 100644
--- a/src/components/ec/cuda/ec_cuda_executor_interruptible.c
+++ b/src/components/ec/cuda/ec_cuda_executor_interruptible.c
@@ -5,7 +5,6 @@
  */
 
 #include "ec_cuda_executor.h"
-#include "components/mc/ucc_mc.h"
 #include "utils/ucc_atomic.h"
 
 ucc_status_t ucc_cuda_executor_interruptible_get_stream(cudaStream_t *stream)
diff --git a/src/components/ec/cuda/ec_cuda_executor_persistent_wait.c b/src/components/ec/cuda/ec_cuda_executor_persistent_wait.c
new file mode 100644
index 0000000000..98b97f990c
--- /dev/null
+++ b/src/components/ec/cuda/ec_cuda_executor_persistent_wait.c
@@ -0,0 +1,63 @@
+/**
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "ec_cuda_executor.h"
+
+ucc_status_t
+ucc_ec_cuda_post_kernel_stream_task(ucc_ec_cuda_executor_state_t *state,
+                                    cudaStream_t stream);
+
+static ucc_status_t
+ucc_ec_cuda_post_driver_stream_task(ucc_ec_cuda_executor_state_t *state,
+                                    cudaStream_t stream)
+{
+    CUdeviceptr state_ptr  = (CUdeviceptr)state;
+
+    CUDADRV_FUNC(cuStreamWriteValue32(stream, state_ptr,
+                                      UCC_EC_CUDA_EXECUTOR_STARTED, 0));
+    CUDADRV_FUNC(cuStreamWaitValue32(stream, state_ptr,
+                                     UCC_EC_CUDA_EXECUTOR_SHUTDOWN,
+                                     CU_STREAM_WAIT_VALUE_EQ));
+    CUDADRV_FUNC(cuStreamWriteValue32(stream, state_ptr,
+                                      UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK, 0));
+    return UCC_OK;
+}
+
+ucc_status_t ucc_cuda_executor_persistent_wait_start(ucc_ee_executor_t *executor,
+                                                     void *ee_context)
+{
+    ucc_ec_cuda_executor_t *eee    = ucc_derived_of(executor,
+                                                    ucc_ec_cuda_executor_t);
+    cudaStream_t            stream = (cudaStream_t)ee_context;
+
+    eee->super.ee_context = ee_context;
+    eee->state            = UCC_EC_CUDA_EXECUTOR_POSTED;
+    eee->mode             = UCC_EC_CUDA_EXECUTOR_MODE_PERSISTENT;
+
+    ucc_memory_cpu_store_fence();
+    if (ucc_ec_cuda.strm_task_mode == UCC_EC_CUDA_TASK_KERNEL) {
+        return ucc_ec_cuda_post_kernel_stream_task(eee->dev_state, stream);
+    } else {
+        return ucc_ec_cuda_post_driver_stream_task(eee->dev_state, stream);
+    }
+}
+
+ucc_status_t ucc_cuda_executor_persistent_wait_stop(ucc_ee_executor_t *executor)
+{
+    ucc_ec_cuda_executor_t *eee = ucc_derived_of(executor,
+                                                 ucc_ec_cuda_executor_t);
+    volatile ucc_ec_cuda_executor_state_t *st = &eee->state;
+
+    ec_debug(&ucc_ec_cuda.super, "executor stop, eee: %p", eee);
+    ucc_assert((*st != UCC_EC_CUDA_EXECUTOR_POSTED) &&
+               (*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN));
+    *st = UCC_EC_CUDA_EXECUTOR_SHUTDOWN;
+    while(*st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK) { }
+    eee->super.ee_context = NULL;
+    eee->state = UCC_EC_CUDA_EXECUTOR_INITIALIZED;
+
+    return UCC_OK;
+}
diff --git a/src/components/ec/cuda/kernel/ec_cuda_wait_kernel.cu b/src/components/ec/cuda/kernel/ec_cuda_wait_kernel.cu
index cfbe7f29eb..89743b91d9 100644
--- a/src/components/ec/cuda/kernel/ec_cuda_wait_kernel.cu
+++ b/src/components/ec/cuda/kernel/ec_cuda_wait_kernel.cu
@@ -17,18 +17,14 @@ extern "C" {
 }
 #endif
 
-__global__ void wait_kernel(volatile uint32_t *status) {
-    ucc_status_t st;
-    *status = UCC_EC_CUDA_TASK_STARTED;
-    do {
-        st = (ucc_status_t)*status;
-    } while(st != UCC_EC_CUDA_TASK_COMPLETED);
-    *status = UCC_EC_CUDA_TASK_COMPLETED_ACK;
-    return;
-}
+__global__ void wait_kernel(volatile ucc_ec_cuda_executor_state_t *state) {
+    ucc_ec_cuda_executor_state_t st;
 
-__global__ void wait_kernel_nb(volatile uint32_t *status) {
-    *status = UCC_EC_CUDA_TASK_COMPLETED_ACK;
+    *state = UCC_EC_CUDA_EXECUTOR_STARTED;
+    do {
+        st = *state;
+    } while (st != UCC_EC_CUDA_EXECUTOR_SHUTDOWN);
+    *state = UCC_EC_CUDA_EXECUTOR_SHUTDOWN_ACK;
     return;
 }
 
@@ -36,15 +32,11 @@ __global__ void wait_kernel_nb(volatile uint32_t *status) {
 extern "C" {
 #endif
 
-ucc_status_t ucc_ec_cuda_post_kernel_stream_task(uint32_t *status,
-                                                 int blocking_wait,
-                                                 cudaStream_t stream)
+ucc_status_t
+ucc_ec_cuda_post_kernel_stream_task(ucc_ec_cuda_executor_state_t *state,
+                                    cudaStream_t stream)
 {
-    if (blocking_wait) {
-        wait_kernel<<<1, 1, 0, stream>>>(status);
-    } else {
-        wait_kernel_nb<<<1, 1, 0, stream>>>(status);
-    }
+    wait_kernel<<<1, 1, 0, stream>>>(state);
     CUDA_CHECK(cudaGetLastError());
     return UCC_OK;
 }
diff --git a/src/components/ec/ucc_ec.c b/src/components/ec/ucc_ec.c
index 093ba381dd..f3c6dadfb2 100644
--- a/src/components/ec/ucc_ec.c
+++ b/src/components/ec/ucc_ec.c
@@ -119,25 +119,6 @@ ucc_status_t ucc_ec_finalize()
     return UCC_OK;
 }
 
-ucc_status_t ucc_ec_task_post(void *ee_context, ucc_ee_type_t ee_type,
-                              void **ee_task)
-{
-    UCC_CHECK_EC_AVAILABLE(ee_type);
-    return ec_ops[ee_type]->task_post(ee_context, ee_task);
-}
-
-ucc_status_t ucc_ec_task_query(void *ee_task, ucc_ee_type_t ee_type)
-{
-    UCC_CHECK_EC_AVAILABLE(ee_type);
-    return ec_ops[ee_type]->task_query(ee_task);
-}
-
-ucc_status_t ucc_ec_task_end(void *ee_task, ucc_ee_type_t ee_type)
-{
-    UCC_CHECK_EC_AVAILABLE(ee_type);
-    return ec_ops[ee_type]->task_end(ee_task);
-}
-
 ucc_status_t ucc_ec_create_event(void **event, ucc_ee_type_t ee_type)
 {
     UCC_CHECK_EC_AVAILABLE(ee_type);
diff --git a/src/components/ec/ucc_ec.h b/src/components/ec/ucc_ec.h
index b9cfb5334a..9d078ef00b 100644
--- a/src/components/ec/ucc_ec.h
+++ b/src/components/ec/ucc_ec.h
@@ -17,13 +17,6 @@ ucc_status_t ucc_ec_get_attr(ucc_ec_attr_t *attr);
 
 ucc_status_t ucc_ec_finalize();
 
-ucc_status_t ucc_ec_task_post(void *ee_context, ucc_ee_type_t ee_type,
-                              void **ee_task);
-
-ucc_status_t ucc_ec_task_query(void *ee_task, ucc_ee_type_t ee_type);
-
-ucc_status_t ucc_ec_task_end(void *ee_task, ucc_ee_type_t ee_type);
-
 ucc_status_t ucc_ec_create_event(void **event, ucc_ee_type_t ee_type);
 
 ucc_status_t ucc_ec_destroy_event(void *event, ucc_ee_type_t ee_type);
diff --git a/src/core/ucc_coll.c b/src/core/ucc_coll.c
index 8c6508a4f4..ad4890e442 100644
--- a/src/core/ucc_coll.c
+++ b/src/core/ucc_coll.c
@@ -402,8 +402,10 @@ static void ucc_trigger_test(ucc_coll_task_t *task)
             /* triggered task doesn't need executor, init and start executor on
              * trigger task
              */
-            params.mask    = UCC_EE_EXECUTOR_PARAM_FIELD_TYPE;
-            params.ee_type = task->ee->ee_type;
+            params.mask       = UCC_EE_EXECUTOR_PARAM_FIELD_TYPE |
+                                UCC_EE_EXECUTOR_PARAM_FIELD_TASK_TYPES;
+            params.ee_type    = task->ee->ee_type;
+            params.task_types = 0;
             status = ucc_ee_executor_init(&params, &task->executor);
             if (ucc_unlikely(status != UCC_OK)) {
                 ucc_error("error in ee executor init, %s",
diff --git a/tools/perf/ucc_pt_comm.cc b/tools/perf/ucc_pt_comm.cc
index 4cff62f71e..1f6a91a651 100644
--- a/tools/perf/ucc_pt_comm.cc
+++ b/tools/perf/ucc_pt_comm.cc
@@ -69,7 +69,8 @@ ucc_ee_h ucc_pt_comm::get_ee()
                 throw std::runtime_error(ucc_status_string(status));
             }
         } else {
-            std::cerr << "execution engine is not supported for given memory type";
+            std::cerr << "execution engine is not supported for given memory type"
+                      << std::endl;
             throw std::runtime_error("not supported");
         }
     }
@@ -91,7 +92,8 @@ ucc_ee_executor_t* ucc_pt_comm::get_executor()
         } else if (cfg.mt == UCC_MEMORY_TYPE_ROCM) {
             executor_params.ee_type = UCC_EE_ROCM_STREAM;
         } else {
-            std::cerr << "executor is not supported for given memory type";
+            std::cerr << "executor is not supported for given memory type"
+                      << std::endl;
             throw std::runtime_error("not supported");
         }
         status = ucc_ee_executor_init(&executor_params, &executor);
@@ -197,7 +199,8 @@ ucc_status_t ucc_pt_comm::finalize()
         if (cfg.mt == UCC_MEMORY_TYPE_CUDA) {
             ucc_pt_cudaStreamDestroy((cudaStream_t)stream);
         } else {
-            std::cerr << "execution engine is not supported for given memory type";
+            std::cerr << "execution engine is not supported for given memory type"
+                      << std::endl;
             throw std::runtime_error("not supported");
         }
     }

From 40a1c795b21001b82a4b5e1409b197eee4d98d28 Mon Sep 17 00:00:00 2001
From: Sergey Lebedev <sergeyle@nvidia.com>
Date: Fri, 16 Dec 2022 08:32:53 +0000
Subject: [PATCH 2/2] EC/ROCM: remove unused functions

---
 src/components/ec/rocm/ec_rocm.c              | 234 +-----------------
 src/components/ec/rocm/ec_rocm.h              |  19 --
 src/components/ec/rocm/kernel/Makefile.am     |   7 +-
 .../ec/rocm/kernel/ec_rocm_wait_kernel.cu     |  44 ----
 4 files changed, 4 insertions(+), 300 deletions(-)
 delete mode 100644 src/components/ec/rocm/kernel/ec_rocm_wait_kernel.cu

diff --git a/src/components/ec/rocm/ec_rocm.c b/src/components/ec/rocm/ec_rocm.c
index 8c86384a4d..32b31cf48e 100644
--- a/src/components/ec/rocm/ec_rocm.c
+++ b/src/components/ec/rocm/ec_rocm.c
@@ -13,43 +13,10 @@
 #include <hsa/hsa.h>
 #include <hsa/hsa_ext_amd.h>
 
-static const char *stream_task_modes[] = {
-    [UCC_EC_ROCM_TASK_KERNEL]  = "kernel",
-    [UCC_EC_ROCM_TASK_MEM_OPS] = "driver",
-    [UCC_EC_ROCM_TASK_AUTO]    = "auto",
-    [UCC_EC_ROCM_TASK_LAST]    = NULL
-};
-
-static const char *task_stream_types[] = {
-    [UCC_EC_ROCM_USER_STREAM]      = "user",
-    [UCC_EC_ROCM_INTERNAL_STREAM]  = "ucc",
-    [UCC_EC_ROCM_TASK_STREAM_LAST] = NULL
-};
-
 static ucc_config_field_t ucc_ec_rocm_config_table[] = {
     {"", "", NULL, ucc_offsetof(ucc_ec_rocm_config_t, super),
      UCC_CONFIG_TYPE_TABLE(ucc_ec_config_table)},
 
-    {"STREAM_TASK_MODE", "auto",
-     "Mechanism to create stream dependency\n"
-     "kernel - use waiting kernel\n"
-     "driver - use driver MEM_OPS\n"
-     "auto   - runtime automatically chooses best one",
-     ucc_offsetof(ucc_ec_rocm_config_t, strm_task_mode),
-     UCC_CONFIG_TYPE_ENUM(stream_task_modes)},
-
-    {"TASK_STREAM", "user",
-     "Stream for rocm task\n"
-     "user - user stream provided in execution engine context\n"
-     "ucc  - ucc library internal stream",
-     ucc_offsetof(ucc_ec_rocm_config_t, task_strm_type),
-     UCC_CONFIG_TYPE_ENUM(task_stream_types)},
-
-    {"STREAM_BLOCKING_WAIT", "1",
-     "Stream is blocked until collective operation is done",
-     ucc_offsetof(ucc_ec_rocm_config_t, stream_blocking_wait),
-     UCC_CONFIG_TYPE_UINT},
-
     {"EXEC_NUM_WORKERS", "1",
      "Number of thread blocks to use for rocm executor",
      ucc_offsetof(ucc_ec_rocm_config_t, exec_num_workers),
@@ -138,7 +105,6 @@ static void ucc_ec_rocm_executor_chunk_cleanup(ucc_mpool_t *mp, void *obj) //NOL
     }
 }
 
-
 static ucc_mpool_ops_t ucc_ec_rocm_ee_executor_mpool_ops = {
     .chunk_alloc   = ucc_ec_rocm_ee_executor_mpool_chunk_malloc,
     .chunk_release = ucc_ec_rocm_ee_executor_mpool_chunk_free,
@@ -146,40 +112,7 @@ static ucc_mpool_ops_t ucc_ec_rocm_ee_executor_mpool_ops = {
     .obj_cleanup   = ucc_ec_rocm_executor_chunk_cleanup,
 };
 
-
-static ucc_status_t ucc_ec_rocm_stream_req_mpool_chunk_malloc(ucc_mpool_t *mp, //NOLINT: mp is unused
-                                                              size_t *size_p,
-                                                              void ** chunk_p)
-{
-    ucc_status_t status;
-
-    status = ROCM_FUNC(hipHostMalloc((void**)chunk_p, *size_p,
-                       hipHostMallocMapped));
-    return status;
-}
-
-static void ucc_ec_rocm_stream_req_mpool_chunk_free(ucc_mpool_t *mp, //NOLINT: mp is unused
-                                                    void *       chunk)
-{
-    hipHostFree(chunk);
-}
-
-static void ucc_ec_rocm_stream_req_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
-{
-    ucc_ec_rocm_stream_request_t *req = (ucc_ec_rocm_stream_request_t*) obj;
-
-    ROCM_FUNC(hipHostGetDevicePointer(
-                  (void**)(&req->dev_status), (void *)&req->status, 0));
-}
-
-static ucc_mpool_ops_t ucc_ec_rocm_stream_req_mpool_ops = {
-    .chunk_alloc   = ucc_ec_rocm_stream_req_mpool_chunk_malloc,
-    .chunk_release = ucc_ec_rocm_stream_req_mpool_chunk_free,
-    .obj_init      = ucc_ec_rocm_stream_req_init,
-    .obj_cleanup   = NULL
-};
-
-static void ucc_ec_rocm_event_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
+static void ucc_ec_rocm_event_init(ucc_mpool_t *mp, void *obj, void *chunk)
 {
     ucc_ec_rocm_event_t *base = (ucc_ec_rocm_event_t *) obj;
 
@@ -206,37 +139,12 @@ static ucc_mpool_ops_t ucc_ec_rocm_event_mpool_ops = {
     .obj_cleanup   = ucc_ec_rocm_event_cleanup,
 };
 
-ucc_status_t ucc_ec_rocm_post_kernel_stream_task(uint32_t *status,
-                                                 int blocking_wait,
-                                                 hipStream_t stream);
-
-static ucc_status_t ucc_ec_rocm_post_driver_stream_task(uint32_t *status,
-                                                        int blocking_wait,
-                                                        hipStream_t stream)
-{
-    hipDeviceptr_t status_ptr  = (hipDeviceptr_t)status;
-
-    if (blocking_wait) {
-        ROCM_FUNC(hipStreamWriteValue32(stream, status_ptr,
-                                        UCC_EC_ROCM_TASK_STARTED, 0));
-        ROCM_FUNC(hipStreamWaitValue32(stream, status_ptr,
-                                       UCC_EC_ROCM_TASK_COMPLETED,
-                                       hipStreamWaitValueEq, 0xFFFFFFFF));
-    }
-    ROCM_FUNC(hipStreamWriteValue32(stream, status_ptr,
-                                    UCC_EC_ROCM_TASK_COMPLETED_ACK, 0));
-    return UCC_OK;
-}
-
 static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params)
 {
     ucc_ec_rocm_config_t *cfg = EC_ROCM_CONFIG;
     ucc_status_t          status;
     int                   device, num_devices;
-    int                   attr=0;
     hipError_t            rocm_st;
-    hipDevice_t           hip_dev;
-    const char           *hip_err_st_str;
     hipDeviceProp_t       prop;
 
     ucc_ec_rocm.stream                   = NULL;
@@ -290,16 +198,6 @@ static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params)
         return status;
     }
 
-    /* create request pool */
-    status = ucc_mpool_init(
-        &ucc_ec_rocm.strm_reqs, 0, sizeof(ucc_ec_rocm_stream_request_t), 0,
-        UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_rocm_stream_req_mpool_ops,
-        UCC_THREAD_MULTIPLE, "ROCM Event Objects");
-    if (status != UCC_OK) {
-        ec_error(&ucc_ec_rocm.super, "failed to create stream pool");
-        return status;
-    }
-
     status = ucc_mpool_init(
         &ucc_ec_rocm.executors, 0, sizeof(ucc_ec_rocm_executor_t), 0,
         UCC_CACHE_LINE_SIZE, 16, UINT_MAX, &ucc_ec_rocm_ee_executor_mpool_ops,
@@ -314,44 +212,7 @@ static ucc_status_t ucc_ec_rocm_init(const ucc_ec_params_t *ec_params)
         sizeof(ucc_ec_rocm_executor_interruptible_task_t), 0, UCC_CACHE_LINE_SIZE,
         16, UINT_MAX, NULL, UCC_THREAD_MULTIPLE,
         "interruptible executor tasks");
-    if (status != UCC_OK) {
-        ec_error(&ucc_ec_rocm.super, "failed to create interruptible tasks pool");
-        return status;
-    }
-    
-    if (cfg->strm_task_mode == UCC_EC_ROCM_TASK_KERNEL) {
-        ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_KERNEL;
-        ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_kernel_stream_task;
-    } else {
-        ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_MEM_OPS;
-        ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_driver_stream_task;
-
-        rocm_st = hipCtxGetDevice(&hip_dev);
-        if (rocm_st != hipSuccess){
-            hip_err_st_str = hipGetErrorString(rocm_st);
-            ec_debug(&ucc_ec_rocm.super, "hipCtxGetDevice() failed: %s",
-                     hip_err_st_str);
-            attr = 0;
-        } else {
-            ROCM_FUNC(hipDeviceGetAttribute(&attr,
-		      hipDeviceAttributeCanUseStreamWaitValue,
-                      hip_dev));
-        }
-        if (cfg->strm_task_mode == UCC_EC_ROCM_TASK_AUTO) {
-            if (attr == 0) {
-                ec_info(&ucc_ec_rocm.super,
-                        "ROCm MEM OPS are not supported or disabled");
-                ucc_ec_rocm.strm_task_mode = UCC_EC_ROCM_TASK_KERNEL;
-                ucc_ec_rocm.post_strm_task = ucc_ec_rocm_post_kernel_stream_task;
-            }
-        } else if (attr == 0) {
-            ec_error(&ucc_ec_rocm.super,
-                     "ROCm MEM OPS are not supported or disabled");
-            return UCC_ERR_NOT_SUPPORTED;
-        }
-    }
 
-    ucc_ec_rocm.task_strm_type = cfg->task_strm_type;
     ucc_spinlock_init(&ucc_ec_rocm.init_spinlock, 0);
     return UCC_OK;
 }
@@ -364,95 +225,6 @@ static ucc_status_t ucc_ec_rocm_get_attr(ucc_ec_attr_t *ec_attr)
     return UCC_OK;
 }
 
-ucc_status_t ucc_ec_rocm_task_post(void *ee_stream, void **ee_req)
-{
-    ucc_ec_rocm_config_t         *cfg = EC_ROCM_CONFIG;
-    ucc_ec_rocm_stream_request_t *req;
-    ucc_ec_rocm_event_t          *rocm_event;
-    ucc_status_t                  status = UCC_OK;
-
-    UCC_EC_ROCM_INIT_STREAM();
-    req = ucc_mpool_get(&ucc_ec_rocm.strm_reqs);
-    if (ucc_unlikely(!req)) {
-        ec_error(&ucc_ec_rocm.super, "Failed to allocate stream request");
-	return UCC_ERR_NO_MEMORY;
-    }
-    req->status = UCC_EC_ROCM_TASK_POSTED;
-    req->stream = (hipStream_t)ee_stream;
-
-    if (ucc_ec_rocm.task_strm_type == UCC_EC_ROCM_USER_STREAM) {
-        status = ucc_ec_rocm.post_strm_task(req->dev_status,
-                                            cfg->stream_blocking_wait,
-                                            req->stream);
-        if (status != UCC_OK) {
-            goto free_req;
-        }
-    } else {
-        rocm_event = ucc_mpool_get(&ucc_ec_rocm.events);
-        if (ucc_unlikely(!rocm_event)) {
-	    ec_error(&ucc_ec_rocm.super, "Failed to allocate rocm event");
-	    status = UCC_ERR_NO_MEMORY;
-	    goto free_req;
-	}
-        ROCMCHECK(hipEventRecord(rocm_event->event, req->stream));
-        ROCMCHECK(hipStreamWaitEvent(ucc_ec_rocm.stream, rocm_event->event, 0));
-        status = ucc_ec_rocm.post_strm_task(req->dev_status,
-                                            cfg->stream_blocking_wait,
-                                            ucc_ec_rocm.stream);
-        if (ucc_unlikely(status != UCC_OK)) {
-            goto free_event;
-        }
-        ROCMCHECK(hipEventRecord(rocm_event->event, ucc_ec_rocm.stream));
-        ROCMCHECK(hipStreamWaitEvent(req->stream, rocm_event->event, 0));
-        ucc_mpool_put(rocm_event);
-    }
-
-    *ee_req = (void *) req;
-
-    ec_info(&ucc_ec_rocm.super, "ROCM stream task posted on \"%s\" stream. req:%p",
-            task_stream_types[ucc_ec_rocm.task_strm_type], req);
-
-    return UCC_OK;
-
-free_event:
-    ucc_mpool_put(rocm_event);
-free_req:
-    ucc_mpool_put(req);
-    return status;
-}
-
-ucc_status_t ucc_ec_rocm_task_query(void *ee_req)
-{
-    ucc_ec_rocm_stream_request_t *req = ee_req;
-
-    /* ee task might be only in POSTED, STARTED or COMPLETED_ACK state
-       COMPLETED state is used by ucc_ee_rocm_task_end function to request
-       stream unblock*/
-    ucc_assert(req->status != UCC_EC_ROCM_TASK_COMPLETED);
-    if (req->status == UCC_EC_ROCM_TASK_POSTED) {
-        return UCC_INPROGRESS;
-    }
-    ec_info(&ucc_ec_rocm.super, "ROCM stream task started. req:%p", req);
-    return UCC_OK;
-}
-
-ucc_status_t ucc_ec_rocm_task_end(void *ee_req)
-{
-    ucc_ec_rocm_stream_request_t *req = ee_req;
-    volatile ucc_ec_task_status_t *st = &req->status;
-
-    /* can be safely ended only if it's in STARTED or COMPLETED_ACK state */
-    ucc_assert((*st != UCC_EC_ROCM_TASK_POSTED) &&
-               (*st != UCC_EC_ROCM_TASK_COMPLETED));
-    if (*st == UCC_EC_ROCM_TASK_STARTED) {
-        *st = UCC_EC_ROCM_TASK_COMPLETED;
-        while(*st != UCC_EC_ROCM_TASK_COMPLETED_ACK) { }
-    }
-    ucc_mpool_put(req);
-    ec_info(&ucc_ec_rocm.super, "ROCM stream task done. req:%p", req);
-    return UCC_OK;
-}
-
 ucc_status_t ucc_ec_rocm_event_create(void **event)
 {
     ucc_ec_rocm_event_t *rocm_event;
@@ -512,7 +284,6 @@ static ucc_status_t ucc_ec_rocm_finalize()
     }
 
     ucc_mpool_cleanup(&ucc_ec_rocm.events, 1);
-    ucc_mpool_cleanup(&ucc_ec_rocm.strm_reqs, 1);
     ucc_mpool_cleanup(&ucc_ec_rocm.executors, 1);
     ucc_free(ucc_ec_rocm.exec_streams);
     return UCC_OK;
@@ -532,9 +303,6 @@ ucc_ec_rocm_t ucc_ec_rocm = {
             .table  = ucc_ec_rocm_config_table,
             .size   = sizeof(ucc_ec_rocm_config_t),
         },
-    .super.ops.task_post              = ucc_ec_rocm_task_post,
-    .super.ops.task_query             = ucc_ec_rocm_task_query,
-    .super.ops.task_end               = ucc_ec_rocm_task_end,
     .super.ops.create_event           = ucc_ec_rocm_event_create,
     .super.ops.destroy_event          = ucc_ec_rocm_event_destroy,
     .super.ops.event_post             = ucc_ec_rocm_event_post,
diff --git a/src/components/ec/rocm/ec_rocm.h b/src/components/ec/rocm/ec_rocm.h
index c22370df03..26bc81ba3d 100644
--- a/src/components/ec/rocm/ec_rocm.h
+++ b/src/components/ec/rocm/ec_rocm.h
@@ -19,19 +19,6 @@
 #include <hip_fp16.h>
 #include <hip_bfloat16.h>
 
-typedef enum ucc_ec_rocm_strm_task_mode {
-    UCC_EC_ROCM_TASK_KERNEL,
-    UCC_EC_ROCM_TASK_MEM_OPS,
-    UCC_EC_ROCM_TASK_AUTO,
-    UCC_EC_ROCM_TASK_LAST,
-} ucc_ec_rocm_strm_task_mode_t;
-
-typedef enum ucc_ec_rocm_task_stream_type {
-    UCC_EC_ROCM_USER_STREAM,
-    UCC_EC_ROCM_INTERNAL_STREAM,
-    UCC_EC_ROCM_TASK_STREAM_LAST
-} ucc_ec_rocm_task_stream_type_t;
-
 typedef enum ucc_ec_task_status {
     UCC_EC_ROCM_TASK_COMPLETED,
     UCC_EC_ROCM_TASK_POSTED,
@@ -72,9 +59,6 @@ typedef ucc_status_t (*ucc_ec_rocm_task_post_fn) (uint32_t *dev_status,
 
 typedef struct ucc_ec_rocm_config {
     ucc_ec_config_t                super;
-    ucc_ec_rocm_strm_task_mode_t   strm_task_mode;
-    ucc_ec_rocm_task_stream_type_t task_strm_type;
-    int                            stream_blocking_wait;
     unsigned long                  exec_num_workers;
     unsigned long                  exec_num_threads;
     unsigned long                  exec_max_tasks;
@@ -96,9 +80,6 @@ typedef struct ucc_ec_rocm {
     ucc_mpool_t                    executors;
     ucc_mpool_t                    executor_interruptible_tasks;
     ucc_thread_mode_t              thread_mode;
-    ucc_ec_rocm_strm_task_mode_t   strm_task_mode;
-    ucc_ec_rocm_task_stream_type_t task_strm_type;
-    ucc_ec_rocm_task_post_fn       post_strm_task;
     ucc_spinlock_t                 init_spinlock;
     ucc_ee_executor_t             *cpu_executor;
 } ucc_ec_rocm_t;
diff --git a/src/components/ec/rocm/kernel/Makefile.am b/src/components/ec/rocm/kernel/Makefile.am
index ee36579358..6f95d2b33b 100644
--- a/src/components/ec/rocm/kernel/Makefile.am
+++ b/src/components/ec/rocm/kernel/Makefile.am
@@ -17,15 +17,14 @@ HIPCCFLAGS =                                     \
 LINK = $(LIBTOOL) --mode=link $(CC) -o $@
 
 .cu.o:
-	$(HIPCC) -c $< -o $@  $(HIPCCFLAGS) 
+	$(HIPCC) -c $< -o $@  $(HIPCCFLAGS)
 
 .cu.lo:
-	/bin/bash $(top_srcdir)/cuda_lt.sh "$(LIBTOOL)" $@ $(HIPCC) -c  $< $(HIPCCFLAGS) 
+	/bin/bash $(top_srcdir)/cuda_lt.sh "$(LIBTOOL)" $@ $(HIPCC) -c  $< $(HIPCCFLAGS)
 
 comp_noinst = libucc_ec_rocm_kernels.la
 
-libucc_ec_rocm_kernels_la_SOURCES  = ec_rocm_wait_kernel.cu     \
-                                     ec_rocm_executor_kernel.cu \
+libucc_ec_rocm_kernels_la_SOURCES  = ec_rocm_executor_kernel.cu \
                                      ec_rocm_reduce.cu
 libucc_ec_rocm_kernels_la_CPPFLAGS =
 
diff --git a/src/components/ec/rocm/kernel/ec_rocm_wait_kernel.cu b/src/components/ec/rocm/kernel/ec_rocm_wait_kernel.cu
deleted file mode 100644
index b33edff060..0000000000
--- a/src/components/ec/rocm/kernel/ec_rocm_wait_kernel.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * Copyright (C) Advanced Micro Devices, Inc. 2022. ALL RIGHTS RESERVED.
- *
- * See file LICENSE for terms.
- */
-
-#include "ec_rocm.h"
-
-__global__ void wait_kernel(volatile uint32_t *status) {
-    ucc_status_t st;
-    *status = UCC_EC_ROCM_TASK_STARTED;
-    do {
-        st = (ucc_status_t)*status;
-    } while(st != (ucc_status_t)UCC_EC_ROCM_TASK_COMPLETED);
-    *status = UCC_EC_ROCM_TASK_COMPLETED_ACK;
-    return;
-}
-
-__global__ void wait_kernel_nb(volatile uint32_t *status) {
-    *status = UCC_EC_ROCM_TASK_COMPLETED_ACK;
-    return;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-ucc_status_t ucc_ec_rocm_post_kernel_stream_task(uint32_t *status,
-                                                 int blocking_wait,
-                                                 hipStream_t stream)
-{
-    if (blocking_wait) {
-        wait_kernel<<<1, 1, 0, stream>>>(status);
-    } else {
-        wait_kernel_nb<<<1, 1, 0, stream>>>(status);
-    }
-    ROCMCHECK(hipGetLastError());
-    return UCC_OK;
-}
-
-#ifdef __cplusplus
-}
-#endif