ipc4: Add cross-core binding support

Implements binding of two pipelines from different cores so stream could travel cross-core. The feature is disabled by default, set CONFIG_CROSS_CORE_STREAM=y to enable. Signed-off-by: Serhiy Katsyuba <[email protected]>
serhiy-katsyuba-intel · Oct 17, 2023 · 6281fec · 6281fec
1 parent e08b2c2
commit 6281fec
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 23 deletions.
diff --git a/src/include/sof/schedule/ll_schedule_domain.h b/src/include/sof/schedule/ll_schedule_domain.h
@@ -44,6 +44,17 @@ struct ll_schedule_domain_ops {
 				 struct task *task, uint32_t num_tasks);
 	void (*domain_enable)(struct ll_schedule_domain *domain, int core);
 	void (*domain_disable)(struct ll_schedule_domain *domain, int core);
+#if CONFIG_CROSS_CORE_STREAM
+	/*
+	 * Unlike domain_disable(), these are intended to temporary block LL from
+	 * starting its next cycle. Triggering (e.g., by means of timer interrupt)
+	 * is still enabled and registered but execution of next cycle is blocked.
+	 * Once unblocked, if triggering were previously registered in a blocked
+	 * state -- next cycle execution could start immediately.
+	 */
+	void (*domain_block)(struct ll_schedule_domain *domain);
+	void (*domain_unblock)(struct ll_schedule_domain *domain);
+#endif
 	void (*domain_set)(struct ll_schedule_domain *domain, uint64_t start);
 	void (*domain_clear)(struct ll_schedule_domain *domain);
 	bool (*domain_is_pending)(struct ll_schedule_domain *domain,
@@ -192,6 +203,20 @@ static inline void domain_disable(struct ll_schedule_domain *domain, int core)
 	}
 }
 
+#if CONFIG_CROSS_CORE_STREAM
+static inline void domain_block(struct ll_schedule_domain *domain)
+{
+	if (domain->ops->domain_block)
+		domain->ops->domain_block(domain);
+}
+
+static inline void domain_unblock(struct ll_schedule_domain *domain)
+{
+	if (domain->ops->domain_unblock)
+		domain->ops->domain_unblock(domain);
+}
+#endif
+
 static inline bool domain_is_pending(struct ll_schedule_domain *domain,
 				     struct task *task, struct comp_dev **comp)
 {

diff --git a/src/ipc/ipc4/helper.c b/src/ipc/ipc4/helper.c
@@ -337,10 +337,78 @@ static struct comp_buffer *ipc4_create_buffer(struct comp_dev *src, bool is_shar
 	ipc_buf.size = buf_size;
 	ipc_buf.comp.id = IPC4_COMP_ID(src_queue, dst_queue);
 	ipc_buf.comp.pipeline_id = src->ipc_config.pipeline_id;
-	ipc_buf.comp.core = src->ipc_config.core;
+	ipc_buf.comp.core = cpu_get_id();
 	return buffer_new(&ipc_buf, is_shared);
 }
 
+#if CONFIG_CROSS_CORE_STREAM
+/*
+ * Disabling interrupts to block next LL cycle works much faster comparing using
+ * of condition variable and mutex. Since same core binding is the most typical
+ * case, let's use slower cond_var blocking mechanism only for not so typical
+ * cross-core binding.
+ *
+ * Note, disabling interrupts to block LL for cross-core binding case will not work
+ * as .bind() handlers are called on corresponding cores using IDC tasks. IDC requires
+ * interrupts to be enabled. Only disabling timer interrupt instead of all interrupts
+ * might work. However, as CPU could go to some power down mode while waiting for
+ * blocking IDC call response, it's not clear how safe is to assume CPU can wakeup
+ * without timer interrupt. It depends on blocking IDC waiting implementation. That
+ * is why additional cond_var mechanism to block LL was introduced which does not
+ * disable any interrupts.
+ */
+
+#define ll_block(cross_core_bind) \
+	do { \
+		if (cross_core_bind) \
+			domain_block(sof_get()->platform_timer_domain); \
+		else \
+			irq_local_disable(flags); \
+	} while (0)
+
+#define ll_unblock(cross_core_bind) \
+	do { \
+		if (cross_core_bind) \
+			domain_unblock(sof_get()->platform_timer_domain); \
+		else \
+			irq_local_enable(flags); \
+	} while (0)
+
+/* Calling both ll_block() and ll_wait_finished_on_core() makes sure LL will not start its
+ * next cycle and its current cycle on specified core has finished.
+ */
+static int ll_wait_finished_on_core(struct comp_dev *dev)
+{
+	/* To make sure (blocked) LL has finished its current cycle, it is
+	 * enough to send any blocking IDC to the core. Since IDC task has lower
+	 * priority then LL thread and cannot preempt it, execution of IDC task
+	 * happens when LL thread is not active waiting for its next cycle.
+	 */
+
+	int ret;
+	struct ipc4_base_module_cfg dummy;
+
+	if (cpu_is_me(dev->ipc_config.core))
+		return 0;
+
+	/* Any blocking IDC that does not change component state could be utilized */
+	ret = comp_ipc4_get_attribute_remote(dev, COMP_ATTR_BASE_CONFIG, &dummy);
+	if (ret < 0) {
+		tr_err(&ipc_tr, "comp_ipc4_get_attribute_remote() failed for module %#x",
+		       dev_comp_id(dev));
+		return ret;
+	}
+
+	return 0;
+}
+
+#else
+
+#define ll_block(cross_core_bind)	irq_local_disable(flags)
+#define ll_unblock(cross_core_bind)	irq_local_enable(flags)
+
+#endif
+
 int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 {
 	struct ipc4_module_bind_unbind *bu;
@@ -364,14 +432,15 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 		return IPC4_INVALID_RESOURCE_ID;
 	}
 
-	bool is_shared = source->ipc_config.core != sink->ipc_config.core;
+	bool cross_core_bind = source->ipc_config.core != sink->ipc_config.core;
 
-	/* Pass IPC to target core if the buffer won't be shared and will be used
-	 * on different core
+	/* If both components are on same core -- process IPC on that core,
+	 * otherwise stay on core 0
 	 */
-	if (!cpu_is_me(source->ipc_config.core) && !is_shared)
+	if (!cpu_is_me(source->ipc_config.core) && !cross_core_bind)
 		return ipc4_process_on_core(source->ipc_config.core, false);
 
+	/* these might call comp_ipc4_get_attribute_remote() if necessary */
 	ret = comp_get_attribute(source, COMP_ATTR_BASE_CONFIG, &source_src_cfg);
 	if (ret < 0) {
 		tr_err(&ipc_tr, "failed to get base config for module %#x", dev_comp_id(source));
@@ -397,7 +466,7 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 	else
 		buf_size = sink_src_cfg.ibs * 2;
 
-	buffer = ipc4_create_buffer(source, is_shared, buf_size, bu->extension.r.src_queue,
+	buffer = ipc4_create_buffer(source, cross_core_bind, buf_size, bu->extension.r.src_queue,
 				    bu->extension.r.dst_queue);
 	if (!buffer) {
 		tr_err(&ipc_tr, "failed to allocate buffer to bind %d to %d", src_id, sink_id);
@@ -418,12 +487,26 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 	source_set_min_available(audio_stream_get_source(&buffer->stream), sink_src_cfg.ibs);
 
 	/*
-	 * Connect and bind the buffer to both source and sink components with the interrupts
-	 * disabled to prevent the IPC task getting preempted which could result in buffers being
-	 * only half connected when a pipeline task gets executed. A spinlock isn't required
-	 * because all connected pipelines need to be on the same core.
+	 * Connect and bind the buffer to both source and sink components with LL processing been
+	 * blocked on corresponding core(s) to prevent IPC or IDC task getting preempted which
+	 * could result in buffers being only half connected when a pipeline task gets executed.
 	 */
-	irq_local_disable(flags);
+	ll_block(cross_core_bind);
+
+	if (cross_core_bind) {
+#if CONFIG_CROSS_CORE_STREAM
+		/* Make sure LL has finished on both cores */
+		if (!cpu_is_me(source->ipc_config.core))
+			if (ll_wait_finished_on_core(source) < 0)
+				goto free;
+		if (!cpu_is_me(sink->ipc_config.core))
+			if (ll_wait_finished_on_core(sink) < 0)
+				goto free;
+#else
+		tr_err(&ipc_tr, "Cross-core binding is disabled");
+		goto free;
+#endif
+	}
 
 	ret = comp_buffer_connect(source, source->ipc_config.core, buffer,
 				  PPL_CONN_DIR_COMP_TO_BUFFER);
@@ -432,15 +515,14 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 		goto free;
 	}
 
-
 	ret = comp_buffer_connect(sink, sink->ipc_config.core, buffer,
 				  PPL_CONN_DIR_BUFFER_TO_COMP);
 	if (ret < 0) {
 		tr_err(&ipc_tr, "failed to connect internal buffer to sink %d", sink_id);
 		goto e_sink_connect;
 	}
 
-
+	/* these might call comp_ipc4_bind_remote() if necessary */
 	ret = comp_bind(source, bu);
 	if (ret < 0)
 		goto e_src_bind;
@@ -461,7 +543,7 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 		source->direction_set = true;
 	}
 
-	irq_local_enable(flags);
+	ll_unblock(cross_core_bind);
 
 	return IPC4_SUCCESS;
 
@@ -472,7 +554,7 @@ int ipc_comp_connect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 e_sink_connect:
 	pipeline_disconnect(source, buffer, PPL_CONN_DIR_COMP_TO_BUFFER);
 free:
-	irq_local_enable(flags);
+	ll_unblock(cross_core_bind);
 	buffer_free(buffer);
 	return IPC4_INVALID_RESOURCE_STATE;
 }
@@ -491,6 +573,7 @@ int ipc_comp_disconnect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 	uint32_t src_id, sink_id, buffer_id;
 	uint32_t flags;
 	int ret, ret1;
+	bool cross_core_unbind;
 
 	bu = (struct ipc4_module_bind_unbind *)_connect;
 	src_id = IPC4_COMP_ID(bu->primary.r.module_id, bu->primary.r.instance_id);
@@ -507,8 +590,12 @@ int ipc_comp_disconnect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 		return 0;
 	}
 
-	/* Pass IPC to target core if both modules has the same target core */
-	if (!cpu_is_me(src->ipc_config.core) && src->ipc_config.core == sink->ipc_config.core)
+	cross_core_unbind = src->ipc_config.core != sink->ipc_config.core;
+
+	/* Pass IPC to target core if both modules has the same target core,
+	 * otherwise stay on core 0
+	 */
+	if (!cpu_is_me(src->ipc_config.core) && !cross_core_unbind)
 		return ipc4_process_on_core(src->ipc_config.core, false);
 
 	buffer_id = IPC4_COMP_ID(bu->extension.r.src_queue, bu->extension.r.dst_queue);
@@ -527,17 +614,39 @@ int ipc_comp_disconnect(struct ipc *ipc, ipc_pipe_comp_connect *_connect)
 
 	/*
 	 * Disconnect and unbind buffer from source/sink components and continue to free the buffer
-	 * even in case of errors. Disable interrupts during disconnect and unbinding to prevent
-	 * the IPC task getting preempted which could result in buffers being only half connected
-	 * when a pipeline task gets executed. A spinlock isn't required because all connected
-	 * pipelines need to be on the same core.
+	 * even in case of errors. Block LL processing during disconnect and unbinding to prevent
+	 * IPC or IDC task getting preempted which could result in buffers being only half connected
+	 * when a pipeline task gets executed.
 	 */
-	irq_local_disable(flags);
+	ll_block(cross_core_unbind);
+
+	if (cross_core_unbind) {
+#if CONFIG_CROSS_CORE_STREAM
+		/* Make sure LL has finished on both cores */
+		if (!cpu_is_me(src->ipc_config.core))
+			if (ll_wait_finished_on_core(src) < 0) {
+				ll_unblock(cross_core_unbind);
+				return IPC4_FAILURE;
+			}
+		if (!cpu_is_me(sink->ipc_config.core))
+			if (ll_wait_finished_on_core(sink) < 0) {
+				ll_unblock(cross_core_unbind);
+				return IPC4_FAILURE;
+			}
+#else
+		tr_err(&ipc_tr, "Cross-core binding is disabled");
+		ll_unblock(cross_core_unbind);
+		return IPC4_FAILURE;
+#endif
+	}
+
 	pipeline_disconnect(src, buffer, PPL_CONN_DIR_COMP_TO_BUFFER);
 	pipeline_disconnect(sink, buffer, PPL_CONN_DIR_BUFFER_TO_COMP);
+	/* these might call comp_ipc4_bind_remote() if necessary */
 	ret = comp_unbind(src, bu);
 	ret1 = comp_unbind(sink, bu);
-	irq_local_enable(flags);
+
+	ll_unblock(cross_core_unbind);
 
 	buffer_free(buffer);
 

diff --git a/src/schedule/zephyr_domain.c b/src/schedule/zephyr_domain.c
@@ -50,6 +50,11 @@ struct zephyr_domain {
 	struct k_timer timer;
 	struct zephyr_domain_thread domain_thread[CONFIG_CORE_COUNT];
 	struct ll_schedule_domain *ll_domain;
+#if CONFIG_CROSS_CORE_STREAM
+	atomic_t block;
+	struct k_mutex block_mutex;
+	struct k_condvar block_condvar;
+#endif
 };
 
 /* perf measurement windows size 2^x */
@@ -67,6 +72,16 @@ static void zephyr_domain_thread_fn(void *p1, void *p2, void *p3)
 		/* immediately go to sleep, waiting to be woken up by the timer */
 		k_sem_take(&dt->sem, K_FOREVER);
 
+#if CONFIG_CROSS_CORE_STREAM
+		if (atomic_get(&zephyr_domain->block)) {
+			k_mutex_lock(&zephyr_domain->block_mutex, K_FOREVER);
+			if (atomic_get(&zephyr_domain->block))
+				k_condvar_wait(&zephyr_domain->block_condvar,
+					       &zephyr_domain->block_mutex, K_FOREVER);
+			k_mutex_unlock(&zephyr_domain->block_mutex);
+		}
+#endif
+
 		cycles0 = k_cycle_get_32();
 		dt->handler(dt->arg);
 		cycles1 = k_cycle_get_32();
@@ -221,9 +236,34 @@ static int zephyr_domain_unregister(struct ll_schedule_domain *domain,
 	return 0;
 }
 
+#if CONFIG_CROSS_CORE_STREAM
+static void zephyr_domain_block(struct ll_schedule_domain *domain)
+{
+	struct zephyr_domain *zephyr_domain = ll_sch_domain_get_pdata(domain);
+
+	k_mutex_lock(&zephyr_domain->block_mutex, K_FOREVER);
+	atomic_set(&zephyr_domain->block, 1);
+	k_mutex_unlock(&zephyr_domain->block_mutex);
+}
+
+static void zephyr_domain_unblock(struct ll_schedule_domain *domain)
+{
+	struct zephyr_domain *zephyr_domain = ll_sch_domain_get_pdata(domain);
+
+	k_mutex_lock(&zephyr_domain->block_mutex, K_FOREVER);
+	atomic_set(&zephyr_domain->block, 0);
+	k_condvar_broadcast(&zephyr_domain->block_condvar);
+	k_mutex_unlock(&zephyr_domain->block_mutex);
+}
+#endif
+
 static const struct ll_schedule_domain_ops zephyr_domain_ops = {
 	.domain_register	= zephyr_domain_register,
 	.domain_unregister	= zephyr_domain_unregister,
+#if CONFIG_CROSS_CORE_STREAM
+	.domain_block		= zephyr_domain_block,
+	.domain_unblock		= zephyr_domain_unblock,
+#endif
 };
 
 struct ll_schedule_domain *zephyr_domain_init(int clk)
@@ -239,6 +279,12 @@ struct ll_schedule_domain *zephyr_domain_init(int clk)
 
 	zephyr_domain->ll_domain = domain;
 
+#if CONFIG_CROSS_CORE_STREAM
+	atomic_set(&zephyr_domain->block, 0);
+	k_mutex_init(&zephyr_domain->block_mutex);
+	k_condvar_init(&zephyr_domain->block_condvar);
+#endif
+
 	ll_sch_domain_set_pdata(domain, zephyr_domain);
 
 	return domain;

diff --git a/zephyr/Kconfig b/zephyr/Kconfig
@@ -56,4 +56,15 @@ config ZEPHYR_DP_SCHEDULER
 	  DP modules can be located in dieffrent cores than LL pipeline modules, may have
 	  different tick (i.e. 300ms for speech reccognition, etc.)
 
+config CROSS_CORE_STREAM
+	bool "Enable cross-core connected pipelines"
+	default y if IPC_MAJOR_4
+	help
+	  Enables support for pipelines from different cores to be
+	  connected together cross-core. So stream can travel from one
+	  core to another. Note, this is different from "multicore"
+	  support. In SOF "multicore" support means different streams
+	  can be processed on different cores, however, each stream
+	  is processed entirely on single core.
+
 endif