diff --git a/.circleci/config.yml b/.circleci/config.yml index dadb14514..71c68ed0a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -27,7 +27,7 @@ jobs: make -j2 make tests -j2 - run: - command: timeout --foreground -k 10s 2m make check + command: timeout --foreground -k 10s 2m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) no_output_timeout: 180s arm_clang: @@ -58,7 +58,7 @@ jobs: make -j2 make tests -j2 - run: - command: timeout --foreground -k 10s 2m make check + command: timeout --foreground -k 10s 2m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) no_output_timeout: 180s arm_sanitizers: @@ -96,7 +96,10 @@ jobs: make -j2 make tests -j2 - run: - command: timeout --foreground -k 10s 4m make check + command: | + export QTHREADS_DIR="$(pwd)" + if [[ "<< parameters.sanitizer >>" == "thread" ]]; then cd test/basics; fi + timeout --foreground -k 10s 4m make check || ( cd $QTHREADS_DIR && cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) no_output_timeout: 120s arm_acfl: @@ -134,7 +137,7 @@ jobs: - run: command: | export PATH=$PATH:/opt/arm/arm-linux-compiler-24.04_Ubuntu-22.04/bin - timeout --foreground -k 10s 4m make check + timeout --foreground -k 10s 4m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) no_output_timeout: 180s nvc: @@ -173,7 +176,7 @@ jobs: make tests -j2 - run: command: | - timeout --foreground -k 10s 4m make check + timeout --foreground -k 10s 4m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) no_output_timeout: 180s musl: @@ -197,7 +200,7 @@ jobs: make -j2 make tests -j2 - run: - command: make check + command: make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) no_output_timeout: 180s workflows: @@ -238,6 +241,24 @@ workflows: - scheduler: distrib topology: hwloc sanitizer: memory + - scheduler: sherwood + topology: 'no' + sanitizer: thread + - scheduler: sherwood + topology: hwloc + sanitizer: thread + - scheduler: sherwood + topology: binders + sanitizer: thread + - scheduler: distrib + topology: 'no' + sanitizer: thread + - scheduler: distrib + topology: hwloc + sanitizer: thread + - scheduler: distrib + topology: binders + sanitizer: thread - arm_acfl: matrix: parameters: diff --git a/.cirrus.yml b/.cirrus.yml index 7154de16b..eb3b1620b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -45,7 +45,7 @@ osx_m1_task: # commented example for how to get a backtrace from CI usign lldb on OSX: #echo "settings set target.process.stop-on-exec false" > ~/.lldbinit #QT_NUM_SHEPHERDS=2 QT_NUM_WORKERS_PER_SHEPHERD=1 lldb bash --batch --one-line 'process launch' --one-line-on-crash 'bt' --one-line-on-crash 'quit' -- test/basics/hello_world - gtimeout --foreground 3m make check + gtimeout --foreground 3m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) freebsd_task: freebsd_instance: @@ -68,7 +68,7 @@ freebsd_task: make -j$CIRRUS_CPU make tests -j$CIRRUS_CPU test_script: | - gtimeout --foreground -k 10s 2m make check + gtimeout --foreground -k 10s 2m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) arm_linux_task: arm_container: @@ -112,7 +112,7 @@ arm_linux_task: make -j$CIRRUS_CPU make tests -j$CIRRUS_CPU test_script: | - timeout --foreground -k 10s 2m make check + timeout --foreground -k 10s 5m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) arm_linux_clang_task: arm_container: @@ -168,5 +168,5 @@ arm_linux_clang_task: make -j$CIRRUS_CPU make tests -j$CIRRUS_CPU test_script: | - timeout --foreground -k 10s 2m make check + timeout --foreground -k 10s 5m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index bf4d1baae..b0d250f4c 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -35,7 +35,7 @@ jobs: make -j2 make tests -j2 - name: make check - run: timeout -k 10s --foreground 3m make check + run: timeout -k 10s --foreground 3m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) timeout-minutes: 4 linux-clang: @@ -77,7 +77,7 @@ jobs: make -j2 make tests -j2 - name: make check - run: timeout -k 10s --foreground 6m make check + run: timeout -k 10s --foreground 6m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) timeout-minutes: 7 linux-icx: @@ -116,7 +116,7 @@ jobs: - name: make check run: | source /opt/intel/oneapi/setvars.sh - timeout -k 10s --foreground 6m make check + timeout -k 10s --foreground 6m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) timeout-minutes: 7 linux-icc: @@ -157,7 +157,7 @@ jobs: - name: make check run: | source /opt/intel/oneapi/setvars.sh - timeout -k 10s --foreground 6m make check + timeout -k 10s --foreground 6m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) timeout-minutes: 7 linux-aocc: @@ -192,7 +192,7 @@ jobs: make tests -j2 - name: make check run: | - timeout -k 10s --foreground 6m make check + timeout -k 10s --foreground 6m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) timeout-minutes: 7 mac: @@ -228,7 +228,7 @@ jobs: # commented example for how to get a backtrace from CI usign lldb on OSX: #echo "settings set target.process.stop-on-exec false" > ~/.lldbinit #QT_NUM_SHEPHERDS=2 QT_NUM_WORKERS_PER_SHEPHERD=1 lldb bash --batch --one-line 'process launch' --one-line-on-crash 'bt' --one-line-on-crash 'quit' -- test/basics/hello_world - gtimeout -k 10s --foreground 8m make check + gtimeout -k 10s --foreground 8m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) timeout-minutes: 9 sanitizers: @@ -245,6 +245,14 @@ jobs: topology: hwloc - sanitizer: memory topology: binders + - sanitizer: thread + scheduler: sherwood + - sanitizer: thread + scheduler: distrib + - sanitizer: thread + topology: hwloc + - sanitizer: thread + topology: binders env: CC: clang-19 CXX: clang++-19 @@ -276,7 +284,10 @@ jobs: make -j2 make tests -j2 - name: make check - run: timeout -k 10s --foreground 8m make check + run: | + export QTHREADS_DIR="$(pwd)" + if [[ "${{ matrix.sanitizer }}" == "thread" ]]; then cd test/basics; fi + timeout -k 10s --foreground 8m make check || ( cd $QTHREADS_DIR && cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) timeout-minutes: 9 linux-thorough: @@ -319,7 +330,7 @@ jobs: make -j2 make tests -j2 - name: make check - run: timeout -k 10s --foreground 6m make check + run: timeout -k 10s --foreground 6m make check || ( cat test/basics/test-suite.log && cat test/features/test-suite.log && cat test/stress/test-suite.log && exit 1 ) timeout-minutes: 7 clang-format: diff --git a/include/qt_queue.h b/include/qt_queue.h index ae8812fb4..a89c86efe 100644 --- a/include/qt_queue.h +++ b/include/qt_queue.h @@ -9,24 +9,25 @@ */ typedef struct qthread_queue_node_s { - struct qthread_queue_node_s *next; - qthread_t *thread; + struct qthread_queue_node_s *_Atomic next; + qthread_t *_Atomic thread; } qthread_queue_node_t; typedef struct qthread_queue_NEMESIS_s { /* The First Cacheline */ - void *head; - void *tail; - uint8_t pad1[CACHELINE_WIDTH - (2 * sizeof(void *))]; + void *_Atomic head; + uint8_t pad1[CACHELINE_WIDTH - sizeof(void *)]; + void *_Atomic tail; + uint8_t pad2[CACHELINE_WIDTH - sizeof(void *)]; /* The Second Cacheline */ - aligned_t length; + _Atomic aligned_t length; void *shadow_head; - uint8_t pad2[CACHELINE_WIDTH - sizeof(void *) - sizeof(aligned_t)]; + uint8_t pad3[CACHELINE_WIDTH - sizeof(void *) - sizeof(aligned_t)]; } qthread_queue_NEMESIS_t; typedef struct qthread_queue_nosync_s { - qthread_queue_node_t *head; - qthread_queue_node_t *tail; + qthread_queue_node_t *_Atomic head; + qthread_queue_node_t *_Atomic tail; } qthread_queue_nosync_t; typedef struct qthread_queue_capped_s { diff --git a/include/qt_shepherd_innards.h b/include/qt_shepherd_innards.h index 19b073a0e..f2833227c 100644 --- a/include/qt_shepherd_innards.h +++ b/include/qt_shepherd_innards.h @@ -36,7 +36,7 @@ struct qthread_worker_s { qthread_shepherd_t *shepherd; struct qthread_s **nostealbuffer; struct qthread_s **stealbuffer; - qthread_t *current; + qthread_t *_Atomic current; qthread_worker_id_t unique_id; qthread_worker_id_t worker_id; qthread_worker_id_t packed_worker_id; diff --git a/src/qthread.c b/src/qthread.c index 9ede8ab69..7cf8d63db 100644 --- a/src/qthread.c +++ b/src/qthread.c @@ -334,7 +334,7 @@ static void *qthread_master(void *arg) { qt_threadqueue_t *threadqueue; qt_threadqueue_private_t *localqueue = NULL; qthread_t *t; - qthread_t **current; + qthread_t *_Atomic *current; int done = 0; assert(me != NULL); @@ -444,8 +444,9 @@ static void *qthread_master(void *arg) { #endif qthread_exec(t, &my_context); - t = *current; // necessary for direct-swap sanity - *current = NULL; // neessary for "queue sanity" + t = *current; // necessary for direct-swap sanity + atomic_store_explicit( + current, NULL, memory_order_relaxed); // neessary for "queue sanity" /* now clean up, based on the thread's state */ switch (atomic_load_explicit(&t->thread_state, memory_order_relaxed)) { @@ -1340,7 +1341,8 @@ qthread_readstate(const enum introspective_state type) { /*{{{ */ sum += qt_threadqueue_advisory_queuelen(sheps[s].ready); qthread_worker_t const *wkrs = sheps[s].workers; for (qthread_worker_id_t w = 0; w < qlib->nworkerspershep; w++) { - sum += (wkrs[w].current != NULL); + sum += (atomic_load_explicit(&wkrs[w].current, + memory_order_relaxed) != NULL); } } return sum; @@ -1351,7 +1353,8 @@ qthread_readstate(const enum introspective_state type) { /*{{{ */ for (qthread_shepherd_id_t s = 0; s < qlib->nshepherds; s++) { qthread_worker_t const *wkrs = sheps[s].workers; for (qthread_worker_id_t w = 0; w < qlib->nworkerspershep; w++) { - count += (wkrs[w].current != NULL); + count += (atomic_load_explicit(&wkrs[w].current, + memory_order_relaxed) != NULL); } } return count; diff --git a/src/queue.c b/src/queue.c index a074b423f..e4357a38b 100644 --- a/src/queue.c +++ b/src/queue.c @@ -54,7 +54,8 @@ qthread_queue_t API_FUNC qthread_queue_create(uint8_t flags, aligned_t length) { aligned_t API_FUNC qthread_queue_length(qthread_queue_t q) { assert(q); switch (q->type) { - case NEMESIS_LENGTH: return q->q.nemesis.length; + case NEMESIS_LENGTH: + return atomic_load_explicit(&q->q.nemesis.length, memory_order_relaxed); case CAPPED: return q->q.capped.membercount; default: return 0; } @@ -78,7 +79,8 @@ void INTERNAL qthread_queue_internal_enqueue(qthread_queue_t q, qthread_t *t) { break; case NEMESIS_LENGTH: qthread_queue_internal_NEMESIS_enqueue(&q->q.nemesis, t); - qthread_incr(&q->q.nemesis.length, 1); + atomic_fetch_add_explicit( + &q->q.nemesis.length, 1ull, memory_order_relaxed); break; case CAPPED: qthread_queue_internal_capped_enqueue(&q->q.capped, t); break; case MTS: QTHREAD_TRAP(); @@ -110,7 +112,8 @@ int API_FUNC qthread_queue_release_one(qthread_queue_t q) { break; case NEMESIS_LENGTH: t = qthread_queue_internal_NEMESIS_dequeue(&q->q.nemesis); - qthread_incr(&q->q.nemesis.length, (aligned_t)-1); + atomic_fetch_add_explicit( + &q->q.nemesis.length, (aligned_t)-1, memory_order_relaxed); break; case CAPPED: t = qthread_queue_internal_capped_dequeue(&q->q.capped); break; default: QTHREAD_TRAP(); @@ -142,13 +145,15 @@ int API_FUNC qthread_queue_release_all(qthread_queue_t q) { } break; case NEMESIS_LENGTH: { - aligned_t const count = q->q.nemesis.length; + aligned_t const count = + atomic_load_explicit(&q->q.nemesis.length, memory_order_relaxed); for (aligned_t c = 0; c < count; c++) { t = qthread_queue_internal_NEMESIS_dequeue(&q->q.nemesis); assert(t); if (t) { qthread_queue_internal_launch(t, shep); } } - qthread_incr(&q->q.nemesis.length, -count); + atomic_fetch_add_explicit( + &q->q.nemesis.length, -count, memory_order_relaxed); break; } case CAPPED: { @@ -198,14 +203,17 @@ void INTERNAL qthread_queue_internal_nosync_enqueue(qthread_queue_nosync_t *q, assert(q); assert(t); - node->thread = t; - node->next = NULL; - if (q->tail == NULL) { - q->head = node; + atomic_store_explicit(&node->thread, t, memory_order_relaxed); + atomic_store_explicit(&node->next, NULL, memory_order_relaxed); + if (atomic_load_explicit(&q->tail, memory_order_relaxed) == NULL) { + atomic_store_explicit(&q->head, node, memory_order_relaxed); } else { - q->tail->next = node; + atomic_store_explicit( + &atomic_load_explicit(&q->tail, memory_order_relaxed)->next, + node, + memory_order_relaxed); } - q->tail = node; + atomic_store_explicit(&q->tail, node, memory_order_relaxed); } qthread_t INTERNAL * @@ -215,10 +223,13 @@ qthread_queue_internal_nosync_dequeue(qthread_queue_nosync_t *q) { assert(q); - node = q->head; + node = atomic_load_explicit(&q->head, memory_order_relaxed); if (node) { - q->head = node->next; - t = node->thread; + atomic_store_explicit( + &q->head, + atomic_load_explicit(&node->next, memory_order_relaxed), + memory_order_relaxed); + t = atomic_load_explicit(&node->thread, memory_order_relaxed); FREE_TQNODE(node); } return t; @@ -230,41 +241,46 @@ void INTERNAL qthread_queue_internal_NEMESIS_enqueue(qthread_queue_NEMESIS_t *q, node = ALLOC_TQNODE(); assert(node != NULL); - node->thread = t; - node->next = NULL; + atomic_store_explicit(&node->thread, t, memory_order_relaxed); + atomic_store_explicit(&node->next, NULL, memory_order_relaxed); prev = qt_internal_atomic_swap_ptr((void **)&(q->tail), node); if (prev == NULL) { - q->head = node; + atomic_store_explicit(&q->head, node, memory_order_relaxed); } else { - prev->next = node; + atomic_store_explicit(&prev->next, node, memory_order_relaxed); } } qthread_t INTERNAL * qthread_queue_internal_NEMESIS_dequeue(qthread_queue_NEMESIS_t *q) { if (!q->shadow_head) { - if (!q->head) { return NULL; } - q->shadow_head = q->head; - q->head = NULL; + if (!atomic_load_explicit(&q->head, memory_order_relaxed)) { return NULL; } + q->shadow_head = atomic_load_explicit(&q->head, memory_order_relaxed); + atomic_store_explicit(&q->head, NULL, memory_order_relaxed); } qthread_queue_node_t *const dequeued = q->shadow_head; if (dequeued != NULL) { - if (dequeued->next != NULL) { - q->shadow_head = dequeued->next; - dequeued->next = NULL; + if (atomic_load_explicit(&dequeued->next, memory_order_relaxed) != NULL) { + q->shadow_head = + atomic_load_explicit(&dequeued->next, memory_order_relaxed); + atomic_store_explicit(&dequeued->next, NULL, memory_order_relaxed); } else { qthread_queue_node_t *old; q->shadow_head = NULL; - old = qthread_cas_ptr(&(q->tail), dequeued, NULL); + old = qthread_cas_ptr((void **)&(q->tail), dequeued, NULL); if (old != dequeued) { - while (dequeued->next == NULL) SPINLOCK_BODY(); - q->shadow_head = dequeued->next; - dequeued->next = NULL; + while (atomic_load_explicit(&dequeued->next, memory_order_relaxed) == + NULL) + SPINLOCK_BODY(); + q->shadow_head = + atomic_load_explicit(&dequeued->next, memory_order_relaxed); + atomic_store_explicit(&dequeued->next, NULL, memory_order_relaxed); } } - qthread_t *retval = dequeued->thread; + qthread_t *retval = + atomic_load_explicit(&dequeued->thread, memory_order_relaxed); FREE_TQNODE(dequeued); return retval; } else { diff --git a/src/threadqueues/nemesis_threadqueues.c b/src/threadqueues/nemesis_threadqueues.c index cee6ffbcd..85a27c5c0 100644 --- a/src/threadqueues/nemesis_threadqueues.c +++ b/src/threadqueues/nemesis_threadqueues.c @@ -58,7 +58,7 @@ struct _qt_threadqueue { alignas(CACHELINE_WIDTH) NEMESIS_queue q; /* the following is for estimating a queue's "busy" level, and is not * guaranteed accurate (that would be a race condition) */ - saligned_t advisory_queuelen; + _Atomic saligned_t advisory_queuelen; #ifdef QTHREAD_CONDWAIT_BLOCKING_QUEUE uint32_t frustration; QTHREAD_COND_DECL(trigger); @@ -180,7 +180,8 @@ void INTERNAL qt_threadqueue_free(qt_threadqueue_t *q) { /*{{{ */ if (node) { qthread_t *retval = node->thread; assert(atomic_load_explicit(&node->next, memory_order_relaxed) == NULL); - (void)qthread_incr(&(q->advisory_queuelen), (aligned_t)-1); + atomic_fetch_add_explicit( + &q->advisory_queuelen, (aligned_t)-1, memory_order_relaxed); FREE_TQNODE(node); qthread_thread_free(retval); } else { @@ -231,7 +232,7 @@ void INTERNAL qt_threadqueue_enqueue(qt_threadqueue_t *restrict q, } else { atomic_store_explicit(&prev->next, node, memory_order_relaxed); } - (void)qthread_incr(&(q->advisory_queuelen), 1); + atomic_fetch_add_explicit(&q->advisory_queuelen, 1, memory_order_relaxed); #ifdef QTHREAD_CONDWAIT_BLOCKING_QUEUE /* awake waiter */ /* Yes, this needs to be here, to prevent reading frustration being hoisted @@ -256,7 +257,7 @@ void INTERNAL qt_threadqueue_enqueue_yielded(qt_threadqueue_t *restrict q, ssize_t INTERNAL qt_threadqueue_advisory_queuelen(qt_threadqueue_t *q) { /*{{{ */ assert(q); - return q->advisory_queuelen; + return atomic_load_explicit(&q->advisory_queuelen, memory_order_relaxed); } /*}}} */ qthread_t INTERNAL * @@ -297,7 +298,8 @@ qt_scheduler_get_thread(qt_threadqueue_t *q, } assert(node); assert(atomic_load_explicit(&node->next, memory_order_relaxed) == NULL); - (void)qthread_incr(&(q->advisory_queuelen), (aligned_t)-1); + atomic_fetch_add_explicit( + &q->advisory_queuelen, (aligned_t)-1, memory_order_relaxed); retval = node->thread; FREE_TQNODE(node); return retval; diff --git a/test/basics/queue.c b/test/basics/queue.c index f8c459414..0cb249a62 100644 --- a/test/basics/queue.c +++ b/test/basics/queue.c @@ -1,17 +1,20 @@ -#include "argparsing.h" #include -#include +#include #include #include -aligned_t threads_in = 0; +#include "qthread/qthread.h" + +#include "argparsing.h" + +_Atomic aligned_t threads_in = 0; aligned_t awoke = 0; int THREADS_ENQUEUED = 100; qthread_queue_t the_queue; static aligned_t tobequeued(void *arg) { - qthread_incr(&threads_in, 1); + atomic_fetch_add_explicit(&threads_in, 1, memory_order_relaxed); // iprintf("\tOne thread about to join the queue...\n"); qthread_queue_join(the_queue); // iprintf("\tAwoke from the queue! %p\n", qthread_retloc()); @@ -54,7 +57,7 @@ int main(int argc, char *argv[]) { ret = qthread_readFF(NULL, &return_value); test_check(ret == QTHREAD_SUCCESS); - test_check(threads_in == 1); + test_check(atomic_load_explicit(&threads_in, memory_order_relaxed) == 1); test_check(awoke == 1); test_check(qthread_queue_length(the_queue) == 0); // This relies on approximate estimates, so it's not reliable to test here. @@ -64,7 +67,7 @@ int main(int argc, char *argv[]) { iprintf("---------------------------------------------------------\n"); iprintf("\tMULTI THREAD TEST\n\n"); - threads_in = 0; + atomic_store_explicit(&threads_in, 0, memory_order_relaxed); awoke = 0; aligned_t *retvals = malloc(sizeof(aligned_t) * THREADS_ENQUEUED); iprintf("1/6 Spawning %u threads to be queued...\n", THREADS_ENQUEUED); @@ -76,7 +79,8 @@ int main(int argc, char *argv[]) { iprintf("2/6 Waiting for %u threads to queue themselves...\n", THREADS_ENQUEUED); while (qthread_queue_length(the_queue) != THREADS_ENQUEUED) qthread_yield(); - test_check(threads_in == THREADS_ENQUEUED); + test_check(atomic_load_explicit(&threads_in, memory_order_relaxed) == + THREADS_ENQUEUED); test_check(qthread_readstate(NODE_BUSYNESS) == 1); iprintf("3/6 Releasing a single thread...\n");