Fix SIGCONT handling on threads blocked in syscalls

If a process receives a SIGSTOP, we emulate the group-stop by: * Leaving the thread which happened to receive the SIGSTOP signal ptrace-stopped * Refusing to schedule any other thread until the group-stop is over The whole group-stop is therefore emulated by rr and not actually enforced by the kernel. When a SIGCONT is received, we need to end the group-stop. However, we can't actually _know_ that a ptrace-stopped thread received a signal until we try and resume it. To work around this, we check /proc/tid/status's `SigPnd` and `ShdPnd` fields in the scheduler to detect when a thread that's in a group-stop has a pending SIGCONT, and so needs to be PTRACE_CONT'd so we can actually `wait` and receive that SIGCONT. A problem however arises in the following case: * A process has at least two threads, * One thread "A" receives a SIGSTOP, * And the other thread "B" is in a blocking system call, * And then a process-directed SIGCONT is sent to the process, * And the scheduler checks if "B" is runnable before checking if "A" is runnable. In this case, the issue is that the process-directed SIGCONT will set the bit in `ShdPnd` for _both_ threads. So `t->is_signal_pending(SIGCONT)` will be true for both thread A and B. The scheduler then tries to PTRACE_CONT thread B, but it's not actually in a ptrace-stop, so it all goes pear shaped (actually you get an assertion failure in `t->resume_execution()`). The fix is not to perform this `SigPnd`/`ShdPnd` checking at all for threads that are not actually in a ptrace-stop. They don't need this kind of special handling, because they're actually not ptrace-stopped; when we go to `try_wait` on them later on, we'll notice that they received a signal, and the handling in `RecordTask::signal_delivered` will actually run `emulate_SIGCONT` then.
rr-debugger · Nov 9, 2024 · b1e461a · b1e461a
1 parent 14cd8b9
commit b1e461a
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 5 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1330,6 +1330,7 @@ set(BASIC_TESTS
   sigaltstack
   sigchld_interrupt_signal
   sigcont
+  sigcont_threaded
   sigframe_grow_stack
   sighandler_bad_rsp_sigsegv
   sighandler_fork

diff --git a/src/RecordTask.cc b/src/RecordTask.cc
@@ -1169,7 +1169,7 @@ void RecordTask::emulate_SIGCONT() {
   // All threads in the process are resumed.
   for (Task* t : thread_group()->task_set()) {
     auto rt = static_cast<RecordTask*>(t);
-    LOG(debug) << "setting " << tid << " to NOT_STOPPED due to SIGCONT";
+    LOG(debug) << "setting " << rt->tid << " to NOT_STOPPED due to SIGCONT";
     rt->clear_stashed_group_stop();
     rt->emulated_stop_pending = false;
     rt->emulated_stop_type = NOT_STOPPED;

diff --git a/src/Scheduler.cc b/src/Scheduler.cc
@@ -318,10 +318,10 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator,
   }
 
   if (t->emulated_stop_type != NOT_STOPPED) {
-    if (t->is_signal_pending(SIGCONT)) {
-      // We have to do this here. RecordTask::signal_delivered can't always
-      // do it because if we don't PTRACE_CONT the task, we'll never see the
-      // SIGCONT.
+    if (t->is_stopped() && t->is_signal_pending(SIGCONT)) {
+      // We have to do this here. RecordTask::signal_delivered can't do it
+      // in the case where t->is_stopped(), because if we don't PTRACE_CONT
+      // the task, we'll never see the SIGCONT.
       t->emulate_SIGCONT();
       // We shouldn't run any user code since there is at least one signal
       // pending.

diff --git a/src/test/sigcont_threaded.c b/src/test/sigcont_threaded.c
@@ -0,0 +1,63 @@
+/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
+
+#include "util.h"
+
+static void *child_process_extra_thread(__attribute__((unused)) void *extra_thread) {
+  int r;
+
+  // Slap in a sched_yield or two here so that the parent process is going to be
+  // blocked in pthread_join.
+  sched_yield();
+  sched_yield();
+
+  // Now, stop ourselves. We'll be unstopped by the parent process.
+  r = kill(gettid(), SIGSTOP);
+  test_assert(r == 0);
+
+  // Now allow self to exit, and the thread-group-leader can continue.
+  return NULL;
+}
+
+static void child_process(void) {
+  pthread_t extra_thread;
+  int r;
+  // Spawn an additional thread
+  r = pthread_create(&extra_thread, NULL, child_process_extra_thread, NULL);
+  test_assert(r == 0);
+
+  // Wait for the child thread we made. It will send SIGSTOP to the process.
+  r = pthread_join(extra_thread, NULL);
+  test_assert(r == 0);
+}
+
+static void parent_process(pid_t pid) {
+  int wait_status, r;
+  pid_t wpid;
+
+  // Wait for the child process to have sent itself SIGSTOP
+  wpid = waitpid(pid, &wait_status, WUNTRACED);
+  test_assert(wpid == pid);
+  test_assert(WIFSTOPPED(wait_status));
+  test_assert(WSTOPSIG(wait_status) == SIGSTOP);
+
+  // Let it continue
+  r = kill(pid, SIGCONT);
+  test_assert(r == 0);
+
+  // Now the child process should actually exit
+  wpid = waitpid(pid, &wait_status, 0);
+  test_assert(wpid == pid);
+  test_assert(WIFEXITED(wait_status));
+}
+
+int main(void) {
+  pid_t pid = fork();
+  test_assert(pid != -1);
+  if (pid == 0) {
+    child_process();
+  } else {
+    parent_process(pid);
+    atomic_puts("EXIT-SUCCESS");
+  }
+  return 0;
+}