From 6d8cba93308f8334c85871b5355679551682fbd8 Mon Sep 17 00:00:00 2001 From: Mike Uttormark Date: Sun, 10 Dec 2023 19:52:41 -0600 Subject: [PATCH] prov/util: Change uffd stop routine to use pipe Without this change, the following segfault can happen. 0 0x00001555546a66de in malloc () from /lib64/libc.so.6 1 0x00001555555351f8 in _dl_new_object () from /lib64/ld-linux-x86-64.so.2 2 0x000015555552f413 in _dl_map_object_from_fd () from /lib64/ld-linux-x86-64.so.2 3 0x000015555553252a in _dl_map_object () from /lib64/ld-linux-x86-64.so.2 4 0x000015555553d700 in dl_open_worker () from /lib64/ld-linux-x86-64.so.2 5 0x000015555476002d in _dl_catch_exception () from /lib64/libc.so.6 6 0x000015555553d28b in _dl_open () from /lib64/ld-linux-x86-64.so.2 7 0x000015555475f37d in do_dlopen () from /lib64/libc.so.6 8 0x000015555476002d in _dl_catch_exception () from /lib64/libc.so.6 9 0x00001555547600bf in _dl_catch_error () from /lib64/libc.so.6 10 0x000015555475f497 in dlerror_run () from /lib64/libc.so.6 11 0x000015555475f567 in __libc_dlopen_mode () from /lib64/libc.so.6 12 0x0000155555409fbb in pthread_cancel_init () from /lib64/libpthread.so.0 13 0x00001555554062b7 in pthread_cancel () from /lib64/libpthread.so.0 14 0x00000000005b66eb in ofi_uffd_stop (monitor=0x70f3e0 ) at prov/util/src/util_mem_monitor.c:887 15 0x00000000005b4c25 in ofi_monitors_update (monitors=0x7fffffffe070) at prov/util/src/util_mem_monitor.c:154 16 0x00000000005b5828 in ofi_monitors_del_cache (cache=0x74a428) at prov/util/src/util_mem_monitor.c:485 17 0x00000000005b9956 in ofi_mr_cache_cleanup (cache=0x74a428) at prov/util/src/util_mr_cache.c:502 18 0x00000000005e8cb1 in cxip_iomm_fini (dom=0x746250) at prov/cxi/src/cxip_iomm.c:368 19 0x0000000000617494 in cxip_domain_disable (dom=0x746250) at prov/cxi/src/cxip_dom.c:540 20 0x000000000061755f in cxip_dom_close (fid=0x746250) at prov/cxi/src/cxip_dom.c:568 21 0x000000000054da04 in fi_close (fid=0x746250) at ./include/rdma/fabric.h:632 22 0x0000000000570df9 in av_auth_key_test_rx_ep_fini () at prov/cxi/test/auth_key.c:2495 23 0x0000000000575efa in data_transfer_av_auth_key_av_user_id_source_err_auth_key_user_id_impl () at prov/cxi/test/auth_key.c:2802 24 0x0000155555017f50 in criterion_internal_test_main (fn=0x5751ad ) at ../src/core/test.c:94 25 0x00000000005751a5 in data_transfer_av_auth_key_av_user_id_source_err_auth_key_user_id_jmp () at prov/cxi/test/auth_key.c:2754 26 0x0000155555016823 in run_test_child () at ../src/core/runner_coroutine.c:230 27 0x00001555550268b1 in bxfi_main () at ../subprojects/boxfort/src/sandbox.c:57 28 0x000015555463e24d in __libc_start_main () from /lib64/libc.so.6 29 0x0000000000405bea in _start () at ../sysdeps/x86_64/start.S:120 Signed-off-by: Mike Uttormark --- include/ofi_mr.h | 1 + prov/util/src/util_mem_monitor.c | 94 ++++++++++++++++++++++++++------ 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 6f85e07eadd..12383413110 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -231,6 +231,7 @@ struct ofi_uffd { struct ofi_mem_monitor monitor; pthread_t thread; int fd; + int exit_pipe[2]; }; extern struct ofi_mem_monitor *uffd_monitor; diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index b725a90bc6d..2b6cc9b0ed5 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -61,6 +61,8 @@ static struct ofi_uffd uffd = { .monitor.start = ofi_uffd_start, .monitor.stop = ofi_uffd_stop, .monitor.name = "uffd", + .fd = -1, + .exit_pipe = { -1, -1 }, }; struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; @@ -594,14 +596,17 @@ static void ofi_uffd_pagefault_handler(struct uffd_msg *msg); static void *ofi_uffd_handler(void *arg) { struct uffd_msg msg; - struct pollfd fds; + struct pollfd fds[2]; int ret; - fds.fd = uffd.fd; - fds.events = POLLIN; + fds[0].fd = uffd.fd; + fds[0].events = POLLIN; + fds[1].fd = uffd.exit_pipe[0]; + fds[1].events = POLLIN; + for (;;) { - ret = poll(&fds, 1, -1); - if (ret != 1) + ret = poll(fds, 2, -1); + if (ret < 0 || fds[1].revents) break; pthread_rwlock_rdlock(&mm_list_rwlock); @@ -832,24 +837,45 @@ static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, return true; } +static void ofi_uffd_close_fd(struct ofi_uffd *monitor) +{ + close(monitor->fd); + monitor->fd = -1; +} + +static void ofi_uffd_close_pipe(struct ofi_uffd *monitor) +{ + close(monitor->exit_pipe[0]); + close(monitor->exit_pipe[1]); + monitor->exit_pipe[0] = -1; + monitor->exit_pipe[1] = -1; +} + static int ofi_uffd_start(struct ofi_mem_monitor *monitor) { struct uffdio_api api; int ret; - uffd.monitor.subscribe = ofi_uffd_subscribe; - uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; - uffd.monitor.valid = ofi_uffd_valid; + if (uffd.fd >= 0) + return 0; if (!num_page_sizes) return -FI_ENODATA; + ret = pipe(uffd.exit_pipe); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "uffd/pipe: %s\n", strerror(errno)); + return -errno; + } + uffd.fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd.fd < 0) { FI_WARN(&core_prov, FI_LOG_MR, "syscall/userfaultfd %s\n", strerror(errno)); - return -errno; + ret = -errno; + goto close_pipe; } api.api = UFFD_API; @@ -860,13 +886,13 @@ static int ofi_uffd_start(struct ofi_mem_monitor *monitor) FI_WARN(&core_prov, FI_LOG_MR, "ioctl/uffdio: %s\n", strerror(errno)); ret = -errno; - goto closefd; + goto close_fd; } if (api.api != UFFD_API) { FI_WARN(&core_prov, FI_LOG_MR, "uffd features not supported\n"); ret = -FI_ENOSYS; - goto closefd; + goto close_fd; } ret = pthread_create(&uffd.thread, NULL, ofi_uffd_handler, &uffd); @@ -874,20 +900,56 @@ static int ofi_uffd_start(struct ofi_mem_monitor *monitor) FI_WARN(&core_prov, FI_LOG_MR, "failed to create handler thread %s\n", strerror(ret)); ret = -ret; - goto closefd; + goto close_fd; } + + uffd.monitor.subscribe = ofi_uffd_subscribe; + uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; + uffd.monitor.valid = ofi_uffd_valid; + + FI_INFO(&core_prov, FI_LOG_MR, + "Memory monitor uffd started.\n"); + return 0; -closefd: - close(uffd.fd); +close_fd: + + ofi_uffd_close_fd(&uffd); + +close_pipe: + + ofi_uffd_close_pipe(&uffd); + + FI_WARN(&core_prov, FI_LOG_MR, + "Memory monitor uffd failed to start: %s.\n", + strerror(-ret)); + return ret; } static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) { - pthread_cancel(uffd.thread); + ssize_t num_written; + + if (uffd.fd < 0) + return; + + /* tell the thread to exit with the exit_pipe */ + + num_written = write(uffd.exit_pipe[1], "X", 1); + if (num_written != 1) { + FI_WARN(&core_prov, FI_LOG_MR, + "uffd/close: unable to write to exit pipe: %s", + strerror(errno)); + } + pthread_join(uffd.thread, NULL); - close(uffd.fd); + + ofi_uffd_close_fd(&uffd); + ofi_uffd_close_pipe(&uffd); + + FI_INFO(&core_prov, FI_LOG_MR, + "Memory monitor uffd stopped.\n"); } #else /* HAVE_UFFD_MONITOR */