From 877221e27c2de9fd5dee0062d311e564db687d63 Mon Sep 17 00:00:00 2001 From: Mike Uttormark Date: Mon, 11 Dec 2023 15:48:35 -0600 Subject: [PATCH] prov/util: Integrate kdreg2 into libfabric kdreg2 is a Linux kernel module used to enabled the libfabric MR cache for FI_HMEM_SYSTEM. Signed-off-by: Mike Uttormark Signed-off-by: Ian Ziemba --- Makefile.am | 1 + configure.ac | 51 +++- include/ofi_mr.h | 38 ++- libfabric.vcxproj | 1 + man/fi_mr.3.md | 5 +- prov/util/src/kdreg2_mem_monitor.c | 367 +++++++++++++++++++++++++++++ prov/util/src/util_mem_monitor.c | 17 +- 7 files changed, 472 insertions(+), 8 deletions(-) create mode 100644 prov/util/src/kdreg2_mem_monitor.c diff --git a/Makefile.am b/Makefile.am index 00242c7d65e..de2158c5fc1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -91,6 +91,7 @@ common_srcs = \ prov/util/src/rocr_ipc_monitor.c \ prov/util/src/ze_ipc_monitor.c \ prov/util/src/xpmem_monitor.c \ + prov/util/src/kdreg2_mem_monitor.c \ prov/util/src/util_profile.c \ prov/coll/src/coll_attr.c \ prov/coll/src/coll_av.c \ diff --git a/configure.ac b/configure.ac index 7e9985991e4..2b476f7f9d2 100644 --- a/configure.ac +++ b/configure.ac @@ -598,6 +598,53 @@ AC_ARG_ENABLE([restricted_dl], AC_DEFINE_UNQUOTED([HAVE_RESTRICTED_DL], [$restricted_dl], [Define to 1 to only look for dl providers under default location if FI_PROVIDER_PATH is not set]) +dnl Check kdreg2 support +kdreg2_enabled=1 +have_kdreg2=0 +have_kdreg2_include_path=0 + +AC_ARG_ENABLE([kdreg2], + [AC_HELP_STRING([--disable-kdreg2], + [Determine whether kdreg2 memory monitor is disabled.])], + [AS_IF([test "$enable_kdreg2" = "no"], [kdreg2_enabled=0])], + []) + +AS_IF([test $kdreg2_enabled -ne 0 ], + [AC_CHECK_HEADER([linux/kdreg2.h], [have_kdreg2=1], [], []) + + AC_ARG_WITH([kdreg2], + [AS_HELP_STRING([--with-kdreg2=DIR], + [Enable KDREG2 memory monitor. + Optional=.])], + [AS_CASE(["$with_kdreg2"], + ["no"], [kdreg2_enabled=0], + ["yes"], [], + [""], [], + [CPPFLAGS="$CPPFLAGS -I$with_kdreg2" + AC_CHECK_HEADER([kdreg2.h], + [have_kdreg2=1 + have_kdreg2_include_path=1], + [have_kdreg2=0], + [])]) + AS_IF([test $have_kdreg2 -eq 0 ], + [AC_MSG_ERROR([KDREG2 header not found in $with_kdreg2. Cannot enable KDREG2 memory monitor.])]) + ]) + ]) + +AS_IF([test $kdreg2_enabled -eq 0], + [AC_MSG_NOTICE([kdreg2 monitor disabled])], + [AS_IF([test $have_kdreg2 -ne 0], + [AC_MSG_NOTICE([kdreg2 present and enabled])])]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2, [$have_kdreg2], + [Define to 1 if kdreg2.h is available.]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2_INCLUDE_PATH, [$have_kdreg2_include_path], + [Define to 1 if kdreg2.h path is not .]) + +AC_DEFINE_UNQUOTED(HAVE_KDREG2_MONITOR, [$have_kdreg2], + [Define to 1 to enable kdreg2 memory monitor]) + dnl Check support to intercept syscalls AC_CHECK_HEADERS_ONCE(elf.h sys/auxv.h) @@ -888,16 +935,18 @@ AC_DEFINE_UNQUOTED(ENABLE_UFFD_MONITOR, [$enable_uffd], default_monitor="" bad_default="0" AC_ARG_WITH([default-monitor], - [AS_HELP_STRING([--with-default-monitor=], + [AS_HELP_STRING([--with-default-monitor=], [Select the default memory monitor.])], [AS_CASE([$with_default_monitor], [memhooks],[default_monitor=memhooks], [uffd],[default_monitor=uffd], + [kdreg2],[default_monitor=kdreg2] [disabled], [default_monitor=disabled], [AC_MSG_ERROR([Unknown monitor specified: $with_default_monitor. Choices are memhooks, uffd, or disabled.])]) AS_CASE([$default_monitor], [memhooks], [AS_IF([test "$enable_memhooks" != "1"], [bad_default=1])], [uffd], [AS_IF([test "$enable_uffd" != "1"], [bad_default=1])], + [kdreg2], [AS_IF([test "$kdreg2_enabled" != "1"], [bad_default=1])], []) AS_IF([test "$bad_default" != "0"], [AC_MSG_ERROR(["Default memory monitor is not available: $default_monitor."])]) diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 64bae5f0755..6f85e07eadd 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -2,7 +2,7 @@ * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. * All rights reserved. - * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,6 +40,8 @@ # include #endif /* HAVE_CONFIG_H */ +struct ofi_mr; + #include #include @@ -48,6 +50,15 @@ #include #include #include +#include + +#if HAVE_KDREG2_MONITOR +#if HAVE_KDREG2_INCLUDE_PATH +#include "kdreg2.h" +#else +#include +#endif +#endif int ofi_open_mr_cache(uint32_t version, void *attr, size_t attr_len, uint64_t flags, struct fid **fid, void *context); @@ -128,6 +139,12 @@ struct ofi_mr_cache; union ofi_mr_hmem_info { uint64_t cuda_id; uint64_t ze_id; +#if HAVE_KDREG2_MONITOR + struct { + kdreg2_cookie_t cookie; + struct kdreg2_monitoring_params monitoring_params; + } kdreg2; +#endif }; struct ofi_mr_entry { @@ -228,6 +245,23 @@ struct ofi_memhooks { extern struct ofi_mem_monitor *memhooks_monitor; +/* + * Kdreg2 monitor + */ + +struct kdreg2_status_data; + +struct ofi_kdreg2 { + struct ofi_mem_monitor monitor; + pthread_t thread; + int fd; + int exit_pipe[2]; + const struct kdreg2_status_data *status_data; + ofi_atomic64_t next_cookie; +}; + +extern struct ofi_mem_monitor *kdreg2_monitor; + extern struct ofi_mem_monitor *cuda_monitor; extern struct ofi_mem_monitor *cuda_ipc_monitor; extern struct ofi_mem_monitor *rocr_monitor; @@ -368,7 +402,7 @@ struct ofi_mr_cache { struct ofi_rbmap tree; struct dlist_entry lru_list; struct dlist_entry dead_region_list; - pthread_mutex_t lock; + pthread_mutex_t lock; size_t cached_cnt; size_t cached_size; diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 1bc35fb93b5..b4e8dc9cbd3 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -759,6 +759,7 @@ + diff --git a/man/fi_mr.3.md b/man/fi_mr.3.md index 7e13d587c47..be43f409c8e 100644 --- a/man/fi_mr.3.md +++ b/man/fi_mr.3.md @@ -1054,12 +1054,13 @@ configure registration caches. : The cache monitor is responsible for detecting system memory (FI_HMEM_SYSTEM) changes made between the virtual addresses used by an application and the underlying physical pages. Valid monitor options are: userfaultfd, memhooks, - and disabled. Selecting disabled will turn off the registration cache. + kdreg2, and disabled. Selecting disabled will turn off the registration cache. Userfaultfd is a Linux kernel feature used to report virtual to physical address mapping changes to user space. Memhooks operates by intercepting relevant memory allocation and deallocation calls which may result in the mappings changing, such as malloc, mmap, free, etc. Note that memhooks - operates at the elf linker layer, and does not use glibc memory hooks. + operates at the elf linker layer, and does not use glibc memory hooks. Kdreg2 + is supplied as a loadable Linux kernel module. *FI_MR_CUDA_CACHE_MONITOR_ENABLED* : The CUDA cache monitor is responsible for detecting CUDA device memory diff --git a/prov/util/src/kdreg2_mem_monitor.c b/prov/util/src/kdreg2_mem_monitor.c new file mode 100644 index 00000000000..ba7c2a21d31 --- /dev/null +++ b/prov/util/src/kdreg2_mem_monitor.c @@ -0,0 +1,367 @@ +/* + * (C) Copyright 2022-2023 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi_mr.h" + +#if HAVE_KDREG2_MONITOR + +#include "ofi_hmem.h" + +#define EVICTOR_THREAD_ATTR NULL +#define INFINITE_TIMEOUT -1 + +static int kdreg2_monitor_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + uint64_t cookie = ofi_atomic_inc64(&kdreg2->next_cookie); + struct kdreg2_ioctl_monitor ioctl_monitor = { + .addr = addr, + .length = len, + .cookie = (kdreg2_cookie_t) cookie, + }; + int ret; + + ret = ioctl(kdreg2->fd, KDREG2_IOCTL_MONITOR, &ioctl_monitor); + if (ret) + return ret; + + hmem_info->kdreg2.cookie = ioctl_monitor.cookie; + hmem_info->kdreg2.monitoring_params = ioctl_monitor.monitoring_params; + + return 0; +} + +static void kdreg2_monitor_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + struct kdreg2_ioctl_unmonitor ioctl_unmonitor = { + .cookie = hmem_info->kdreg2.cookie, + .monitoring_params = hmem_info->kdreg2.monitoring_params, + }; + + ioctl(kdreg2->fd, KDREG2_IOCTL_UNMONITOR, &ioctl_unmonitor); +} + +static bool kdreg2_monitor_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + struct kdreg2_monitoring_params *params = + &entry->hmem_info.kdreg2.monitoring_params; + + return !kdreg2_mapping_changed(kdreg2->status_data, params); +} + +static int kdreg2_read_evictions(struct ofi_kdreg2 *kdreg2) +{ + struct kdreg2_event event; + ssize_t bytes; + int err; + + while (kdreg2_read_counter(&kdreg2->status_data->pending_events) > 0) { + + /* The read should return a multiple of sizeof(event) or + * an error. There should be no partial reads. + */ + + bytes = read(kdreg2->fd, &event, sizeof(event)); + if (bytes < 0) { + err = errno; + + /* EINTR means we caught a signal. */ + if (err == EINTR) + continue; + + /* Nothing left */ + if ((err == EAGAIN) || + (err == EWOULDBLOCK)) + return 0; + + /* All other errors */ + return err; + } + + switch (event.type) { + case KDREG2_EVENT_MAPPING_CHANGE: + + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + + ofi_monitor_notify(&kdreg2->monitor, + event.u.mapping_change.addr, + event.u.mapping_change.len); + + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); + + break; + + default: + + return -ENOMSG; + } + } + + return 0; +} + +static void kdreg2_close_pipe(struct ofi_kdreg2 *kdreg2) +{ + close(kdreg2->exit_pipe[0]); + close(kdreg2->exit_pipe[1]); + kdreg2->exit_pipe[0] = -1; + kdreg2->exit_pipe[1] = -1; +} + +static void kdreg2_close_fd(struct ofi_kdreg2 *kdreg2) +{ + close(kdreg2->fd); + kdreg2->fd = -1; + kdreg2->status_data = NULL; +} + +static void *kdreg2_evictor(void *arg) +{ + struct ofi_kdreg2 *kdreg2 = (struct ofi_kdreg2 *) arg; + int ret; + struct pollfd pollfd[2] = { + { + .fd = kdreg2->fd, + .events = POLLIN, + }, + { .fd = kdreg2->exit_pipe[0], + .events = POLLIN, + }, + }; + int n; + + while (1) { + + /* wait until there are events to read */ + n = poll(pollfd, 2, INFINITE_TIMEOUT); + if (n == 0) /* timeout(?) */ + continue; + + if (n < 0) { + switch (errno) { + case EINTR: /* interrupted */ + continue; + default: + ret = -errno; + goto error_ret; + } + } + + /* look for exit message on second fd */ + if (pollfd[1].revents) { + ret = 0; + goto error_ret; + } + + ret = kdreg2_read_evictions(kdreg2); + if (ret) + goto error_ret; + } + +error_ret: + + return (void *) (intptr_t) ret; +} + + +static int kdreg2_monitor_start(struct ofi_mem_monitor *monitor) +{ + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + int ret = 0; + struct kdreg2_config_data config_data; + + /* see if already started */ + if (kdreg2->fd >= 0) + return 0; + + ofi_atomic_initialize64(&kdreg2->next_cookie, 1); + + ret = pipe(kdreg2->exit_pipe); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to create pipe for kdreg2: %s\n", + strerror(errno)); + return -errno; + } + + kdreg2->fd = open(KDREG2_DEVICE_NAME, O_RDWR); + if (kdreg2->fd < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to open %s for monitor kdreg2: %s.\n", + KDREG2_DEVICE_NAME, strerror(errno)); + ret = -errno; + goto close_pipe; + } + + /* configure the monitor with the maximum number of entries */ + + config_data.max_regions = cache_params.max_cnt; + if (!config_data.max_regions) { + ret = -FI_ENOSPC; + goto close_fd; + } + + ret = ioctl(kdreg2->fd, KDREG2_IOCTL_CONFIG_DATA, &config_data); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to get module config data for kdreg2 monitor: %d.\n", + errno); + ret = -errno; + goto close_fd; + } + + /* Configuring the monitor allocates the status data. Save the address. */ + + kdreg2->status_data = config_data.status_data; + + ret = pthread_create(&kdreg2->thread, EVICTOR_THREAD_ATTR, + kdreg2_evictor, kdreg2); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to start thread for kdreg2 monitor: %d.\n", + ret); + goto close_fd; + } + + FI_INFO(&core_prov, FI_LOG_MR, "Kdreg2 memory monitor started.\n"); + + return 0; + +close_fd: + + kdreg2_close_fd(kdreg2); + +close_pipe: + + kdreg2_close_pipe(kdreg2); + + FI_WARN(&core_prov, FI_LOG_MR, + "Kdreg2 memory monitor failed to start: %i.\n", ret); + + return ret; +} + +static void kdreg2_monitor_stop(struct ofi_mem_monitor *monitor) +{ + ssize_t num_written; + struct ofi_kdreg2 *kdreg2 = + container_of(monitor, struct ofi_kdreg2, monitor); + + /* see if it's really running */ + if (kdreg2->fd < 0) + return; + + num_written = write(kdreg2->exit_pipe[1], "X", 1); + if (num_written != 1) { + FI_WARN(&core_prov, FI_LOG_MR, + "Unable to write to kdreg2 exit pipe: %s\n", + strerror(errno)); + /* We could call pthread cancel here. The thread + * has probably already exited. Cancelling would be + * benign. But calling join on an exited thread is + * also legal. + */ + } + + pthread_join(kdreg2->thread, NULL); + + kdreg2_close_fd(kdreg2); + kdreg2_close_pipe(kdreg2); + + FI_INFO(&core_prov, FI_LOG_MR, "Kdreg2 memory monitor stopped.\n"); +} + +#else /* !HAVE_KDREG2_MONITOR */ + +static int kdreg2_monitor_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, + size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + return -FI_ENOSYS; +} + +static void kdreg2_monitor_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ +} + +static bool kdreg2_monitor_valid(struct ofi_mem_monitor *monitor, + const struct ofi_mr_info *info, + struct ofi_mr_entry *entry) +{ + return false; +} + +static int kdreg2_monitor_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +void kdreg2_monitor_stop(struct ofi_mem_monitor *monitor) +{ + /* no-op */ +} + +#endif /* HAVE_KDREG2_MONITOR */ + +static struct ofi_kdreg2 kdreg2_mm = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_monitor_init, + .monitor.cleanup = ofi_monitor_cleanup, + .monitor.start = kdreg2_monitor_start, + .monitor.stop = kdreg2_monitor_stop, + .monitor.subscribe = kdreg2_monitor_subscribe, + .monitor.unsubscribe = kdreg2_monitor_unsubscribe, + .monitor.valid = kdreg2_monitor_valid, + .monitor.name = "kdreg2", + .fd = -1, + .exit_pipe = { -1, -1 }, + .status_data = NULL, +}; + +struct ofi_mem_monitor *kdreg2_monitor = &kdreg2_mm.monitor; diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index 10a6b4e2795..b725a90bc6d 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -194,6 +194,7 @@ static void initialize_monitor_list() ze_monitor, ze_ipc_monitor, import_monitor, + kdreg2_monitor, }; monitor_list_size = ARRAY_SIZE(monitors); @@ -229,6 +230,13 @@ static void set_default_monitor(const char *monitor) #else FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n"); default_monitor = NULL; +#endif + } else if (!strcmp(monitor, "kdreg2")) { +#if HAVE_KDREG2_MONITOR + default_monitor = kdreg2_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "kdreg2 monitor not available\n"); + default_monitor = NULL; #endif } else if (!strcmp(monitor, "disabled")) { default_monitor = NULL; @@ -269,9 +277,10 @@ void ofi_monitors_init(void) "Define a default memory registration monitor." " The monitor checks for virtual to physical memory" " address changes. Options are: userfaultfd, memhooks" - " and disabled. Userfaultfd is a Linux kernel feature." - " Memhooks operates by intercepting memory allocation" - " and free calls." + " kdreg2, and disabled. Userfaultfd is a Linux kernel" + " feature. Memhooks operates by intercepting memory" + " allocation and free calls. kdreg2 is a supplied as a" + " loadable Linux kernel module." #if defined(HAVE_MR_CACHE_MONITOR_DEFAULT) " " HAVE_MR_CACHE_MONITOR_DEFAULT #else @@ -313,6 +322,8 @@ void ofi_monitors_init(void) default_monitor = memhooks_monitor; #elif HAVE_UFFD_MONITOR default_monitor = uffd_monitor; +#elif HAVE_KDREG2_MONITOR + default_monitor = kdreg2_monitor; #else default_monitor = NULL; #endif