diff --git a/debian/changelog b/debian/changelog index b5c6fc4b..3d7d5c5f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,11 @@ +pve-kernel (5.16.5-2) edge; urgency=medium + + * Synchronize kernel patches with Proxmox. + * Synchronize ZFS patches with Proxmox. + * Separate Ubuntu patches from Proxmox patches. + + -- Fabian Mastenbroek Fri, 04 Feb 2022 11:00:00 +0000 + pve-kernel (5.16.5-1) edge; urgency=medium * Update to Linux 5.16.5. diff --git a/debian/patches/pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch b/debian/patches/pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch new file mode 100644 index 00000000..d38d1a9e --- /dev/null +++ b/debian/patches/pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch @@ -0,0 +1,104 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Lamprecht +Date: Mon, 27 Sep 2021 11:28:39 +0200 +Subject: [PATCH] Revert "PCI: Coalesce host bridge contiguous apertures" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This reverts commit ab20e43b20b60f5cc8e2ea3763ffa388158469ac. + +was reverted upstream because of reports similar to + +Link: https://bugzilla.proxmox.com/show_bug.cgi?id=3552 +Link: https://lore.kernel.org/r/20210709231529.GA3270116@roeck-us.net +Signed-off-by: Fabian Grünbichler +Signed-off-by: Thomas Lamprecht +--- + drivers/pci/probe.c | 50 ++++----------------------------------------- + 1 file changed, 4 insertions(+), 46 deletions(-) + +diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c +index cb70d2605e97..258350f80f6c 100644 +--- a/drivers/pci/probe.c ++++ b/drivers/pci/probe.c +@@ -20,7 +20,6 @@ + #include + #include + #include +-#include + #include "pci.h" + + #define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ +@@ -881,31 +880,14 @@ static void pci_set_bus_msi_domain(struct pci_bus *bus) + dev_set_msi_domain(&bus->dev, d); + } + +-static int res_cmp(void *priv, const struct list_head *a, +- const struct list_head *b) +-{ +- struct resource_entry *entry1, *entry2; +- +- entry1 = container_of(a, struct resource_entry, node); +- entry2 = container_of(b, struct resource_entry, node); +- +- if (entry1->res->flags != entry2->res->flags) +- return entry1->res->flags > entry2->res->flags; +- +- if (entry1->offset != entry2->offset) +- return entry1->offset > entry2->offset; +- +- return entry1->res->start > entry2->res->start; +-} +- + static int pci_register_host_bridge(struct pci_host_bridge *bridge) + { + struct device *parent = bridge->dev.parent; +- struct resource_entry *window, *next, *n; ++ struct resource_entry *window, *n; + struct pci_bus *bus, *b; +- resource_size_t offset, next_offset; ++ resource_size_t offset; + LIST_HEAD(resources); +- struct resource *res, *next_res; ++ struct resource *res; + char addr[64], *fmt; + const char *name; + int err; +@@ -988,35 +970,11 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) + if (nr_node_ids > 1 && pcibus_to_node(bus) == NUMA_NO_NODE) + dev_warn(&bus->dev, "Unknown NUMA node; performance will be reduced\n"); + +- /* Sort and coalesce contiguous windows */ +- list_sort(NULL, &resources, res_cmp); +- resource_list_for_each_entry_safe(window, n, &resources) { +- if (list_is_last(&window->node, &resources)) +- break; +- +- next = list_next_entry(window, node); +- offset = window->offset; +- res = window->res; +- next_offset = next->offset; +- next_res = next->res; +- +- if (res->flags != next_res->flags || offset != next_offset) +- continue; +- +- if (res->end + 1 == next_res->start) { +- next_res->start = res->start; +- res->flags = res->start = res->end = 0; +- } +- } +- + /* Add initial resources to the bus */ + resource_list_for_each_entry_safe(window, n, &resources) { ++ list_move_tail(&window->node, &bridge->windows); + offset = window->offset; + res = window->res; +- if (!res->end) +- continue; +- +- list_move_tail(&window->node, &bridge->windows); + + if (res->flags & IORESOURCE_BUS) + pci_bus_insert_busn_res(bus, bus->number, res->end); \ No newline at end of file diff --git a/debian/patches/pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch b/debian/patches/pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch new file mode 100644 index 00000000..c46b7192 --- /dev/null +++ b/debian/patches/pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch @@ -0,0 +1,112 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Kai-Heng Feng +Date: Tue, 13 Jul 2021 20:50:07 +0800 +Subject: [PATCH] PCI: Reinstate "PCI: Coalesce host bridge contiguous + apertures" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Built-in graphics on HP EliteDesk 805 G6 doesn't work because graphics +can't get the BAR it needs: + pci_bus 0000:00: root bus resource [mem 0x10020200000-0x100303fffff window] + pci_bus 0000:00: root bus resource [mem 0x10030400000-0x100401fffff window] + + pci 0000:00:08.1: bridge window [mem 0xd2000000-0xd23fffff] + pci 0000:00:08.1: bridge window [mem 0x10030000000-0x100401fffff 64bit pref] + pci 0000:00:08.1: can't claim BAR 15 [mem 0x10030000000-0x100401fffff 64bit pref]: no compatible bridge window + pci 0000:00:08.1: [mem 0x10030000000-0x100401fffff 64bit pref] clipped to [mem 0x10030000000-0x100303fffff 64bit pref] + pci 0000:00:08.1: bridge window [mem 0x10030000000-0x100303fffff 64bit pref] + pci 0000:07:00.0: can't claim BAR 0 [mem 0x10030000000-0x1003fffffff 64bit pref]: no compatible bridge window + pci 0000:07:00.0: can't claim BAR 2 [mem 0x10040000000-0x100401fffff 64bit pref]: no compatible bridge window + +However, the root bus has two contiguous apertures that can contain the +child resource requested. + +Coalesce contiguous apertures so we can allocate from the entire contiguous +region. + +This is the second take of commit 65db04053efe ("PCI: Coalesce host +bridge contiguous apertures"). The original approach sorts the apertures +by address, but that makes NVMe stop working on QEMU ppc:sam460ex: + PCI host bridge to bus 0002:00 + pci_bus 0002:00: root bus resource [io 0x0000-0xffff] + pci_bus 0002:00: root bus resource [mem 0xd80000000-0xdffffffff] (bus address [0x80000000-0xffffffff]) + pci_bus 0002:00: root bus resource [mem 0xc0ee00000-0xc0eefffff] (bus address [0x00000000-0x000fffff]) + +After the offending commit: + PCI host bridge to bus 0002:00 + pci_bus 0002:00: root bus resource [io 0x0000-0xffff] + pci_bus 0002:00: root bus resource [mem 0xc0ee00000-0xc0eefffff] (bus address [0x00000000-0x000fffff]) + pci_bus 0002:00: root bus resource [mem 0xd80000000-0xdffffffff] (bus address [0x80000000-0xffffffff]) + +Since the apertures on HP EliteDesk 805 G6 are already in ascending +order, doing a precautious sorting is not necessary. + +Remove the sorting part to avoid the regression on ppc:sam460ex. + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212013 +Cc: Guenter Roeck +Suggested-by: Bjorn Helgaas +Signed-off-by: Kai-Heng Feng +Signed-off-by: Fabian Grünbichler +Signed-off-by: Thomas Lamprecht +--- + drivers/pci/probe.c | 31 +++++++++++++++++++++++++++---- + 1 file changed, 27 insertions(+), 4 deletions(-) + +diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c +index 258350f80f6c..7ff9fcec365b 100644 +--- a/drivers/pci/probe.c ++++ b/drivers/pci/probe.c +@@ -883,11 +883,11 @@ static void pci_set_bus_msi_domain(struct pci_bus *bus) + static int pci_register_host_bridge(struct pci_host_bridge *bridge) + { + struct device *parent = bridge->dev.parent; +- struct resource_entry *window, *n; ++ struct resource_entry *window, *next, *n; + struct pci_bus *bus, *b; +- resource_size_t offset; ++ resource_size_t offset, next_offset; + LIST_HEAD(resources); +- struct resource *res; ++ struct resource *res, *next_res; + char addr[64], *fmt; + const char *name; + int err; +@@ -970,11 +970,34 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) + if (nr_node_ids > 1 && pcibus_to_node(bus) == NUMA_NO_NODE) + dev_warn(&bus->dev, "Unknown NUMA node; performance will be reduced\n"); + ++ /* Coalesce contiguous windows */ ++ resource_list_for_each_entry_safe(window, n, &resources) { ++ if (list_is_last(&window->node, &resources)) ++ break; ++ ++ next = list_next_entry(window, node); ++ offset = window->offset; ++ res = window->res; ++ next_offset = next->offset; ++ next_res = next->res; ++ ++ if (res->flags != next_res->flags || offset != next_offset) ++ continue; ++ ++ if (res->end + 1 == next_res->start) { ++ next_res->start = res->start; ++ res->flags = res->start = res->end = 0; ++ } ++ } ++ + /* Add initial resources to the bus */ + resource_list_for_each_entry_safe(window, n, &resources) { +- list_move_tail(&window->node, &bridge->windows); + offset = window->offset; + res = window->res; ++ if (!res->end) ++ continue; ++ ++ list_move_tail(&window->node, &bridge->windows); + + if (res->flags & IORESOURCE_BUS) + pci_bus_insert_busn_res(bus, bus->number, res->end); \ No newline at end of file diff --git a/debian/patches/pve/0006-disable-split-btf.patch b/debian/patches/pve/0008-do-not-generate-split-BTF-type-info-per-default.patch similarity index 100% rename from debian/patches/pve/0006-disable-split-btf.patch rename to debian/patches/pve/0008-do-not-generate-split-BTF-type-info-per-default.patch diff --git a/debian/patches/pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch b/debian/patches/pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch new file mode 100644 index 00000000..c4221f78 --- /dev/null +++ b/debian/patches/pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch @@ -0,0 +1,147 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Wolfgang Bumiller +Date: Tue, 11 Jan 2022 09:31:59 +0100 +Subject: [PATCH] blk-cgroup: always terminate io.stat lines + +With the removal of seq_get_buf in blkcg_print_one_stat, we +cannot make adding the newline conditional on there being +relevant stats because the name was already written out +unconditionally. +Otherwise we may end up with multiple device names in one +line which is confusing and doesn't follow the nested-keyed +file format. + +Signed-off-by: Wolfgang Bumiller +Fixes: 252c651a4c85 ("blk-cgroup: stop using seq_get_buf") +Signed-off-by: Thomas Lamprecht +--- + block/blk-cgroup.c | 9 ++------- + block/blk-iocost.c | 5 ++--- + block/blk-iolatency.c | 8 +++----- + include/linux/blk-cgroup.h | 2 +- + 4 files changed, 8 insertions(+), 16 deletions(-) + +diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c +index 0eec59e4df65..38c62a44905a 100644 +--- a/block/blk-cgroup.c ++++ b/block/blk-cgroup.c +@@ -887,7 +887,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) + { + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; +- bool has_stats = false; + const char *dname; + unsigned seq; + int i; +@@ -913,14 +912,12 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { +- has_stats = true; + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { +- has_stats = true; + seq_printf(s, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); +@@ -932,12 +929,10 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + +- if (pol->pd_stat_fn(blkg->pd[i], s)) +- has_stats = true; ++ pol->pd_stat_fn(blkg->pd[i], s); + } + +- if (has_stats) +- seq_printf(s, "\n"); ++ seq_puts(s, "\n"); + } + + static int blkcg_print_stat(struct seq_file *sf, void *v) +diff --git a/block/blk-iocost.c b/block/blk-iocost.c +index eb7b0d6bd11f..381c28f9561e 100644 +--- a/block/blk-iocost.c ++++ b/block/blk-iocost.c +@@ -2995,13 +2995,13 @@ static void ioc_pd_free(struct blkg_policy_data *pd) + kfree(iocg); + } + +-static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) ++static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) + { + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + + if (!ioc->enabled) +- return false; ++ return; + + if (iocg->level == 0) { + unsigned vp10k = DIV64_U64_ROUND_CLOSEST( +@@ -3017,7 +3017,6 @@ static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); +- return true; + } + + static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, +diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c +index c0545f9da549..d33460f3d43d 100644 +--- a/block/blk-iolatency.c ++++ b/block/blk-iolatency.c +@@ -890,7 +890,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) + return 0; + } + +-static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) ++static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) + { + struct latency_stat stat; + int cpu; +@@ -913,17 +913,16 @@ static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); +- return true; + } + +-static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) ++static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) + { + struct iolatency_grp *iolat = pd_to_lat(pd); + unsigned long long avg_lat; + unsigned long long cur_win; + + if (!blkcg_debug_stats) +- return false; ++ return; + + if (iolat->ssd) + return iolatency_ssd_stat(iolat, s); +@@ -936,7 +935,6 @@ static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) + else + seq_printf(s, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); +- return true; + } + + static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, +diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h +index b4de2010fba5..132e05ed6935 100644 +--- a/include/linux/blk-cgroup.h ++++ b/include/linux/blk-cgroup.h +@@ -152,7 +152,7 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); + typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); + typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); + typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); +-typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, ++typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, + struct seq_file *s); + + struct blkcg_policy { \ No newline at end of file diff --git a/debian/patches/series.linux b/debian/patches/series.linux index 64f0feb1..9d565a61 100644 --- a/debian/patches/series.linux +++ b/debian/patches/series.linux @@ -1,10 +1,14 @@ +ubuntu/0001-apparmor-compatibility-v2.x-net-rules.patch +ubuntu/0002-apparmor-af_unix-mediation.patch +ubuntu/0003-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch +ubuntu/0004-apparmor-fix-use-after-free-in-sk_peer_label.patch + pve/0001-Make-mkcompile_h-accept-an-alternate-timestamp-strin.patch pve/0002-bridge-keep-MAC-of-first-assigned-port.patch pve/0003-pci-Enable-overrides-for-missing-ACS-capabilities-4..patch pve/0004-kvm-disable-default-dynamic-halt-polling-growth.patch pve/0005-net-core-downgrade-unregister_netdevice-refcount-lea.patch -pve/0006-disable-split-btf.patch -pve/0007-apparmor-compatibility-v2.x-net-rules.patch -pve/0008-apparmor-af_unix-mediation.patch -pve/0009-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch -pve/0010-apparmor-fix-use-after-free-in-sk_peer_label.patch +# pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch +# pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch +pve/0008-do-not-generate-split-BTF-type-info-per-default.patch +pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch \ No newline at end of file diff --git a/debian/patches/series.zfs b/debian/patches/series.zfs index 67a08e21..25667b65 100644 --- a/debian/patches/series.zfs +++ b/debian/patches/series.zfs @@ -7,3 +7,6 @@ zfs/0006-dont-symlink-zed-scripts.patch zfs/0007-Use-installed-python3.patch zfs/0008-Add-systemd-unit-for-importing-specific-pools.patch zfs/0009-Patch-move-manpage-arcstat-1-to-arcstat-8.patch +zfs/0010-arcstat-Fix-integer-division-with-python3.patch +zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch +zfs/0012-Fix-zvol_open-lock-inversion.patch diff --git a/debian/patches/pve/0007-apparmor-compatibility-v2.x-net-rules.patch b/debian/patches/ubuntu/0001-apparmor-compatibility-v2.x-net-rules.patch similarity index 100% rename from debian/patches/pve/0007-apparmor-compatibility-v2.x-net-rules.patch rename to debian/patches/ubuntu/0001-apparmor-compatibility-v2.x-net-rules.patch diff --git a/debian/patches/pve/0008-apparmor-af_unix-mediation.patch b/debian/patches/ubuntu/0002-apparmor-af_unix-mediation.patch similarity index 100% rename from debian/patches/pve/0008-apparmor-af_unix-mediation.patch rename to debian/patches/ubuntu/0002-apparmor-af_unix-mediation.patch diff --git a/debian/patches/pve/0009-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch b/debian/patches/ubuntu/0003-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch similarity index 100% rename from debian/patches/pve/0009-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch rename to debian/patches/ubuntu/0003-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch diff --git a/debian/patches/pve/0010-apparmor-fix-use-after-free-in-sk_peer_label.patch b/debian/patches/ubuntu/0004-apparmor-fix-use-after-free-in-sk_peer_label.patch similarity index 100% rename from debian/patches/pve/0010-apparmor-fix-use-after-free-in-sk_peer_label.patch rename to debian/patches/ubuntu/0004-apparmor-fix-use-after-free-in-sk_peer_label.patch diff --git a/debian/patches/zfs/0010-arcstat-Fix-integer-division-with-python3.patch b/debian/patches/zfs/0010-arcstat-Fix-integer-division-with-python3.patch new file mode 100644 index 00000000..086347f8 --- /dev/null +++ b/debian/patches/zfs/0010-arcstat-Fix-integer-division-with-python3.patch @@ -0,0 +1,134 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Valmiky Arquissandas +Date: Fri, 8 Oct 2021 16:32:27 +0100 +Subject: [PATCH] arcstat: Fix integer division with python3 + +The arcstat script requests compatibility with python2 and python3, but +PEP 238 modified the / operator and results in erroneous output when +run under python3. + +This commit replaces instances of / with //, yielding the expected +result in both versions of Python. + +Reviewed-by: Brian Behlendorf +Reviewed-by: John Kennedy +Reviewed-by: Ryan Moeller +Signed-off-by: Valmiky Arquissandas +Closes #12603 +(cherry picked from commit 2d02bba23d83ae8fede8d281edc255f01ccd28e9) +Signed-off-by: Thomas Lamprecht +--- + cmd/arcstat/arcstat.in | 66 +++++++++++++++++++++--------------------- + 1 file changed, 33 insertions(+), 33 deletions(-) + +diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in +index 9e7c52a6c..cd9a803a2 100755 +--- a/cmd/arcstat/arcstat.in ++++ b/cmd/arcstat/arcstat.in +@@ -441,73 +441,73 @@ def calculate(): + + v = dict() + v["time"] = time.strftime("%H:%M:%S", time.localtime()) +- v["hits"] = d["hits"] / sint +- v["miss"] = d["misses"] / sint ++ v["hits"] = d["hits"] // sint ++ v["miss"] = d["misses"] // sint + v["read"] = v["hits"] + v["miss"] +- v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0 ++ v["hit%"] = 100 * v["hits"] // v["read"] if v["read"] > 0 else 0 + v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0 + +- v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint +- v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint ++ v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) // sint ++ v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) // sint + + v["dread"] = v["dhit"] + v["dmis"] +- v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0 ++ v["dh%"] = 100 * v["dhit"] // v["dread"] if v["dread"] > 0 else 0 + v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0 + +- v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint ++ v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) // sint + v["pmis"] = (d["prefetch_data_misses"] + +- d["prefetch_metadata_misses"]) / sint ++ d["prefetch_metadata_misses"]) // sint + + v["pread"] = v["phit"] + v["pmis"] +- v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0 ++ v["ph%"] = 100 * v["phit"] // v["pread"] if v["pread"] > 0 else 0 + v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0 + + v["mhit"] = (d["prefetch_metadata_hits"] + +- d["demand_metadata_hits"]) / sint ++ d["demand_metadata_hits"]) // sint + v["mmis"] = (d["prefetch_metadata_misses"] + +- d["demand_metadata_misses"]) / sint ++ d["demand_metadata_misses"]) // sint + + v["mread"] = v["mhit"] + v["mmis"] +- v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0 ++ v["mh%"] = 100 * v["mhit"] // v["mread"] if v["mread"] > 0 else 0 + v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0 + + v["arcsz"] = cur["size"] + v["size"] = cur["size"] + v["c"] = cur["c"] +- v["mfu"] = d["mfu_hits"] / sint +- v["mru"] = d["mru_hits"] / sint +- v["mrug"] = d["mru_ghost_hits"] / sint +- v["mfug"] = d["mfu_ghost_hits"] / sint +- v["eskip"] = d["evict_skip"] / sint +- v["el2skip"] = d["evict_l2_skip"] / sint +- v["el2cach"] = d["evict_l2_cached"] / sint +- v["el2el"] = d["evict_l2_eligible"] / sint +- v["el2mfu"] = d["evict_l2_eligible_mfu"] / sint +- v["el2mru"] = d["evict_l2_eligible_mru"] / sint +- v["el2inel"] = d["evict_l2_ineligible"] / sint +- v["mtxmis"] = d["mutex_miss"] / sint ++ v["mfu"] = d["mfu_hits"] // sint ++ v["mru"] = d["mru_hits"] // sint ++ v["mrug"] = d["mru_ghost_hits"] // sint ++ v["mfug"] = d["mfu_ghost_hits"] // sint ++ v["eskip"] = d["evict_skip"] // sint ++ v["el2skip"] = d["evict_l2_skip"] // sint ++ v["el2cach"] = d["evict_l2_cached"] // sint ++ v["el2el"] = d["evict_l2_eligible"] // sint ++ v["el2mfu"] = d["evict_l2_eligible_mfu"] // sint ++ v["el2mru"] = d["evict_l2_eligible_mru"] // sint ++ v["el2inel"] = d["evict_l2_ineligible"] // sint ++ v["mtxmis"] = d["mutex_miss"] // sint + + if l2exist: +- v["l2hits"] = d["l2_hits"] / sint +- v["l2miss"] = d["l2_misses"] / sint ++ v["l2hits"] = d["l2_hits"] // sint ++ v["l2miss"] = d["l2_misses"] // sint + v["l2read"] = v["l2hits"] + v["l2miss"] +- v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0 ++ v["l2hit%"] = 100 * v["l2hits"] // v["l2read"] if v["l2read"] > 0 else 0 + + v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 + v["l2asize"] = cur["l2_asize"] + v["l2size"] = cur["l2_size"] +- v["l2bytes"] = d["l2_read_bytes"] / sint ++ v["l2bytes"] = d["l2_read_bytes"] // sint + + v["l2pref"] = cur["l2_prefetch_asize"] + v["l2mfu"] = cur["l2_mfu_asize"] + v["l2mru"] = cur["l2_mru_asize"] + v["l2data"] = cur["l2_bufc_data_asize"] + v["l2meta"] = cur["l2_bufc_metadata_asize"] +- v["l2pref%"] = 100 * v["l2pref"] / v["l2asize"] +- v["l2mfu%"] = 100 * v["l2mfu"] / v["l2asize"] +- v["l2mru%"] = 100 * v["l2mru"] / v["l2asize"] +- v["l2data%"] = 100 * v["l2data"] / v["l2asize"] +- v["l2meta%"] = 100 * v["l2meta"] / v["l2asize"] ++ v["l2pref%"] = 100 * v["l2pref"] // v["l2asize"] ++ v["l2mfu%"] = 100 * v["l2mfu"] // v["l2asize"] ++ v["l2mru%"] = 100 * v["l2mru"] // v["l2asize"] ++ v["l2data%"] = 100 * v["l2data"] // v["l2asize"] ++ v["l2meta%"] = 100 * v["l2meta"] // v["l2asize"] + + v["grow"] = 0 if cur["arc_no_grow"] else 1 + v["need"] = cur["arc_need_free"] diff --git a/debian/patches/zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch b/debian/patches/zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch new file mode 100644 index 00000000..8de5df9c --- /dev/null +++ b/debian/patches/zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch @@ -0,0 +1,112 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Lamprecht +Date: Wed, 10 Nov 2021 09:29:47 +0100 +Subject: [PATCH] arc stat/summary: guard access to l2arc MFU/MRU stats + +commit 085321621e79a75bea41c2b6511da6ebfbf2ba0a added printing MFU +and MRU stats for 2.1 user space tools, but those keys are not +available in the 2.0 module. That means it may break the arcstat and +arc_summary tools after upgrade to 2.1 (user space), before a reboot +to the new 2.1 ZFS kernel-module happened, due to python raising a +KeyError on the dict access then. + +Move those two keys to a .get accessor with `0` as fallback, as it +should be better to show some possible wrong data for new stat-keys +than throwing an exception. + +Signed-off-by: Thomas Lamprecht + +also move l2_mfu_asize l2_mru_asize l2_prefetch_asize +l2_bufc_data_asize l2_bufc_metadata_asize to .get accessor +(these are only present with a cache device in the pool) +Signed-off-by: Stoiko Ivanov +--- + cmd/arc_summary/arc_summary3 | 28 ++++++++++++++-------------- + cmd/arcstat/arcstat.in | 14 +++++++------- + 2 files changed, 21 insertions(+), 21 deletions(-) + +diff --git a/cmd/arc_summary/arc_summary3 b/cmd/arc_summary/arc_summary3 +index 7b28012ed..fe6a6d9e2 100755 +--- a/cmd/arc_summary/arc_summary3 ++++ b/cmd/arc_summary/arc_summary3 +@@ -617,13 +617,13 @@ def section_arc(kstats_dict): + prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached'])) + prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible'])) + prt_i2('L2 eligible MFU evictions:', +- f_perc(arc_stats['evict_l2_eligible_mfu'], ++ f_perc(arc_stats.get('evict_l2_eligible_mfu', 0), # 2.0 module compat + arc_stats['evict_l2_eligible']), +- f_bytes(arc_stats['evict_l2_eligible_mfu'])) ++ f_bytes(arc_stats.get('evict_l2_eligible_mfu', 0))) + prt_i2('L2 eligible MRU evictions:', +- f_perc(arc_stats['evict_l2_eligible_mru'], ++ f_perc(arc_stats.get('evict_l2_eligible_mru', 0), # 2.0 module compat + arc_stats['evict_l2_eligible']), +- f_bytes(arc_stats['evict_l2_eligible_mru'])) ++ f_bytes(arc_stats.get('evict_l2_eligible_mru', 0))) + prt_i1('L2 ineligible evictions:', + f_bytes(arc_stats['evict_l2_ineligible'])) + print() +@@ -765,20 +765,20 @@ def section_l2arc(kstats_dict): + f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), + f_bytes(arc_stats['l2_hdr_size'])) + prt_i2('MFU allocated size:', +- f_perc(arc_stats['l2_mfu_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_mfu_asize'])) ++ f_perc(arc_stats.get('l2_mfu_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_mfu_asize', 0))) # 2.0 module compat + prt_i2('MRU allocated size:', +- f_perc(arc_stats['l2_mru_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_mru_asize'])) ++ f_perc(arc_stats.get('l2_mru_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_mru_asize', 0))) # 2.0 module compat + prt_i2('Prefetch allocated size:', +- f_perc(arc_stats['l2_prefetch_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_prefetch_asize'])) ++ f_perc(arc_stats.get('l2_prefetch_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_prefetch_asize',0))) # 2.0 module compat + prt_i2('Data (buffer content) allocated size:', +- f_perc(arc_stats['l2_bufc_data_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_bufc_data_asize'])) ++ f_perc(arc_stats.get('l2_bufc_data_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_bufc_data_asize', 0))) # 2.0 module compat + prt_i2('Metadata (buffer content) allocated size:', +- f_perc(arc_stats['l2_bufc_metadata_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_bufc_metadata_asize'])) ++ f_perc(arc_stats.get('l2_bufc_metadata_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_bufc_metadata_asize', 0))) # 2.0 module compat + + print() + prt_1('L2ARC breakdown:', f_hits(l2_access_total)) +diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in +index cd9a803a2..ea45dc602 100755 +--- a/cmd/arcstat/arcstat.in ++++ b/cmd/arcstat/arcstat.in +@@ -482,8 +482,8 @@ def calculate(): + v["el2skip"] = d["evict_l2_skip"] // sint + v["el2cach"] = d["evict_l2_cached"] // sint + v["el2el"] = d["evict_l2_eligible"] // sint +- v["el2mfu"] = d["evict_l2_eligible_mfu"] // sint +- v["el2mru"] = d["evict_l2_eligible_mru"] // sint ++ v["el2mfu"] = d.get("evict_l2_eligible_mfu", 0) // sint ++ v["el2mru"] = d.get("evict_l2_eligible_mru", 0) // sint + v["el2inel"] = d["evict_l2_ineligible"] // sint + v["mtxmis"] = d["mutex_miss"] // sint + +@@ -498,11 +498,11 @@ def calculate(): + v["l2size"] = cur["l2_size"] + v["l2bytes"] = d["l2_read_bytes"] // sint + +- v["l2pref"] = cur["l2_prefetch_asize"] +- v["l2mfu"] = cur["l2_mfu_asize"] +- v["l2mru"] = cur["l2_mru_asize"] +- v["l2data"] = cur["l2_bufc_data_asize"] +- v["l2meta"] = cur["l2_bufc_metadata_asize"] ++ v["l2pref"] = cur.get("l2_prefetch_asize", 0) ++ v["l2mfu"] = cur.get("l2_mfu_asize", 0) ++ v["l2mru"] = cur.get("l2_mru_asize", 0) ++ v["l2data"] = cur.get("l2_bufc_data_asize", 0) ++ v["l2meta"] = cur.get("l2_bufc_metadata_asize", 0) + v["l2pref%"] = 100 * v["l2pref"] // v["l2asize"] + v["l2mfu%"] = 100 * v["l2mfu"] // v["l2asize"] + v["l2mru%"] = 100 * v["l2mru"] // v["l2asize"] \ No newline at end of file diff --git a/debian/patches/zfs/0012-Fix-zvol_open-lock-inversion.patch b/debian/patches/zfs/0012-Fix-zvol_open-lock-inversion.patch new file mode 100644 index 00000000..eb74550f --- /dev/null +++ b/debian/patches/zfs/0012-Fix-zvol_open-lock-inversion.patch @@ -0,0 +1,212 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Brian Behlendorf +Date: Fri, 17 Dec 2021 09:52:13 -0800 +Subject: [PATCH] Fix zvol_open() lock inversion + +When restructuring the zvol_open() logic for the Linux 5.13 kernel +a lock inversion was accidentally introduced. In the updated code +the spa_namespace_lock is now taken before the zv_suspend_lock +allowing the following scenario to occur: + + down_read <=== waiting for zv_suspend_lock + zvol_open <=== holds spa_namespace_lock + __blkdev_get + blkdev_get_by_dev + blkdev_open + ... + + mutex_lock <== waiting for spa_namespace_lock + spa_open_common + spa_open + dsl_pool_hold + dmu_objset_hold_flags + dmu_objset_hold + dsl_prop_get + dsl_prop_get_integer + zvol_create_minor + dmu_recv_end + zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend() + zfs_ioc_recv + ... + +This commit resolves the issue by moving the acquisition of the +spa_namespace_lock back to after the zv_suspend_lock which restores +the original ordering. + +Additionally, as part of this change the error exit paths were +simplified where possible. + +Reviewed-by: Tony Hutter +Reviewed-by: Rich Ercolani +Signed-off-by: Brian Behlendorf +Closes #12863 +(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023) +Signed-off-by: Stoiko Ivanov +--- + module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------ + 1 file changed, 58 insertions(+), 63 deletions(-) + +diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c +index 44caadd58..69479b3f7 100644 +--- a/module/os/linux/zfs/zvol_os.c ++++ b/module/os/linux/zfs/zvol_os.c +@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) + { + zvol_state_t *zv; + int error = 0; +- boolean_t drop_suspend = B_TRUE; +- boolean_t drop_namespace = B_FALSE; ++ boolean_t drop_suspend = B_FALSE; + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS + hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); + hrtime_t start = gethrtime(); +@@ -517,7 +516,36 @@ retry: + return (SET_ERROR(-ENXIO)); + } + +- if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { ++ mutex_enter(&zv->zv_state_lock); ++ /* ++ * Make sure zvol is not suspended during first open ++ * (hold zv_suspend_lock) and respect proper lock acquisition ++ * ordering - zv_suspend_lock before zv_state_lock ++ */ ++ if (zv->zv_open_count == 0) { ++ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { ++ mutex_exit(&zv->zv_state_lock); ++ rw_enter(&zv->zv_suspend_lock, RW_READER); ++ mutex_enter(&zv->zv_state_lock); ++ /* check to see if zv_suspend_lock is needed */ ++ if (zv->zv_open_count != 0) { ++ rw_exit(&zv->zv_suspend_lock); ++ } else { ++ drop_suspend = B_TRUE; ++ } ++ } else { ++ drop_suspend = B_TRUE; ++ } ++ } ++ rw_exit(&zvol_state_lock); ++ ++ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ++ ++ if (zv->zv_open_count == 0) { ++ boolean_t drop_namespace = B_FALSE; ++ ++ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ++ + /* + * In all other call paths the spa_namespace_lock is taken + * before the bdev->bd_mutex lock. However, on open(2) +@@ -542,84 +570,51 @@ retry: + * the kernel so the only option is to return the error for + * the caller to handle it. + */ +- if (!mutex_tryenter(&spa_namespace_lock)) { +- rw_exit(&zvol_state_lock); ++ if (!mutex_owned(&spa_namespace_lock)) { ++ if (!mutex_tryenter(&spa_namespace_lock)) { ++ mutex_exit(&zv->zv_state_lock); ++ rw_exit(&zv->zv_suspend_lock); + + #ifdef HAVE_BLKDEV_GET_ERESTARTSYS +- schedule(); +- return (SET_ERROR(-ERESTARTSYS)); +-#else +- if ((gethrtime() - start) > timeout) ++ schedule(); + return (SET_ERROR(-ERESTARTSYS)); ++#else ++ if ((gethrtime() - start) > timeout) ++ return (SET_ERROR(-ERESTARTSYS)); + +- schedule_timeout(MSEC_TO_TICK(10)); +- goto retry; ++ schedule_timeout(MSEC_TO_TICK(10)); ++ goto retry; + #endif +- } else { +- drop_namespace = B_TRUE; +- } +- } +- +- mutex_enter(&zv->zv_state_lock); +- /* +- * make sure zvol is not suspended during first open +- * (hold zv_suspend_lock) and respect proper lock acquisition +- * ordering - zv_suspend_lock before zv_state_lock +- */ +- if (zv->zv_open_count == 0) { +- if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { +- mutex_exit(&zv->zv_state_lock); +- rw_enter(&zv->zv_suspend_lock, RW_READER); +- mutex_enter(&zv->zv_state_lock); +- /* check to see if zv_suspend_lock is needed */ +- if (zv->zv_open_count != 0) { +- rw_exit(&zv->zv_suspend_lock); +- drop_suspend = B_FALSE; ++ } else { ++ drop_namespace = B_TRUE; + } + } +- } else { +- drop_suspend = B_FALSE; +- } +- rw_exit(&zvol_state_lock); +- +- ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + +- if (zv->zv_open_count == 0) { +- ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); +- if (error) +- goto out_mutex; +- } + +- if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { +- error = -EROFS; +- goto out_open_count; ++ if (drop_namespace) ++ mutex_exit(&spa_namespace_lock); + } + +- zv->zv_open_count++; +- +- mutex_exit(&zv->zv_state_lock); +- if (drop_namespace) +- mutex_exit(&spa_namespace_lock); +- if (drop_suspend) +- rw_exit(&zv->zv_suspend_lock); +- +- zfs_check_media_change(bdev); +- +- return (0); ++ if (error == 0) { ++ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { ++ if (zv->zv_open_count == 0) ++ zvol_last_close(zv); + +-out_open_count: +- if (zv->zv_open_count == 0) +- zvol_last_close(zv); ++ error = SET_ERROR(-EROFS); ++ } else { ++ zv->zv_open_count++; ++ } ++ } + +-out_mutex: + mutex_exit(&zv->zv_state_lock); +- if (drop_namespace) +- mutex_exit(&spa_namespace_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + +- return (SET_ERROR(error)); ++ if (error == 0) ++ zfs_check_media_change(bdev); ++ ++ return (error); + } + + static void