From 43a7346b84815a3003157e05e15f2171d2433dfa Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Thu, 3 Feb 2022 17:13:04 +0100 Subject: [PATCH 1/3] Update ZFS patches This change adds new patches from Proxmox for the ZFS project. --- debian/patches/series.zfs | 3 + ...at-Fix-integer-division-with-python3.patch | 134 +++++++++++ ...-guard-access-to-l2arc-MFU-MRU-stats.patch | 112 +++++++++ .../0012-Fix-zvol_open-lock-inversion.patch | 212 ++++++++++++++++++ 4 files changed, 461 insertions(+) create mode 100644 debian/patches/zfs/0010-arcstat-Fix-integer-division-with-python3.patch create mode 100644 debian/patches/zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch create mode 100644 debian/patches/zfs/0012-Fix-zvol_open-lock-inversion.patch diff --git a/debian/patches/series.zfs b/debian/patches/series.zfs index 67a08e21..25667b65 100644 --- a/debian/patches/series.zfs +++ b/debian/patches/series.zfs @@ -7,3 +7,6 @@ zfs/0006-dont-symlink-zed-scripts.patch zfs/0007-Use-installed-python3.patch zfs/0008-Add-systemd-unit-for-importing-specific-pools.patch zfs/0009-Patch-move-manpage-arcstat-1-to-arcstat-8.patch +zfs/0010-arcstat-Fix-integer-division-with-python3.patch +zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch +zfs/0012-Fix-zvol_open-lock-inversion.patch diff --git a/debian/patches/zfs/0010-arcstat-Fix-integer-division-with-python3.patch b/debian/patches/zfs/0010-arcstat-Fix-integer-division-with-python3.patch new file mode 100644 index 00000000..086347f8 --- /dev/null +++ b/debian/patches/zfs/0010-arcstat-Fix-integer-division-with-python3.patch @@ -0,0 +1,134 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Valmiky Arquissandas +Date: Fri, 8 Oct 2021 16:32:27 +0100 +Subject: [PATCH] arcstat: Fix integer division with python3 + +The arcstat script requests compatibility with python2 and python3, but +PEP 238 modified the / operator and results in erroneous output when +run under python3. + +This commit replaces instances of / with //, yielding the expected +result in both versions of Python. + +Reviewed-by: Brian Behlendorf +Reviewed-by: John Kennedy +Reviewed-by: Ryan Moeller +Signed-off-by: Valmiky Arquissandas +Closes #12603 +(cherry picked from commit 2d02bba23d83ae8fede8d281edc255f01ccd28e9) +Signed-off-by: Thomas Lamprecht +--- + cmd/arcstat/arcstat.in | 66 +++++++++++++++++++++--------------------- + 1 file changed, 33 insertions(+), 33 deletions(-) + +diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in +index 9e7c52a6c..cd9a803a2 100755 +--- a/cmd/arcstat/arcstat.in ++++ b/cmd/arcstat/arcstat.in +@@ -441,73 +441,73 @@ def calculate(): + + v = dict() + v["time"] = time.strftime("%H:%M:%S", time.localtime()) +- v["hits"] = d["hits"] / sint +- v["miss"] = d["misses"] / sint ++ v["hits"] = d["hits"] // sint ++ v["miss"] = d["misses"] // sint + v["read"] = v["hits"] + v["miss"] +- v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0 ++ v["hit%"] = 100 * v["hits"] // v["read"] if v["read"] > 0 else 0 + v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0 + +- v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint +- v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint ++ v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) // sint ++ v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) // sint + + v["dread"] = v["dhit"] + v["dmis"] +- v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0 ++ v["dh%"] = 100 * v["dhit"] // v["dread"] if v["dread"] > 0 else 0 + v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0 + +- v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint ++ v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) // sint + v["pmis"] = (d["prefetch_data_misses"] + +- d["prefetch_metadata_misses"]) / sint ++ d["prefetch_metadata_misses"]) // sint + + v["pread"] = v["phit"] + v["pmis"] +- v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0 ++ v["ph%"] = 100 * v["phit"] // v["pread"] if v["pread"] > 0 else 0 + v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0 + + v["mhit"] = (d["prefetch_metadata_hits"] + +- d["demand_metadata_hits"]) / sint ++ d["demand_metadata_hits"]) // sint + v["mmis"] = (d["prefetch_metadata_misses"] + +- d["demand_metadata_misses"]) / sint ++ d["demand_metadata_misses"]) // sint + + v["mread"] = v["mhit"] + v["mmis"] +- v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0 ++ v["mh%"] = 100 * v["mhit"] // v["mread"] if v["mread"] > 0 else 0 + v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0 + + v["arcsz"] = cur["size"] + v["size"] = cur["size"] + v["c"] = cur["c"] +- v["mfu"] = d["mfu_hits"] / sint +- v["mru"] = d["mru_hits"] / sint +- v["mrug"] = d["mru_ghost_hits"] / sint +- v["mfug"] = d["mfu_ghost_hits"] / sint +- v["eskip"] = d["evict_skip"] / sint +- v["el2skip"] = d["evict_l2_skip"] / sint +- v["el2cach"] = d["evict_l2_cached"] / sint +- v["el2el"] = d["evict_l2_eligible"] / sint +- v["el2mfu"] = d["evict_l2_eligible_mfu"] / sint +- v["el2mru"] = d["evict_l2_eligible_mru"] / sint +- v["el2inel"] = d["evict_l2_ineligible"] / sint +- v["mtxmis"] = d["mutex_miss"] / sint ++ v["mfu"] = d["mfu_hits"] // sint ++ v["mru"] = d["mru_hits"] // sint ++ v["mrug"] = d["mru_ghost_hits"] // sint ++ v["mfug"] = d["mfu_ghost_hits"] // sint ++ v["eskip"] = d["evict_skip"] // sint ++ v["el2skip"] = d["evict_l2_skip"] // sint ++ v["el2cach"] = d["evict_l2_cached"] // sint ++ v["el2el"] = d["evict_l2_eligible"] // sint ++ v["el2mfu"] = d["evict_l2_eligible_mfu"] // sint ++ v["el2mru"] = d["evict_l2_eligible_mru"] // sint ++ v["el2inel"] = d["evict_l2_ineligible"] // sint ++ v["mtxmis"] = d["mutex_miss"] // sint + + if l2exist: +- v["l2hits"] = d["l2_hits"] / sint +- v["l2miss"] = d["l2_misses"] / sint ++ v["l2hits"] = d["l2_hits"] // sint ++ v["l2miss"] = d["l2_misses"] // sint + v["l2read"] = v["l2hits"] + v["l2miss"] +- v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0 ++ v["l2hit%"] = 100 * v["l2hits"] // v["l2read"] if v["l2read"] > 0 else 0 + + v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 + v["l2asize"] = cur["l2_asize"] + v["l2size"] = cur["l2_size"] +- v["l2bytes"] = d["l2_read_bytes"] / sint ++ v["l2bytes"] = d["l2_read_bytes"] // sint + + v["l2pref"] = cur["l2_prefetch_asize"] + v["l2mfu"] = cur["l2_mfu_asize"] + v["l2mru"] = cur["l2_mru_asize"] + v["l2data"] = cur["l2_bufc_data_asize"] + v["l2meta"] = cur["l2_bufc_metadata_asize"] +- v["l2pref%"] = 100 * v["l2pref"] / v["l2asize"] +- v["l2mfu%"] = 100 * v["l2mfu"] / v["l2asize"] +- v["l2mru%"] = 100 * v["l2mru"] / v["l2asize"] +- v["l2data%"] = 100 * v["l2data"] / v["l2asize"] +- v["l2meta%"] = 100 * v["l2meta"] / v["l2asize"] ++ v["l2pref%"] = 100 * v["l2pref"] // v["l2asize"] ++ v["l2mfu%"] = 100 * v["l2mfu"] // v["l2asize"] ++ v["l2mru%"] = 100 * v["l2mru"] // v["l2asize"] ++ v["l2data%"] = 100 * v["l2data"] // v["l2asize"] ++ v["l2meta%"] = 100 * v["l2meta"] // v["l2asize"] + + v["grow"] = 0 if cur["arc_no_grow"] else 1 + v["need"] = cur["arc_need_free"] diff --git a/debian/patches/zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch b/debian/patches/zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch new file mode 100644 index 00000000..8de5df9c --- /dev/null +++ b/debian/patches/zfs/0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch @@ -0,0 +1,112 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Lamprecht +Date: Wed, 10 Nov 2021 09:29:47 +0100 +Subject: [PATCH] arc stat/summary: guard access to l2arc MFU/MRU stats + +commit 085321621e79a75bea41c2b6511da6ebfbf2ba0a added printing MFU +and MRU stats for 2.1 user space tools, but those keys are not +available in the 2.0 module. That means it may break the arcstat and +arc_summary tools after upgrade to 2.1 (user space), before a reboot +to the new 2.1 ZFS kernel-module happened, due to python raising a +KeyError on the dict access then. + +Move those two keys to a .get accessor with `0` as fallback, as it +should be better to show some possible wrong data for new stat-keys +than throwing an exception. + +Signed-off-by: Thomas Lamprecht + +also move l2_mfu_asize l2_mru_asize l2_prefetch_asize +l2_bufc_data_asize l2_bufc_metadata_asize to .get accessor +(these are only present with a cache device in the pool) +Signed-off-by: Stoiko Ivanov +--- + cmd/arc_summary/arc_summary3 | 28 ++++++++++++++-------------- + cmd/arcstat/arcstat.in | 14 +++++++------- + 2 files changed, 21 insertions(+), 21 deletions(-) + +diff --git a/cmd/arc_summary/arc_summary3 b/cmd/arc_summary/arc_summary3 +index 7b28012ed..fe6a6d9e2 100755 +--- a/cmd/arc_summary/arc_summary3 ++++ b/cmd/arc_summary/arc_summary3 +@@ -617,13 +617,13 @@ def section_arc(kstats_dict): + prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached'])) + prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible'])) + prt_i2('L2 eligible MFU evictions:', +- f_perc(arc_stats['evict_l2_eligible_mfu'], ++ f_perc(arc_stats.get('evict_l2_eligible_mfu', 0), # 2.0 module compat + arc_stats['evict_l2_eligible']), +- f_bytes(arc_stats['evict_l2_eligible_mfu'])) ++ f_bytes(arc_stats.get('evict_l2_eligible_mfu', 0))) + prt_i2('L2 eligible MRU evictions:', +- f_perc(arc_stats['evict_l2_eligible_mru'], ++ f_perc(arc_stats.get('evict_l2_eligible_mru', 0), # 2.0 module compat + arc_stats['evict_l2_eligible']), +- f_bytes(arc_stats['evict_l2_eligible_mru'])) ++ f_bytes(arc_stats.get('evict_l2_eligible_mru', 0))) + prt_i1('L2 ineligible evictions:', + f_bytes(arc_stats['evict_l2_ineligible'])) + print() +@@ -765,20 +765,20 @@ def section_l2arc(kstats_dict): + f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), + f_bytes(arc_stats['l2_hdr_size'])) + prt_i2('MFU allocated size:', +- f_perc(arc_stats['l2_mfu_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_mfu_asize'])) ++ f_perc(arc_stats.get('l2_mfu_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_mfu_asize', 0))) # 2.0 module compat + prt_i2('MRU allocated size:', +- f_perc(arc_stats['l2_mru_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_mru_asize'])) ++ f_perc(arc_stats.get('l2_mru_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_mru_asize', 0))) # 2.0 module compat + prt_i2('Prefetch allocated size:', +- f_perc(arc_stats['l2_prefetch_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_prefetch_asize'])) ++ f_perc(arc_stats.get('l2_prefetch_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_prefetch_asize',0))) # 2.0 module compat + prt_i2('Data (buffer content) allocated size:', +- f_perc(arc_stats['l2_bufc_data_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_bufc_data_asize'])) ++ f_perc(arc_stats.get('l2_bufc_data_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_bufc_data_asize', 0))) # 2.0 module compat + prt_i2('Metadata (buffer content) allocated size:', +- f_perc(arc_stats['l2_bufc_metadata_asize'], arc_stats['l2_asize']), +- f_bytes(arc_stats['l2_bufc_metadata_asize'])) ++ f_perc(arc_stats.get('l2_bufc_metadata_asize', 0), arc_stats['l2_asize']), ++ f_bytes(arc_stats.get('l2_bufc_metadata_asize', 0))) # 2.0 module compat + + print() + prt_1('L2ARC breakdown:', f_hits(l2_access_total)) +diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in +index cd9a803a2..ea45dc602 100755 +--- a/cmd/arcstat/arcstat.in ++++ b/cmd/arcstat/arcstat.in +@@ -482,8 +482,8 @@ def calculate(): + v["el2skip"] = d["evict_l2_skip"] // sint + v["el2cach"] = d["evict_l2_cached"] // sint + v["el2el"] = d["evict_l2_eligible"] // sint +- v["el2mfu"] = d["evict_l2_eligible_mfu"] // sint +- v["el2mru"] = d["evict_l2_eligible_mru"] // sint ++ v["el2mfu"] = d.get("evict_l2_eligible_mfu", 0) // sint ++ v["el2mru"] = d.get("evict_l2_eligible_mru", 0) // sint + v["el2inel"] = d["evict_l2_ineligible"] // sint + v["mtxmis"] = d["mutex_miss"] // sint + +@@ -498,11 +498,11 @@ def calculate(): + v["l2size"] = cur["l2_size"] + v["l2bytes"] = d["l2_read_bytes"] // sint + +- v["l2pref"] = cur["l2_prefetch_asize"] +- v["l2mfu"] = cur["l2_mfu_asize"] +- v["l2mru"] = cur["l2_mru_asize"] +- v["l2data"] = cur["l2_bufc_data_asize"] +- v["l2meta"] = cur["l2_bufc_metadata_asize"] ++ v["l2pref"] = cur.get("l2_prefetch_asize", 0) ++ v["l2mfu"] = cur.get("l2_mfu_asize", 0) ++ v["l2mru"] = cur.get("l2_mru_asize", 0) ++ v["l2data"] = cur.get("l2_bufc_data_asize", 0) ++ v["l2meta"] = cur.get("l2_bufc_metadata_asize", 0) + v["l2pref%"] = 100 * v["l2pref"] // v["l2asize"] + v["l2mfu%"] = 100 * v["l2mfu"] // v["l2asize"] + v["l2mru%"] = 100 * v["l2mru"] // v["l2asize"] \ No newline at end of file diff --git a/debian/patches/zfs/0012-Fix-zvol_open-lock-inversion.patch b/debian/patches/zfs/0012-Fix-zvol_open-lock-inversion.patch new file mode 100644 index 00000000..eb74550f --- /dev/null +++ b/debian/patches/zfs/0012-Fix-zvol_open-lock-inversion.patch @@ -0,0 +1,212 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Brian Behlendorf +Date: Fri, 17 Dec 2021 09:52:13 -0800 +Subject: [PATCH] Fix zvol_open() lock inversion + +When restructuring the zvol_open() logic for the Linux 5.13 kernel +a lock inversion was accidentally introduced. In the updated code +the spa_namespace_lock is now taken before the zv_suspend_lock +allowing the following scenario to occur: + + down_read <=== waiting for zv_suspend_lock + zvol_open <=== holds spa_namespace_lock + __blkdev_get + blkdev_get_by_dev + blkdev_open + ... + + mutex_lock <== waiting for spa_namespace_lock + spa_open_common + spa_open + dsl_pool_hold + dmu_objset_hold_flags + dmu_objset_hold + dsl_prop_get + dsl_prop_get_integer + zvol_create_minor + dmu_recv_end + zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend() + zfs_ioc_recv + ... + +This commit resolves the issue by moving the acquisition of the +spa_namespace_lock back to after the zv_suspend_lock which restores +the original ordering. + +Additionally, as part of this change the error exit paths were +simplified where possible. + +Reviewed-by: Tony Hutter +Reviewed-by: Rich Ercolani +Signed-off-by: Brian Behlendorf +Closes #12863 +(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023) +Signed-off-by: Stoiko Ivanov +--- + module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------ + 1 file changed, 58 insertions(+), 63 deletions(-) + +diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c +index 44caadd58..69479b3f7 100644 +--- a/module/os/linux/zfs/zvol_os.c ++++ b/module/os/linux/zfs/zvol_os.c +@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) + { + zvol_state_t *zv; + int error = 0; +- boolean_t drop_suspend = B_TRUE; +- boolean_t drop_namespace = B_FALSE; ++ boolean_t drop_suspend = B_FALSE; + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS + hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); + hrtime_t start = gethrtime(); +@@ -517,7 +516,36 @@ retry: + return (SET_ERROR(-ENXIO)); + } + +- if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { ++ mutex_enter(&zv->zv_state_lock); ++ /* ++ * Make sure zvol is not suspended during first open ++ * (hold zv_suspend_lock) and respect proper lock acquisition ++ * ordering - zv_suspend_lock before zv_state_lock ++ */ ++ if (zv->zv_open_count == 0) { ++ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { ++ mutex_exit(&zv->zv_state_lock); ++ rw_enter(&zv->zv_suspend_lock, RW_READER); ++ mutex_enter(&zv->zv_state_lock); ++ /* check to see if zv_suspend_lock is needed */ ++ if (zv->zv_open_count != 0) { ++ rw_exit(&zv->zv_suspend_lock); ++ } else { ++ drop_suspend = B_TRUE; ++ } ++ } else { ++ drop_suspend = B_TRUE; ++ } ++ } ++ rw_exit(&zvol_state_lock); ++ ++ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ++ ++ if (zv->zv_open_count == 0) { ++ boolean_t drop_namespace = B_FALSE; ++ ++ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ++ + /* + * In all other call paths the spa_namespace_lock is taken + * before the bdev->bd_mutex lock. However, on open(2) +@@ -542,84 +570,51 @@ retry: + * the kernel so the only option is to return the error for + * the caller to handle it. + */ +- if (!mutex_tryenter(&spa_namespace_lock)) { +- rw_exit(&zvol_state_lock); ++ if (!mutex_owned(&spa_namespace_lock)) { ++ if (!mutex_tryenter(&spa_namespace_lock)) { ++ mutex_exit(&zv->zv_state_lock); ++ rw_exit(&zv->zv_suspend_lock); + + #ifdef HAVE_BLKDEV_GET_ERESTARTSYS +- schedule(); +- return (SET_ERROR(-ERESTARTSYS)); +-#else +- if ((gethrtime() - start) > timeout) ++ schedule(); + return (SET_ERROR(-ERESTARTSYS)); ++#else ++ if ((gethrtime() - start) > timeout) ++ return (SET_ERROR(-ERESTARTSYS)); + +- schedule_timeout(MSEC_TO_TICK(10)); +- goto retry; ++ schedule_timeout(MSEC_TO_TICK(10)); ++ goto retry; + #endif +- } else { +- drop_namespace = B_TRUE; +- } +- } +- +- mutex_enter(&zv->zv_state_lock); +- /* +- * make sure zvol is not suspended during first open +- * (hold zv_suspend_lock) and respect proper lock acquisition +- * ordering - zv_suspend_lock before zv_state_lock +- */ +- if (zv->zv_open_count == 0) { +- if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { +- mutex_exit(&zv->zv_state_lock); +- rw_enter(&zv->zv_suspend_lock, RW_READER); +- mutex_enter(&zv->zv_state_lock); +- /* check to see if zv_suspend_lock is needed */ +- if (zv->zv_open_count != 0) { +- rw_exit(&zv->zv_suspend_lock); +- drop_suspend = B_FALSE; ++ } else { ++ drop_namespace = B_TRUE; + } + } +- } else { +- drop_suspend = B_FALSE; +- } +- rw_exit(&zvol_state_lock); +- +- ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + +- if (zv->zv_open_count == 0) { +- ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); +- if (error) +- goto out_mutex; +- } + +- if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { +- error = -EROFS; +- goto out_open_count; ++ if (drop_namespace) ++ mutex_exit(&spa_namespace_lock); + } + +- zv->zv_open_count++; +- +- mutex_exit(&zv->zv_state_lock); +- if (drop_namespace) +- mutex_exit(&spa_namespace_lock); +- if (drop_suspend) +- rw_exit(&zv->zv_suspend_lock); +- +- zfs_check_media_change(bdev); +- +- return (0); ++ if (error == 0) { ++ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { ++ if (zv->zv_open_count == 0) ++ zvol_last_close(zv); + +-out_open_count: +- if (zv->zv_open_count == 0) +- zvol_last_close(zv); ++ error = SET_ERROR(-EROFS); ++ } else { ++ zv->zv_open_count++; ++ } ++ } + +-out_mutex: + mutex_exit(&zv->zv_state_lock); +- if (drop_namespace) +- mutex_exit(&spa_namespace_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + +- return (SET_ERROR(error)); ++ if (error == 0) ++ zfs_check_media_change(bdev); ++ ++ return (error); + } + + static void From 8963a3b71da1ca3e8c67284a42ff43044cd652eb Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 4 Feb 2022 10:44:55 +0100 Subject: [PATCH 2/3] Synchronize kernel patches with upstream This change updates the kernel patches with the patches used by Proxmox. Furthermore, we split the patches from Proxmox and Ubuntu, so that we don't have to re-order them. --- ...sce-host-bridge-contiguous-apertures.patch | 104 +++++++++++++ ...I-Coalesce-host-bridge-contiguous-ap.patch | 112 +++++++++++++ ...ate-split-BTF-type-info-per-default.patch} | 0 ...group-always-terminate-io.stat-lines.patch | 147 ++++++++++++++++++ debian/patches/series.linux | 14 +- ...parmor-compatibility-v2.x-net-rules.patch} | 0 .../0002-apparmor-af_unix-mediation.patch} | 0 ...diating-locking-non-fs-unix-sockets.patch} | 0 ...fix-use-after-free-in-sk_peer_label.patch} | 0 9 files changed, 372 insertions(+), 5 deletions(-) create mode 100644 debian/patches/pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch create mode 100644 debian/patches/pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch rename debian/patches/pve/{0006-disable-split-btf.patch => 0008-do-not-generate-split-BTF-type-info-per-default.patch} (100%) create mode 100644 debian/patches/pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch rename debian/patches/{pve/0007-apparmor-compatibility-v2.x-net-rules.patch => ubuntu/0001-apparmor-compatibility-v2.x-net-rules.patch} (100%) rename debian/patches/{pve/0008-apparmor-af_unix-mediation.patch => ubuntu/0002-apparmor-af_unix-mediation.patch} (100%) rename debian/patches/{pve/0009-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch => ubuntu/0003-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch} (100%) rename debian/patches/{pve/0010-apparmor-fix-use-after-free-in-sk_peer_label.patch => ubuntu/0004-apparmor-fix-use-after-free-in-sk_peer_label.patch} (100%) diff --git a/debian/patches/pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch b/debian/patches/pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch new file mode 100644 index 00000000..d38d1a9e --- /dev/null +++ b/debian/patches/pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch @@ -0,0 +1,104 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Lamprecht +Date: Mon, 27 Sep 2021 11:28:39 +0200 +Subject: [PATCH] Revert "PCI: Coalesce host bridge contiguous apertures" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This reverts commit ab20e43b20b60f5cc8e2ea3763ffa388158469ac. + +was reverted upstream because of reports similar to + +Link: https://bugzilla.proxmox.com/show_bug.cgi?id=3552 +Link: https://lore.kernel.org/r/20210709231529.GA3270116@roeck-us.net +Signed-off-by: Fabian Grünbichler +Signed-off-by: Thomas Lamprecht +--- + drivers/pci/probe.c | 50 ++++----------------------------------------- + 1 file changed, 4 insertions(+), 46 deletions(-) + +diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c +index cb70d2605e97..258350f80f6c 100644 +--- a/drivers/pci/probe.c ++++ b/drivers/pci/probe.c +@@ -20,7 +20,6 @@ + #include + #include + #include +-#include + #include "pci.h" + + #define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ +@@ -881,31 +880,14 @@ static void pci_set_bus_msi_domain(struct pci_bus *bus) + dev_set_msi_domain(&bus->dev, d); + } + +-static int res_cmp(void *priv, const struct list_head *a, +- const struct list_head *b) +-{ +- struct resource_entry *entry1, *entry2; +- +- entry1 = container_of(a, struct resource_entry, node); +- entry2 = container_of(b, struct resource_entry, node); +- +- if (entry1->res->flags != entry2->res->flags) +- return entry1->res->flags > entry2->res->flags; +- +- if (entry1->offset != entry2->offset) +- return entry1->offset > entry2->offset; +- +- return entry1->res->start > entry2->res->start; +-} +- + static int pci_register_host_bridge(struct pci_host_bridge *bridge) + { + struct device *parent = bridge->dev.parent; +- struct resource_entry *window, *next, *n; ++ struct resource_entry *window, *n; + struct pci_bus *bus, *b; +- resource_size_t offset, next_offset; ++ resource_size_t offset; + LIST_HEAD(resources); +- struct resource *res, *next_res; ++ struct resource *res; + char addr[64], *fmt; + const char *name; + int err; +@@ -988,35 +970,11 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) + if (nr_node_ids > 1 && pcibus_to_node(bus) == NUMA_NO_NODE) + dev_warn(&bus->dev, "Unknown NUMA node; performance will be reduced\n"); + +- /* Sort and coalesce contiguous windows */ +- list_sort(NULL, &resources, res_cmp); +- resource_list_for_each_entry_safe(window, n, &resources) { +- if (list_is_last(&window->node, &resources)) +- break; +- +- next = list_next_entry(window, node); +- offset = window->offset; +- res = window->res; +- next_offset = next->offset; +- next_res = next->res; +- +- if (res->flags != next_res->flags || offset != next_offset) +- continue; +- +- if (res->end + 1 == next_res->start) { +- next_res->start = res->start; +- res->flags = res->start = res->end = 0; +- } +- } +- + /* Add initial resources to the bus */ + resource_list_for_each_entry_safe(window, n, &resources) { ++ list_move_tail(&window->node, &bridge->windows); + offset = window->offset; + res = window->res; +- if (!res->end) +- continue; +- +- list_move_tail(&window->node, &bridge->windows); + + if (res->flags & IORESOURCE_BUS) + pci_bus_insert_busn_res(bus, bus->number, res->end); \ No newline at end of file diff --git a/debian/patches/pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch b/debian/patches/pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch new file mode 100644 index 00000000..c46b7192 --- /dev/null +++ b/debian/patches/pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch @@ -0,0 +1,112 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Kai-Heng Feng +Date: Tue, 13 Jul 2021 20:50:07 +0800 +Subject: [PATCH] PCI: Reinstate "PCI: Coalesce host bridge contiguous + apertures" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Built-in graphics on HP EliteDesk 805 G6 doesn't work because graphics +can't get the BAR it needs: + pci_bus 0000:00: root bus resource [mem 0x10020200000-0x100303fffff window] + pci_bus 0000:00: root bus resource [mem 0x10030400000-0x100401fffff window] + + pci 0000:00:08.1: bridge window [mem 0xd2000000-0xd23fffff] + pci 0000:00:08.1: bridge window [mem 0x10030000000-0x100401fffff 64bit pref] + pci 0000:00:08.1: can't claim BAR 15 [mem 0x10030000000-0x100401fffff 64bit pref]: no compatible bridge window + pci 0000:00:08.1: [mem 0x10030000000-0x100401fffff 64bit pref] clipped to [mem 0x10030000000-0x100303fffff 64bit pref] + pci 0000:00:08.1: bridge window [mem 0x10030000000-0x100303fffff 64bit pref] + pci 0000:07:00.0: can't claim BAR 0 [mem 0x10030000000-0x1003fffffff 64bit pref]: no compatible bridge window + pci 0000:07:00.0: can't claim BAR 2 [mem 0x10040000000-0x100401fffff 64bit pref]: no compatible bridge window + +However, the root bus has two contiguous apertures that can contain the +child resource requested. + +Coalesce contiguous apertures so we can allocate from the entire contiguous +region. + +This is the second take of commit 65db04053efe ("PCI: Coalesce host +bridge contiguous apertures"). The original approach sorts the apertures +by address, but that makes NVMe stop working on QEMU ppc:sam460ex: + PCI host bridge to bus 0002:00 + pci_bus 0002:00: root bus resource [io 0x0000-0xffff] + pci_bus 0002:00: root bus resource [mem 0xd80000000-0xdffffffff] (bus address [0x80000000-0xffffffff]) + pci_bus 0002:00: root bus resource [mem 0xc0ee00000-0xc0eefffff] (bus address [0x00000000-0x000fffff]) + +After the offending commit: + PCI host bridge to bus 0002:00 + pci_bus 0002:00: root bus resource [io 0x0000-0xffff] + pci_bus 0002:00: root bus resource [mem 0xc0ee00000-0xc0eefffff] (bus address [0x00000000-0x000fffff]) + pci_bus 0002:00: root bus resource [mem 0xd80000000-0xdffffffff] (bus address [0x80000000-0xffffffff]) + +Since the apertures on HP EliteDesk 805 G6 are already in ascending +order, doing a precautious sorting is not necessary. + +Remove the sorting part to avoid the regression on ppc:sam460ex. + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212013 +Cc: Guenter Roeck +Suggested-by: Bjorn Helgaas +Signed-off-by: Kai-Heng Feng +Signed-off-by: Fabian Grünbichler +Signed-off-by: Thomas Lamprecht +--- + drivers/pci/probe.c | 31 +++++++++++++++++++++++++++---- + 1 file changed, 27 insertions(+), 4 deletions(-) + +diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c +index 258350f80f6c..7ff9fcec365b 100644 +--- a/drivers/pci/probe.c ++++ b/drivers/pci/probe.c +@@ -883,11 +883,11 @@ static void pci_set_bus_msi_domain(struct pci_bus *bus) + static int pci_register_host_bridge(struct pci_host_bridge *bridge) + { + struct device *parent = bridge->dev.parent; +- struct resource_entry *window, *n; ++ struct resource_entry *window, *next, *n; + struct pci_bus *bus, *b; +- resource_size_t offset; ++ resource_size_t offset, next_offset; + LIST_HEAD(resources); +- struct resource *res; ++ struct resource *res, *next_res; + char addr[64], *fmt; + const char *name; + int err; +@@ -970,11 +970,34 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) + if (nr_node_ids > 1 && pcibus_to_node(bus) == NUMA_NO_NODE) + dev_warn(&bus->dev, "Unknown NUMA node; performance will be reduced\n"); + ++ /* Coalesce contiguous windows */ ++ resource_list_for_each_entry_safe(window, n, &resources) { ++ if (list_is_last(&window->node, &resources)) ++ break; ++ ++ next = list_next_entry(window, node); ++ offset = window->offset; ++ res = window->res; ++ next_offset = next->offset; ++ next_res = next->res; ++ ++ if (res->flags != next_res->flags || offset != next_offset) ++ continue; ++ ++ if (res->end + 1 == next_res->start) { ++ next_res->start = res->start; ++ res->flags = res->start = res->end = 0; ++ } ++ } ++ + /* Add initial resources to the bus */ + resource_list_for_each_entry_safe(window, n, &resources) { +- list_move_tail(&window->node, &bridge->windows); + offset = window->offset; + res = window->res; ++ if (!res->end) ++ continue; ++ ++ list_move_tail(&window->node, &bridge->windows); + + if (res->flags & IORESOURCE_BUS) + pci_bus_insert_busn_res(bus, bus->number, res->end); \ No newline at end of file diff --git a/debian/patches/pve/0006-disable-split-btf.patch b/debian/patches/pve/0008-do-not-generate-split-BTF-type-info-per-default.patch similarity index 100% rename from debian/patches/pve/0006-disable-split-btf.patch rename to debian/patches/pve/0008-do-not-generate-split-BTF-type-info-per-default.patch diff --git a/debian/patches/pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch b/debian/patches/pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch new file mode 100644 index 00000000..c4221f78 --- /dev/null +++ b/debian/patches/pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch @@ -0,0 +1,147 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Wolfgang Bumiller +Date: Tue, 11 Jan 2022 09:31:59 +0100 +Subject: [PATCH] blk-cgroup: always terminate io.stat lines + +With the removal of seq_get_buf in blkcg_print_one_stat, we +cannot make adding the newline conditional on there being +relevant stats because the name was already written out +unconditionally. +Otherwise we may end up with multiple device names in one +line which is confusing and doesn't follow the nested-keyed +file format. + +Signed-off-by: Wolfgang Bumiller +Fixes: 252c651a4c85 ("blk-cgroup: stop using seq_get_buf") +Signed-off-by: Thomas Lamprecht +--- + block/blk-cgroup.c | 9 ++------- + block/blk-iocost.c | 5 ++--- + block/blk-iolatency.c | 8 +++----- + include/linux/blk-cgroup.h | 2 +- + 4 files changed, 8 insertions(+), 16 deletions(-) + +diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c +index 0eec59e4df65..38c62a44905a 100644 +--- a/block/blk-cgroup.c ++++ b/block/blk-cgroup.c +@@ -887,7 +887,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) + { + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; +- bool has_stats = false; + const char *dname; + unsigned seq; + int i; +@@ -913,14 +912,12 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { +- has_stats = true; + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { +- has_stats = true; + seq_printf(s, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); +@@ -932,12 +929,10 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + +- if (pol->pd_stat_fn(blkg->pd[i], s)) +- has_stats = true; ++ pol->pd_stat_fn(blkg->pd[i], s); + } + +- if (has_stats) +- seq_printf(s, "\n"); ++ seq_puts(s, "\n"); + } + + static int blkcg_print_stat(struct seq_file *sf, void *v) +diff --git a/block/blk-iocost.c b/block/blk-iocost.c +index eb7b0d6bd11f..381c28f9561e 100644 +--- a/block/blk-iocost.c ++++ b/block/blk-iocost.c +@@ -2995,13 +2995,13 @@ static void ioc_pd_free(struct blkg_policy_data *pd) + kfree(iocg); + } + +-static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) ++static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) + { + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + + if (!ioc->enabled) +- return false; ++ return; + + if (iocg->level == 0) { + unsigned vp10k = DIV64_U64_ROUND_CLOSEST( +@@ -3017,7 +3017,6 @@ static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); +- return true; + } + + static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, +diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c +index c0545f9da549..d33460f3d43d 100644 +--- a/block/blk-iolatency.c ++++ b/block/blk-iolatency.c +@@ -890,7 +890,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) + return 0; + } + +-static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) ++static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) + { + struct latency_stat stat; + int cpu; +@@ -913,17 +913,16 @@ static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); +- return true; + } + +-static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) ++static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) + { + struct iolatency_grp *iolat = pd_to_lat(pd); + unsigned long long avg_lat; + unsigned long long cur_win; + + if (!blkcg_debug_stats) +- return false; ++ return; + + if (iolat->ssd) + return iolatency_ssd_stat(iolat, s); +@@ -936,7 +935,6 @@ static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) + else + seq_printf(s, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); +- return true; + } + + static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, +diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h +index b4de2010fba5..132e05ed6935 100644 +--- a/include/linux/blk-cgroup.h ++++ b/include/linux/blk-cgroup.h +@@ -152,7 +152,7 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); + typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); + typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); + typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); +-typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, ++typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, + struct seq_file *s); + + struct blkcg_policy { \ No newline at end of file diff --git a/debian/patches/series.linux b/debian/patches/series.linux index 64f0feb1..9d565a61 100644 --- a/debian/patches/series.linux +++ b/debian/patches/series.linux @@ -1,10 +1,14 @@ +ubuntu/0001-apparmor-compatibility-v2.x-net-rules.patch +ubuntu/0002-apparmor-af_unix-mediation.patch +ubuntu/0003-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch +ubuntu/0004-apparmor-fix-use-after-free-in-sk_peer_label.patch + pve/0001-Make-mkcompile_h-accept-an-alternate-timestamp-strin.patch pve/0002-bridge-keep-MAC-of-first-assigned-port.patch pve/0003-pci-Enable-overrides-for-missing-ACS-capabilities-4..patch pve/0004-kvm-disable-default-dynamic-halt-polling-growth.patch pve/0005-net-core-downgrade-unregister_netdevice-refcount-lea.patch -pve/0006-disable-split-btf.patch -pve/0007-apparmor-compatibility-v2.x-net-rules.patch -pve/0008-apparmor-af_unix-mediation.patch -pve/0009-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch -pve/0010-apparmor-fix-use-after-free-in-sk_peer_label.patch +# pve/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch +# pve/0007-PCI-Reinstate-PCI-Coalesce-host-bridge-contiguous-ap.patch +pve/0008-do-not-generate-split-BTF-type-info-per-default.patch +pve/0009-blk-cgroup-always-terminate-io.stat-lines.patch \ No newline at end of file diff --git a/debian/patches/pve/0007-apparmor-compatibility-v2.x-net-rules.patch b/debian/patches/ubuntu/0001-apparmor-compatibility-v2.x-net-rules.patch similarity index 100% rename from debian/patches/pve/0007-apparmor-compatibility-v2.x-net-rules.patch rename to debian/patches/ubuntu/0001-apparmor-compatibility-v2.x-net-rules.patch diff --git a/debian/patches/pve/0008-apparmor-af_unix-mediation.patch b/debian/patches/ubuntu/0002-apparmor-af_unix-mediation.patch similarity index 100% rename from debian/patches/pve/0008-apparmor-af_unix-mediation.patch rename to debian/patches/ubuntu/0002-apparmor-af_unix-mediation.patch diff --git a/debian/patches/pve/0009-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch b/debian/patches/ubuntu/0003-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch similarity index 100% rename from debian/patches/pve/0009-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch rename to debian/patches/ubuntu/0003-apparmor-fix-apparmor-mediating-locking-non-fs-unix-sockets.patch diff --git a/debian/patches/pve/0010-apparmor-fix-use-after-free-in-sk_peer_label.patch b/debian/patches/ubuntu/0004-apparmor-fix-use-after-free-in-sk_peer_label.patch similarity index 100% rename from debian/patches/pve/0010-apparmor-fix-use-after-free-in-sk_peer_label.patch rename to debian/patches/ubuntu/0004-apparmor-fix-use-after-free-in-sk_peer_label.patch From c254a6c138557aa931ca94c47b6c42d49b044c9b Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 4 Feb 2022 10:52:11 +0100 Subject: [PATCH 3/3] Add release to include new patches This change adds a new kernel release that includes the latest patches from Proxmox. --- debian/changelog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/debian/changelog b/debian/changelog index b5c6fc4b..3d7d5c5f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,11 @@ +pve-kernel (5.16.5-2) edge; urgency=medium + + * Synchronize kernel patches with Proxmox. + * Synchronize ZFS patches with Proxmox. + * Separate Ubuntu patches from Proxmox patches. + + -- Fabian Mastenbroek Fri, 04 Feb 2022 11:00:00 +0000 + pve-kernel (5.16.5-1) edge; urgency=medium * Update to Linux 5.16.5.