diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 8f5524d4513..89152cb1afa 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -1,7 +1,12 @@
+# SPDX-License-Identifier: BSD-2-Clause-Patent
+# Copyright (c) 2024 Intel Corporation.
+
 name: Trivy scan
 
 on:
   workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'
   push:
     branches: ["master", "release/**"]
   pull_request:
@@ -11,15 +16,17 @@ on:
 permissions: {}
 
 jobs:
-  build:
-    name: Build
-    runs-on: ubuntu-20.04
+  scan:
+    name: Scan with Trivy
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
     steps:
       - name: Checkout code
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11  # v4.1.1
 
-      - name: Run Trivy vulnerability scanner in repo mode
-        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+      - name: Run Trivy vulnerability scanner in filesystem mode (table format)
+        uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2  # 0.28.0
         with:
           scan-type: 'fs'
           scan-ref: '.'
@@ -43,8 +50,8 @@ jobs:
             utils/trivy/trivy.yaml
           sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml
 
-      - name: Run Trivy vulnerability scanner in repo mode
-        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+      - name: Run Trivy vulnerability scanner in filesystem mode (sarif format)
+        uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2  # 0.28.0
         with:
           scan-type: 'fs'
           scan-ref: '.'
@@ -62,8 +69,8 @@ jobs:
           sed -i 's/format: sarif/format: table/g' utils/trivy/trivy.yaml
           sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml
 
-      - name: Run Trivy vulnerability scanner in repo mode
-        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+      - name: Run Trivy vulnerability scanner in filesystem mode (human readable format)
+        uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2  # 0.28.0
         with:
           scan-type: 'fs'
           scan-ref: '.'
diff --git a/Jenkinsfile b/Jenkinsfile
index 18451cd9c56..87416ffdf98 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -876,7 +876,7 @@ pipeline {
                     }
                     steps {
                         job_step_update(
-                            unitTest(timeout_time: 60,
+                            unitTest(timeout_time: 180,
                                      unstash_opt: true,
                                      ignore_failure: true,
                                      inst_repos: prRepos(),
@@ -1167,6 +1167,7 @@ pipeline {
                         'Functional Hardware Medium': getFunctionalTestStage(
                             name: 'Functional Hardware Medium',
                             pragma_suffix: '-hw-medium',
+                            base_branch: 'master',
                             label: params.FUNCTIONAL_HARDWARE_MEDIUM_LABEL,
                             next_version: next_version,
                             stage_tags: 'hw,medium,-provider',
@@ -1179,6 +1180,7 @@ pipeline {
                         'Functional Hardware Medium MD on SSD': getFunctionalTestStage(
                             name: 'Functional Hardware Medium MD on SSD',
                             pragma_suffix: '-hw-medium-md-on-ssd',
+                            base_branch: 'master',
                             label: params.FUNCTIONAL_HARDWARE_MEDIUM_LABEL,
                             next_version: next_version,
                             stage_tags: 'hw,medium,-provider',
@@ -1192,6 +1194,7 @@ pipeline {
                         'Functional Hardware Medium VMD': getFunctionalTestStage(
                             name: 'Functional Hardware Medium VMD',
                             pragma_suffix: '-hw-medium-vmd',
+                            base_branch: 'master',
                             label: params.FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL,
                             next_version: next_version,
                             stage_tags: 'hw_vmd,medium',
@@ -1205,6 +1208,7 @@ pipeline {
                         'Functional Hardware Medium Verbs Provider': getFunctionalTestStage(
                             name: 'Functional Hardware Medium Verbs Provider',
                             pragma_suffix: '-hw-medium-verbs-provider',
+                            base_branch: 'master',
                             label: params.FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL,
                             next_version: next_version,
                             stage_tags: 'hw,medium,provider',
@@ -1218,6 +1222,7 @@ pipeline {
                         'Functional Hardware Medium Verbs Provider MD on SSD': getFunctionalTestStage(
                             name: 'Functional Hardware Medium Verbs Provider MD on SSD',
                             pragma_suffix: '-hw-medium-verbs-provider-md-on-ssd',
+                            base_branch: 'master',
                             label: params.FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL,
                             next_version: next_version,
                             stage_tags: 'hw,medium,provider',
@@ -1232,6 +1237,7 @@ pipeline {
                         'Functional Hardware Medium UCX Provider': getFunctionalTestStage(
                             name: 'Functional Hardware Medium UCX Provider',
                             pragma_suffix: '-hw-medium-ucx-provider',
+                            base_branch: 'master',
                             label: params.FUNCTIONAL_HARDWARE_MEDIUM_UCX_PROVIDER_LABEL,
                             next_version: next_version,
                             stage_tags: 'hw,medium,provider',
@@ -1245,6 +1251,7 @@ pipeline {
                         'Functional Hardware Large': getFunctionalTestStage(
                             name: 'Functional Hardware Large',
                             pragma_suffix: '-hw-large',
+                            base_branch: 'master',
                             label: params.FUNCTIONAL_HARDWARE_LARGE_LABEL,
                             next_version: next_version,
                             stage_tags: 'hw,large',
@@ -1257,6 +1264,7 @@ pipeline {
                         'Functional Hardware Large MD on SSD': getFunctionalTestStage(
                             name: 'Functional Hardware Large MD on SSD',
                             pragma_suffix: '-hw-large-md-on-ssd',
+                            base_branch: 'master',
                             label: params.FUNCTIONAL_HARDWARE_LARGE_LABEL,
                             next_version: next_version,
                             stage_tags: 'hw,large',
diff --git a/README.md b/README.md
index 0bd1915919e..35fd647b185 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
 [![Build](https://github.com/daos-stack/daos/actions/workflows/ci2.yml/badge.svg)](https://github.com/daos-stack/daos/actions/workflows/ci2.yml)
 [![Codespell](https://github.com/daos-stack/daos/actions/workflows/spelling.yml/badge.svg)](https://github.com/daos-stack/daos/actions/workflows/spelling.yml)
 [![Doxygen](https://github.com/daos-stack/daos/actions/workflows/doxygen.yml/badge.svg)](https://github.com/daos-stack/daos/actions/workflows/doxygen.yml)
+[![Trivy scan](https://github.com/daos-stack/daos/actions/workflows/trivy.yml/badge.svg)](https://github.com/daos-stack/daos/actions/workflows/trivy.yml)
 
 <a href="https://docs.daos.io/">
 <img src="https://avatars.githubusercontent.com/u/20561043?s=400&u=db7cd0ada987ba59c21c3de5f9e7cffba73c3325&v=4" width="200" height="200">
diff --git a/debian/changelog b/debian/changelog
index f65c13eeb9f..a77e2e130b9 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+daos (2.7.100-10) unstable; urgency=medium
+
+  [ Sherin T George ]
+  * Add DAV v2 lib
+
+ -- Sherin T George <sherin-t.george@hpe.com>  Fri, 1 Nov 2024 11:54:00 +0530
+
 daos (2.7.100-9) unstable; urgency=medium
   [ Brian J. Murrell ]
   * Remove Build-Depends: for UCX as they were obsoleted as of e01970d
@@ -130,6 +137,7 @@ daos (2.5.100-12) unstable; urgency=medium
 
  -- Tomasz Gromadzki <tomasz.gromadzki@intel.com>  Fri, 17 Nov 2023 12:52:00 -0400
 
+daos (2.5.100-11) unstable; urgency=medium
   [ Jerome Soumagne ]
   * Bump mercury min version to 2.3.1
 
diff --git a/debian/daos-server.install b/debian/daos-server.install
index fb1e8af9a67..99d344327f4 100644
--- a/debian/daos-server.install
+++ b/debian/daos-server.install
@@ -28,6 +28,7 @@ usr/lib64/daos_srv/libbio.so
 usr/lib64/daos_srv/libplacement.so
 usr/lib64/daos_srv/libpipeline.so
 usr/lib64/libdaos_common_pmem.so
+usr/lib64/libdav_v2.so
 usr/share/daos/control/setup_spdk.sh
 usr/lib/systemd/system/daos_server.service
 usr/lib/sysctl.d/10-daos_server.conf
diff --git a/docs/admin/pool_operations.md b/docs/admin/pool_operations.md
index 36907a2e31f..8c3db202c4b 100644
--- a/docs/admin/pool_operations.md
+++ b/docs/admin/pool_operations.md
@@ -26,6 +26,7 @@ Its subcommands can be grouped into the following areas:
 * An upgrade command to upgrade a pool's format version
   after a DAOS software upgrade.
 
+
 ### Creating a Pool
 
 A DAOS pool can be created through the `dmg pool create` command.
@@ -170,6 +171,195 @@ on pool size, but also on number of targets, target size, object class,
 storage redundancy factor, etc.
 
 
+#### Creating a pool in MD-on-SSD mode
+
+In MD-on-SSD mode, a pool is made up of a single component in memory (RAM-disk
+associated with each engine) and three components on storage (NVMe SSD). The
+components in storage are related to "roles" WAL, META and DATA and roles are
+assigned to hardware devices in the
+[server configuration file](https://docs.daos.io/v2.6/admin/deployment/#server-configuration-file).
+
+In MD-on-SSD mode pools are by default created with equal allocations for
+metadata-in-memory and metadata-on-SSD but it is possible to change this. To
+create a pool with a metadata-on-SSD allocation size that is double what is
+allocated in memory, set `dmg pool create --mem-ratio` option to `50%`. This
+implies that the ratio of metadata on memory and on storage should be 0.5 and
+therefore metadata-on-SSD allocation is twice that of metadata-in-memory.
+
+A MD-on-SSD pool created with a `--mem-ratio` between 0 and 100 percent is
+said to be operating in "phase-2" mode.
+
+#### MD-on-SSD phase-2 pool create examples
+
+These examples cover the recommended way to create a pool in MD-on-SSD phase-2
+mode using the `--size` percentage option.
+
+The following example is run on a single host with dual engines where bdev
+roles META and DATA are not shared. Two pools are created with VOS index file
+size equal to half the meta-blob size (`--mem-ratio 50%`). Both pools use
+roughly half the original capacity available (first using 50% and the second
+100% of the remainder).
+
+Rough calculations: `dmg storage scan` shows that for each rank, one 800GB SSD
+is assigned for each tier (first: WAL+META, second: DATA). `df -h /mnt/daos*`
+reports usable ramdisk capacity for each rank is 66GiB.
+- Expected Data storage would then be 400GB for a 50% capacity first pool and
+  100% capacity second pool per-rank.
+- Expected Meta storage at 50% mem-ratio would be `66GiB*2 = 132GiB == 141GB`
+  giving ~70GB for 50% first and 100% second pools.
+- Expected Memory file size (aggregated) is `66GiB/2 = 35GB` for 50% first and
+  100% second pools.
+
+```bash
+$ dmg pool create bob --size 50% --mem-ratio 50%
+
+Pool created with 14.86%,85.14% storage tier ratio
+--------------------------------------------------
+  UUID             : 47060d94-c689-4981-8c89-011beb063f8f
+  Service Leader   : 0
+  Service Ranks    : [0-1]
+  Storage Ranks    : [0-1]
+  Total Size       : 940 GB
+  Metadata Storage : 140 GB (70 GB / rank)
+  Data Storage     : 800 GB (400 GB / rank)
+  Memory File Size : 70 GB (35 GB / rank)
+
+$ dmg pool create bob2 --size 100% --mem-ratio 50%
+
+Pool created with 14.47%,85.53% storage tier ratio
+--------------------------------------------------
+  UUID             : bdbef091-f0f8-411d-8995-f91c4efc690f
+  Service Leader   : 1
+  Service Ranks    : [0-1]
+  Storage Ranks    : [0-1]
+  Total Size       : 935 GB
+  Metadata Storage : 135 GB (68 GB / rank)
+  Data Storage     : 800 GB (400 GB / rank)
+  Memory File Size : 68 GB (34 GB / rank)
+
+$ dmg pool query bob
+
+Pool 47060d94-c689-4981-8c89-011beb063f8f, ntarget=32, disabled=0, leader=0, version=1, state=Ready
+Pool health info:
+- Rebuild idle, 0 objs, 0 recs
+Pool space info:
+- Target count:32
+- Total memory-file size: 70 GB
+- Metadata storage:
+  Total size: 140 GB
+  Free: 131 GB, min:4.1 GB, max:4.1 GB, mean:4.1 GB
+- Data storage:
+  Total size: 800 GB
+  Free: 799 GB, min:25 GB, max:25 GB, mean:25 GB
+
+$ dmg pool query bob2
+
+Pool bdbef091-f0f8-411d-8995-f91c4efc690f, ntarget=32, disabled=0, leader=1, version=1, state=Ready
+Pool health info:
+- Rebuild idle, 0 objs, 0 recs
+Pool space info:
+- Target count:32
+- Total memory-file size: 68 GB
+- Metadata storage:
+  Total size: 135 GB
+  Free: 127 GB, min:4.0 GB, max:4.0 GB, mean:4.0 GB
+- Data storage:
+  Total size: 800 GB
+  Free: 799 GB, min:25 GB, max:25 GB, mean:25 GB
+```
+
+The following examples are with a single host with dual engines where bdev
+roles WAL, META and DATA are shared.
+
+Single pool with VOS index file size equal to the meta-blob size (`--mem-ratio
+100%`).
+
+```bash
+$ dmg pool create bob --size 100% --mem-ratio 100%
+
+Pool created with 5.93%,94.07% storage tier ratio
+-------------------------------------------------
+  UUID             : bad54f1d-8976-428b-a5dd-243372dfa65c
+  Service Leader   : 1
+  Service Ranks    : [0-1]
+  Storage Ranks    : [0-1]
+  Total Size       : 2.4 TB
+  Metadata Storage : 140 GB (70 GB / rank)
+  Data Storage     : 2.2 TB (1.1 TB / rank)
+  Memory File Size : 140 GB (70 GB / rank)
+
+```
+
+Rough calculations: 1.2TB of usable space is returned from storage scan and
+because roles are shared required META (70GB) is reserved so only 1.1TB is
+provided for data.
+
+Logging shows:
+```bash
+DEBUG 2024/09/24 15:44:38.554431 pool.go:1139: added smd device c7da7391-9077-4eb6-9f4a-a3d656166236 (rank 1, ctrlr 0000:d8:00.0, roles "data,meta,wal") as usable: device state="NORMAL", smd-size 623 GB (623307128832), ctrlr-total-free 623 GB (623307128832)
+DEBUG 2024/09/24 15:44:38.554516 pool.go:1139: added smd device 18c7bf45-7586-49ba-93c0-cbc08caed901 (rank 1, ctrlr 0000:d9:00.0, roles "data,meta,wal") as usable: device state="NORMAL", smd-size 554 GB (554050781184), ctrlr-total-free 1.2 TB (1177357910016)
+DEBUG 2024/09/24 15:44:38.554603 pool.go:1246: based on minimum available ramdisk capacity of 70 GB and mem-ratio 1.00 with 70 GB of reserved metadata capacity, the maximum per-rank sizes for a pool are META=70 GB (69792169984 B) DATA=1.1 TB (1107565740032 B)
+```
+
+Now the same as above but with a single pool with VOS index file size equal to
+a quarter of the meta-blob size (`--mem-ratio 25%`).
+
+```bash
+$ dmg pool create bob --size 100% --mem-ratio 25%
+
+Pool created with 23.71%,76.29% storage tier ratio
+--------------------------------------------------
+  UUID             : 999ecf55-474e-4476-9f90-0b4c754d4619
+  Service Leader   : 0
+  Service Ranks    : [0-1]
+  Storage Ranks    : [0-1]
+  Total Size       : 2.4 TB
+  Metadata Storage : 558 GB (279 GB / rank)
+  Data Storage     : 1.8 TB (898 GB / rank)
+  Memory File Size : 140 GB (70 GB / rank)
+
+```
+
+Rough calculations: 1.2TB of usable space is returned from storage scan and
+because roles are shared required META (279GB) is reserved so only ~900GB is
+provided for data.
+
+Logging shows:
+```bash
+DEBUG 2024/09/24 16:16:00.172719 pool.go:1246: based on minimum available ramdisk capacity of 70 GB and mem-ratio 0.25 with 279 GB of reserved metadata capacity, the maximum per-rank sizes for a pool are META=279 GB (279168679936 B) DATA=898 GB (898189230080 B)
+```
+
+Now with 6 ranks and a single pool with VOS index file size equal to a half of
+the meta-blob size (`--mem-ratio 50%`).
+
+```bash
+$ dmg pool create bob --size 100% --mem-ratio 50%
+
+Pool created with 11.86%,88.14% storage tier ratio
+--------------------------------------------------
+  UUID             : 4fa38199-23a9-4b4d-aa9a-8b9838cad1d6
+  Service Leader   : 1
+  Service Ranks    : [0-2,4-5]
+  Storage Ranks    : [0-5]
+  Total Size       : 7.1 TB
+  Metadata Storage : 838 GB (140 GB / rank)
+  Data Storage     : 6.2 TB (1.0 TB / rank)
+  Memory File Size : 419 GB (70 GB / rank)
+
+```
+
+Rough calculations: 1177 GB of usable space is returned from storage scan and
+because roles are shared required META (140 GB) is reserved so only 1037 GB is
+provided for data (per-rank).
+
+Logging shows:
+```bash
+DEBUG 2024/09/24 16:40:41.570331 pool.go:1139: added smd device c921c7b9-5f5c-4332-a878-0ebb8191c160 (rank 1, ctrlr 0000:d8:00.0, roles "data,meta,wal") as usable: device state="NORMAL", smd-size 623 GB (623307128832), ctrlr-total-free 623 GB (623307128832)
+DEBUG 2024/09/24 16:40:41.570447 pool.go:1139: added smd device a071c3cf-5de1-4911-8549-8c5e8f550554 (rank 1, ctrlr 0000:d9:00.0, roles "data,meta,wal") as usable: device state="NORMAL", smd-size 554 GB (554050781184), ctrlr-total-free 1.2 TB (1177357910016)
+DEBUG 2024/09/24 16:40:41.570549 pool.go:1246: based on minimum available ramdisk capacity of 70 GB and mem-ratio 0.50 with 140 GB of reserved metadata capacity, the maximum per-rank sizes for a pool are META=140 GB (139584339968 B) DATA=1.0 TB (1037773570048 B)
+```
+
+
 ### Listing Pools
 
 To see a list of the pools in the DAOS system:
diff --git a/docs/user/filesystem.md b/docs/user/filesystem.md
index 048dcfd04f7..f1a3398df87 100644
--- a/docs/user/filesystem.md
+++ b/docs/user/filesystem.md
@@ -228,7 +228,6 @@ Additionally, there are several optional command-line options:
 | --container=<label\|uuid\> | container label or uuid to open  |
 | --sys-name=<name\>         | DAOS system name                 |
 | --foreground               | run in foreground                |
-| --singlethreaded           | run single threaded              |
 | --thread-count=<count>     | Number of threads to use         |
 | --multi-user               | Run in multi user mode           |
 | --read-only                | Mount in read-only mode          |
diff --git a/src/bio/bio_context.c b/src/bio/bio_context.c
index 297694c6e6a..c450a25f0af 100644
--- a/src/bio/bio_context.c
+++ b/src/bio/bio_context.c
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2018-2023 Intel Corporation.
+ * (C) Copyright 2018-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -457,7 +457,8 @@ int bio_mc_destroy(struct bio_xs_context *xs_ctxt, uuid_t pool_id, enum bio_mc_f
 
 static int
 bio_blob_create(uuid_t uuid, struct bio_xs_context *xs_ctxt, uint64_t blob_sz,
-		enum smd_dev_type st, enum bio_mc_flags flags, spdk_blob_id *blob_id)
+		enum smd_dev_type st, enum bio_mc_flags flags, spdk_blob_id *blob_id,
+		uint64_t scm_sz)
 {
 	struct blob_msg_arg		 bma = { 0 };
 	struct blob_cp_arg		*ba = &bma.bma_cp_arg;
@@ -541,9 +542,10 @@ bio_blob_create(uuid_t uuid, struct bio_xs_context *xs_ctxt, uint64_t blob_sz,
 						     blob_sz);
 			else
 				rc = smd_pool_add_tgt(uuid, xs_ctxt->bxc_tgt_id, ba->bca_id, st,
-						      blob_sz);
+						      blob_sz, scm_sz);
 		} else {
-			rc = smd_pool_add_tgt(uuid, xs_ctxt->bxc_tgt_id, ba->bca_id, st, blob_sz);
+			rc = smd_pool_add_tgt(uuid, xs_ctxt->bxc_tgt_id, ba->bca_id, st, blob_sz,
+					      0);
 		}
 
 		if (rc != 0) {
@@ -611,14 +613,14 @@ __bio_ioctxt_open(struct bio_io_context **pctxt, struct bio_xs_context *xs_ctxt,
 /*
  * Calculate a reasonable WAL size based on following assumptions:
  * - Single target update IOPS can be up to 65k;
- * - Each TX consumes 2 WAL blocks in average;
+ * - Each TX consumes 2 WAL blocks on average;
  * - Checkpointing interval is 5 seconds, and the WAL should have at least
  *   half free space before next checkpoint;
  */
 uint64_t
 default_wal_sz(uint64_t meta_sz)
 {
-	uint64_t wal_sz = (6ULL << 30);	/* 6GB */
+	uint64_t wal_sz = (6ULL << 30); /* 6GiB */
 
 	/* The WAL size could be larger than meta size for tiny pool */
 	if ((meta_sz * 2) <= wal_sz)
@@ -627,8 +629,8 @@ default_wal_sz(uint64_t meta_sz)
 	return wal_sz;
 }
 
-int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_sz,
-		  uint64_t wal_sz, uint64_t data_sz, enum bio_mc_flags flags)
+int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t scm_sz, uint64_t meta_sz,
+		  uint64_t wal_sz, uint64_t data_sz, enum bio_mc_flags flags, uint8_t backend_type)
 {
 	int			 rc = 0, rc1;
 	spdk_blob_id		 data_blobid = SPDK_BLOBID_INVALID;
@@ -637,12 +639,13 @@ int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_
 	struct bio_meta_context *mc = NULL;
 	struct meta_fmt_info	*fi = NULL;
 	struct bio_xs_blobstore *bxb;
+	uint32_t		 meta_flags = 0;
 
 	D_ASSERT(xs_ctxt != NULL);
 	if (data_sz > 0 && bio_nvme_configured(SMD_DEV_TYPE_DATA)) {
 		D_ASSERT(!(flags & BIO_MC_FL_RDB));
 		rc = bio_blob_create(pool_id, xs_ctxt, data_sz, SMD_DEV_TYPE_DATA, flags,
-				     &data_blobid);
+				     &data_blobid, 0);
 		if (rc)
 			return rc;
 	}
@@ -656,9 +659,28 @@ int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_
 			meta_sz, default_cluster_sz());
 		rc = -DER_INVAL;
 		goto delete_data;
+	} else if (meta_sz < scm_sz) {
+		D_ERROR("Meta blob size("DF_U64") is less than scm size("DF_U64")\n",
+			meta_sz, scm_sz);
+		rc = -DER_INVAL;
+		goto delete_data;
+	} else if (scm_sz == meta_sz) {
+		scm_sz = 0;
+	}
+
+	/* scm_sz < meta_sz case */
+	if (scm_sz != 0) {
+		if (flags & BIO_MC_FL_RDB) {
+			D_ERROR("RDB doesn't allow scm_sz("DF_U64") != meta_sz("DF_U64")\n",
+				scm_sz, meta_sz);
+			rc = -DER_INVAL;
+			goto delete_data;
+		}
+		meta_flags |= META_HDR_FL_EVICTABLE;
 	}
 
-	rc = bio_blob_create(pool_id, xs_ctxt, meta_sz, SMD_DEV_TYPE_META, flags, &meta_blobid);
+	rc = bio_blob_create(pool_id, xs_ctxt, meta_sz, SMD_DEV_TYPE_META, flags, &meta_blobid,
+			     scm_sz);
 	if (rc)
 		goto delete_data;
 
@@ -671,7 +693,7 @@ int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_
 	if (wal_sz == 0 || wal_sz < default_cluster_sz())
 		wal_sz = default_wal_sz(meta_sz);
 
-	rc = bio_blob_create(pool_id, xs_ctxt, wal_sz, SMD_DEV_TYPE_WAL, flags, &wal_blobid);
+	rc = bio_blob_create(pool_id, xs_ctxt, wal_sz, SMD_DEV_TYPE_WAL, flags, &wal_blobid, 0);
 	if (rc)
 		goto delete_meta;
 
@@ -717,8 +739,9 @@ int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_
 	fi->fi_wal_size = wal_sz;
 	fi->fi_data_size = data_sz;
 	fi->fi_vos_id = xs_ctxt->bxc_tgt_id;
+	fi->fi_backend_type = backend_type;
 
-	rc = meta_format(mc, fi, true);
+	rc = meta_format(mc, fi, meta_flags, true);
 	if (rc)
 		D_ERROR("Unable to format newly created blob for xs:%p pool:"DF_UUID"\n",
 			xs_ctxt, DP_UUID(pool_id));
diff --git a/src/bio/bio_wal.c b/src/bio/bio_wal.c
index 6c99a203966..1caa538eb5e 100644
--- a/src/bio/bio_wal.c
+++ b/src/bio/bio_wal.c
@@ -1861,13 +1861,15 @@ bio_wal_checkpoint(struct bio_meta_context *mc, uint64_t tx_id, uint64_t *purged
 
 void
 bio_meta_get_attr(struct bio_meta_context *mc, uint64_t *capacity, uint32_t *blk_sz,
-		  uint32_t *hdr_blks)
+		  uint32_t *hdr_blks, uint8_t *backend_type, bool *evictable)
 {
 	/* The mc could be NULL when md on SSD not enabled & data blob not existing */
 	if (mc != NULL) {
 		*blk_sz = mc->mc_meta_hdr.mh_blk_bytes;
 		*capacity = mc->mc_meta_hdr.mh_tot_blks * (*blk_sz);
 		*hdr_blks = mc->mc_meta_hdr.mh_hdr_blks;
+		*backend_type = mc->mc_meta_hdr.mh_backend_type;
+		*evictable = mc->mc_meta_hdr.mh_flags & META_HDR_FL_EVICTABLE;
 	}
 }
 
@@ -2022,7 +2024,7 @@ get_wal_gen(uuid_t pool_id, uint32_t tgt_id)
 }
 
 int
-meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, bool force)
+meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, uint32_t flags, bool force)
 {
 	struct meta_header	*meta_hdr = &mc->mc_meta_hdr;
 	struct wal_super_info	*si = &mc->mc_wal_info;
@@ -2068,7 +2070,8 @@ meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, bool force)
 	meta_hdr->mh_hdr_blks = META_HDR_BLKS;
 	meta_hdr->mh_tot_blks = (fi->fi_meta_size / META_BLK_SZ) - META_HDR_BLKS;
 	meta_hdr->mh_vos_id = fi->fi_vos_id;
-	meta_hdr->mh_flags = META_HDR_FL_EMPTY;
+	meta_hdr->mh_flags = (flags | META_HDR_FL_EMPTY);
+	meta_hdr->mh_backend_type = fi->fi_backend_type;
 
 	rc = write_header(mc, mc->mc_meta, meta_hdr, sizeof(*meta_hdr), &meta_hdr->mh_csum);
 	if (rc) {
diff --git a/src/bio/bio_wal.h b/src/bio/bio_wal.h
index 6eb187c61e6..1f15a7d94ef 100644
--- a/src/bio/bio_wal.h
+++ b/src/bio/bio_wal.h
@@ -11,6 +11,7 @@
 
 enum meta_hdr_flags {
 	META_HDR_FL_EMPTY	= (1UL << 0),
+	META_HDR_FL_EVICTABLE	= (1UL << 1),
 };
 
 /* Meta blob header */
@@ -28,7 +29,10 @@ struct meta_header {
 	uint64_t	mh_tot_blks;		/* Meta blob capacity, in blocks */
 	uint32_t	mh_vos_id;		/* Associated per-engine target ID */
 	uint32_t	mh_flags;		/* Meta header flags */
-	uint32_t	mh_padding[5];		/* Reserved */
+	uint8_t		mh_backend_type;	/* Backend allocator type */
+	uint8_t		mh_padding1;		/* Reserved */
+	uint16_t	mh_padding2;		/* Reserved */
+	uint32_t	mh_padding[4];		/* Reserved */
 	uint32_t	mh_csum;		/* Checksum of this header */
 };
 
@@ -124,9 +128,10 @@ struct meta_fmt_info {
 	uint64_t	fi_wal_size;		/* WAL blob size in bytes */
 	uint64_t	fi_data_size;		/* Data blob size in bytes */
 	uint32_t	fi_vos_id;		/* Associated per-engine target ID */
+	uint8_t		fi_backend_type;	/* Backend allocator type */
 };
 
-int meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, bool force);
+int meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, uint32_t flags, bool force);
 int meta_open(struct bio_meta_context *mc);
 void meta_close(struct bio_meta_context *mc);
 int wal_open(struct bio_meta_context *mc);
diff --git a/src/bio/smd/smd_internal.h b/src/bio/smd/smd_internal.h
index 0b641cddb61..4195581e40a 100644
--- a/src/bio/smd/smd_internal.h
+++ b/src/bio/smd/smd_internal.h
@@ -27,6 +27,8 @@ extern char TABLE_TGTS[SMD_DEV_TYPE_MAX][SMD_DEV_NAME_MAX];
 
 extern char TABLE_POOLS[SMD_DEV_TYPE_MAX][SMD_DEV_NAME_MAX];
 
+extern char TABLE_POOLS_EX[SMD_DEV_TYPE_MAX][SMD_DEV_NAME_MAX];
+
 #define SMD_MAX_TGT_CNT		64
 
 /** callback parameter for smd_db_traverse */
diff --git a/src/bio/smd/smd_pool.c b/src/bio/smd/smd_pool.c
index c9d9572c556..84c25a9863f 100644
--- a/src/bio/smd/smd_pool.c
+++ b/src/bio/smd/smd_pool.c
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2018-2023 Intel Corporation.
+ * (C) Copyright 2018-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -35,6 +35,17 @@ struct smd_pool {
 	uint64_t	sp_blobs[SMD_MAX_TGT_CNT];
 };
 
+char TABLE_POOLS_EX[SMD_DEV_TYPE_MAX][SMD_DEV_NAME_MAX] = {
+	"ex_data_pool",
+	"ex_meta_pool",
+	"ex_wal_pool",
+};
+
+struct smd_pool_meta {
+	uint64_t	spm_scm_sz;
+	uint64_t	spm_reserved[3];
+};
+
 static int
 smd_pool_find_tgt(struct smd_pool *pool, int tgt_id)
 {
@@ -56,7 +67,6 @@ pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, char *table_name
 
 	uuid_copy(id.uuid, pool_id);
 
-	smd_db_lock();
 	/* Fetch pool if it's already existing */
 	rc = smd_db_fetch(table_name, &id, sizeof(id), &pool, sizeof(pool));
 	if (rc == 0) {
@@ -65,23 +75,20 @@ pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, char *table_name
 				""DF_U64" != "DF_U64"\n",
 				DP_UUID(&id.uuid), pool.sp_blob_sz,
 				blob_sz);
-			rc = -DER_INVAL;
-			goto out;
+			return -DER_INVAL;
 		}
 
 		if (pool.sp_tgt_cnt >= SMD_MAX_TGT_CNT) {
 			D_ERROR("Pool "DF_UUID" is assigned to too many "
 				"targets (%d)\n", DP_UUID(&id.uuid),
 				pool.sp_tgt_cnt);
-			rc = -DER_OVERFLOW;
-			goto out;
+			return -DER_OVERFLOW;
 		}
 
 		rc = smd_pool_find_tgt(&pool, tgt_id);
 		if (rc >= 0) {
 			D_ERROR("Dup target %d, idx: %d\n", tgt_id, rc);
-			rc = -DER_EXIST;
-			goto out;
+			return -DER_EXIST;
 		}
 
 		pool.sp_tgts[pool.sp_tgt_cnt] = tgt_id;
@@ -102,32 +109,69 @@ pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, char *table_name
 	} else {
 		D_ERROR("Fetch pool "DF_UUID" failed. "DF_RC"\n",
 			DP_UUID(&id.uuid), DP_RC(rc));
-		goto out;
+		return rc;
 	}
 
 	rc = smd_db_upsert(table_name, &id, sizeof(id), &pool, sizeof(pool));
-	if (rc) {
+	if (rc)
 		D_ERROR("Update pool "DF_UUID" failed. "DF_RC"\n",
 			DP_UUID(&id.uuid), DP_RC(rc));
-		goto out;
-	}
-out:
-	smd_db_unlock();
+
 	return rc;
 }
 
 int
 smd_pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id,
-		 enum smd_dev_type st, uint64_t blob_sz)
+		 enum smd_dev_type st, uint64_t blob_sz, uint64_t scm_sz)
 {
-	return pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_POOLS[st], blob_sz);
+	struct smd_pool_meta	meta = { 0 };
+	struct d_uuid		id;
+	int			rc;
+
+	smd_db_lock();
+
+	rc = pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_POOLS[st], blob_sz);
+	if (rc || scm_sz == 0) {
+		smd_db_unlock();
+		return rc;
+	}
+
+	D_ASSERTF(scm_sz < blob_sz, "scm_sz("DF_U64") >= blob_sz("DF_U64")\n", scm_sz, blob_sz);
+	D_ASSERT(st == SMD_DEV_TYPE_META);
+
+	uuid_copy(id.uuid, pool_id);
+	/* Fetch pool_meta_ex to see if it's already existing */
+	rc = smd_db_fetch(TABLE_POOLS_EX[st], &id, sizeof(id), &meta, sizeof(meta));
+	if (rc == 0) {
+		if (meta.spm_scm_sz != scm_sz) {
+			D_ERROR("Pool "DF_UUID" meta size mismatch. "DF_U64" != "DF_U64"\n",
+				DP_UUID(&id.uuid), meta.spm_scm_sz, scm_sz);
+			rc = -DER_INVAL;
+		}
+	} else if (rc == -DER_NONEXIST) {
+		meta.spm_scm_sz = scm_sz;
+		rc = smd_db_upsert(TABLE_POOLS_EX[st], &id, sizeof(id), &meta, sizeof(meta));
+		if (rc)
+			DL_ERROR(rc, "Update pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid));
+	} else {
+		DL_ERROR(rc, "Fetch pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid));
+	}
+
+	smd_db_unlock();
+	return rc;
 }
 
 int
 smd_rdb_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id,
 		enum smd_dev_type st, uint64_t blob_sz)
 {
-	return pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_RDBS[st], blob_sz);
+	int	rc;
+
+	smd_db_lock();
+	rc = pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_RDBS[st], blob_sz);
+	smd_db_unlock();
+
+	return rc;
 }
 
 static int
@@ -140,20 +184,18 @@ pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, char *table_name)
 
 	uuid_copy(id.uuid, pool_id);
 
-	smd_db_lock();
 	rc = smd_db_fetch(table_name, &id, sizeof(id), &pool, sizeof(pool));
 	if (rc) {
 		D_ERROR("Fetch pool "DF_UUID" failed. "DF_RC"\n",
 			DP_UUID(id.uuid), DP_RC(rc));
-		goto out;
+		return rc;
 	}
 
 	rc = smd_pool_find_tgt(&pool, tgt_id);
 	if (rc < 0) {
 		D_ERROR("Pool "DF_UUID" target %d not found.\n",
 			DP_UUID(id.uuid), tgt_id);
-		rc = -DER_NONEXIST;
-		goto out;
+		return -DER_NONEXIST;
 	}
 
 	for (i = rc; i < pool.sp_tgt_cnt - 1; i++) {
@@ -168,35 +210,69 @@ pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, char *table_name)
 		if (rc) {
 			D_ERROR("Update pool "DF_UUID" failed: "DF_RC"\n",
 				DP_UUID(&id.uuid), DP_RC(rc));
-			goto out;
+			return rc;
 		}
 	} else {
 		rc = smd_db_delete(table_name, &id, sizeof(id));
 		if (rc) {
 			D_ERROR("Delete pool "DF_UUID" failed: "DF_RC"\n",
 				DP_UUID(&id.uuid), DP_RC(rc));
-			goto out;
+			return rc;
 		}
+		rc = 1;	/* Inform caller that last target is deleted */
 	}
-out:
-	smd_db_unlock();
+
 	return rc;
 }
 
 int
 smd_pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, enum smd_dev_type st)
 {
-	return pool_del_tgt(pool_id, tgt_id, TABLE_POOLS[st]);
+	struct smd_pool_meta	meta = { 0 };
+	struct d_uuid		id;
+	int			rc;
+
+	smd_db_lock();
+	rc = pool_del_tgt(pool_id, tgt_id, TABLE_POOLS[st]);
+	if (rc <= 0)
+		goto out;
+
+	rc = 0;
+	if (st == SMD_DEV_TYPE_META) {
+		uuid_copy(id.uuid, pool_id);
+
+		rc = smd_db_fetch(TABLE_POOLS_EX[st], &id, sizeof(id), &meta, sizeof(meta));
+		if (rc == -DER_NONEXIST) {
+			rc = 0;
+			goto out;
+		} else if (rc) {
+			DL_ERROR(rc, "Fetch pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid));
+			goto out;
+		}
+
+		rc = smd_db_delete(TABLE_POOLS_EX[st], &id, sizeof(id));
+		if (rc)
+			DL_ERROR(rc, "Delete pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid));
+	}
+out:
+	smd_db_unlock();
+	return rc;
 }
 
 int
 smd_rdb_del_tgt(uuid_t pool_id, uint32_t tgt_id, enum smd_dev_type st)
 {
-	return pool_del_tgt(pool_id, tgt_id, TABLE_RDBS[st]);
+	int	rc;
+
+	smd_db_lock();
+	rc = pool_del_tgt(pool_id, tgt_id, TABLE_RDBS[st]);
+	smd_db_unlock();
+
+	return rc < 0 ? rc : 0;
 }
 
 static struct smd_pool_info *
-smd_pool_alloc_info(struct d_uuid *id, struct smd_pool *pools)
+smd_pool_alloc_info(struct d_uuid *id, struct smd_pool *pools, uint64_t scm_sz)
 {
 	struct smd_pool_info	*info;
 	enum smd_dev_type	 st;
@@ -206,6 +282,7 @@ smd_pool_alloc_info(struct d_uuid *id, struct smd_pool *pools)
 	if (info == NULL)
 		return NULL;
 
+	info->spi_scm_sz = scm_sz;
 	for (st = SMD_DEV_TYPE_DATA; st < SMD_DEV_TYPE_MAX; st++) {
 		D_ALLOC_ARRAY(info->spi_tgts[st], SMD_MAX_TGT_CNT);
 		if (info->spi_tgts[st] == NULL) {
@@ -237,6 +314,7 @@ smd_pool_get_info(uuid_t pool_id, struct smd_pool_info **pool_info)
 {
 	struct smd_pool_info	*info;
 	struct smd_pool          pools[SMD_DEV_TYPE_MAX];
+	struct smd_pool_meta	 meta = { 0 };
 	enum smd_dev_type	 st;
 	struct d_uuid		 id;
 	int			 rc;
@@ -255,7 +333,16 @@ smd_pool_get_info(uuid_t pool_id, struct smd_pool_info **pool_info)
 		}
 	}
 
-	info = smd_pool_alloc_info(&id, pools);
+	rc = smd_db_fetch(TABLE_POOLS_EX[SMD_DEV_TYPE_META], &id, sizeof(id), &meta, sizeof(meta));
+	if (rc == -DER_NONEXIST) {
+		meta.spm_scm_sz = pools[SMD_DEV_TYPE_META].sp_blob_sz;
+		rc = 0;
+	} else if (rc) {
+		DL_ERROR(rc, "Fetch pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid));
+		goto out;
+	}
+
+	info = smd_pool_alloc_info(&id, pools, meta.spm_scm_sz);
 	if (info == NULL) {
 		rc = -DER_NOMEM;
 		goto out;
@@ -338,6 +425,7 @@ smd_pool_list_cb(struct sys_db *db, char *table, d_iov_t *key, void *args)
 	struct smd_trav_data    *td = args;
 	struct smd_pool_info    *info;
 	struct smd_pool          pools[SMD_DEV_TYPE_MAX];
+	struct smd_pool_meta	 meta = { 0 };
 	enum smd_dev_type	 st;
 	struct d_uuid            id;
 	int                      rc;
@@ -363,7 +451,16 @@ smd_pool_list_cb(struct sys_db *db, char *table, d_iov_t *key, void *args)
 			return rc;
 	}
 
-	info = smd_pool_alloc_info(&id, pools);
+	rc = smd_db_fetch(TABLE_POOLS_EX[SMD_DEV_TYPE_META], &id, sizeof(id), &meta, sizeof(meta));
+	if (rc == -DER_NONEXIST) {
+		meta.spm_scm_sz = pools[SMD_DEV_TYPE_META].sp_blob_sz;
+		rc = 0;
+	} else if (rc) {
+		DL_ERROR(rc, "Fetch pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid));
+		return rc;
+	}
+
+	info = smd_pool_alloc_info(&id, pools, meta.spm_scm_sz);
 	if (!info)
 		return -DER_NOMEM;
 
diff --git a/src/bio/smd/tests/smd_ut.c b/src/bio/smd/tests/smd_ut.c
index 129db9acf0d..bb2fcb6107a 100644
--- a/src/bio/smd/tests/smd_ut.c
+++ b/src/bio/smd/tests/smd_ut.c
@@ -21,7 +21,7 @@
 #include <daos/sys_db.h>
 
 #define SMD_STORAGE_PATH	"/mnt/daos"
-#define DB_LIST_NR		(SMD_DEV_TYPE_MAX * 2 + 1)
+#define DB_LIST_NR		(SMD_DEV_TYPE_MAX * 2 + 2)
 
 struct ut_db {
 	struct sys_db	ud_db;
@@ -46,11 +46,14 @@ db_name2list(struct sys_db *db, char *name)
 
 	if (!strcmp(name, TABLE_DEV))
 		return &ud->ud_lists[0];
+	if (!strcmp(name, TABLE_POOLS_EX[SMD_DEV_TYPE_META]))
+		return &ud->ud_lists[1];
+
 	for (st = SMD_DEV_TYPE_DATA; st < SMD_DEV_TYPE_MAX; st++) {
 		if (!strcmp(name, TABLE_TGTS[st]))
-			return &ud->ud_lists[st + 1];
+			return &ud->ud_lists[st + 2];
 		if (!strcmp(name, TABLE_POOLS[st]))
-			return &ud->ud_lists[st + SMD_DEV_TYPE_MAX + 1];
+			return &ud->ud_lists[st + SMD_DEV_TYPE_MAX + 2];
 	}
 	D_ASSERT(0);
 	return NULL;
@@ -325,12 +328,13 @@ ut_device(void **state)
 }
 
 static void
-verify_pool(struct smd_pool_info *pool_info, uuid_t id, int shift)
+verify_pool(struct smd_pool_info *pool_info, uuid_t id, int shift, uint64_t scm_sz)
 {
 	enum smd_dev_type	st;
 	int			i, j;
 
 	assert_int_equal(uuid_compare(pool_info->spi_id, id), 0);
+	assert_int_equal(pool_info->spi_scm_sz, scm_sz);
 	assert_int_equal(pool_info->spi_tgt_cnt[SMD_DEV_TYPE_DATA], 4);
 	assert_int_equal(pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META], 1);
 	assert_int_equal(pool_info->spi_tgt_cnt[SMD_DEV_TYPE_WAL], 1);
@@ -359,35 +363,42 @@ ut_pool(void **state)
 
 	for (i = 0; i < 6; i++) {
 		st = (i < 4) ? SMD_DEV_TYPE_DATA : SMD_DEV_TYPE_DATA + i - 3;
-		rc = smd_pool_add_tgt(id1, i, i << 10, st, 100);
+		rc = smd_pool_add_tgt(id1, i, i << 10, st, 100, 0);
 		assert_rc_equal(rc, 0);
 
-		rc = smd_pool_add_tgt(id2, i, i << 20, st, 200);
+		if (st == SMD_DEV_TYPE_META)
+			rc = smd_pool_add_tgt(id2, i, i << 20, st, 200, 50);
+		else
+			rc = smd_pool_add_tgt(id2, i, i << 20, st, 200, 0);
 		assert_rc_equal(rc, 0);
 	}
 
-	rc = smd_pool_add_tgt(id1, 0, 5000, SMD_DEV_TYPE_DATA, 100);
+	rc = smd_pool_add_tgt(id1, 0, 5000, SMD_DEV_TYPE_DATA, 100, 0);
 	assert_rc_equal(rc, -DER_EXIST);
 
-	rc = smd_pool_add_tgt(id1, 4, 4 << 10, SMD_DEV_TYPE_DATA, 200);
+	rc = smd_pool_add_tgt(id1, 4, 4 << 10, SMD_DEV_TYPE_DATA, 200, 0);
 	assert_rc_equal(rc, -DER_INVAL);
 
-	rc = smd_pool_add_tgt(id1, 4, 5000, SMD_DEV_TYPE_META, 100);
+	rc = smd_pool_add_tgt(id1, 4, 5000, SMD_DEV_TYPE_META, 100, 0);
 	assert_rc_equal(rc, -DER_EXIST);
 
-	rc = smd_pool_add_tgt(id1, 0, 4 << 10, SMD_DEV_TYPE_META, 200);
+	rc = smd_pool_add_tgt(id1, 0, 4 << 10, SMD_DEV_TYPE_META, 200, 0);
 	assert_rc_equal(rc, -DER_INVAL);
 
-	rc = smd_pool_add_tgt(id1, 5, 5000, SMD_DEV_TYPE_WAL, 100);
+	rc = smd_pool_add_tgt(id1, 5, 5000, SMD_DEV_TYPE_WAL, 100, 0);
 	assert_rc_equal(rc, -DER_EXIST);
 
-	rc = smd_pool_add_tgt(id1, 0, 4 << 10, SMD_DEV_TYPE_WAL, 200);
+	rc = smd_pool_add_tgt(id1, 0, 4 << 10, SMD_DEV_TYPE_WAL, 200, 0);
 	assert_rc_equal(rc, -DER_INVAL);
 
 	rc = smd_pool_get_info(id1, &pool_info);
 	assert_rc_equal(rc, 0);
-	verify_pool(pool_info, id1, 10);
+	verify_pool(pool_info, id1, 10, 100);
+	smd_pool_free_info(pool_info);
 
+	rc = smd_pool_get_info(id2, &pool_info);
+	assert_rc_equal(rc, 0);
+	verify_pool(pool_info, id2, 20, 50);
 	smd_pool_free_info(pool_info);
 
 	rc = smd_pool_get_info(id3, &pool_info);
@@ -416,9 +427,9 @@ ut_pool(void **state)
 
 	d_list_for_each_entry_safe(pool_info, tmp, &pool_list, spi_link) {
 		if (uuid_compare(pool_info->spi_id, id1) == 0)
-			verify_pool(pool_info, id1, 10);
+			verify_pool(pool_info, id1, 10, 100);
 		else if (uuid_compare(pool_info->spi_id, id2) == 0)
-			verify_pool(pool_info, id2, 20);
+			verify_pool(pool_info, id2, 20, 50);
 		else
 			assert_true(false);
 
diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h
index e3b3c0d7d0e..9d162810db6 100644
--- a/src/client/dfuse/dfuse.h
+++ b/src/client/dfuse/dfuse.h
@@ -29,7 +29,6 @@ struct dfuse_info {
 	char                *di_mountpoint;
 	int32_t              di_thread_count;
 	uint32_t             di_eq_count;
-	bool                 di_threaded;
 	bool                 di_foreground;
 	bool                 di_caching;
 	bool                 di_multi_user;
diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c
index 4f654fa3209..6397b283e97 100644
--- a/src/client/dfuse/dfuse_core.c
+++ b/src/client/dfuse/dfuse_core.c
@@ -53,7 +53,7 @@ dfuse_progress_thread(void *arg)
 				return NULL;
 		}
 
-		rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_WAIT, 128, &dev[0]);
+		rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_NOWAIT, 128, &dev[0]);
 		if (rc >= 1) {
 			for (i = 0; i < rc; i++) {
 				struct dfuse_event *ev;
diff --git a/src/client/dfuse/dfuse_main.c b/src/client/dfuse/dfuse_main.c
index d75656121a5..02db62cc4e9 100644
--- a/src/client/dfuse/dfuse_main.c
+++ b/src/client/dfuse/dfuse_main.c
@@ -166,6 +166,7 @@ dfuse_bg(struct dfuse_info *dfuse_info)
  *
  * Should be called from the post_start plugin callback and creates
  * a filesystem.
+ * Returns a DAOS error code.
  * Returns true on success, false on failure.
  */
 int
@@ -204,18 +205,17 @@ dfuse_launch_fuse(struct dfuse_info *dfuse_info, struct fuse_args *args)
 		DFUSE_TRA_ERROR(dfuse_info, "Error sending signal to fg: "DF_RC, DP_RC(rc));
 
 	/* Blocking */
-	if (dfuse_info->di_threaded)
-		rc = dfuse_loop(dfuse_info);
-	else
-		rc = fuse_session_loop(dfuse_info->di_session);
-	if (rc != 0)
+	rc = dfuse_loop(dfuse_info);
+	if (rc != 0) {
 		DHS_ERROR(dfuse_info, rc, "Fuse loop exited");
+		rc = daos_errno2der(rc);
+	}
 
 umount:
 
 	fuse_session_unmount(dfuse_info->di_session);
 
-	return daos_errno2der(rc);
+	return rc;
 }
 
 #define DF_POOL_PREFIX "pool="
@@ -279,7 +279,6 @@ show_help(char *name)
 	    "	   --path=<path>	Path to load UNS pool/container data\n"
 	    "	   --sys-name=STR	DAOS system name context for servers\n"
 	    "\n"
-	    "	-S --singlethread	Single threaded (deprecated)\n"
 	    "	-t --thread-count=count	Total number of threads to use\n"
 	    "	-e --eq-count=count	Number of event queues to use\n"
 	    "	-f --foreground		Run in foreground\n"
@@ -423,7 +422,6 @@ main(int argc, char **argv)
 					     {"pool", required_argument, 0, 'p'},
 					     {"container", required_argument, 0, 'c'},
 					     {"sys-name", required_argument, 0, 'G'},
-					     {"singlethread", no_argument, 0, 'S'},
 					     {"thread-count", required_argument, 0, 't'},
 					     {"eq-count", required_argument, 0, 'e'},
 					     {"foreground", no_argument, 0, 'f'},
@@ -447,13 +445,12 @@ main(int argc, char **argv)
 	if (dfuse_info == NULL)
 		D_GOTO(out_debug, rc = -DER_NOMEM);
 
-	dfuse_info->di_threaded = true;
 	dfuse_info->di_caching  = true;
 	dfuse_info->di_wb_cache = true;
 	dfuse_info->di_eq_count = 1;
 
 	while (1) {
-		c = getopt_long(argc, argv, "Mm:St:o:fhe:v", long_options, NULL);
+		c = getopt_long(argc, argv, "Mm:t:o:fhe:v", long_options, NULL);
 
 		if (c == -1)
 			break;
@@ -491,13 +488,6 @@ main(int argc, char **argv)
 		case 'P':
 			path = optarg;
 			break;
-		case 'S':
-			/* Set it to be single threaded, but allow an extra one
-			 * for the event queue processing
-			 */
-			dfuse_info->di_threaded     = false;
-			dfuse_info->di_thread_count = 2;
-			break;
 		case 'e':
 			dfuse_info->di_eq_count = atoi(optarg);
 			break;
@@ -564,7 +554,7 @@ main(int argc, char **argv)
 	 * check CPU binding.  If bound to a number of cores then launch that number of threads,
 	 * if not bound them limit to 16.
 	 */
-	if (dfuse_info->di_threaded && !have_thread_count) {
+	if (!have_thread_count) {
 		struct hwloc_topology *hwt;
 		hwloc_const_cpuset_t   hw;
 		int                    total;
diff --git a/src/client/dfuse/pil4dfs/hook.c b/src/client/dfuse/pil4dfs/hook.c
index 4af38d885db..09a54e7351c 100644
--- a/src/client/dfuse/pil4dfs/hook.c
+++ b/src/client/dfuse/pil4dfs/hook.c
@@ -42,6 +42,7 @@
 #include <linux/limits.h>
 #include <capstone/capstone.h>
 #include <gurt/common.h>
+#include <gnu/libc-version.h>
 
 #include "hook.h"
 #include "hook_int.h"
@@ -89,10 +90,15 @@ static uint64_t lib_base_addr[MAX_NUM_LIB];
 /* List of names of loaded libraries */
 static char   **lib_name_list;
 
+/* libc version number in current process. e.g., 2.28 */
+static float    libc_version;
+static char    *libc_version_str;
+
 /* end   to compile list of memory blocks in /proc/pid/maps */
 
 static char    *path_ld;
 static char    *path_libc;
+static char    *path_libdl;
 static char    *path_libpthread;
 /* This holds the path of libpil4dfs.so. It is needed when we want to
  * force child processes append libpil4dfs.so to env LD_PRELOAD. */
@@ -213,7 +219,7 @@ determine_lib_path(void)
 {
 	int   path_offset   = 0, read_size, i, rc;
 	char *read_buff_map = NULL;
-	char *pos, *start, *end, lib_ver_str[32] = "", *lib_dir_str = NULL;
+	char *pos, *start, *end, *lib_dir_str = NULL;
 
 	read_size = read_map_file(&read_buff_map);
 
@@ -290,19 +296,17 @@ determine_lib_path(void)
 		goto err;
 	path_libc[end - start] = 0;
 
-	pos = strstr(path_libc, "libc-2.");
-	if (pos) {
-		/* containing version in name. example, 2.17 */
-		memcpy(lib_ver_str, pos + 5, 4);
-		lib_ver_str[4] = 0;
+	if (libc_version_str == NULL) {
+		libc_version_str = (char *)gnu_get_libc_version();
+		if (libc_version_str == NULL) {
+			DS_ERROR(errno, "Failed to determine libc version");
+			goto err;
+		}
+		libc_version = atof(libc_version_str);
 	}
 
-	if (lib_ver_str[0]) {
-		/* with version in name */
-		rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, lib_ver_str);
-	} else {
-		rc = asprintf(&path_libpthread, "%s/libpthread.so.0", lib_dir_str);
-	}
+	/* with version in name */
+	rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, libc_version_str);
 	if (rc < 0) {
 		DS_ERROR(ENOMEM, "Failed to allocate memory for path_libpthread");
 		goto err_1;
@@ -312,7 +316,18 @@ determine_lib_path(void)
 		path_libpthread = NULL;
 		DS_ERROR(ENAMETOOLONG, "path_libpthread is too long");
 		goto err_1;
-	}	
+	}
+	rc = asprintf(&path_libdl, "%s/libdl-%s.so", lib_dir_str, libc_version_str);
+	if (rc < 0) {
+		DS_ERROR(ENOMEM, "Failed to allocate memory for path_libdl");
+		goto err_1;
+	}
+	if (rc >= PATH_MAX) {
+		free(path_libdl);
+		path_libdl = NULL;
+		DS_ERROR(ENAMETOOLONG, "path_libdl is too long");
+		goto err_1;
+	}
 	D_FREE(lib_dir_str);
 
 	if (strstr(read_buff_map, "libioil.so")) {
@@ -354,6 +369,11 @@ query_pil4dfs_path(void)
 	return path_libpil4dfs;
 }
 
+float
+query_libc_version(void)
+{
+	return libc_version;
+}
 
 /*
  * query_func_addr - Determine the addresses and code sizes of functions in func_name_list[].
@@ -760,6 +780,7 @@ free_memory_in_hook(void)
 	D_FREE(path_ld);
 	D_FREE(path_libc);
 	D_FREE(module_list);
+	free(path_libdl);
 	free(path_libpthread);
 
 	if (lib_name_list) {
@@ -1040,6 +1061,8 @@ register_a_hook(const char *module_name, const char *func_name, const void *new_
 		module_name_local = path_ld;
 	else if (strncmp(module_name, "libc", 5) == 0)
 		module_name_local = path_libc;
+	else if (strncmp(module_name, "libdl", 6) == 0)
+		module_name_local = path_libdl;
 	else if (strncmp(module_name, "libpthread", 11) == 0)
 		module_name_local = path_libpthread;
 	else
diff --git a/src/client/dfuse/pil4dfs/hook.h b/src/client/dfuse/pil4dfs/hook.h
index 7742faaff53..b686d99ce4e 100644
--- a/src/client/dfuse/pil4dfs/hook.h
+++ b/src/client/dfuse/pil4dfs/hook.h
@@ -60,4 +60,10 @@ free_memory_in_hook(void);
 char *
 query_pil4dfs_path(void);
 
+/**
+ * return glibc version in current process
+ */
+float
+query_libc_version(void);
+
 #endif
diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c
index bfd3dcd0ff1..0c7db5d9563 100644
--- a/src/client/dfuse/pil4dfs/int_dfs.c
+++ b/src/client/dfuse/pil4dfs/int_dfs.c
@@ -159,6 +159,7 @@ static long int         page_size;
 #define DAOS_INIT_RUNNING     1
 
 static _Atomic uint64_t mpi_init_count;
+static _Atomic int64_t  zeInit_count;
 
 static long int         daos_initing;
 _Atomic bool            d_daos_inited;
@@ -488,6 +489,9 @@ static int (*next_tcgetattr)(int fd, void *termios_p);
 
 static int (*next_mpi_init)(int *argc, char ***argv);
 static int (*next_pmpi_init)(int *argc, char ***argv);
+static int (*next_ze_init)(int flags);
+static void *(*next_dlsym)(void *handle, const char *symbol);
+static void *(*new_dlsym)(void *handle, const char *symbol);
 
 /* to do!! */
 /**
@@ -1074,6 +1078,143 @@ PMPI_Init(int *argc, char ***argv)
 	return rc;
 }
 
+int
+zeInit(int flags)
+{
+	int rc;
+
+	if (next_ze_init == NULL) {
+		if (d_hook_enabled)
+			next_ze_init = next_dlsym(RTLD_NEXT, "zeInit");
+		else
+			next_ze_init = dlsym(RTLD_NEXT, "zeInit");
+	}
+	D_ASSERT(next_ze_init != NULL);
+	atomic_fetch_add_relaxed(&zeInit_count, 1);
+	rc = next_ze_init(flags);
+	atomic_fetch_add_relaxed(&zeInit_count, -1);
+	return rc;
+}
+
+#if defined(__x86_64__)
+/* This is used to work around compiling warning and limitations of using asm function. */
+static void *
+query_new_dlsym_addr(void *addr)
+{
+	int i;
+
+	/* assume little endian */
+	for (i = 0; i < 64; i++) {
+		/* 0x56579090 is corresponding to the first four instructions at new_dlsym_asm.
+		 * 0x90 - nop, 0x90 - nop, 0x57 - push %rdi, 0x56 - push %rsi
+		 */
+		if (*((int *)(addr + i)) == 0x56579090) {
+			/* two nop are added for easier positioning. offset +2 here to skip two
+			 * nop and start from the real entry.
+			 */
+			return ((void *)(addr + i + 2));
+		}
+	}
+	return NULL;
+}
+
+_Pragma("GCC diagnostic push")
+_Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+_Pragma("GCC diagnostic ignored \"-Wunused-variable\"")
+
+_Pragma("GCC push_options")
+_Pragma("GCC optimize(\"-O0\")")
+static char str_zeinit[] = "zeInit";
+
+static int
+is_hook_enabled(void)
+{
+	return (d_hook_enabled ? (1) : (0));
+}
+
+/* This wrapper function is introduced to avoid compiling issue with Intel-C on Leap 15.5 */
+static int
+my_strcmp(const char *s1, const char *s2)
+{
+	return strcmp(s1, s2);
+}
+
+static void *
+get_zeinit_addr(void)
+{
+	return (void *)zeInit;
+}
+
+__attribute__((aligned(16))) static void
+new_dlsym_marker(void)
+{
+}
+
+__asm__(
+	"new_dlsym_asm:\n"
+	"nop\n"
+	"nop\n"
+	"push %rdi\n"
+	"push %rsi\n"
+
+	"call is_hook_enabled\n"
+	"test %eax,%eax\n"
+	"je org_dlsym\n"
+
+	"mov %rsi, %rdi\n"
+	"lea str_zeinit(%rip), %rsi\n"
+	"call my_strcmp\n"
+	"test %eax,%eax\n"
+	"jne org_dlsym\n"
+
+	"pop %rsi\n"
+	"pop %rdi\n"
+	"call *next_dlsym(%rip)\n"
+	"mov %rax, next_ze_init(%rip)\n"
+
+	"test %eax,%eax\n"
+	"jne found\n"
+	"ret\n"
+
+	"found:\n"
+	"call get_zeinit_addr\n"
+	"ret\n"
+
+	"org_dlsym:\n"
+	"pop %rsi\n"
+	"pop %rdi\n"
+	"jmp *next_dlsym(%rip)\n"
+);
+_Pragma("GCC pop_options")
+_Pragma("GCC diagnostic pop")
+
+#else
+/* c code for other architecture. caller info could be wrong inside libc dlsym() when handle is set
+ * RTLD_NEXT. Assembly version implementation similar to above is needed to fix the issue by using
+ * jump instead of call instruction. 
+ */
+static void *
+new_dlsym_c(void *handle, const char *symbol)
+{
+	if (!d_hook_enabled)
+		goto org_dlsym;
+	printf("Inside my dlsym().\n");
+	if (strcmp(symbol, "zeInit") != 0)
+		goto org_dlsym;
+
+	next_ze_init = next_dlsym(handle, symbol);
+	if (next_ze_init)
+		/* dlsym() finished successfully, then intercept zeInit() */
+		return zeInit;
+	else
+		return next_ze_init;
+
+org_dlsym:
+	/* Ideally we need to adjust stack and jump to next_dlsym(). */
+	return next_dlsym(handle, symbol);
+}
+#endif
+
 /** determine whether a path (both relative and absolute) is on DAOS or not. If yes,
  *  returns parent object, item name, full path of parent dir, full absolute path, and
  *  the pointer to struct dfs_mt.
@@ -1180,6 +1321,15 @@ query_path(const char *szInput, int *is_target_path, struct dcache_rec **parent,
 				goto out_normal;
 			}
 
+			/* Check whether zeInit() is running. If yes, pass to the original
+			 * libc functions. Avoid possible zeInit reentrancy/nested call.
+			 */
+
+			if (atomic_load_relaxed(&zeInit_count) > 0) {
+				*is_target_path = 0;
+				goto out_normal;
+			}
+
 			/* daos_init() is expensive to call. We call it only when necessary. */
 
 			/* Check whether daos_init() is running. If yes, pass to the original
@@ -2051,6 +2201,7 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 
 	if (!is_target_path)
 		goto org_func;
+	atomic_fetch_add_relaxed(&num_open, 1);
 	if (oflags & O_CREAT && (oflags & O_DIRECTORY || oflags & O_PATH)) {
 		/* Create a dir is not supported. */
 		errno = ENOENT;
@@ -2078,7 +2229,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 		}
 
 		/* Need to create a fake fd and associate with fd_kernel */
-		atomic_fetch_add_relaxed(&num_open, 1);
 		dfs_get_mode(dfs_obj, &mode_query);
 
 		/* regular file */
@@ -2254,7 +2404,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 
 		return (idx_dirfd + FD_DIR_BASE);
 	}
-	atomic_fetch_add_relaxed(&num_open, 1);
 
 	rc = find_next_available_fd(NULL, &idx_fd);
 	if (rc)
@@ -6092,7 +6241,7 @@ ioctl(int fd, unsigned long request, ...)
 	va_list                         arg;
 	void                           *param;
 	struct dfuse_user_reply        *reply;
-	int                             fd_directed;
+	int                             fd_directed = fd;
 
 	va_start(arg, request);
 	param = va_arg(arg, void *);
@@ -6118,12 +6267,11 @@ ioctl(int fd, unsigned long request, ...)
 		return next_ioctl(fd, request, param);
 
 	fd_directed = d_get_fd_redirected(fd);
-	if (fd_directed < FD_FILE_BASE)
+	if ((fd_directed < FD_FILE_BASE) || (fd_directed >= (FD_DIR_BASE + MAX_OPENED_DIR)))
 		return next_ioctl(fd, request, param);
 
 	errno = ENOTSUP;
-
-	return -1;
+	return (-1);
 }
 
 int
@@ -6936,6 +7084,18 @@ check_bypasslist(void)
 	return;
 }
 
+#define SMALL_DIFF (0.0001)
+static int
+libc_ver_cmp(float ver_a, float ver_b)
+{
+	if ((ver_a + SMALL_DIFF) < ver_b)
+		return (-1);
+	else if (ver_a > (ver_b + SMALL_DIFF))
+		return (1);
+	else
+		return (0);
+}
+
 static __attribute__((constructor)) void
 init_myhook(void)
 {
@@ -6944,6 +7104,7 @@ init_myhook(void)
 	char    *env_no_bypass;
 	int      rc;
 	uint64_t eq_count_loc = 0;
+	float    libc_version;
 
 	/* D_IL_NO_BYPASS is ONLY for testing. It always keeps function interception enabled in
 	 * current process and children processes. This is needed to thoroughly test interception
@@ -7118,6 +7279,18 @@ init_myhook(void)
 	register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3));
 	register_a_hook("libc", "readlink", (void *)new_readlink, (long int *)(&libc_readlink));
 
+#if defined(__x86_64__)
+	new_dlsym = query_new_dlsym_addr(new_dlsym_marker);
+#else
+	new_dlsym = new_dlsym_c;
+#endif
+	D_ASSERT(new_dlsym != NULL);
+	libc_version = query_libc_version();
+	if (libc_ver_cmp(libc_version, 2.34) < 0)
+		register_a_hook("libdl", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym));
+	else
+		register_a_hook("libc", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym));
+
 	init_fd_dup2_list();
 
 	if (is_bash && no_dcache_in_bash)
@@ -7127,6 +7300,10 @@ init_myhook(void)
 		dcache_rec_timeout = 0;
 
 	install_hook();
+
+	/* Check it here to minimize the work in function new_dlsym() written in assembly */
+	D_ASSERT(next_dlsym != NULL);
+
 	d_hook_enabled   = 1;
 	hook_enabled_bak = d_hook_enabled;
 }
diff --git a/src/client/dfuse/pil4dfs/pil4dfs_int.h b/src/client/dfuse/pil4dfs/pil4dfs_int.h
index a9c54b55555..0693123b51f 100644
--- a/src/client/dfuse/pil4dfs/pil4dfs_int.h
+++ b/src/client/dfuse/pil4dfs/pil4dfs_int.h
@@ -30,7 +30,7 @@
 /* FD_FILE_BASE - The base number of the file descriptor for a directory.
  * The fd allocate from this lib is always larger than FD_FILE_BASE.
  */
-#define FD_DIR_BASE     (0x40000000)
+#define FD_DIR_BASE     (FD_FILE_BASE + MAX_OPENED_FILE)
 
 /* structure allocated for a FD for a file */
 struct file_obj {
diff --git a/src/common/SConscript b/src/common/SConscript
index 0eec057198d..9d4c522536e 100644
--- a/src/common/SConscript
+++ b/src/common/SConscript
@@ -30,7 +30,7 @@ def build_daos_common(denv, client):
                    'dav/ravl_interval.c', 'dav/recycler.c', 'dav/stats.c', 'dav/tx.c', 'dav/ulog.c',
                    'dav/util.c', 'dav/wal_tx.c']
         ad_mem_files = ['ad_mem.c', 'ad_tx.c']
-        common_libs.extend(['pmemobj', 'abt'])
+        common_libs.extend(['pmemobj', 'abt', 'dav_v2'])
         benv.AppendUnique(RPATH_FULL=['$PREFIX/lib64/daos_srv'])
         benv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD'])
         benv.Append(OBJPREFIX="v_")
@@ -51,6 +51,7 @@ def scons():
     """Execute build"""
     Import('env', 'base_env', 'prereqs')
 
+    SConscript('dav_v2/SConscript')
     env.AppendUnique(LIBPATH=[Dir('.')])
     base_env.AppendUnique(LIBPATH=[Dir('.')])
     base_env.d_add_build_rpath()
diff --git a/src/common/ad_tx.c b/src/common/ad_tx.c
index a68ac18eb0e..9ca51db969e 100644
--- a/src/common/ad_tx.c
+++ b/src/common/ad_tx.c
@@ -1147,8 +1147,8 @@ umo_tx_free(struct umem_instance *umm, umem_off_t umoff)
 }
 
 static umem_off_t
-umo_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags,
-	     unsigned int type_num)
+umo_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num,
+	     unsigned int mbkt_id)
 {
 	struct ad_tx		*tx = tx_get();
 	struct ad_blob_handle	 bh = umm2ad_blob_hdl(umm);
@@ -1242,7 +1242,8 @@ umo_tx_add_ptr(struct umem_instance *umm, void *ptr, size_t size)
 }
 
 static umem_off_t
-umo_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num)
+umo_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num,
+	    unsigned int mbkt_id)
 {
 	struct ad_blob_handle	 bh = umm2ad_blob_hdl(umm);
 	struct ad_reserv_act	*ract = act;
@@ -1330,9 +1331,10 @@ umo_atomic_copy(struct umem_instance *umm, void *dest, const void *src, size_t l
 }
 
 static umem_off_t
-umo_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num)
+umo_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num,
+		 unsigned int mbkt_id)
 {
-	return umo_tx_alloc(umm, size, 0, type_num);
+	return umo_tx_alloc(umm, size, 0, type_num, mbkt_id);
 }
 
 static int
diff --git a/src/common/btree.c b/src/common/btree.c
index 6bf1bdb2b15..579b921d768 100644
--- a/src/common/btree.c
+++ b/src/common/btree.c
@@ -945,8 +945,12 @@ btr_root_alloc(struct btr_context *tcx)
 	struct btr_instance	*tins = &tcx->tc_tins;
 	struct btr_root		*root;
 
-	tins->ti_root_off = umem_zalloc(btr_umm(tcx),
-					sizeof(struct btr_root));
+	if (btr_ops(tcx)->to_node_alloc != NULL)
+		tins->ti_root_off = btr_ops(tcx)->to_node_alloc(&tcx->tc_tins,
+								sizeof(struct btr_root));
+	else
+		tins->ti_root_off = umem_zalloc(btr_umm(tcx), sizeof(struct btr_root));
+
 	if (UMOFF_IS_NULL(tins->ti_root_off))
 		return btr_umm(tcx)->umm_nospc_rc;
 
diff --git a/src/common/dav/bucket.c b/src/common/dav/bucket.c
index 8df41288a13..55e72b45ce8 100644
--- a/src/common/dav/bucket.c
+++ b/src/common/dav/bucket.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2022, Intel Corporation */
+/* Copyright 2015-2023, Intel Corporation */
 
 /*
  * bucket.c -- bucket implementation
diff --git a/src/common/dav/bucket.h b/src/common/dav/bucket.h
index aadc6e714fc..8f5754324f5 100644
--- a/src/common/dav/bucket.h
+++ b/src/common/dav/bucket.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2021, Intel Corporation */
+/* Copyright 2015-2023, Intel Corporation */
 
 /*
  * bucket.h -- internal definitions for bucket
diff --git a/src/common/dav/dav.h b/src/common/dav/dav.h
index 72f836c937b..1c1840a9bb3 100644
--- a/src/common/dav/dav.h
+++ b/src/common/dav/dav.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2022, Intel Corporation */
+/* Copyright 2015-2024, Intel Corporation */
 
 /*
  * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
@@ -23,17 +23,23 @@
 #define DAV_FLAG_TX_NO_ABORT		(((uint64_t)1) << 4)
 
 #define DAV_CLASS_ID(id)		(((uint64_t)(id)) << 48)
-#define DAV_ARENA_ID(id)		(((uint64_t)(id)) << 32)
+#ifdef	DAV_V2_BUILD
+#define DAV_EZONE_ID(id)		(((uint64_t)(id)) << 16)
+#endif	/* DAV_V2_BUILD */
 
 #define DAV_XALLOC_CLASS_MASK		((((uint64_t)1 << 16) - 1) << 48)
-#define DAV_XALLOC_ARENA_MASK		((((uint64_t)1 << 16) - 1) << 32)
+#ifdef	DAV_V2_BUILD
+#define DAV_XALLOC_EZONE_MASK		((((uint64_t)1 << 32) - 1) << 16)
+#else	/* DAV_V2_BUILD */
+#define DAV_XALLOC_EZONE_MASK		0
+#endif	/* DAV_V2_BUILD */
 #define DAV_XALLOC_ZERO			DAV_FLAG_ZERO
 #define DAV_XALLOC_NO_FLUSH		DAV_FLAG_NO_FLUSH
 #define DAV_XALLOC_NO_ABORT		DAV_FLAG_TX_NO_ABORT
 
 #define DAV_TX_XALLOC_VALID_FLAGS	(DAV_XALLOC_ZERO |\
 					DAV_XALLOC_NO_FLUSH |\
-					DAV_XALLOC_ARENA_MASK |\
+					DAV_XALLOC_EZONE_MASK |\
 					DAV_XALLOC_CLASS_MASK |\
 					DAV_XALLOC_NO_ABORT)
 
diff --git a/src/common/dav/dav_iface.c b/src/common/dav/dav_iface.c
index 7d0efa14b4b..4c8448c4b19 100644
--- a/src/common/dav/dav_iface.c
+++ b/src/common/dav/dav_iface.c
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2015-2023 Intel Corporation.
+ * (C) Copyright 2015-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -94,7 +94,9 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume
 	if (hdl->do_store->stor_priv == NULL) {
 		D_ERROR("meta context not defined. WAL commit disabled for %s\n", path);
 	} else {
-		rc = umem_cache_alloc(store, 0);
+		num_pages = (sz + UMEM_CACHE_PAGE_SZ - 1) >> UMEM_CACHE_PAGE_SZ_SHIFT;
+		rc = umem_cache_alloc(store, UMEM_CACHE_PAGE_SZ, num_pages, 0, 0, 0, base, NULL,
+				      NULL, NULL);
 		if (rc != 0) {
 			D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc));
 			err = rc;
@@ -104,14 +106,6 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume
 
 	D_STRNDUP(hdl->do_path, path, strlen(path));
 
-	num_pages = (sz + UMEM_CACHE_PAGE_SZ - 1) >> UMEM_CACHE_PAGE_SZ_SHIFT;
-	rc = umem_cache_map_range(hdl->do_store, 0, base, num_pages);
-	if (rc != 0) {
-		D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc));
-		err = rc;
-		goto out2;
-	}
-
 	if (flags & DAV_HEAP_INIT) {
 		setup_dav_phdr(hdl);
 		heap_base = (char *)hdl->do_base + hdl->do_phdr->dp_heap_offset;
@@ -135,7 +129,7 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume
 
 		D_ASSERT(store != NULL);
 
-		rc = store->stor_ops->so_load(store, hdl->do_base);
+		rc = store->stor_ops->so_load(store, hdl->do_base, 0, store->stor_size);
 		if (rc) {
 			D_ERROR("Failed to read blob to vos file %s, rc = %d\n", path, rc);
 			goto out2;
diff --git a/src/common/dav/dav_internal.h b/src/common/dav/dav_internal.h
index 0f8ddff5916..ae6150c2748 100644
--- a/src/common/dav/dav_internal.h
+++ b/src/common/dav/dav_internal.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2022, Intel Corporation */
+/* Copyright 2015-2023, Intel Corporation */
 
 /*
  * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
diff --git a/src/common/dav/heap.c b/src/common/dav/heap.c
index 4384fe40f8c..ee2feca85a1 100644
--- a/src/common/dav/heap.c
+++ b/src/common/dav/heap.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2022, Intel Corporation */
+/* Copyright 2015-2023, Intel Corporation */
 
 /*
  * heap.c -- heap implementation
diff --git a/src/common/dav/heap.h b/src/common/dav/heap.h
index d3e2bba4cdf..2b3f86e2fff 100644
--- a/src/common/dav/heap.h
+++ b/src/common/dav/heap.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2021, Intel Corporation */
+/* Copyright 2015-2023, Intel Corporation */
 
 /*
  * heap.h -- internal definitions for heap
diff --git a/src/common/dav/obj.h b/src/common/dav/obj.h
index 3140235d105..e85c0d317e8 100644
--- a/src/common/dav/obj.h
+++ b/src/common/dav/obj.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2014-2021, Intel Corporation */
+/* Copyright 2014-2023, Intel Corporation */
 
 /*
  * obj.h -- internal definitions for obj module
diff --git a/src/common/dav/palloc.c b/src/common/dav/palloc.c
index a7b5424576f..59b4d1833f0 100644
--- a/src/common/dav/palloc.c
+++ b/src/common/dav/palloc.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2022, Intel Corporation */
+/* Copyright 2015-2023, Intel Corporation */
 
 /*
  * palloc.c -- implementation of pmalloc POSIX-like API
diff --git a/src/common/dav/palloc.h b/src/common/dav/palloc.h
index 9c7560f1aaa..047bee47424 100644
--- a/src/common/dav/palloc.h
+++ b/src/common/dav/palloc.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2020, Intel Corporation */
+/* Copyright 2015-2023, Intel Corporation */
 
 /*
  * palloc.h -- internal definitions for persistent allocator
diff --git a/src/common/dav/recycler.c b/src/common/dav/recycler.c
index 07537a44bd4..392610985a5 100644
--- a/src/common/dav/recycler.c
+++ b/src/common/dav/recycler.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2016-2022, Intel Corporation */
+/* Copyright 2016-2023, Intel Corporation */
 
 /*
  * recycler.c -- implementation of run recycler
diff --git a/src/common/dav/recycler.h b/src/common/dav/recycler.h
index 2d68d8d70fc..e89720f8871 100644
--- a/src/common/dav/recycler.h
+++ b/src/common/dav/recycler.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2016-2021, Intel Corporation */
+/* Copyright 2016-2023, Intel Corporation */
 
 /*
  * recycler.h -- internal definitions of run recycler
diff --git a/src/common/dav/tx.c b/src/common/dav/tx.c
index 45b3daba73c..6d1efe0b8e7 100644
--- a/src/common/dav/tx.c
+++ b/src/common/dav/tx.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2022, Intel Corporation */
+/* Copyright 2015-2023, Intel Corporation */
 
 /*
  * tx.c -- transactions implementation
diff --git a/src/common/dav_v2/README.md b/src/common/dav_v2/README.md
new file mode 100644
index 00000000000..008b3202327
--- /dev/null
+++ b/src/common/dav_v2/README.md
@@ -0,0 +1,6 @@
+# DAOS Allocator for VOS
+
+The DAV allocator for md_on_ssd phase 2 now supports evictable zones. This introduces change in the
+layout of heap and is not compatible with the DAV allocator of phase 1. In order to support both
+layouts the new allocator is packaged as a different library and linked to daos_common_pmem
+library.
diff --git a/src/common/dav_v2/SConscript b/src/common/dav_v2/SConscript
new file mode 100644
index 00000000000..fe69cb34697
--- /dev/null
+++ b/src/common/dav_v2/SConscript
@@ -0,0 +1,31 @@
+"""Build dav_v2 libraries"""
+
+
+SRC = ['alloc_class.c', 'bucket.c', 'container_ravl.c', 'container_seglists.c', 'critnib.c',
+       'dav_clogs.c', 'dav_iface.c', 'heap.c', 'memblock.c', 'memops.c', 'meta_io.c',
+       'palloc.c', 'ravl.c', 'ravl_interval.c', 'recycler.c', 'stats.c', 'tx.c', 'ulog.c',
+       'util.c', 'wal_tx.c']
+
+
+def scons():
+    """Scons function"""
+
+    Import('env', 'base_env')
+
+    env.AppendUnique(LIBPATH=[Dir('.')])
+    base_env.AppendUnique(LIBPATH=[Dir('.')])
+    base_env.d_add_build_rpath()
+    env.d_add_build_rpath()
+
+    denv = env.Clone()
+
+    denv.AppendUnique(LIBS=['pthread', 'gurt'])
+    denv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD', '-DDAV_V2_BUILD'])
+    denv.AppendUnique(CFLAGS=['-fvisibility=hidden'])
+
+    dav_v2 = denv.d_library('dav_v2', SRC)
+    denv.Install('$PREFIX/lib64/', dav_v2)
+
+
+if __name__ == "SCons.Script":
+    scons()
diff --git a/src/common/dav_v2/alloc_class.c b/src/common/dav_v2/alloc_class.c
new file mode 100644
index 00000000000..02c968c2d4f
--- /dev/null
+++ b/src/common/dav_v2/alloc_class.c
@@ -0,0 +1,647 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2024, Intel Corporation */
+
+/*
+ * alloc_class.c -- implementation of allocation classes
+ */
+
+#include <float.h>
+#include <string.h>
+
+#include "alloc_class.h"
+#include "heap_layout.h"
+#include "util.h"
+#include "out.h"
+#include "bucket.h"
+#include "critnib.h"
+
+#define RUN_CLASS_KEY_PACK(map_idx_s, flags_s, size_idx_s)\
+((uint64_t)(map_idx_s) << 32 |\
+(uint64_t)(flags_s) << 16 |\
+(uint64_t)(size_idx_s))
+
+/*
+ * Value used to mark a reserved spot in the bucket array.
+ */
+#define ACLASS_RESERVED ((void *)0xFFFFFFFFULL)
+
+/*
+ * The last size that is handled by runs.
+ */
+#define MAX_RUN_SIZE (CHUNKSIZE * 10)
+
+/*
+ * Maximum number of bytes the allocation class generation algorithm can decide
+ * to waste in a single run chunk.
+ */
+#define MAX_RUN_WASTED_BYTES 1024
+
+/*
+ * Allocation categories are used for allocation classes generation. Each one
+ * defines the biggest handled size (in bytes) and step pct of the generation
+ * process. The step percentage defines maximum allowed external fragmentation
+ * for the category.
+ */
+#define MAX_ALLOC_CATEGORIES 9
+
+/*
+ * The first size (in byes) which is actually used in the allocation
+ * class generation algorithm. All smaller sizes use the first predefined bucket
+ * with the smallest run unit size.
+ */
+#define FIRST_GENERATED_CLASS_SIZE 128
+
+/*
+ * The granularity of the allocation class generation algorithm.
+ */
+#define ALLOC_BLOCK_SIZE_GEN 64
+
+/*
+ * The first predefined allocation class size
+ */
+#define MIN_UNIT_SIZE 128
+
+static const struct {
+	size_t size;
+	float step;
+} categories[MAX_ALLOC_CATEGORIES] = {
+	/* dummy category - the first allocation class is predefined */
+	{FIRST_GENERATED_CLASS_SIZE, 0.05f},
+	{1024, 0.05f},
+	{2048, 0.05f},
+	{4096, 0.05f},
+	{8192, 0.05f},
+	{16384, 0.05f},
+	{32768, 0.05f},
+	{131072, 0.05f},
+	{393216, 0.05f},
+};
+
+#define RUN_UNIT_MAX_ALLOC 8U
+
+/*
+ * Every allocation has to be a multiple of at least 8 because we need to
+ * ensure proper alignment of every persistent structure.
+ */
+#define ALLOC_BLOCK_SIZE 16
+
+/*
+ * Converts size (in bytes) to number of allocation blocks.
+ */
+#define SIZE_TO_CLASS_MAP_INDEX(_s, _g) (1 + (((_s) - 1) / (_g)))
+
+/*
+ * Target number of allocations per run instance.
+ */
+#define RUN_MIN_NALLOCS 200
+
+/*
+ * Hard limit of chunks per single run.
+ */
+#define RUN_SIZE_IDX_CAP (16)
+
+#define ALLOC_CLASS_DEFAULT_FLAGS CHUNK_FLAG_FLEX_BITMAP
+
+struct alloc_class_collection {
+	size_t granularity;
+
+	struct alloc_class *aclasses[MAX_ALLOCATION_CLASSES];
+
+	/*
+	 * The last size (in bytes) that is handled by runs, everything bigger
+	 * uses the default class.
+	 */
+	size_t last_run_max_size;
+
+	/* maps allocation classes to allocation sizes, excluding the header! */
+	uint8_t *class_map_by_alloc_size;
+
+	/* maps allocation classes to run unit sizes */
+	struct critnib *class_map_by_unit_size;
+
+	int fail_on_missing_class;
+	int autogenerate_on_missing_class;
+};
+
+/*
+ * alloc_class_find_first_free_slot -- searches for the
+ *	first available allocation class slot
+ *
+ * This function must be thread-safe because allocation classes can be created
+ * at runtime.
+ */
+int
+alloc_class_find_first_free_slot(struct alloc_class_collection *ac,
+	uint8_t *slot)
+{
+	for (int n = 0; n < MAX_ALLOCATION_CLASSES; ++n) {
+		if (util_bool_compare_and_swap64(&ac->aclasses[n],
+				NULL, ACLASS_RESERVED)) {
+			*slot = (uint8_t)n;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+/*
+ * alloc_class_reserve -- reserve the specified class id
+ */
+int
+alloc_class_reserve(struct alloc_class_collection *ac, uint8_t id)
+{
+	return util_bool_compare_and_swap64(&ac->aclasses[id],
+			NULL, ACLASS_RESERVED) ? 0 : -1;
+}
+
+/*
+ * alloc_class_reservation_clear -- removes the reservation on class id
+ */
+static void
+alloc_class_reservation_clear(struct alloc_class_collection *ac, int id)
+{
+	int ret = util_bool_compare_and_swap64(&ac->aclasses[id],
+		ACLASS_RESERVED, NULL);
+	ASSERT(ret);
+}
+
+/*
+ * alloc_class_new -- creates a new allocation class
+ */
+struct alloc_class *
+alloc_class_new(int id, struct alloc_class_collection *ac,
+	enum alloc_class_type type, enum header_type htype,
+	size_t unit_size, size_t alignment,
+	uint32_t size_idx)
+{
+	DAV_DBG("alloc_class_new id:%d\n",
+		  (type == CLASS_HUGE) ? DEFAULT_ALLOC_CLASS_ID : id);
+
+	struct alloc_class *c;
+
+	D_ALLOC_PTR_NZ(c);
+
+	if (c == NULL)
+		goto error_class_alloc;
+
+	c->unit_size = unit_size;
+	c->header_type = htype;
+	c->type = type;
+	c->flags = (uint16_t)
+		(header_type_to_flag[c->header_type] |
+		(alignment ? CHUNK_FLAG_ALIGNED : 0)) |
+		ALLOC_CLASS_DEFAULT_FLAGS;
+
+	switch (type) {
+	case CLASS_HUGE:
+		id = DEFAULT_ALLOC_CLASS_ID;
+		break;
+	case CLASS_RUN:
+		c->rdsc.alignment = alignment;
+		memblock_run_bitmap(&size_idx, c->flags, unit_size,
+			alignment, NULL, &c->rdsc.bitmap);
+		c->rdsc.nallocs = c->rdsc.bitmap.nbits;
+		c->rdsc.size_idx = size_idx;
+
+		/* these two fields are duplicated from class */
+		c->rdsc.unit_size = c->unit_size;
+		c->rdsc.flags = c->flags;
+
+		uint8_t slot = (uint8_t)id;
+
+		if (id < 0 && alloc_class_find_first_free_slot(ac,
+				&slot) != 0)
+			goto error_map_insert;
+		id = slot;
+
+		size_t map_idx = SIZE_TO_CLASS_MAP_INDEX(c->unit_size,
+			ac->granularity);
+		ASSERT(map_idx <= UINT32_MAX);
+		uint32_t map_idx_s = (uint32_t)map_idx;
+		uint16_t size_idx_s = (uint16_t)size_idx;
+		uint16_t flags_s = (uint16_t)c->flags;
+		uint64_t k = RUN_CLASS_KEY_PACK(map_idx_s,
+			flags_s, size_idx_s);
+
+		if (critnib_insert(ac->class_map_by_unit_size,
+		    k, c) != 0) {
+			ERR("unable to register allocation class");
+			goto error_map_insert;
+		}
+
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	c->id = (uint8_t)id;
+	ac->aclasses[c->id] = c;
+	return c;
+
+error_map_insert:
+	D_FREE(c);
+error_class_alloc:
+	if (id >= 0)
+		alloc_class_reservation_clear(ac, id);
+
+	D_CRIT("alloc_class_new failed\n");
+	return NULL;
+}
+
+/*
+ * alloc_class_delete -- (internal) deletes an allocation class
+ */
+void
+alloc_class_delete(struct alloc_class_collection *ac,
+	struct alloc_class *c)
+{
+	DAV_DBG("alloc_class_delete: %d\n", c->id);
+
+	ac->aclasses[c->id] = NULL;
+	D_FREE(c);
+}
+
+/*
+ * alloc_class_find_or_create -- (internal) searches for the
+ * biggest allocation class for which unit_size is evenly divisible by n.
+ * If no such class exists, create one.
+ */
+static struct alloc_class *
+alloc_class_find_or_create(struct alloc_class_collection *ac, size_t n)
+{
+	COMPILE_ERROR_ON(MAX_ALLOCATION_CLASSES > UINT8_MAX);
+	uint64_t required_size_bytes = n * RUN_MIN_NALLOCS;
+	uint32_t required_size_idx = 1;
+
+	if (required_size_bytes > RUN_DEFAULT_SIZE) {
+		required_size_bytes -= RUN_DEFAULT_SIZE;
+		required_size_idx +=
+			CALC_SIZE_IDX(CHUNKSIZE, required_size_bytes);
+		if (required_size_idx > RUN_SIZE_IDX_CAP)
+			required_size_idx = RUN_SIZE_IDX_CAP;
+	}
+
+	for (int i = MAX_ALLOCATION_CLASSES - 1; i >= 0; --i) {
+		struct alloc_class *c = ac->aclasses[i];
+
+		if (c == NULL || c->type == CLASS_HUGE ||
+				c->rdsc.size_idx < required_size_idx)
+			continue;
+
+		if (n % c->unit_size == 0 &&
+			n / c->unit_size <= RUN_UNIT_MAX_ALLOC)
+			return c;
+	}
+
+	/*
+	 * In order to minimize the wasted space at the end of the run the
+	 * run data size must be divisible by the allocation class unit size
+	 * with the smallest possible remainder, preferably 0.
+	 */
+	struct run_bitmap b;
+	size_t runsize_bytes = 0;
+
+	do {
+		if (runsize_bytes != 0) /* don't increase on first iteration */
+			n += ALLOC_BLOCK_SIZE_GEN;
+
+		uint32_t size_idx = required_size_idx;
+
+		memblock_run_bitmap(&size_idx, ALLOC_CLASS_DEFAULT_FLAGS, n, 0,
+			NULL, &b);
+
+		runsize_bytes = RUN_CONTENT_SIZE_BYTES(size_idx) - b.size;
+	} while ((runsize_bytes % n) > MAX_RUN_WASTED_BYTES);
+
+	/*
+	 * Now that the desired unit size is found the existing classes need
+	 * to be searched for possible duplicates. If a class that can handle
+	 * the calculated size already exists, simply return that.
+	 */
+	for (int i = 1; i < MAX_ALLOCATION_CLASSES; ++i) {
+		struct alloc_class *c = ac->aclasses[i];
+
+		if (c == NULL || c->type == CLASS_HUGE)
+			continue;
+		if (n / c->unit_size <= RUN_UNIT_MAX_ALLOC &&
+			n % c->unit_size == 0)
+			return c;
+		if (c->unit_size == n)
+			return c;
+	}
+
+	return alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT, n, 0,
+		required_size_idx);
+}
+
+/*
+ * alloc_class_find_min_frag -- searches for an existing allocation
+ * class that will provide the smallest internal fragmentation for the given
+ * size.
+ */
+static struct alloc_class *
+alloc_class_find_min_frag(struct alloc_class_collection *ac, size_t n)
+{
+	struct alloc_class *best_c = NULL;
+	size_t lowest_waste = SIZE_MAX;
+
+	ASSERTne(n, 0);
+
+	/*
+	 * Start from the largest buckets in order to minimize unit size of
+	 * allocated memory blocks.
+	 */
+	for (int i = MAX_ALLOCATION_CLASSES - 1; i >= 0; --i) {
+		struct alloc_class *c = ac->aclasses[i];
+
+		/* can't use alloc classes /w no headers by default */
+		if (c == NULL || c->header_type == HEADER_NONE)
+			continue;
+
+		size_t real_size = n + header_type_to_size[c->header_type];
+
+		size_t units = CALC_SIZE_IDX(c->unit_size, real_size);
+
+		/* can't exceed the maximum allowed run unit max */
+		if (c->type == CLASS_RUN && units > RUN_UNIT_MAX_ALLOC)
+			continue;
+
+		if (c->unit_size * units == real_size)
+			return c;
+
+		size_t waste = (c->unit_size * units) - real_size;
+
+		/*
+		 * If we assume that the allocation class is only ever going to
+		 * be used with exactly one size, the effective internal
+		 * fragmentation would be increased by the leftover
+		 * memory at the end of the run.
+		 */
+		if (c->type == CLASS_RUN) {
+			size_t wasted_units = c->rdsc.nallocs % units;
+			size_t wasted_bytes = wasted_units * c->unit_size;
+			size_t waste_avg_per_unit = wasted_bytes /
+				c->rdsc.nallocs;
+
+			waste += waste_avg_per_unit;
+		}
+
+		if (best_c == NULL || lowest_waste > waste) {
+			best_c = c;
+			lowest_waste = waste;
+		}
+	}
+
+	ASSERTne(best_c, NULL);
+	return best_c;
+}
+
+/*
+ * alloc_class_collection_new -- creates a new collection of allocation classes
+ */
+struct alloc_class_collection *
+alloc_class_collection_new()
+{
+	struct alloc_class_collection *ac;
+
+	D_ALLOC_PTR(ac);
+	if (ac == NULL)
+		return NULL;
+
+	ac->granularity = ALLOC_BLOCK_SIZE;
+	ac->last_run_max_size = MAX_RUN_SIZE;
+	ac->fail_on_missing_class = 0;
+	ac->autogenerate_on_missing_class = 1;
+
+	size_t maps_size = (MAX_RUN_SIZE / ac->granularity) + 1;
+
+	D_ALLOC_NZ(ac->class_map_by_alloc_size, maps_size);
+	if (ac->class_map_by_alloc_size == NULL)
+		goto error;
+	ac->class_map_by_unit_size = critnib_new();
+	if (ac->class_map_by_unit_size == NULL)
+		goto error;
+
+	memset(ac->class_map_by_alloc_size, 0xFF, maps_size);
+
+	if (alloc_class_new(-1, ac, CLASS_HUGE, HEADER_COMPACT,
+		CHUNKSIZE, 0, 1) == NULL)
+		goto error;
+
+	struct alloc_class *predefined_class =
+		alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT,
+			MIN_UNIT_SIZE, 0, 1);
+	if (predefined_class == NULL)
+		goto error;
+
+	for (size_t i = 0; i < FIRST_GENERATED_CLASS_SIZE / ac->granularity;
+		++i) {
+		ac->class_map_by_alloc_size[i] = predefined_class->id;
+	}
+
+	/*
+	 * Based on the defined categories, a set of allocation classes is
+	 * created. The unit size of those classes is depended on the category
+	 * initial size and step.
+	 */
+	size_t granularity_mask = ALLOC_BLOCK_SIZE_GEN - 1;
+
+	for (int c = 1; c < MAX_ALLOC_CATEGORIES; ++c) {
+		size_t n = categories[c - 1].size + ALLOC_BLOCK_SIZE_GEN;
+
+		do {
+			if (alloc_class_find_or_create(ac, n) == NULL)
+				goto error;
+
+			float stepf = (float)n * categories[c].step;
+			size_t stepi = (size_t)stepf;
+
+			stepi = (stepf - (float)stepi < FLT_EPSILON) ?
+				stepi : stepi + 1;
+
+			n += (stepi + (granularity_mask)) & ~granularity_mask;
+		} while (n <= categories[c].size);
+	}
+
+	/*
+	 * Find the largest alloc class and use it's unit size as run allocation
+	 * threshold.
+	 */
+	uint8_t largest_aclass_slot;
+
+	for (largest_aclass_slot = MAX_ALLOCATION_CLASSES - 1;
+			largest_aclass_slot > 0 &&
+			ac->aclasses[largest_aclass_slot] == NULL;
+			--largest_aclass_slot) {
+		/* intentional NOP */
+	}
+
+	struct alloc_class *c = ac->aclasses[largest_aclass_slot];
+
+	/*
+	 * The actual run might contain less unit blocks than the theoretical
+	 * unit max variable. This may be the case for very large unit sizes.
+	 */
+	size_t real_unit_max = (c->rdsc.nallocs < RUN_UNIT_MAX_ALLOC) ?
+		c->rdsc.nallocs : RUN_UNIT_MAX_ALLOC;
+
+	size_t theoretical_run_max_size = c->unit_size * real_unit_max;
+
+	ac->last_run_max_size = theoretical_run_max_size <= MAX_RUN_SIZE ?
+		theoretical_run_max_size : MAX_RUN_SIZE;
+
+#ifdef DAV_EXTRA_DEBUG
+	/*
+	 * Verify that each bucket's unit size points back to the bucket by the
+	 * bucket map. This must be true for the default allocation classes,
+	 * otherwise duplicate buckets will be created.
+	 */
+	for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		struct alloc_class *cl = ac->aclasses[i];
+
+		if (cl != NULL && cl->type == CLASS_RUN) {
+			ASSERTeq(i, cl->id);
+			ASSERTeq(alloc_class_by_run(ac, cl->unit_size,
+				cl->flags, cl->rdsc.size_idx), cl);
+		}
+	}
+#endif
+
+	return ac;
+
+error:
+	alloc_class_collection_delete(ac);
+
+	return NULL;
+}
+
+/*
+ * alloc_class_collection_delete -- deletes the allocation class collection and
+ *	all of the classes within it
+ */
+void
+alloc_class_collection_delete(struct alloc_class_collection *ac)
+{
+	for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		struct alloc_class *c = ac->aclasses[i];
+
+		if (c != NULL)
+			alloc_class_delete(ac, c);
+	}
+
+	if (ac->class_map_by_unit_size)
+		critnib_delete(ac->class_map_by_unit_size);
+	D_FREE(ac->class_map_by_alloc_size);
+	D_FREE(ac);
+}
+
+/*
+ * alloc_class_assign_by_size -- (internal) chooses the allocation class that
+ *	best approximates the provided size
+ */
+static struct alloc_class *
+alloc_class_assign_by_size(struct alloc_class_collection *ac,
+	size_t size)
+{
+	size_t class_map_index = SIZE_TO_CLASS_MAP_INDEX(size,
+		ac->granularity);
+	struct alloc_class *c = alloc_class_find_min_frag(ac,
+		class_map_index * ac->granularity);
+
+	ASSERTne(c, NULL);
+
+	/*
+	 * We don't lock this array because locking this section here and then
+	 * bailing out if someone else was faster would be still slower than
+	 * just calculating the class and failing to assign the variable.
+	 * We are using a compare and swap so that helgrind/drd don't complain.
+	 */
+	util_bool_compare_and_swap64(
+		&ac->class_map_by_alloc_size[class_map_index],
+		MAX_ALLOCATION_CLASSES, c->id);
+
+	DAV_DBG("alloc_class_assign_by_size: %zu id:%d",
+		  size, c->id);
+
+	return c;
+}
+
+/*
+ * alloc_class_by_alloc_size -- returns allocation class that is assigned
+ *	to handle an allocation of the provided size
+ */
+struct alloc_class *
+alloc_class_by_alloc_size(struct alloc_class_collection *ac, size_t size)
+{
+	if (size < ac->last_run_max_size) {
+		uint8_t class_id = ac->class_map_by_alloc_size[
+			SIZE_TO_CLASS_MAP_INDEX(size, ac->granularity)];
+
+		if (class_id == MAX_ALLOCATION_CLASSES) {
+			if (ac->fail_on_missing_class)
+				return NULL;
+			else if (ac->autogenerate_on_missing_class)
+				return alloc_class_assign_by_size(ac, size);
+			else
+				return ac->aclasses[DEFAULT_ALLOC_CLASS_ID];
+		}
+
+		return ac->aclasses[class_id];
+	} else {
+		return ac->aclasses[DEFAULT_ALLOC_CLASS_ID];
+	}
+}
+
+/*
+ * alloc_class_by_run -- returns the allocation class that has the given
+ *	unit size
+ */
+struct alloc_class *
+alloc_class_by_run(struct alloc_class_collection *ac,
+	size_t unit_size, uint16_t flags, uint32_t size_idx)
+{
+	size_t map_idx = SIZE_TO_CLASS_MAP_INDEX(unit_size, ac->granularity);
+
+	ASSERT(map_idx <= UINT32_MAX);
+
+	uint32_t map_idx_s = (uint32_t)map_idx;
+
+	ASSERT(size_idx <= MAX_CHUNK);
+
+	uint16_t size_idx_s = (uint16_t)size_idx;
+	uint16_t flags_s = (uint16_t)flags;
+
+	return critnib_get(ac->class_map_by_unit_size,
+		RUN_CLASS_KEY_PACK(map_idx_s, flags_s, size_idx_s));
+}
+
+/*
+ * alloc_class_by_id -- returns the allocation class with an id
+ */
+struct alloc_class *
+alloc_class_by_id(struct alloc_class_collection *ac, uint8_t id)
+{
+	return ac->aclasses[id];
+}
+
+/*
+ * alloc_class_calc_size_idx -- calculates how many units does the size require
+ */
+ssize_t
+alloc_class_calc_size_idx(struct alloc_class *c, size_t size)
+{
+	uint32_t size_idx = CALC_SIZE_IDX(c->unit_size,
+		size + header_type_to_size[c->header_type]);
+
+	if (c->type == CLASS_RUN) {
+		if (c->header_type == HEADER_NONE && size_idx != 1)
+			return -1;
+		else if (size_idx > RUN_UNIT_MAX)
+			return -1;
+		else if (size_idx > c->rdsc.nallocs)
+			return -1;
+	}
+
+	return size_idx;
+}
diff --git a/src/common/dav_v2/alloc_class.h b/src/common/dav_v2/alloc_class.h
new file mode 100644
index 00000000000..48ffd815e26
--- /dev/null
+++ b/src/common/dav_v2/alloc_class.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2023, Intel Corporation */
+
+/*
+ * alloc_class.h -- internal definitions for allocation classes
+ */
+
+#ifndef __DAOS_COMMON_ALLOC_CLASS_H
+#define __DAOS_COMMON_ALLOC_CLASS_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include "heap_layout.h"
+#include "memblock.h"
+
+#define MAX_ALLOCATION_CLASSES (UINT8_MAX)
+#define DEFAULT_ALLOC_CLASS_ID (0)
+#define RUN_UNIT_MAX RUN_BITS_PER_VALUE
+
+struct alloc_class_collection;
+
+enum alloc_class_type {
+	CLASS_UNKNOWN,
+	CLASS_HUGE,
+	CLASS_RUN,
+
+	MAX_ALLOC_CLASS_TYPES
+};
+
+struct alloc_class {
+	uint8_t id;
+	uint16_t flags;
+
+	size_t unit_size;
+
+	enum header_type header_type;
+	enum alloc_class_type type;
+
+	/* run-specific data */
+	struct run_descriptor rdsc;
+};
+
+struct alloc_class_collection *alloc_class_collection_new(void);
+void alloc_class_collection_delete(struct alloc_class_collection *ac);
+
+struct alloc_class *alloc_class_by_run(
+	struct alloc_class_collection *ac,
+	size_t unit_size, uint16_t flags, uint32_t size_idx);
+struct alloc_class *alloc_class_by_alloc_size(
+	struct alloc_class_collection *ac, size_t size);
+struct alloc_class *alloc_class_by_id(
+	struct alloc_class_collection *ac, uint8_t id);
+
+int alloc_class_reserve(struct alloc_class_collection *ac, uint8_t id);
+int alloc_class_find_first_free_slot(struct alloc_class_collection *ac,
+	uint8_t *slot);
+
+ssize_t
+alloc_class_calc_size_idx(struct alloc_class *c, size_t size);
+
+struct alloc_class *
+alloc_class_new(int id, struct alloc_class_collection *ac,
+	enum alloc_class_type type, enum header_type htype,
+	size_t unit_size, size_t alignment,
+	uint32_t size_idx);
+
+void alloc_class_delete(struct alloc_class_collection *ac,
+	struct alloc_class *c);
+
+#endif /* __DAOS_COMMON_ALLOC_CLASS_H */
diff --git a/src/common/dav_v2/bucket.c b/src/common/dav_v2/bucket.c
new file mode 100644
index 00000000000..ab86f94ee6d
--- /dev/null
+++ b/src/common/dav_v2/bucket.c
@@ -0,0 +1,275 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * bucket.c -- bucket implementation
+ *
+ * Buckets manage volatile state of the heap. They are the abstraction layer
+ * between the heap-managed chunks/runs and memory allocations.
+ *
+ * Each bucket instance can have a different underlying container that is
+ * responsible for selecting blocks - which means that whether the allocator
+ * serves memory blocks in best/first/next -fit manner is decided during bucket
+ * creation.
+ */
+
+#include "alloc_class.h"
+#include "bucket.h"
+#include "heap.h"
+#include "memblock.h"
+#include "out.h"
+#include "sys_util.h"
+#include "valgrind_internal.h"
+
+struct bucket {
+	/* this struct is both the lock guard and the locked state */
+	struct bucket_locked             *locked;
+	struct alloc_class               *aclass;
+	struct block_container           *container;
+	const struct block_container_ops *c_ops;
+	struct memory_block_reserved     *active_memory_block;
+	struct mbrt                      *mb;
+	int                               is_active;
+};
+
+struct bucket_locked {
+	struct bucket bucket;
+	pthread_mutex_t lock;
+};
+
+/*
+ * bucket_init -- initializes the bucket's runtime state
+ */
+static int
+bucket_init(struct bucket *b, struct block_container *c,
+	struct alloc_class *aclass)
+{
+	b->container = c;
+	b->c_ops = c->c_ops;
+
+	b->is_active = 0;
+	b->active_memory_block = NULL;
+	if (aclass && aclass->type == CLASS_RUN) {
+		D_ALLOC_PTR(b->active_memory_block);
+
+		if (b->active_memory_block == NULL)
+			return -1;
+	}
+	b->aclass = aclass;
+
+	return 0;
+}
+
+/*
+ * bucket_fini -- destroys the bucket's runtime state
+ */
+static void
+bucket_fini(struct bucket *b)
+{
+	if (b->active_memory_block)
+		D_FREE(b->active_memory_block);
+	b->c_ops->destroy(b->container);
+}
+
+/*
+ * bucket_locked_new -- creates a new locked bucket instance
+ */
+struct bucket_locked *
+bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct mbrt *mb)
+{
+	ASSERTne(c, NULL);
+
+	struct bucket_locked *b;
+
+	D_ALLOC_PTR_NZ(b);
+	if (b == NULL)
+		return NULL;
+
+	if (bucket_init(&b->bucket, c, aclass) != 0)
+		goto err_bucket_init;
+
+	util_mutex_init(&b->lock);
+	b->bucket.locked = b;
+	b->bucket.mb     = mb;
+
+	return b;
+
+err_bucket_init:
+	D_FREE(b);
+	return NULL;
+}
+
+/*
+ * bucket_locked_delete -- cleanups and deallocates locked bucket instance
+ */
+void
+bucket_locked_delete(struct bucket_locked *b)
+{
+	bucket_fini(&b->bucket);
+	util_mutex_destroy(&b->lock);
+	D_FREE(b);
+}
+
+/*
+ * bucket_acquire -- acquires a usable bucket struct
+ */
+struct bucket *
+bucket_acquire(struct bucket_locked *b)
+{
+	util_mutex_lock(&b->lock);
+	return &b->bucket;
+}
+
+/*
+ * bucket_release -- releases a bucket struct
+ */
+void
+bucket_release(struct bucket *b)
+{
+	util_mutex_unlock(&b->locked->lock);
+}
+
+/*
+ * bucket_try_insert_attached_block -- tries to return a previously allocated
+ *	memory block back to the original bucket
+ */
+void
+bucket_try_insert_attached_block(struct bucket *b, const struct memory_block *m)
+{
+	struct memory_block *active = &b->active_memory_block->m;
+
+	if (b->is_active &&
+	    m->chunk_id == active->chunk_id &&
+	    m->zone_id == active->zone_id) {
+		bucket_insert_block(b, m);
+	}
+}
+
+/*
+ * bucket_alloc_class -- returns the bucket's alloc class
+ */
+struct alloc_class *
+bucket_alloc_class(struct bucket *b)
+{
+	return b->aclass;
+}
+
+/*
+ * bucket_insert_block -- inserts a block into the bucket
+ */
+int
+bucket_insert_block(struct bucket *b, const struct memory_block *m)
+{
+#if VG_MEMCHECK_ENABLED || VG_HELGRIND_ENABLED || VG_DRD_ENABLED
+	if (On_memcheck || On_drd_or_hg) {
+		size_t size = m->m_ops->get_real_size(m);
+		void *data = m->m_ops->get_real_data(m);
+
+		VALGRIND_DO_MAKE_MEM_NOACCESS(data, size);
+		VALGRIND_ANNOTATE_NEW_MEMORY(data, size);
+	}
+#endif
+	return b->c_ops->insert(b->container, m);
+}
+
+/*
+ * bucket_remove_block -- removes an exact block from the bucket
+ */
+int
+bucket_remove_block(struct bucket *b, const struct memory_block *m)
+{
+	return b->c_ops->get_rm_exact(b->container, m);
+}
+
+/*
+ * bucket_alloc_block -- allocates a block from the bucket
+ */
+int
+bucket_alloc_block(struct bucket *b, struct memory_block *m_out)
+{
+	return b->c_ops->get_rm_bestfit(b->container, m_out);
+}
+
+/*
+ * bucket_memblock_insert_block -- (internal) bucket insert wrapper
+ *	for callbacks
+ */
+static int
+bucket_memblock_insert_block(const struct memory_block *m, void *b)
+{
+	return bucket_insert_block(b, m);
+}
+
+/*
+ * bucket_attach_run - attaches a run to a bucket, making it active
+ */
+int
+bucket_attach_run(struct bucket *b, const struct memory_block *m)
+{
+	pthread_mutex_t *lock = m->m_ops->get_lock(m);
+
+	util_mutex_lock(lock);
+
+	int ret = m->m_ops->iterate_free(m, bucket_memblock_insert_block, b);
+
+	util_mutex_unlock(lock);
+
+	if (ret == 0) {
+		b->active_memory_block->m = *m;
+		b->active_memory_block->bucket = b->locked;
+		b->is_active = 1;
+		util_fetch_and_add64(&b->active_memory_block->nresv, 1);
+	} else {
+		b->c_ops->rm_all(b->container);
+	}
+	return 0;
+}
+
+/*
+ * bucket_detach_run - gets rid of the active block in the bucket
+ */
+int
+bucket_detach_run(struct bucket *b, struct memory_block *m_out, int *empty)
+{
+	*empty = 0;
+
+	struct memory_block_reserved **active = &b->active_memory_block;
+
+	if (b->is_active) {
+		b->c_ops->rm_all(b->container);
+		if (util_fetch_and_sub64(&(*active)->nresv, 1) == 1) {
+			*m_out = (*active)->m;
+			*empty = 1;
+
+			VALGRIND_ANNOTATE_HAPPENS_AFTER(&(*active)->nresv);
+			(*active)->m = MEMORY_BLOCK_NONE;
+		} else {
+			VALGRIND_ANNOTATE_HAPPENS_BEFORE(&(*active)->nresv);
+			*active = NULL;
+		}
+		b->is_active = 0;
+	}
+
+	if (*active == NULL) {
+		D_ALLOC_PTR(*active);
+		if (*active == NULL)
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * bucket_active_block -- returns the bucket active block
+ */
+struct memory_block_reserved *
+bucket_active_block(struct bucket *b)
+{
+	return b->is_active ? b->active_memory_block : NULL;
+}
+
+struct mbrt *
+bucket_get_mbrt(struct bucket *b)
+{
+	return b->mb;
+}
diff --git a/src/common/dav_v2/bucket.h b/src/common/dav_v2/bucket.h
new file mode 100644
index 00000000000..af2d5be6410
--- /dev/null
+++ b/src/common/dav_v2/bucket.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * bucket.h -- internal definitions for bucket
+ */
+
+#ifndef __DAOS_COMMON_BUCKET_H
+#define __DAOS_COMMON_BUCKET_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "alloc_class.h"
+#include "container.h"
+#include "memblock.h"
+
+#define CALC_SIZE_IDX(_unit_size, _size)\
+	((_size) == 0 ? 0 : (uint32_t)((((_size)-1) / (_unit_size)) + 1))
+
+struct bucket_locked;
+struct bucket;
+
+struct bucket_locked *
+bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct mbrt *mb);
+
+struct bucket *bucket_acquire(struct bucket_locked *b);
+void bucket_release(struct bucket *b);
+
+struct alloc_class *bucket_alloc_class(struct bucket *b);
+int bucket_insert_block(struct bucket *b, const struct memory_block *m);
+void bucket_try_insert_attached_block(struct bucket *b,
+	const struct memory_block *m);
+int bucket_remove_block(struct bucket *b, const struct memory_block *m);
+int bucket_alloc_block(struct bucket *b, struct memory_block *m_out);
+
+int bucket_attach_run(struct bucket *b, const struct memory_block *m);
+int bucket_detach_run(struct bucket *b,
+	struct memory_block *m_out, int *empty);
+
+struct memory_block_reserved *bucket_active_block(struct bucket *b);
+
+void bucket_locked_delete(struct bucket_locked *b);
+struct mbrt *
+bucket_get_mbrt(struct bucket *b);
+
+#endif /* __DAOS_COMMON_BUCKET_H */
diff --git a/src/common/dav_v2/container.h b/src/common/dav_v2/container.h
new file mode 100644
index 00000000000..5d2c247e248
--- /dev/null
+++ b/src/common/dav_v2/container.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * container.h -- internal definitions for block containers
+ */
+
+#ifndef __DAOS_COMMON_CONTAINER_H
+#define __DAOS_COMMON_CONTAINER_H 1
+
+#include "memblock.h"
+
+struct block_container {
+	const struct block_container_ops *c_ops;
+	struct palloc_heap *heap;
+};
+
+struct block_container_ops {
+	/* inserts a new memory block into the container */
+	int (*insert)(struct block_container *c, const struct memory_block *m);
+
+	/* removes exact match memory block */
+	int (*get_rm_exact)(struct block_container *c,
+		const struct memory_block *m);
+
+	/* removes and returns the best-fit memory block for size */
+	int (*get_rm_bestfit)(struct block_container *c,
+		struct memory_block *m);
+
+	/* checks whether the container is empty */
+	int (*is_empty)(struct block_container *c);
+
+	/* removes all elements from the container */
+	void (*rm_all)(struct block_container *c);
+
+	/* deletes the container */
+	void (*destroy)(struct block_container *c);
+};
+
+struct palloc_heap;
+struct block_container *container_new_ravl(struct palloc_heap *heap);
+struct block_container *container_new_seglists(struct palloc_heap *heap);
+
+#endif /* __DAOS_COMMON_CONTAINER_H */
diff --git a/src/common/dav_v2/container_ravl.c b/src/common/dav_v2/container_ravl.c
new file mode 100644
index 00000000000..af542c3c744
--- /dev/null
+++ b/src/common/dav_v2/container_ravl.c
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2023, Intel Corporation */
+
+/*
+ * container_ravl.c -- implementation of ravl-based block container
+ */
+
+#include "container.h"
+#include "ravl.h"
+#include "out.h"
+#include "sys_util.h"
+
+struct block_container_ravl {
+	struct block_container super;
+	struct memory_block    m;
+	struct ravl           *tree;
+};
+
+/*
+ * container_compare_memblocks -- (internal) compares two memory blocks
+ */
+static int
+container_compare_memblocks(const void *lhs, const void *rhs)
+{
+	const struct memory_block *l = lhs;
+	const struct memory_block *r = rhs;
+
+	int64_t diff = (int64_t)l->size_idx - (int64_t)r->size_idx;
+
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->zone_id - (int64_t)r->zone_id;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->chunk_id - (int64_t)r->chunk_id;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->block_off - (int64_t)r->block_off;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	return 0;
+}
+
+/*
+ * container_ravl_insert_block -- (internal) inserts a new memory block
+ *	into the container
+ */
+static int
+container_ravl_insert_block(struct block_container *bc,
+	const struct memory_block *m)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	ASSERT(m->chunk_id < MAX_CHUNK);
+	ASSERT(m->zone_id < UINT32_MAX);
+
+	c->m = *m;
+
+	return ravl_emplace_copy(c->tree, m);
+}
+
+/*
+ * container_ravl_get_rm_block_bestfit -- (internal) removes and returns the
+ *	best-fit memory block for size
+ */
+static int
+container_ravl_get_rm_block_bestfit(struct block_container *bc,
+	struct memory_block *m)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	struct ravl_node *n = ravl_find(c->tree, m,
+		RAVL_PREDICATE_GREATER_EQUAL);
+
+	if (n == NULL)
+		return ENOMEM;
+
+	struct memory_block *e = ravl_data(n);
+	*m                     = c->m;
+	m->zone_id             = e->zone_id;
+	m->chunk_id            = e->chunk_id;
+	m->size_idx            = e->size_idx;
+	m->block_off           = e->block_off;
+	/* Rest of the fields in e should not be accessed. */
+
+	ravl_remove(c->tree, n);
+
+	return 0;
+}
+
+/*
+ * container_ravl_get_rm_block_exact --
+ *	(internal) removes exact match memory block
+ */
+static int
+container_ravl_get_rm_block_exact(struct block_container *bc,
+	const struct memory_block *m)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	struct ravl_node *n = ravl_find(c->tree, m, RAVL_PREDICATE_EQUAL);
+
+	if (n == NULL)
+		return ENOMEM;
+
+	ravl_remove(c->tree, n);
+
+	return 0;
+}
+
+/*
+ * container_ravl_is_empty -- (internal) checks whether the container is empty
+ */
+static int
+container_ravl_is_empty(struct block_container *bc)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	return ravl_empty(c->tree);
+}
+
+/*
+ * container_ravl_rm_all -- (internal) removes all elements from the tree
+ */
+static void
+container_ravl_rm_all(struct block_container *bc)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	ravl_clear(c->tree);
+}
+
+/*
+ * container_ravl_delete -- (internal) deletes the container
+ */
+static void
+container_ravl_destroy(struct block_container *bc)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	ravl_delete(c->tree);
+
+	D_FREE(bc);
+}
+
+/*
+ * Tree-based block container used to provide best-fit functionality to the
+ * bucket. The time complexity for this particular container is O(k) where k is
+ * the length of the key.
+ *
+ * The get methods also guarantee that the block with lowest possible address
+ * that best matches the requirements is provided.
+ */
+static const struct block_container_ops container_ravl_ops = {
+	.insert = container_ravl_insert_block,
+	.get_rm_exact = container_ravl_get_rm_block_exact,
+	.get_rm_bestfit = container_ravl_get_rm_block_bestfit,
+	.is_empty = container_ravl_is_empty,
+	.rm_all = container_ravl_rm_all,
+	.destroy = container_ravl_destroy,
+};
+
+/*
+ * container_new_ravl -- allocates and initializes a ravl container
+ */
+struct block_container *
+container_new_ravl(struct palloc_heap *heap)
+{
+	struct block_container_ravl *bc;
+
+	D_ALLOC_PTR_NZ(bc);
+	if (bc == NULL)
+		goto error_container_malloc;
+
+	bc->super.heap = heap;
+	bc->super.c_ops = &container_ravl_ops;
+	bc->tree =
+	    ravl_new_sized(container_compare_memblocks, offsetof(struct memory_block, m_ops));
+	if (bc->tree == NULL)
+		goto error_ravl_new;
+
+	return (struct block_container *)&bc->super;
+
+error_ravl_new:
+	D_FREE(bc);
+
+error_container_malloc:
+	return NULL;
+}
diff --git a/src/common/dav_v2/container_seglists.c b/src/common/dav_v2/container_seglists.c
new file mode 100644
index 00000000000..3ec18df0b3f
--- /dev/null
+++ b/src/common/dav_v2/container_seglists.c
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * container_seglists.c -- implementation of segregated lists block container
+ *
+ * This container is constructed from N (up to 64) intrusive lists and a
+ * single 8 byte bitmap that stores the information whether a given list is
+ * empty or not.
+ */
+
+#include "container.h"
+#include "out.h"
+#include "sys_util.h"
+#include "util.h"
+#include "valgrind_internal.h"
+#include "vecq.h"
+
+#define SEGLIST_BLOCK_LISTS 64U
+
+struct block_container_seglists {
+	struct block_container super;
+	struct memory_block m;
+
+	VECQ(, uint32_t) blocks[SEGLIST_BLOCK_LISTS];
+	uint64_t nonempty_lists;
+};
+
+/*
+ * container_seglists_insert_block -- (internal) inserts a new memory block
+ *	into the container
+ */
+static int
+container_seglists_insert_block(struct block_container *bc,
+	const struct memory_block *m)
+{
+	ASSERT(m->chunk_id < MAX_CHUNK);
+	ASSERT(m->zone_id < UINT32_MAX);
+	ASSERTne(m->size_idx, 0);
+
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	if (c->nonempty_lists == 0)
+		c->m = *m;
+
+	ASSERT(m->size_idx <= SEGLIST_BLOCK_LISTS);
+	ASSERT(m->chunk_id == c->m.chunk_id);
+	ASSERT(m->zone_id == c->m.zone_id);
+
+	if (VECQ_ENQUEUE(&c->blocks[m->size_idx - 1], m->block_off) != 0)
+		return -1;
+
+	/* marks the list as nonempty */
+	c->nonempty_lists |= 1ULL << (m->size_idx - 1);
+
+	return 0;
+}
+
+/*
+ * container_seglists_get_rm_block_bestfit -- (internal) removes and returns the
+ *	best-fit memory block for size
+ */
+static int
+container_seglists_get_rm_block_bestfit(struct block_container *bc,
+	struct memory_block *m)
+{
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	ASSERT(m->size_idx <= SEGLIST_BLOCK_LISTS);
+	uint32_t i = 0;
+
+	/* applicable lists */
+	uint64_t size_mask = (1ULL << (m->size_idx - 1)) - 1;
+	uint64_t v = c->nonempty_lists & ~size_mask;
+
+	if (v == 0)
+		return ENOMEM;
+
+	/* finds the list that serves the smallest applicable size */
+	i = util_lssb_index64(v);
+
+	uint32_t block_offset = VECQ_DEQUEUE(&c->blocks[i]);
+
+	if (VECQ_SIZE(&c->blocks[i]) == 0) /* marks the list as empty */
+		c->nonempty_lists &= ~(1ULL << (i));
+
+	*m = c->m;
+	m->block_off = block_offset;
+	m->size_idx = i + 1;
+
+	return 0;
+}
+
+/*
+ * container_seglists_is_empty -- (internal) checks whether the container is
+ * empty
+ */
+static int
+container_seglists_is_empty(struct block_container *bc)
+{
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	return c->nonempty_lists == 0;
+}
+
+/*
+ * container_seglists_rm_all -- (internal) removes all elements from the tree
+ */
+static void
+container_seglists_rm_all(struct block_container *bc)
+{
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i)
+		VECQ_CLEAR(&c->blocks[i]);
+
+	c->nonempty_lists = 0;
+}
+
+/*
+ * container_seglists_delete -- (internal) deletes the container
+ */
+static void
+container_seglists_destroy(struct block_container *bc)
+{
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i)
+		VECQ_DELETE(&c->blocks[i]);
+
+	D_FREE(c);
+}
+
+/*
+ * This container does not support retrieval of exact memory blocks, but other
+ * than provides best-fit in O(1) time for unit sizes that do not exceed 64.
+ */
+static const struct block_container_ops container_seglists_ops = {
+	.insert = container_seglists_insert_block,
+	.get_rm_exact = NULL,
+	.get_rm_bestfit = container_seglists_get_rm_block_bestfit,
+	.is_empty = container_seglists_is_empty,
+	.rm_all = container_seglists_rm_all,
+	.destroy = container_seglists_destroy,
+};
+
+/*
+ * container_new_seglists -- allocates and initializes a seglists container
+ */
+struct block_container *
+container_new_seglists(struct palloc_heap *heap)
+{
+	struct block_container_seglists *bc;
+
+	D_ALLOC_PTR_NZ(bc);
+	if (bc == NULL)
+		goto error_container_malloc;
+
+	bc->super.heap = heap;
+	bc->super.c_ops = &container_seglists_ops;
+
+	for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i)
+		VECQ_INIT(&bc->blocks[i]);
+	bc->nonempty_lists = 0;
+
+	return (struct block_container *)&bc->super;
+
+error_container_malloc:
+	return NULL;
+}
diff --git a/src/common/dav_v2/critnib.c b/src/common/dav_v2/critnib.c
new file mode 100644
index 00000000000..304d568ca8e
--- /dev/null
+++ b/src/common/dav_v2/critnib.c
@@ -0,0 +1,678 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2023, Intel Corporation */
+
+/*
+ * critnib.c -- implementation of critnib tree
+ *
+ * It offers identity lookup (like a hashmap) and <= lookup (like a search
+ * tree).  Unlike some hashing algorithms (cuckoo hash, perfect hashing) the
+ * complexity isn't constant, but for data sizes we expect it's several
+ * times as fast as cuckoo, and has no "stop the world" cases that would
+ * cause latency (ie, better worst case behavior).
+ */
+
+/*
+ * STRUCTURE DESCRIPTION
+ *
+ * Critnib is a hybrid between a radix tree and DJ Bernstein's critbit:
+ * it skips nodes for uninteresting radix nodes (ie, ones that would have
+ * exactly one child), this requires adding to every node a field that
+ * describes the slice (4-bit in our case) that this radix level is for.
+ *
+ * This implementation also stores each node's path (ie, bits that are
+ * common to every key in that subtree) -- this doesn't help with lookups
+ * at all (unused in == match, could be reconstructed at no cost in <=
+ * after first dive) but simplifies inserts and removes.  If we ever want
+ * that piece of memory it's easy to trim it down.
+ */
+
+/*
+ * CONCURRENCY ISSUES
+ *
+ * Reads are completely lock-free sync-free, but only almost wait-free:
+ * if for some reason a read thread gets pathologically stalled, it will
+ * notice the data being stale and restart the work.  In usual cases,
+ * the structure having been modified does _not_ cause a restart.
+ *
+ * Writes could be easily made lock-free as well (with only a cmpxchg
+ * sync), but this leads to problems with removes.  A possible solution
+ * would be doing removes by overwriting by NULL w/o freeing -- yet this
+ * would lead to the structure growing without bounds.  Complex per-node
+ * locks would increase concurrency but they slow down individual writes
+ * enough that in practice a simple global write lock works faster.
+ *
+ * Removes are the only operation that can break reads.  The structure
+ * can do local RCU well -- the problem being knowing when it's safe to
+ * free.  Any synchronization with reads would kill their speed, thus
+ * instead we have a remove count.  The grace period is DELETED_LIFE,
+ * after which any read will notice staleness and restart its work.
+ */
+#include <errno.h>
+#include <stdbool.h>
+
+#include "critnib.h"
+#include "out.h"
+#include "sys_util.h"
+#include "valgrind_internal.h"
+#include "util.h"
+
+/*
+ * A node that has been deleted is left untouched for this many delete
+ * cycles.  Reads have guaranteed correctness if they took no longer than
+ * DELETED_LIFE concurrent deletes, otherwise they notice something is
+ * wrong and restart.  The memory of deleted nodes is never freed to
+ * malloc nor their pointers lead anywhere wrong, thus a stale read will
+ * (temporarily) get a wrong answer but won't crash.
+ *
+ * There's no need to count writes as they never interfere with reads.
+ *
+ * Allowing stale reads (of arbitrarily old writes or of deletes less than
+ * DELETED_LIFE old) might sound counterintuitive, but it doesn't affect
+ * semantics in any way: the thread could have been stalled just after
+ * returning from our code.  Thus, the guarantee is: the result of get() or
+ * find_le() is a value that was current at any point between the call
+ * start and end.
+ */
+#define DELETED_LIFE 16
+
+#define SLICE 4
+#define NIB ((1ULL << SLICE) - 1)
+#define SLNODES (1 << SLICE)
+
+typedef unsigned char sh_t;
+
+struct critnib_node {
+	/*
+	 * path is the part of a tree that's already traversed (be it through
+	 * explicit nodes or collapsed links) -- ie, any subtree below has all
+	 * those bits set to this value.
+	 *
+	 * nib is a 4-bit slice that's an index into the node's children.
+	 *
+	 * shift is the length (in bits) of the part of the key below this node.
+	 *
+	 *            nib
+	 * |XXXXXXXXXX|?|*****|
+	 *    path      ^
+	 *              +-----+
+	 *               shift
+	 */
+	struct critnib_node *child[SLNODES];
+	uint64_t path;
+	sh_t shift;
+};
+
+struct critnib_leaf {
+	uint64_t key;
+	void *value;
+};
+
+struct critnib {
+	struct critnib_node *root;
+
+	/* pool of freed nodes: singly linked list, next at child[0] */
+	struct critnib_node *deleted_node;
+	struct critnib_leaf *deleted_leaf;
+
+	/* nodes removed but not yet eligible for reuse */
+	struct critnib_node *pending_del_nodes[DELETED_LIFE];
+	struct critnib_leaf *pending_del_leaves[DELETED_LIFE];
+
+	uint64_t remove_count;
+
+	pthread_mutex_t mutex; /* writes/removes */
+};
+
+/*
+ * atomic load
+ */
+static void
+load(void *src, void *dst)
+{
+	util_atomic_load_explicit64((uint64_t *)src, (uint64_t *)dst,
+		memory_order_acquire);
+}
+
+/*
+ * atomic store
+ */
+static void
+store(void *dst, void *src)
+{
+	util_atomic_store_explicit64((uint64_t *)dst, (uint64_t)src,
+		memory_order_release);
+}
+
+/*
+ * internal: is_leaf -- check tagged pointer for leafness
+ */
+static inline bool
+is_leaf(struct critnib_node *n)
+{
+	return (uint64_t)n & 1;
+}
+
+/*
+ * internal: to_leaf -- untag a leaf pointer
+ */
+static inline struct critnib_leaf *
+to_leaf(struct critnib_node *n)
+{
+	return (void *)((uint64_t)n & ~1ULL);
+}
+
+/*
+ * internal: path_mask -- return bit mask of a path above a subtree [shift]
+ * bits tall
+ */
+static inline uint64_t
+path_mask(sh_t shift)
+{
+	return ~NIB << shift;
+}
+
+/*
+ * internal: slice_index -- return index of child at the given nib
+ */
+static inline unsigned
+slice_index(uint64_t key, sh_t shift)
+{
+	return (unsigned)((key >> shift) & NIB);
+}
+
+/*
+ * critnib_new -- allocates a new critnib structure
+ */
+struct critnib *
+critnib_new(void)
+{
+	struct critnib *c;
+
+	D_ALLOC_PTR(c);
+	if (!c)
+		return NULL;
+
+	util_mutex_init(&c->mutex);
+
+	VALGRIND_HG_DRD_DISABLE_CHECKING(&c->root, sizeof(c->root));
+	VALGRIND_HG_DRD_DISABLE_CHECKING(&c->remove_count,
+					sizeof(c->remove_count));
+
+	return c;
+}
+
+/*
+ * internal: delete_node -- recursively free (to malloc) a subtree
+ */
+static void
+delete_node(struct critnib_node *__restrict n)
+{
+	if (!is_leaf(n)) {
+		for (int i = 0; i < SLNODES; i++) {
+			if (n->child[i])
+				delete_node(n->child[i]);
+		}
+
+		D_FREE(n);
+	} else {
+		void *ptr;
+
+		ptr = (void *)to_leaf(n);
+		D_FREE(ptr);
+	}
+}
+
+/*
+ * critnib_delete -- destroy and free a critnib struct
+ */
+void
+critnib_delete(struct critnib *c)
+{
+	if (c->root)
+		delete_node(c->root);
+
+	util_mutex_destroy(&c->mutex);
+
+	for (struct critnib_node *m = c->deleted_node; m; ) {
+		struct critnib_node *mm = m->child[0];
+
+		D_FREE(m);
+		m = mm;
+	}
+
+	for (struct critnib_leaf *k = c->deleted_leaf; k; ) {
+		struct critnib_leaf *kk = k->value;
+
+		D_FREE(k);
+		k = kk;
+	}
+
+	for (int i = 0; i < DELETED_LIFE; i++) {
+		D_FREE(c->pending_del_nodes[i]);
+		D_FREE(c->pending_del_leaves[i]);
+	}
+
+	D_FREE(c);
+}
+
+/*
+ * internal: free_node -- free (to internal pool, not malloc) a node.
+ *
+ * We cannot free them to malloc as a stalled reader thread may still walk
+ * through such nodes; it will notice the result being bogus but only after
+ * completing the walk, thus we need to ensure any freed nodes still point
+ * to within the critnib structure.
+ */
+static void
+free_node(struct critnib *__restrict c, struct critnib_node *__restrict n)
+{
+	if (!n)
+		return;
+
+	ASSERT(!is_leaf(n));
+	n->child[0] = c->deleted_node;
+	c->deleted_node = n;
+}
+
+/*
+ * internal: alloc_node -- allocate a node from our pool or from malloc
+ */
+static struct critnib_node *
+alloc_node(struct critnib *__restrict c)
+{
+	if (!c->deleted_node) {
+		struct critnib_node *n;
+
+		D_ALLOC_PTR_NZ(n);
+		if (n == NULL)
+			D_CRIT("Malloc!\n");
+
+		return n;
+	}
+
+	struct critnib_node *n = c->deleted_node;
+
+	c->deleted_node = n->child[0];
+	VALGRIND_ANNOTATE_NEW_MEMORY(n, sizeof(*n));
+
+	return n;
+}
+
+/*
+ * internal: free_leaf -- free (to internal pool, not malloc) a leaf.
+ *
+ * See free_node().
+ */
+static void
+free_leaf(struct critnib *__restrict c, struct critnib_leaf *__restrict k)
+{
+	if (!k)
+		return;
+
+	k->value = c->deleted_leaf;
+	c->deleted_leaf = k;
+}
+
+/*
+ * internal: alloc_leaf -- allocate a leaf from our pool or from malloc
+ */
+static struct critnib_leaf *
+alloc_leaf(struct critnib *__restrict c)
+{
+	if (!c->deleted_leaf) {
+		struct critnib_leaf *k;
+
+		D_ALLOC_PTR_NZ(k);
+		if (k == NULL)
+			D_CRIT("Malloc!\n");
+
+		return k;
+	}
+
+	struct critnib_leaf *k = c->deleted_leaf;
+
+	c->deleted_leaf = k->value;
+	VALGRIND_ANNOTATE_NEW_MEMORY(k, sizeof(*k));
+
+	return k;
+}
+
+/*
+ * critnib_insert -- write a key:value pair to the critnib structure
+ *
+ * Returns:
+ *  - 0 on success
+ *  - EEXIST if such a key already exists
+ *  - ENOMEM if we're out of memory
+ *
+ * Takes a global write lock but doesn't stall any readers.
+ */
+int
+critnib_insert(struct critnib *c, uint64_t key, void *value)
+{
+	util_mutex_lock(&c->mutex);
+
+	struct critnib_leaf *k = alloc_leaf(c);
+
+	if (!k) {
+		util_mutex_unlock(&c->mutex);
+
+		return ENOMEM;
+	}
+
+	VALGRIND_HG_DRD_DISABLE_CHECKING(k, sizeof(struct critnib_leaf));
+
+	k->key = key;
+	k->value = value;
+
+	struct critnib_node *kn = (void *)((uint64_t)k | 1);
+
+	struct critnib_node *n = c->root;
+
+	if (!n) {
+		c->root = kn;
+
+		util_mutex_unlock(&c->mutex);
+
+		return 0;
+	}
+
+	struct critnib_node **parent = &c->root;
+	struct critnib_node *prev = c->root;
+
+	while (n && !is_leaf(n) && (key & path_mask(n->shift)) == n->path) {
+		prev = n;
+		parent = &n->child[slice_index(key, n->shift)];
+		n = *parent;
+	}
+
+	if (!n) {
+		n = prev;
+		store(&n->child[slice_index(key, n->shift)], kn);
+
+		util_mutex_unlock(&c->mutex);
+
+		return 0;
+	}
+
+	uint64_t path = is_leaf(n) ? to_leaf(n)->key : n->path;
+	/* Find where the path differs from our key. */
+	uint64_t at = path ^ key;
+
+	if (!at) {
+		ASSERT(is_leaf(n));
+		free_leaf(c, to_leaf(kn));
+		/* fail instead of replacing */
+
+		util_mutex_unlock(&c->mutex);
+
+		return EEXIST;
+	}
+
+	/* and convert that to an index. */
+	sh_t sh = util_mssb_index64(at) & (sh_t)~(SLICE - 1);
+
+	struct critnib_node *m = alloc_node(c);
+
+	if (!m) {
+		free_leaf(c, to_leaf(kn));
+
+		util_mutex_unlock(&c->mutex);
+
+		return ENOMEM;
+	}
+	VALGRIND_HG_DRD_DISABLE_CHECKING(m, sizeof(struct critnib_node));
+
+	for (int i = 0; i < SLNODES; i++)
+		m->child[i] = NULL;
+
+	m->child[slice_index(key, sh)] = kn;
+	m->child[slice_index(path, sh)] = n;
+	m->shift = sh;
+	m->path = key & path_mask(sh);
+	store(parent, m);
+
+	util_mutex_unlock(&c->mutex);
+
+	return 0;
+}
+
+/*
+ * critnib_remove -- delete a key from the critnib structure, return its value
+ */
+void *
+critnib_remove(struct critnib *c, uint64_t key)
+{
+	struct critnib_leaf *k;
+	void *value = NULL;
+
+	util_mutex_lock(&c->mutex);
+
+	struct critnib_node *n = c->root;
+
+	if (!n)
+		goto not_found;
+
+	uint64_t del = util_fetch_and_add64(&c->remove_count, 1) % DELETED_LIFE;
+
+	free_node(c, c->pending_del_nodes[del]);
+	free_leaf(c, c->pending_del_leaves[del]);
+	c->pending_del_nodes[del] = NULL;
+	c->pending_del_leaves[del] = NULL;
+
+	if (is_leaf(n)) {
+		k = to_leaf(n);
+		if (k->key == key) {
+			store(&c->root, NULL);
+			goto del_leaf;
+		}
+
+		goto not_found;
+	}
+	/*
+	 * n and k are a parent:child pair (after the first iteration); k is the
+	 * leaf that holds the key we're deleting.
+	 */
+	struct critnib_node **k_parent = &c->root;
+	struct critnib_node **n_parent = &c->root;
+	struct critnib_node *kn = n;
+
+	while (!is_leaf(kn)) {
+		n_parent = k_parent;
+		n = kn;
+		k_parent = &kn->child[slice_index(key, kn->shift)];
+		kn = *k_parent;
+
+		if (!kn)
+			goto not_found;
+	}
+
+	k = to_leaf(kn);
+	if (k->key != key)
+		goto not_found;
+
+	store(&n->child[slice_index(key, n->shift)], NULL);
+
+	/* Remove the node if there's only one remaining child. */
+	int ochild = -1;
+
+	for (int i = 0; i < SLNODES; i++) {
+		if (n->child[i]) {
+			if (ochild != -1)
+				goto del_leaf;
+
+			ochild = i;
+		}
+	}
+
+	ASSERTne(ochild, -1);
+
+	store(n_parent, n->child[ochild]);
+	c->pending_del_nodes[del] = n;
+
+del_leaf:
+	value = k->value;
+	c->pending_del_leaves[del] = k;
+
+not_found:
+	util_mutex_unlock(&c->mutex);
+	return value;
+}
+
+/*
+ * critnib_get -- query for a key ("==" match), returns value or NULL
+ *
+ * Doesn't need a lock but if many deletes happened while our thread was
+ * somehow stalled the query is restarted (as freed nodes remain unused only
+ * for a grace period).
+ *
+ * Counterintuitively, it's pointless to return the most current answer,
+ * we need only one that was valid at any point after the call started.
+ */
+void *
+critnib_get(struct critnib *c, uint64_t key)
+{
+	uint64_t wrs1, wrs2;
+	void *res;
+
+	do {
+		struct critnib_node *n;
+
+		load(&c->remove_count, &wrs1);
+		load(&c->root, &n);
+
+		/*
+		 * critbit algorithm: dive into the tree, looking at nothing but
+		 * each node's critical bit^H^H^Hnibble.  This means we risk
+		 * going wrong way if our path is missing, but that's ok...
+		 */
+		while (n && !is_leaf(n))
+			load(&n->child[slice_index(key, n->shift)], &n);
+
+		/* ... as we check it at the end. */
+		struct critnib_leaf *k = to_leaf(n);
+
+		res = (n && k->key == key) ? k->value : NULL;
+		load(&c->remove_count, &wrs2);
+	} while (wrs1 + DELETED_LIFE <= wrs2);
+
+	return res;
+}
+
+/*
+ * internal: find_successor -- return the rightmost non-null node in a subtree
+ */
+static void *
+find_successor(struct critnib_node *__restrict n)
+{
+	while (1) {
+		int nib;
+
+		for (nib = NIB; nib >= 0; nib--)
+			if (n->child[nib])
+				break;
+
+		if (nib < 0)
+			return NULL;
+
+		n = n->child[nib];
+		if (is_leaf(n))
+			return to_leaf(n)->value;
+	}
+}
+
+/*
+ * internal: find_le -- recursively search <= in a subtree
+ */
+static void *
+find_le(struct critnib_node *__restrict n, uint64_t key)
+{
+	if (!n)
+		return NULL;
+
+	if (is_leaf(n)) {
+		struct critnib_leaf *k = to_leaf(n);
+
+		return (k->key <= key) ? k->value : NULL;
+	}
+
+	/*
+	 * is our key outside the subtree we're in?
+	 *
+	 * If we're inside, all bits above the nib will be identical; note
+	 * that shift points at the nib's lower rather than upper edge, so it
+	 * needs to be masked away as well.
+	 */
+	if ((key ^ n->path) >> (n->shift) & ~NIB) {
+		/*
+		 * subtree is too far to the left?
+		 * -> its rightmost value is good
+		 */
+		if (n->path < key)
+			return find_successor(n);
+
+		/*
+		 * subtree is too far to the right?
+		 * -> it has nothing of interest to us
+		 */
+		return NULL;
+	}
+
+	unsigned nib = slice_index(key, n->shift);
+
+	/* recursive call: follow the path */
+	{
+		struct critnib_node *m;
+
+		load(&n->child[nib], &m);
+
+		void *value = find_le(m, key);
+
+		if (value)
+			return value;
+	}
+
+	/*
+	 * nothing in that subtree?  We strayed from the path at this point,
+	 * thus need to search every subtree to our left in this node.  No
+	 * need to dive into any but the first non-null, though.
+	 */
+	for (; nib > 0; nib--) {
+		struct critnib_node *m;
+
+		load(&n->child[nib - 1], &m);
+		if (m) {
+			n = m;
+			if (is_leaf(n))
+				return to_leaf(n)->value;
+
+			return find_successor(n);
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * critnib_find_le -- query for a key ("<=" match), returns value or NULL
+ *
+ * Same guarantees as critnib_get().
+ */
+void *
+critnib_find_le(struct critnib *c, uint64_t key)
+{
+	uint64_t wrs1, wrs2;
+	void *res;
+
+	do {
+		load(&c->remove_count, &wrs1);
+
+		struct critnib_node *n; /* avoid a subtle TOCTOU */
+
+		load(&c->root, &n);
+		res = n ? find_le(n, key) : NULL;
+		load(&c->remove_count, &wrs2);
+	} while (wrs1 + DELETED_LIFE <= wrs2);
+
+	return res;
+}
diff --git a/src/common/dav_v2/critnib.h b/src/common/dav_v2/critnib.h
new file mode 100644
index 00000000000..8e6d07f1c5d
--- /dev/null
+++ b/src/common/dav_v2/critnib.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2023, Intel Corporation */
+
+/*
+ * critnib.h -- internal definitions for critnib tree
+ */
+
+#ifndef __DAOS_COMMON_CRITNIB_H
+#define __DAOS_COMMON_CRITNIB_H 1
+
+#include <stdint.h>
+
+struct critnib;
+
+struct critnib *critnib_new(void);
+void critnib_delete(struct critnib *c);
+
+int critnib_insert(struct critnib *c, uint64_t key, void *value);
+void *critnib_remove(struct critnib *c, uint64_t key);
+void *critnib_get(struct critnib *c, uint64_t key);
+void *critnib_find_le(struct critnib *c, uint64_t key);
+
+#endif /* __DAOS_COMMON_CRITNIB_H */
diff --git a/src/common/dav_v2/dav_clogs.c b/src/common/dav_v2/dav_clogs.c
new file mode 100644
index 00000000000..1603e14dd88
--- /dev/null
+++ b/src/common/dav_v2/dav_clogs.c
@@ -0,0 +1,104 @@
+/**
+ * (C) Copyright 2015-2023 Intel Corporation.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ */
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "dav_internal.h"
+#include "memops.h"
+#include "tx.h"
+
+static void
+clogs_extend_free(struct ulog *redo)
+{
+	D_FREE(redo);
+}
+
+static int
+clogs_extend_redo(struct ulog **redo, uint64_t gen_num)
+{
+	size_t size = SIZEOF_ALIGNED_ULOG(LANE_REDO_EXTERNAL_SIZE);
+
+	D_ALIGNED_ALLOC_NZ(*redo, CACHELINE_SIZE, size);
+	if (*redo == NULL)
+		return -1;
+
+	size_t capacity = ALIGN_DOWN(size - sizeof(struct ulog), CACHELINE_SIZE);
+
+	ulog_construct_new(*redo, capacity, gen_num, 0);
+	return 0;
+}
+
+static int
+clogs_extend_undo(struct ulog **undo, uint64_t gen_num)
+{
+	size_t size = TX_DEFAULT_RANGE_CACHE_SIZE;
+
+	D_ALIGNED_ALLOC_NZ(*undo, CACHELINE_SIZE, size);
+	if (*undo == NULL)
+		return -1;
+
+	size_t capacity = ALIGN_DOWN(size - sizeof(struct ulog), CACHELINE_SIZE);
+
+	ulog_construct_new(*undo, capacity, gen_num, 0);
+	return 0;
+}
+
+int
+dav_create_clogs(dav_obj_t *hdl)
+{
+
+	ulog_construct_new((struct ulog *)&hdl->clogs.external,
+		LANE_REDO_EXTERNAL_SIZE, 0, 0);
+	ulog_construct_new((struct ulog *)&hdl->clogs.undo,
+		LANE_UNDO_SIZE, 0, 0);
+
+	hdl->external = operation_new((struct ulog *)&hdl->clogs.external,
+		LANE_REDO_EXTERNAL_SIZE, clogs_extend_redo, clogs_extend_free,
+		&hdl->p_ops, LOG_TYPE_REDO);
+	if (hdl->external == NULL)
+		return -1;
+	hdl->undo = operation_new((struct ulog *)&hdl->clogs.undo,
+		LANE_UNDO_SIZE, clogs_extend_undo, clogs_extend_free,
+		&hdl->p_ops, LOG_TYPE_UNDO);
+	if (hdl->undo == NULL) {
+		operation_delete(hdl->external);
+		return -1;
+	}
+	return 0;
+}
+
+void
+dav_destroy_clogs(dav_obj_t *hdl)
+{
+	operation_free_logs(hdl->external);
+	operation_delete(hdl->external);
+	operation_free_logs(hdl->undo);
+	operation_delete(hdl->undo);
+}
+
+int
+dav_hold_clogs(dav_obj_t *hdl)
+{
+	if (hdl->nested_tx++ == 0) {
+		operation_init(hdl->external);
+		operation_init(hdl->undo);
+	}
+	return 0;
+}
+
+int
+dav_release_clogs(dav_obj_t *hdl)
+{
+	if (hdl->nested_tx == 0)
+		FATAL("release clogs");
+	--hdl->nested_tx;
+	return 0;
+}
diff --git a/src/common/dav_v2/dav_clogs.h b/src/common/dav_v2/dav_clogs.h
new file mode 100644
index 00000000000..b2565a949ac
--- /dev/null
+++ b/src/common/dav_v2/dav_clogs.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * dav_iface.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
+ */
+
+#ifndef __DAOS_COMMON_DAV_CLOGS_H
+#define __DAOS_COMMON_CLOGS_H 1
+
+#include <stdint.h>
+#include <sys/types.h>
+#include "ulog.h"
+
+#define LANE_TOTAL_SIZE (3072) /* 3 * 1024 (sum of 3 old lane sections) */
+/*
+ * We have 3 kilobytes to distribute be split between transactional redo
+ * and undo logs.
+ * Since by far the most space consuming operations are transactional
+ * snapshots, most of the space, 2304 bytes, is assigned to the undo log.
+ * After that, the remainder, 640 bytes, or 40 ulog entries, is left for the
+ * transactional redo logs.
+ * Thanks to this distribution, all small and medium transactions should be
+ * entirely performed without allocating any additional metadata.
+ *
+ * These values must be cacheline size aligned to be used for ulogs. Therefore
+ * they are parametrized for the size of the struct ulog changes between
+ * platforms.
+ */
+#define LANE_UNDO_SIZE (LANE_TOTAL_SIZE \
+			- LANE_REDO_EXTERNAL_SIZE \
+			- 2 * sizeof(struct ulog)) /* 2304 for 64B ulog */
+#define LANE_REDO_EXTERNAL_SIZE ALIGN_UP(704 - sizeof(struct ulog), \
+					CACHELINE_SIZE) /* 640 for 64B ulog */
+
+struct dav_clogs {
+	/*
+	 * Redo log for large operations/transactions.
+	 * Can be extended by the use of internal ulog.
+	 */
+	struct ULOG(LANE_REDO_EXTERNAL_SIZE) external;
+	/*
+	 * Undo log for snapshots done in a transaction.
+	 * Can be extended/shrunk by the use of internal ulog.
+	 */
+	struct ULOG(LANE_UNDO_SIZE) undo;
+};
+
+typedef struct dav_obj dav_obj_t;
+
+int dav_create_clogs(dav_obj_t *hdl);
+void dav_destroy_clogs(dav_obj_t *hdl);
+int dav_hold_clogs(dav_obj_t *hdl);
+int dav_release_clogs(dav_obj_t *hdl);
+
+#endif /* __DAOS_COMMON_DAV_CLOGS_H */
diff --git a/src/common/dav_v2/dav_iface.c b/src/common/dav_v2/dav_iface.c
new file mode 100644
index 00000000000..ede29fafc56
--- /dev/null
+++ b/src/common/dav_v2/dav_iface.c
@@ -0,0 +1,480 @@
+/**
+ * (C) Copyright 2015-2024 Intel Corporation.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ */
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <uuid/uuid.h>
+
+#include <daos/mem.h>
+#include "dav_internal.h"
+#include "heap.h"
+#include "palloc.h"
+#include "mo_wal.h"
+#include "obj.h"
+#include "tx.h"
+
+#define	DAV_HEAP_INIT	0x1
+#define MEGABYTE	((uintptr_t)1 << 20)
+
+static bool
+is_zone_evictable(void *arg, uint32_t zid)
+{
+	struct dav_obj *hdl = (struct dav_obj *)arg;
+
+	return heap_mbrt_ismb_evictable(hdl->do_heap, zid);
+}
+
+static int
+dav_uc_callback(int evt_type, void *arg, uint32_t zid)
+{
+	struct dav_obj *hdl = (struct dav_obj *)arg;
+	struct zone    *z   = ZID_TO_ZONE(&hdl->do_heap->layout_info, zid);
+
+	switch (evt_type) {
+	case UMEM_CACHE_EVENT_PGLOAD:
+		if (hdl->do_booted) {
+			VALGRIND_DO_CREATE_MEMPOOL(z, 0, 0);
+#if VG_MEMCHECK_ENABLED
+			if (On_memcheck)
+				palloc_heap_vg_zone_open(hdl->do_heap, zid, 1);
+#endif
+			D_ASSERT(z->header.flags & ZONE_EVICTABLE_MB);
+			heap_mbrt_setmb_usage(hdl->do_heap, zid, z->header.sp_usage);
+		}
+		break;
+	case UMEM_CACHE_EVENT_PGEVICT:
+		if (hdl->do_booted) {
+			VALGRIND_DO_DESTROY_MEMPOOL(z);
+		}
+		break;
+	default:
+		D_ERROR("Unknown umem cache event type in callback");
+	}
+	return 0;
+}
+
+static dav_obj_t *
+dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct umem_store *store)
+{
+	dav_obj_t              *hdl = NULL;
+	void                   *mmap_base;
+	int                     err = 0;
+	int                     rc;
+	struct heap_zone_limits hzl;
+	struct zone            *z0;
+
+	hzl = heap_get_zone_limits(store->stor_size, scm_sz, 100);
+
+	if (hzl.nzones_heap == 0) {
+		ERR("Insufficient heap size.");
+		errno = EINVAL;
+		return NULL;
+	}
+
+	if ((hzl.nzones_cache < 2) && (hzl.nzones_heap > hzl.nzones_cache)) {
+		ERR("Insufficient scm size.");
+		errno = EINVAL;
+		return NULL;
+	}
+
+	if (hzl.nzones_cache * ZONE_MAX_SIZE != scm_sz)
+		D_WARN("scm size %lu is not aligned to zone size %lu, some scm will be unused",
+		       scm_sz, ZONE_MAX_SIZE);
+
+	if (hzl.nzones_heap < hzl.nzones_cache)
+		D_WARN("scm size %lu exceeds metablob size %lu, some scm will be unused", scm_sz,
+		       store->stor_size);
+
+	mmap_base = mmap(NULL, scm_sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (mmap_base == MAP_FAILED)
+		return NULL;
+
+	D_ALIGNED_ALLOC(hdl, CACHELINE_SIZE, sizeof(dav_obj_t));
+	if (hdl == NULL) {
+		err = ENOMEM;
+		goto out0;
+	}
+
+	hdl->do_fd = fd;
+	hdl->do_base            = mmap_base;
+	hdl->do_size_mem        = scm_sz;
+	hdl->do_size_mem_usable = hzl.nzones_cache * ZONE_MAX_SIZE;
+	hdl->do_size_meta       = store->stor_size;
+	hdl->p_ops.base         = hdl;
+	hdl->do_store           = store;
+	hdl->p_ops.umem_store   = store;
+
+	if (hdl->do_store->stor_priv == NULL) {
+		D_ERROR("Missing backing store for the heap");
+		err = EINVAL;
+		goto out1;
+	}
+
+	if (flags & DAV_HEAP_INIT) {
+		rc = heap_init(mmap_base, scm_sz, store);
+		if (rc) {
+			err = errno;
+			goto out1;
+		}
+	}
+
+	D_STRNDUP(hdl->do_path, path, strlen(path));
+	D_ALLOC_PTR(hdl->do_heap);
+	if (hdl->do_heap == NULL) {
+		err = ENOMEM;
+		goto out2;
+	}
+
+	hdl->do_stats = stats_new(hdl);
+	if (hdl->do_stats == NULL)
+		goto out2;
+
+	rc = heap_boot(hdl->do_heap, hdl->do_base, hdl->do_store->stor_size, scm_sz, &hdl->p_ops,
+		       hdl->do_stats);
+	if (rc) {
+		err = rc;
+		goto out2;
+	}
+
+	heap_set_root_ptrs(hdl->do_heap, &hdl->do_root_offsetp, &hdl->do_root_sizep);
+	heap_set_stats_ptr(hdl->do_heap, &hdl->do_stats->persistent);
+
+	rc = umem_cache_alloc(store, ZONE_MAX_SIZE, hzl.nzones_heap, hzl.nzones_cache,
+			      heap_get_max_nemb(hdl->do_heap), 4096, mmap_base, is_zone_evictable,
+			      dav_uc_callback, hdl);
+	if (rc != 0) {
+		D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc));
+		err = daos_der2errno(rc);
+		goto out3;
+	}
+
+	if (!(flags & DAV_HEAP_INIT)) {
+		rc = heap_zone_load(hdl->do_heap, 0);
+		if (rc) {
+			err = rc;
+			goto out4;
+		}
+		D_ASSERT(store != NULL);
+		rc = hdl->do_store->stor_ops->so_wal_replay(hdl->do_store, dav_wal_replay_cb, hdl);
+		if (rc) {
+			err = daos_der2errno(rc);
+			goto out4;
+		}
+	}
+
+	rc = dav_create_clogs(hdl);
+	if (rc) {
+		err = rc;
+		goto out4;
+	}
+
+	rc = lw_tx_begin(hdl);
+	if (rc) {
+		D_ERROR("lw_tx_begin failed with err %d\n", rc);
+		err = ENOMEM;
+		goto out5;
+	}
+	rc = heap_ensure_zone0_initialized(hdl->do_heap);
+	if (rc) {
+		lw_tx_end(hdl, NULL);
+		D_ERROR("Failed to initialize zone0, rc = %d", daos_errno2der(rc));
+		goto out5;
+	}
+	lw_tx_end(hdl, NULL);
+
+	z0 = ZID_TO_ZONE(&hdl->do_heap->layout_info, 0);
+	if (z0->header.zone0_zinfo_off) {
+		D_ASSERT(z0->header.zone0_zinfo_size);
+		D_ASSERT(OFFSET_TO_ZID(z0->header.zone0_zinfo_off) == 0);
+
+		rc = heap_update_mbrt_zinfo(hdl->do_heap, false);
+		if (rc) {
+			D_ERROR("Failed to update mbrt with zinfo errno = %d", rc);
+			err = rc;
+			goto out5;
+		}
+
+		rc = heap_load_nonevictable_zones(hdl->do_heap);
+		if (rc) {
+			D_ERROR("Failed to load required zones during boot, errno= %d", rc);
+			err = rc;
+			goto out5;
+		}
+	} else {
+		D_ASSERT(z0->header.zone0_zinfo_size == 0);
+		rc = lw_tx_begin(hdl);
+		if (rc) {
+			D_ERROR("lw_tx_begin failed with err %d\n", rc);
+			err = ENOMEM;
+			goto out5;
+		}
+		rc = obj_realloc(hdl, &z0->header.zone0_zinfo_off, &z0->header.zone0_zinfo_size,
+				 heap_zinfo_get_size(hzl.nzones_heap));
+		if (rc != 0) {
+			lw_tx_end(hdl, NULL);
+			D_ERROR("Failed to setup zinfo");
+			goto out5;
+		}
+		rc = heap_update_mbrt_zinfo(hdl->do_heap, true);
+		if (rc) {
+			D_ERROR("Failed to update mbrt with zinfo errno = %d", rc);
+			err = rc;
+			goto out5;
+		}
+		lw_tx_end(hdl, NULL);
+	}
+	umem_cache_post_replay(hdl->do_store);
+
+#if VG_MEMCHECK_ENABLED
+	if (On_memcheck)
+		palloc_heap_vg_open(hdl->do_heap, 1);
+#endif
+
+	hdl->do_booted = 1;
+
+	return hdl;
+out5:
+	dav_destroy_clogs(hdl);
+out4:
+	umem_cache_free(hdl->do_store);
+out3:
+	heap_cleanup(hdl->do_heap);
+out2:
+	if (hdl->do_stats)
+		stats_delete(hdl, hdl->do_stats);
+	if (hdl->do_heap)
+		D_FREE(hdl->do_heap);
+	if (hdl->do_utx) {
+		dav_umem_wtx_cleanup(hdl->do_utx);
+		D_FREE(hdl->do_utx);
+	}
+	D_FREE(hdl->do_path);
+out1:
+	D_FREE(hdl);
+out0:
+	munmap(mmap_base, scm_sz);
+	errno = err;
+	return NULL;
+
+}
+
+DAV_FUNC_EXPORT dav_obj_t *
+dav_obj_create_v2(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store)
+{
+	int fd;
+	dav_obj_t *hdl;
+	struct stat statbuf;
+	int         create = 0;
+
+	SUPPRESS_UNUSED(flags);
+
+	if (sz == 0) {
+		/* Open the file and obtain the size */
+		fd = open(path, O_RDWR|O_CLOEXEC);
+		if (fd == -1) {
+			DS_ERROR(errno, "obj_create_v2 open %s to fetch size", path);
+			return NULL;
+		}
+
+		if (fstat(fd, &statbuf) != 0)
+			goto out;
+		sz = statbuf.st_size;
+	} else {
+		fd = open(path, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, mode);
+		if (fd == -1) {
+			DS_ERROR(errno, "obj_create_v2 open %s to alloc", path);
+			return NULL;
+		}
+
+		if (fallocate(fd, 0, 0, (off_t)sz) == -1) {
+			errno = ENOSPC;
+			goto out;
+		}
+		create = 1;
+	}
+
+	hdl = dav_obj_open_internal(fd, DAV_HEAP_INIT, sz, path, store);
+	if (hdl == NULL)
+		goto out;
+
+	DAV_DBG("pool %s created, size="DF_U64"", hdl->do_path, sz);
+	return hdl;
+
+out:
+	close(fd);
+	if (create)
+		unlink(path);
+	return NULL;
+}
+
+DAV_FUNC_EXPORT dav_obj_t *
+dav_obj_open_v2(const char *path, int flags, struct umem_store *store)
+{
+	size_t size;
+	int fd;
+	dav_obj_t *hdl;
+	struct stat statbuf;
+
+	SUPPRESS_UNUSED(flags);
+
+	fd = open(path, O_RDWR|O_CLOEXEC);
+	if (fd == -1) {
+		DS_ERROR(errno, "obj_create_v2 open %s", path);
+		return NULL;
+	}
+
+	if (fstat(fd, &statbuf) != 0) {
+		close(fd);
+		return NULL;
+	}
+	size = (size_t)statbuf.st_size;
+
+	hdl = dav_obj_open_internal(fd, 0, size, path, store);
+	if (hdl == NULL) {
+		close(fd);
+		return NULL;
+	}
+	DAV_DBG("pool %s is open, size="DF_U64"", hdl->do_path, size);
+	return hdl;
+}
+
+DAV_FUNC_EXPORT void
+dav_obj_close_v2(dav_obj_t *hdl)
+{
+
+	if (hdl == NULL) {
+		ERR("NULL handle");
+		return;
+	}
+	dav_destroy_clogs(hdl);
+	heap_cleanup(hdl->do_heap);
+	D_FREE(hdl->do_heap);
+
+	stats_delete(hdl, hdl->do_stats);
+
+	munmap(hdl->do_base, hdl->do_size_mem);
+	close(hdl->do_fd);
+	if (hdl->do_utx) {
+		dav_umem_wtx_cleanup(hdl->do_utx);
+		D_FREE(hdl->do_utx);
+	}
+	umem_cache_free(hdl->do_store);
+	DAV_DBG("pool %s is closed", hdl->do_path);
+	D_FREE(hdl->do_path);
+	D_FREE(hdl);
+}
+
+DAV_FUNC_EXPORT void *
+dav_get_base_ptr_v2(dav_obj_t *hdl)
+{
+	return hdl->do_heap->layout_info.zone0;
+}
+
+DAV_FUNC_EXPORT int
+dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p)
+{
+	uint8_t                        id        = (uint8_t)p->class_id;
+	struct alloc_class_collection *ac = heap_alloc_classes(pop->do_heap);
+	enum header_type               lib_htype = MAX_HEADER_TYPES;
+	size_t                         runsize_bytes;
+	uint32_t                       size_idx;
+	struct alloc_class            *c;
+
+	if (p->unit_size <= 0 || p->unit_size > DAV_MAX_ALLOC_SIZE ||
+		p->units_per_block <= 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (p->alignment != 0 && p->unit_size % p->alignment != 0) {
+		ERR("unit size must be evenly divisible by alignment");
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (p->alignment > (MEGABYTE * 2)) {
+		ERR("alignment cannot be larger than 2 megabytes");
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (p->class_id >= MAX_ALLOCATION_CLASSES) {
+		ERR("class id outside of the allowed range");
+		errno = ERANGE;
+		return -1;
+	}
+
+	switch (p->header_type) {
+	case DAV_HEADER_LEGACY:
+		lib_htype = HEADER_LEGACY;
+		break;
+	case DAV_HEADER_COMPACT:
+		lib_htype = HEADER_COMPACT;
+		break;
+	case DAV_HEADER_NONE:
+		lib_htype = HEADER_NONE;
+		break;
+	case MAX_DAV_HEADER_TYPES:
+	default:
+		ERR("invalid header type");
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (id == 0) {
+		if (alloc_class_find_first_free_slot(ac, &id) != 0) {
+			ERR("no available free allocation class identifier");
+			errno = EINVAL;
+			return -1;
+		}
+	} else {
+		if (alloc_class_reserve(ac, id) != 0) {
+			ERR("attempted to overwrite an allocation class");
+			errno = EEXIST;
+			return -1;
+		}
+	}
+
+	runsize_bytes = CHUNKSIZE;
+	while (((p->units_per_block * p->unit_size) + RUN_BASE_METADATA_SIZE) > runsize_bytes)
+		runsize_bytes += CHUNKSIZE;
+
+	/* aligning the buffer might require up-to to 'alignment' bytes */
+	if (p->alignment != 0)
+		runsize_bytes += p->alignment;
+
+	size_idx = (uint32_t)(runsize_bytes / CHUNKSIZE);
+
+	if (size_idx > MAX_CHUNK)
+		size_idx = MAX_CHUNK;
+
+	c = alloc_class_new(id, heap_alloc_classes(pop->do_heap), CLASS_RUN, lib_htype,
+			    p->unit_size, p->alignment, size_idx);
+	if (c == NULL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (heap_create_alloc_class_buckets(pop->do_heap, c) != 0) {
+		alloc_class_delete(ac, c);
+		return -1;
+	}
+
+	p->class_id = c->id;
+	p->units_per_block = c->rdsc.nallocs;
+
+	return 0;
+}
+
+DAV_FUNC_EXPORT size_t
+dav_obj_pgsz_v2()
+{
+	return ZONE_MAX_SIZE;
+}
diff --git a/src/common/dav_v2/dav_internal.h b/src/common/dav_v2/dav_internal.h
new file mode 100644
index 00000000000..bc13e2eabc3
--- /dev/null
+++ b/src/common/dav_v2/dav_internal.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
+ */
+
+#ifndef __DAOS_COMMON_DAV_INTERNAL_H
+#define __DAOS_COMMON_DAV_INTERNAL_H 1
+
+#include "dav_v2.h"
+#include "dav_clogs.h"
+#include "heap.h"
+#include "mo_wal.h"
+#include "wal_tx.h"
+
+#define DAV_FUNC_EXPORT __attribute__ ((visibility ("default")))
+
+#define DAV_MAX_ALLOC_SIZE ((size_t)0x3FFDFFFC0)
+
+enum dav_tx_failure_behavior {
+	DAV_TX_FAILURE_ABORT,
+	DAV_TX_FAILURE_RETURN,
+};
+
+enum dav_stats_enabled {
+	DAV_STATS_ENABLED_TRANSIENT,
+	DAV_STATS_ENABLED_BOTH,
+	DAV_STATS_ENABLED_PERSISTENT,
+	DAV_STATS_DISABLED,
+};
+
+#define	DAV_PHDR_SIZE	4096
+
+/* DAV object handle */
+typedef struct dav_obj {
+	char				*do_path;
+	uint64_t                         do_size_meta;
+	uint64_t                         do_size_mem;
+	uint64_t                         do_size_mem_usable;
+	void				*do_base;
+	uint64_t                        *do_root_offsetp;
+	uint64_t                        *do_root_sizep;
+	struct palloc_heap              *do_heap;
+	struct operation_context	*external;
+	struct operation_context	*undo;
+	struct mo_ops			 p_ops;	/* REVISIT */
+	struct stats			*do_stats;
+	int				 do_fd;
+	int				 nested_tx;
+	struct umem_wal_tx		*do_utx;
+	struct umem_store               *do_store;
+	int                              do_booted;
+
+	struct dav_clogs		 clogs __attribute__ ((__aligned__(CACHELINE_SIZE)));
+} dav_obj_t;
+
+static inline
+struct dav_tx *utx2wtx(struct umem_wal_tx *utx)
+{
+	return (struct dav_tx *)&utx->utx_private;
+}
+
+static inline
+struct umem_wal_tx *wtx2utx(struct dav_tx *wtx)
+{
+	return (struct umem_wal_tx *)((void *)wtx
+			- (ptrdiff_t)offsetof(struct umem_wal_tx, utx_private));
+}
+
+int lw_tx_begin(dav_obj_t *pop);
+int lw_tx_end(dav_obj_t *pop, void *data);
+
+#endif /* __DAOS_COMMON_DAV_INTERNAL_H */
diff --git a/src/common/dav_v2/dav_v2.h b/src/common/dav_v2/dav_v2.h
new file mode 100644
index 00000000000..6147d33ba4e
--- /dev/null
+++ b/src/common/dav_v2/dav_v2.h
@@ -0,0 +1,322 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
+ */
+
+#ifndef __DAOS_COMMON_DAV_V2_H
+#define __DAOS_COMMON_DAV_V2_H 1
+
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include "../dav/dav.h"
+
+typedef struct dav_obj dav_obj_t;
+struct umem_store;
+
+/**
+ * Create and initialize a DAV object and return its handle.
+ *
+ * \param[in]	path	Path of the vos file.
+ *
+ * \param[in]	flags	additional flags (Future).
+ *
+ * \param[in]	sz	size of the file/heap.
+ *
+ * \param[in]	mode	permission to use while creating the file.
+ *
+ * \param[in]	store	backing umem store.
+ *
+ * \return		Returns the pointer to the object handle. Upon failure,
+ *			it returns NULL with errno set appropriately.
+ */
+dav_obj_t *
+dav_obj_create_v2(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store);
+
+/**
+ * Open and initialize a DAV object and return its handle.
+ *
+ * \param[in]	path	Path of the vos file.
+ *
+ * \param[in]	flags	additional flags (Future).
+ *
+ * \param[in]	store	backing umem store.
+ *
+ * \return		Returns the pointer to the object handle. Upon failure,
+ *			it returns NULL with errno set appropriately.
+ */
+dav_obj_t *
+dav_obj_open_v2(const char *path, int flags, struct umem_store *store);
+
+/**
+ * Close the DAV object
+ *
+ * \param[in]	hdl	DAV handle
+ */
+void
+dav_obj_close_v2(dav_obj_t *hdl);
+
+/**
+ * Return the pointer to the base of the heap.
+ *
+ * \param[in]	hdl	DAV handle
+ *
+ * \return		Returns the pointer to the base of the heap pointed to
+ *			by hdl.
+ */
+void *
+dav_get_base_ptr_v2(dav_obj_t *hdl);
+
+typedef int (*dav_constr)(dav_obj_t *pop, void *ptr, void *arg);
+
+/*
+ * Allocates a new object from the pool and calls a constructor function before
+ * returning. It is guaranteed that allocated object is either properly
+ * initialized, or if it's interrupted before the constructor completes, the
+ * memory reserved for the object is automatically reclaimed.
+ */
+int
+dav_alloc_v2(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags,
+	   dav_constr constructor, void *arg);
+
+/**
+ * Frees the memory at specified offset within the DAV object pointed to by hdl.
+ *
+ * \param[in]	hdl	DAV handle.
+ *
+ * \param[in]	off	offset to the memory location. off should correspond
+ *			to the offset returned by previous call to dav_malloc().
+ */
+void
+dav_free_v2(dav_obj_t *pop, uint64_t off);
+
+/*
+ * DAV version of memcpy. Data copied is made persistent in blob.
+ */
+void *
+dav_memcpy_persist_v2(dav_obj_t *pop, void *dest, const void *src, size_t len);
+
+/*
+ * If called for the first time on a newly created dav heap, the root object
+ * of given size is allocated.  Otherwise, it returns the existing root object.
+ * In such case, the size must be not less than the actual root object size
+ * stored in the pool.  If it's larger, the root object is automatically
+ * resized.
+ *
+ * This function is currently *not* thread-safe.
+ */
+uint64_t
+dav_root_v2(dav_obj_t *pop, size_t size);
+
+/*
+ * Starts a new transaction in the current thread.
+ * If called within an open transaction, starts a nested transaction.
+ *
+ * If successful, transaction stage changes to TX_STAGE_WORK and function
+ * returns zero. Otherwise, stage changes to TX_STAGE_ONABORT and an error
+ * number is returned.
+ */
+int
+dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...);
+
+/*
+ * Aborts current transaction
+ *
+ * Causes transition to TX_STAGE_ONABORT.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+void
+dav_tx_abort_v2(int errnum);
+
+/*
+ * Commits current transaction
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+void
+dav_tx_commit_v2(void);
+
+/*
+ * Cleanups current transaction. Must always be called after dav_tx_begin,
+ * even if starting the transaction failed.
+ *
+ * If called during TX_STAGE_NONE, has no effect.
+ *
+ * Always causes transition to TX_STAGE_NONE.
+ *
+ * If transaction was successful, returns 0. Otherwise returns error code set
+ * by dav_tx_abort.
+ *
+ * This function must *not* be called during TX_STAGE_WORK.
+ */
+int
+dav_tx_end_v2(void *data);
+
+/*
+ * Returns the current stage of the transaction.
+ */
+enum dav_tx_stage
+dav_tx_stage_v2(void);
+
+/*
+ * Returns last transaction error code.
+ */
+int
+dav_tx_errno_v2(void);
+
+/*
+ * Transactionally allocates a new object.
+ *
+ * If successful, returns offset of the object in the heap.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an zero is returned.
+ * 'Flags' is a bitmask of the following values:
+ *  - POBJ_XALLOC_ZERO - zero the allocated object
+ *  - POBJ_XALLOC_NO_FLUSH - skip flush on commit
+ *  - POBJ_XALLOC_NO_ABORT - if the function does not end successfully,
+ *  - DAV_CLASS_ID(id)	   - id of allocation class to use.
+ *  - DAV_EZONE_ID(id)	   - id of zone to use.
+ *  do not abort the transaction and return the error number.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+uint64_t
+dav_tx_alloc_v2(size_t size, uint64_t type_num, uint64_t flags);
+
+/*
+ * Transactionally frees an existing object.
+ *
+ * If successful, returns zero.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+int
+dav_tx_free_v2(uint64_t off);
+
+/*
+ * Takes a "snapshot" of the memory block of given size and located at given
+ * offset 'off' in the object 'oid' and saves it in the undo log.
+ * The application is then free to directly modify the object in that memory
+ * range. In case of failure or abort, all the changes within this range will
+ * be rolled-back automatically.
+ *
+ * If successful, returns zero.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+int
+dav_tx_add_range_v2(uint64_t off, size_t size);
+
+/*
+ * Takes a "snapshot" of the given memory region and saves it in the undo log.
+ * The application is then free to directly modify the object in that memory
+ * range. In case of failure or abort, all the changes within this range will
+ * be rolled-back automatically. The supplied block of memory has to be within
+ * the given pool.
+ *
+ * If successful, returns zero.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+int
+dav_tx_add_range_direct_v2(const void *ptr, size_t size);
+
+/*
+ * Behaves exactly the same as dav_tx_add_range when 'flags' equals 0.
+ * 'Flags' is a bitmask of the following values:
+ *  - POBJ_XADD_NO_FLUSH - skips flush on commit
+ *  - POBJ_XADD_NO_SNAPSHOT - added range will not be snapshotted
+ *  - POBJ_XADD_ASSUME_INITIALIZED - added range is assumed to be initialized
+ *  - POBJ_XADD_NO_ABORT - if the function does not end successfully,
+ *  do not abort the transaction and return the error number.
+ */
+int
+dav_tx_xadd_range_v2(uint64_t off, size_t size, uint64_t flags);
+
+/*
+ * Behaves exactly the same as dav_tx_add_range_direct when 'flags' equals
+ * 0. 'Flags' is a bitmask of the following values:
+ *  - POBJ_XADD_NO_FLUSH - skips flush on commit
+ *  - POBJ_XADD_NO_SNAPSHOT - added range will not be snapshotted
+ *  - POBJ_XADD_ASSUME_INITIALIZED - added range is assumed to be initialized
+ *  - POBJ_XADD_NO_ABORT - if the function does not end successfully,
+ *  do not abort the transaction and return the error number.
+ */
+int
+dav_tx_xadd_range_direct_v2(const void *ptr, size_t size, uint64_t flags);
+
+#define DAV_ACTION_XRESERVE_VALID_FLAGS						\
+	(DAV_XALLOC_CLASS_MASK | DAV_XALLOC_EZONE_MASK | DAV_XALLOC_ZERO)
+
+struct dav_action;
+uint64_t
+dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num,
+	     uint64_t flags);
+void
+dav_defer_free_v2(dav_obj_t *pop, uint64_t off, struct dav_action *act);
+void
+dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt);
+int
+dav_tx_publish_v2(struct dav_action *actv, size_t actvcnt);
+
+struct dav_alloc_class_desc;
+/*
+ * Registers an allocation class handle with the DAV object.
+ */
+int
+dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p);
+
+struct dav_heap_stats;
+/*
+ * Returns the heap allocation statistics associated  with the
+ * DAV object.
+ */
+int
+dav_get_heap_stats_v2(dav_obj_t *pop, struct dav_heap_stats *st);
+
+struct dav_heap_mb_stats {
+	uint64_t dhms_allocated;
+	uint64_t dhms_maxsz;
+};
+
+/**
+ * Returns the usage statistics of a memory bucket. Note that usage
+ * stats for evictable MBs will be approximate values if they are not
+ * yet loaded on to the umem cache.
+ *
+ * \param[in]           pop             pool handle
+ * \param[in]           mb_id           memory bucket id
+ * \param[out]          st              mb stats
+ *
+ * \return   0, success
+ *         < 0, error and errno is set to appropriate value.
+ */
+int
+dav_get_heap_mb_stats_v2(dav_obj_t *pop, uint32_t mb_id, struct dav_heap_mb_stats *st);
+
+/**
+ * Allot an evictable memory bucket for tasks like new object creation
+ *
+ * \param[in]           pop             pool handle
+ * \param[in]           flags           zone selection criteria.
+ *
+ * \return id > 0, mbid of evictable memory bucket.
+ *         id = 0, no evictable memory bucket is available
+ *                 use non-evictable memory bucket.
+ */
+uint32_t
+dav_allot_mb_evictable_v2(dav_obj_t *pop, int flags);
+
+/*
+ * Return the page size for dav_v2.
+ */
+size_t
+dav_obj_pgsz_v2();
+
+#endif /* __DAOS_COMMON_DAV_V2_H */
diff --git a/src/common/dav_v2/heap.c b/src/common/dav_v2/heap.c
new file mode 100644
index 00000000000..d730fed7bc4
--- /dev/null
+++ b/src/common/dav_v2/heap.c
@@ -0,0 +1,2195 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * heap.c -- heap implementation
+ */
+
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <float.h>
+#include <sys/queue.h>
+
+#include "bucket.h"
+#include "dav_internal.h"
+#include "memblock.h"
+#include "queue.h"
+#include "heap.h"
+#include "out.h"
+#include "util.h"
+#include "sys_util.h"
+#include "valgrind_internal.h"
+#include "recycler.h"
+#include "container.h"
+#include "alloc_class.h"
+#include "meta_io.h"
+
+#define HEAP_NEMB_PCT_DEFAULT 80
+
+static void
+heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, uint32_t zone_id);
+
+#define MAX_RUN_LOCKS MAX_CHUNK
+#define MAX_RUN_LOCKS_VG MAX_CHUNK /* avoid perf issues /w drd */
+
+#define ZINFO_VERSION    0x1
+
+struct zinfo_element {
+	unsigned char z_allotted   : 1;
+	unsigned char z_evictable  : 1;
+	unsigned char z_usage_hint : 3;
+};
+
+struct zinfo_vec {
+	uint32_t             version;
+	uint32_t             num_elems;
+	struct zinfo_element z[];
+};
+
+TAILQ_HEAD(mbrt_q, mbrt);
+
+/*
+ * Memory Bucket Runtime.
+ */
+struct mbrt {
+	TAILQ_ENTRY(mbrt) mb_link;
+	struct mbrt_q        *qptr;
+	uint32_t              mb_id;
+	uint32_t              garbage_reclaimed;
+	uint64_t              space_usage;
+	uint64_t              prev_usage;
+	struct palloc        *heap;
+	struct bucket_locked *default_bucket; /* bucket for free chunks */
+	struct bucket_locked *buckets[MAX_ALLOCATION_CLASSES];
+	struct recycler      *recyclers[MAX_ALLOCATION_CLASSES];
+	bool                  laf[MAX_ALLOCATION_CLASSES]; /* last allocation failed? */
+	bool                  laf_updated;
+};
+
+enum mb_usage_hint {
+	MB_U0_HINT   = 0,
+	MB_U30_HINT  = 1,
+	MB_U75_HINT  = 2,
+	MB_U90_HINT  = 3,
+	MB_UMAX_HINT = 4,
+};
+
+#define MB_U90         (ZONE_MAX_SIZE * 9 / 10)
+#define MB_U75         (ZONE_MAX_SIZE * 75 / 100)
+#define MB_U30         (ZONE_MAX_SIZE * 3 / 10)
+#define MB_USAGE_DELTA (ZONE_MAX_SIZE / 20)
+
+size_t mb_usage_byhint[MB_UMAX_HINT] = {0, MB_U30 + 1, MB_U75 + 1, MB_U90 + 1};
+
+struct heap_rt {
+	struct alloc_class_collection *alloc_classes;
+	pthread_mutex_t                run_locks[MAX_RUN_LOCKS];
+	unsigned                       nlocks;
+	unsigned                       nzones;
+	unsigned                       nzones_e;
+	unsigned                       nzones_ne;
+	unsigned                       zones_exhausted;
+	unsigned                       zones_exhausted_e;
+	unsigned                       zones_exhausted_ne;
+	unsigned                       zones_ne_gc;
+	unsigned                       zones_lastne_gc;
+	unsigned                       zones_unused_first;
+	unsigned                       zinfo_vec_size;
+	unsigned                       mb_create_waiters;
+	unsigned                       mb_pressure;
+	unsigned                       nemb_pct;
+	void                          *mb_create_wq;
+	struct zinfo_vec              *zinfo_vec;
+	struct mbrt                   *default_mb;
+	struct mbrt                  **evictable_mbs;
+	struct mbrt                   *active_evictable_mb;
+	struct mbrt_q                  mb_u90;
+	struct mbrt_q                  mb_u75;
+	struct mbrt_q                  mb_u30;
+	struct mbrt_q                  mb_u0;
+};
+
+#define MBRT_NON_EVICTABLE ((struct mbrt *)(-1UL))
+
+static inline void
+heap_zinfo_set(struct palloc_heap *heap, uint32_t zid, bool allotted, bool evictable)
+{
+	struct zinfo_element *ze = heap->rt->zinfo_vec->z;
+
+	ze[zid].z_allotted  = allotted;
+	ze[zid].z_evictable = evictable;
+	mo_wal_persist(&heap->p_ops, &ze[zid], sizeof(ze[zid]));
+}
+
+static inline void
+heap_zinfo_get(struct palloc_heap *heap, uint32_t zid, bool *allotted, bool *evictable)
+{
+	struct zinfo_element *ze = heap->rt->zinfo_vec->z;
+
+	*allotted  = ze[zid].z_allotted;
+	*evictable = ze[zid].z_evictable;
+}
+
+static inline void
+heap_zinfo_set_usage(struct palloc_heap *heap, uint32_t zid, enum mb_usage_hint val)
+{
+	struct zinfo_element *ze = heap->rt->zinfo_vec->z;
+
+	D_ASSERT(ze[zid].z_allotted && ze[zid].z_evictable && val < MB_UMAX_HINT);
+	ze[zid].z_usage_hint = val;
+	mo_wal_persist(&heap->p_ops, &ze[zid], sizeof(ze[zid]));
+}
+
+static inline void
+heap_zinfo_get_usage(struct palloc_heap *heap, uint32_t zid, enum mb_usage_hint *val)
+{
+	struct zinfo_element *ze = heap->rt->zinfo_vec->z;
+
+	D_ASSERT(ze[zid].z_allotted && ze[zid].z_evictable && ze[zid].z_usage_hint < MB_UMAX_HINT);
+	*val = ze[zid].z_usage_hint;
+}
+
+size_t
+heap_zinfo_get_size(uint32_t nzones)
+{
+	return (sizeof(struct zinfo_vec) + sizeof(struct zinfo_element) * nzones);
+}
+
+static inline void
+heap_zinfo_init(struct palloc_heap *heap)
+{
+	struct zinfo_vec *z = heap->rt->zinfo_vec;
+
+	D_ASSERT(heap->layout_info.zone0->header.zone0_zinfo_size >=
+		 heap_zinfo_get_size(heap->rt->nzones));
+
+	z->version   = ZINFO_VERSION;
+	z->num_elems = heap->rt->nzones;
+	mo_wal_persist(&heap->p_ops, z, sizeof(*z));
+	heap_zinfo_set(heap, 0, 1, false);
+}
+
+static void
+mbrt_set_laf(struct mbrt *mb, int c_id)
+{
+	if (mb->mb_id == 0)
+		return;
+	D_ASSERT(c_id < MAX_ALLOCATION_CLASSES);
+
+	mb->laf[c_id]   = true;
+	mb->laf_updated = true;
+}
+
+static void
+mbrt_clear_laf(struct mbrt *mb)
+{
+	if (mb->mb_id == 0)
+		return;
+	if (mb->laf_updated) {
+		memset(mb->laf, 0, MAX_ALLOCATION_CLASSES);
+		mb->laf_updated = false;
+	}
+}
+
+static bool
+mbrt_is_laf(struct mbrt *mb, int c_id)
+{
+	D_ASSERT(c_id < MAX_ALLOCATION_CLASSES);
+	return mb->laf[c_id];
+}
+
+void
+heap_mbrt_setmb_nonevictable(struct palloc_heap *heap, uint32_t zid)
+{
+	D_ASSERT(zid < heap->rt->nzones);
+	heap->rt->evictable_mbs[zid] = MBRT_NON_EVICTABLE;
+}
+
+void
+heap_mbrt_setmb_evictable(struct palloc_heap *heap, struct mbrt *mb)
+{
+	D_ASSERT((mb->mb_id != 0) && (mb->mb_id < heap->rt->nzones));
+	heap->rt->evictable_mbs[mb->mb_id] = mb;
+}
+
+bool
+heap_mbrt_ismb_evictable(struct palloc_heap *heap, uint32_t zid)
+{
+	D_ASSERT(zid < heap->rt->nzones);
+	return (heap->rt->evictable_mbs[zid] != MBRT_NON_EVICTABLE);
+}
+
+bool
+heap_mbrt_ismb_initialized(struct palloc_heap *heap, uint32_t zid)
+{
+	D_ASSERT(zid < heap->rt->nzones);
+	return (heap->rt->evictable_mbs[zid] != 0);
+}
+
+/*
+ * mbrt_bucket_acquire -- fetches by mbrt or by id a bucket exclusive
+ * for the thread until mbrt_bucket_release is called
+ */
+struct bucket *
+mbrt_bucket_acquire(struct mbrt *mb, uint8_t class_id)
+{
+	struct bucket_locked *b;
+
+	D_ASSERT(mb != NULL);
+
+	if (class_id == DEFAULT_ALLOC_CLASS_ID)
+		b = mb->default_bucket;
+	else
+		b = mb->buckets[class_id];
+
+	return bucket_acquire(b);
+}
+
+/*
+ * mbrt_bucket_release -- puts the bucket back into the heap
+ */
+void
+mbrt_bucket_release(struct bucket *b)
+{
+	bucket_release(b);
+}
+
+/*
+ * heap_mbrt_setup_mb -- (internal) create and initializes a Memory Bucket runtime.
+ */
+struct mbrt *
+heap_mbrt_setup_mb(struct palloc_heap *heap, uint32_t zid)
+{
+	struct heap_rt     *rt = heap->rt;
+	struct mbrt        *mb;
+	struct alloc_class *c;
+	uint8_t             i;
+
+	D_ALLOC_PTR(mb);
+	if (mb == NULL) {
+		errno = ENOMEM;
+		return NULL;
+	}
+
+	mb->mb_id = zid;
+
+	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		c = alloc_class_by_id(rt->alloc_classes, i);
+
+		if (c == NULL)
+			continue;
+
+		mb->buckets[c->id] = bucket_locked_new(container_new_seglists(heap), c, mb);
+		if (mb->buckets[c->id] == NULL)
+			goto error_bucket_create;
+	}
+
+	mb->default_bucket =
+	    bucket_locked_new(container_new_ravl(heap),
+			      alloc_class_by_id(rt->alloc_classes, DEFAULT_ALLOC_CLASS_ID), mb);
+
+	if (mb->default_bucket == NULL)
+		goto error_bucket_create;
+
+	return mb;
+
+error_bucket_create:
+	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		c = alloc_class_by_id(rt->alloc_classes, i);
+		if (c != NULL) {
+			if (mb->buckets[c->id] != NULL)
+				bucket_locked_delete(mb->buckets[c->id]);
+		}
+	}
+	D_FREE(mb);
+	errno = ENOMEM;
+	return NULL;
+}
+
+static void
+heap_mbrt_cleanup_mb(struct mbrt *mb)
+{
+	uint8_t i;
+
+	if (mb == NULL)
+		return;
+
+	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		if (mb->buckets[i] == NULL)
+			continue;
+		bucket_locked_delete(mb->buckets[i]);
+	}
+	bucket_locked_delete(mb->default_bucket);
+
+	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		if (mb->recyclers[i] == NULL)
+			continue;
+		recycler_delete(mb->recyclers[i]);
+	}
+	D_DEBUG(DB_TRACE, "MB %u utilization = %lu\n", mb->mb_id, mb->space_usage);
+	D_FREE(mb);
+}
+
+int
+heap_mbrt_update_alloc_class_buckets(struct palloc_heap *heap, struct mbrt *mb,
+				     struct alloc_class *c)
+{
+	uint8_t c_id = c->id;
+
+	if ((heap->rt->default_mb == mb) || (mb->buckets[c_id] != NULL))
+		return 0;
+
+	/* Allocation class created post creation/loading of the memory bucket runtime */
+	if (heap->rt->default_mb->buckets[c_id]) {
+		mb->buckets[c_id] = bucket_locked_new(container_new_seglists(heap), c, mb);
+		if (!mb->buckets[c_id])
+			return ENOMEM;
+	}
+	return 0;
+}
+
+static inline int
+heap_mbrt_init(struct palloc_heap *heap)
+{
+	struct heap_rt    *rt    = heap->rt;
+	int                ret   = 0;
+	struct umem_store *store = heap->layout_info.store;
+
+	rt->default_mb          = NULL;
+	rt->active_evictable_mb = NULL;
+	rt->mb_create_waiters   = 0;
+	rt->mb_create_wq        = NULL;
+	rt->mb_pressure         = 0;
+	ret                     = store->stor_ops->so_waitqueue_create(&rt->mb_create_wq);
+	if (ret) {
+		ret = daos_der2errno(ret);
+		goto error;
+	}
+
+	D_ALLOC_ARRAY(rt->evictable_mbs, rt->nzones);
+	if (rt->evictable_mbs == NULL) {
+		ret = ENOMEM;
+		goto error;
+	}
+
+	TAILQ_INIT(&rt->mb_u90);
+	TAILQ_INIT(&rt->mb_u75);
+	TAILQ_INIT(&rt->mb_u30);
+	TAILQ_INIT(&rt->mb_u0);
+
+	rt->default_mb = heap_mbrt_setup_mb(heap, 0);
+	if (rt->default_mb == NULL) {
+		ret = ENOMEM;
+		goto error_default_mb_setup;
+	}
+	heap_mbrt_setmb_nonevictable(heap, 0);
+	return 0;
+
+error_default_mb_setup:
+	D_FREE(rt->evictable_mbs);
+error:
+	return ret;
+}
+
+static inline void
+heap_mbrt_fini(struct palloc_heap *heap)
+{
+	struct heap_rt    *rt = heap->rt;
+	int                i;
+	struct umem_store *store = heap->layout_info.store;
+
+	for (i = 0; i < rt->zones_exhausted; i++) {
+		if (heap_mbrt_ismb_evictable(heap, i))
+			heap_mbrt_cleanup_mb(rt->evictable_mbs[i]);
+	}
+	heap_mbrt_cleanup_mb(rt->default_mb);
+
+	D_FREE(rt->evictable_mbs);
+	rt->default_mb          = NULL;
+	rt->active_evictable_mb = NULL;
+	rt->evictable_mbs       = NULL;
+	D_ASSERT(rt->mb_create_waiters == 0);
+	if (rt->mb_create_wq != NULL)
+		store->stor_ops->so_waitqueue_destroy(rt->mb_create_wq);
+	rt->mb_create_wq = NULL;
+}
+
+/*
+ * heap_mbrt_get_mb - returns the reference to the mb runtime given
+ *		      zone_id or mb_id.
+ */
+struct mbrt *
+heap_mbrt_get_mb(struct palloc_heap *heap, uint32_t zone_id)
+{
+	if (!heap_mbrt_ismb_evictable(heap, zone_id))
+		return heap->rt->default_mb;
+
+	D_ASSERT(heap->rt->evictable_mbs[zone_id] != NULL);
+	return heap->rt->evictable_mbs[zone_id];
+}
+
+void
+heap_mbrt_log_alloc_failure(struct palloc_heap *heap, uint32_t zone_id)
+{
+	struct mbrt *mb = heap->rt->active_evictable_mb;
+
+	if (mb && (mb->mb_id == zone_id)) {
+		TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link);
+		mb->qptr                      = &heap->rt->mb_u90;
+		mb->prev_usage                = mb->space_usage;
+		heap->rt->active_evictable_mb = NULL;
+		heap_zinfo_set_usage(heap, zone_id, MB_U90_HINT);
+	}
+}
+
+void
+heap_mbrt_setmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t usage)
+{
+	struct mbrt *mb = heap->rt->evictable_mbs[zone_id];
+
+	D_ASSERT(zone_id < heap->rt->nzones);
+	if (zone_id == 0) {
+		heap->rt->default_mb->space_usage = usage;
+		return;
+	}
+	if (mb == (struct mbrt *)(-1UL))
+		return;
+
+	mb->space_usage = usage;
+
+	if ((heap->rt->active_evictable_mb == mb) || (mb->qptr))
+		return;
+
+	if (mb->space_usage > MB_U90) {
+		TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link);
+		mb->qptr = &heap->rt->mb_u90;
+	} else if (mb->space_usage > MB_U75) {
+		TAILQ_INSERT_TAIL(&heap->rt->mb_u75, mb, mb_link);
+		mb->qptr = &heap->rt->mb_u75;
+	} else if (mb->space_usage > MB_U30) {
+		TAILQ_INSERT_TAIL(&heap->rt->mb_u30, mb, mb_link);
+		mb->qptr = &heap->rt->mb_u30;
+		heap->rt->mb_pressure = 0;
+	} else {
+		TAILQ_INSERT_TAIL(&heap->rt->mb_u0, mb, mb_link);
+		mb->qptr = &heap->rt->mb_u0;
+		heap->rt->mb_pressure = 0;
+	}
+	mb->prev_usage = mb->space_usage;
+}
+
+int
+heap_mbrt_getmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t *allotted,
+		      uint64_t *maxsz)
+{
+	struct mbrt *mb;
+
+	if (zone_id == 0) {
+		*maxsz    = heap->rt->nzones_ne * ZONE_MAX_SIZE;
+		*allotted = heap->rt->default_mb->space_usage;
+	} else {
+		if (zone_id >= heap->rt->nzones) {
+			errno = EINVAL;
+			return -1;
+		}
+		mb = heap->rt->evictable_mbs[zone_id];
+		if (!mb || (mb == (struct mbrt *)(-1UL))) {
+			errno = EINVAL;
+			return -1;
+		}
+		*maxsz    = ZONE_MAX_SIZE;
+		*allotted = mb->space_usage;
+	}
+	return 0;
+}
+
+void
+heap_mbrt_incrmb_usage(struct palloc_heap *heap, uint32_t zone_id, int size)
+{
+	struct mbrt *mb = heap->rt->evictable_mbs[zone_id];
+
+	if (mb == (struct mbrt *)(-1UL)) {
+		heap->rt->default_mb->space_usage += size;
+		return;
+	}
+
+	mb->space_usage += size;
+	if ((heap->rt->active_evictable_mb == mb) ||
+	    (labs((int64_t)(mb->space_usage - mb->prev_usage)) < MB_USAGE_DELTA))
+		return;
+
+	if (mb->space_usage > MB_U90) {
+		if (mb->qptr != &heap->rt->mb_u90) {
+			TAILQ_REMOVE(mb->qptr, mb, mb_link);
+			TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link);
+			mb->qptr = &heap->rt->mb_u90;
+			heap_zinfo_set_usage(heap, zone_id, MB_U90_HINT);
+		}
+	} else if (mb->space_usage > MB_U75) {
+		if (mb->qptr != &heap->rt->mb_u75) {
+			TAILQ_REMOVE(mb->qptr, mb, mb_link);
+			TAILQ_INSERT_TAIL(&heap->rt->mb_u75, mb, mb_link);
+			mb->qptr = &heap->rt->mb_u75;
+			heap_zinfo_set_usage(heap, zone_id, MB_U75_HINT);
+		}
+	} else if (mb->space_usage > MB_U30) {
+		if (mb->qptr != &heap->rt->mb_u30) {
+			TAILQ_REMOVE(mb->qptr, mb, mb_link);
+			TAILQ_INSERT_TAIL(&heap->rt->mb_u30, mb, mb_link);
+			mb->qptr = &heap->rt->mb_u30;
+			heap_zinfo_set_usage(heap, zone_id, MB_U30_HINT);
+			heap->rt->mb_pressure = 0;
+		}
+	} else if (mb->qptr != &heap->rt->mb_u0) {
+		TAILQ_REMOVE(mb->qptr, mb, mb_link);
+		TAILQ_INSERT_TAIL(&heap->rt->mb_u0, mb, mb_link);
+		mb->qptr = &heap->rt->mb_u0;
+		heap_zinfo_set_usage(heap, zone_id, MB_U0_HINT);
+		heap->rt->mb_pressure = 0;
+	}
+	mb->prev_usage = mb->space_usage;
+}
+
+int
+heap_mbrt_mb_reclaim_garbage(struct palloc_heap *heap, uint32_t zid)
+{
+	struct mbrt   *mb;
+	struct bucket *b;
+
+	mb = heap_mbrt_get_mb(heap, zid);
+
+	if ((mb->mb_id != 0) && (mb->garbage_reclaimed))
+		return 0;
+
+	b = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID);
+	heap_reclaim_zone_garbage(heap, b, zid);
+	mbrt_bucket_release(b);
+
+	if (mb->mb_id != 0)
+		mb->garbage_reclaimed = 1;
+
+	return 0;
+}
+
+void
+heap_set_root_ptrs(struct palloc_heap *heap, uint64_t **offp, uint64_t **sizep)
+{
+	*offp  = &heap->layout_info.zone0->header.reserved[0];
+	*sizep = &heap->layout_info.zone0->header.reserved[1];
+}
+
+void
+heap_set_stats_ptr(struct palloc_heap *heap, struct stats_persistent **sp)
+{
+	D_CASSERT(sizeof(struct stats_persistent) == sizeof(uint64_t));
+	*sp = (struct stats_persistent *)&heap->layout_info.zone0->header.sp_usage_glob;
+	VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(*sp, sizeof(*sp));
+}
+
+/*
+ * heap_get_recycler - (internal) retrieves the recycler instance from the mbrt with
+ *	the corresponding class id. Initializes the recycler if needed.
+ */
+static struct recycler *
+heap_get_recycler(struct palloc_heap *heap, struct mbrt *mb, size_t id, size_t nallocs)
+{
+	struct recycler *r;
+
+	D_ASSERT(mb != NULL);
+	util_atomic_load_explicit64(&mb->recyclers[id], &r, memory_order_acquire);
+	if (r != NULL)
+		return r;
+
+	r = recycler_new(heap, nallocs, mb);
+	if (r && !util_bool_compare_and_swap64(&mb->recyclers[id], NULL, r)) {
+		/*
+		 * If a different thread succeeded in assigning the recycler
+		 * first, the recycler this thread created needs to be deleted.
+		 */
+		recycler_delete(r);
+
+		return heap_get_recycler(heap, mb, id, nallocs);
+	}
+
+	return r;
+}
+
+/*
+ * heap_alloc_classes -- returns the allocation classes collection
+ */
+struct alloc_class_collection *
+heap_alloc_classes(struct palloc_heap *heap)
+{
+	return heap->rt ? heap->rt->alloc_classes : NULL;
+}
+
+/*
+ * heap_get_best_class -- returns the alloc class that best fits the
+ *	requested size
+ */
+struct alloc_class *
+heap_get_best_class(struct palloc_heap *heap, size_t size)
+{
+	return alloc_class_by_alloc_size(heap->rt->alloc_classes, size);
+}
+
+/*
+ * heap_get_run_lock -- returns the lock associated with memory block
+ */
+pthread_mutex_t *
+heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id)
+{
+	return &heap->rt->run_locks[chunk_id % heap->rt->nlocks];
+}
+
+/*
+ * heap_max_zone -- (internal) calculates how many zones can the heap fit
+ */
+static unsigned
+heap_max_zone(size_t size)
+{
+	unsigned max_zone = 0;
+
+	size -= sizeof(struct heap_header);
+
+	while (size >= ZONE_MIN_SIZE) {
+		max_zone++;
+		size -= size <= ZONE_MAX_SIZE ? size : ZONE_MAX_SIZE;
+	}
+
+	return max_zone;
+}
+
+/*
+ * zone_calc_size_idx -- (internal) calculates zone size index
+ */
+static uint32_t
+zone_calc_size_idx(uint32_t zone_id, unsigned max_zone, size_t heap_size)
+{
+	ASSERT(max_zone > 0);
+	if (zone_id < max_zone - 1)
+		return MAX_CHUNK;
+
+	ASSERT(heap_size >= zone_id * ZONE_MAX_SIZE);
+	size_t zone_raw_size = heap_size - zone_id * ZONE_MAX_SIZE;
+
+	ASSERT(zone_raw_size >= (sizeof(struct zone_header) +
+			sizeof(struct chunk_header) * MAX_CHUNK) +
+			sizeof(struct heap_header));
+	zone_raw_size -= sizeof(struct zone_header) +
+		sizeof(struct chunk_header) * MAX_CHUNK +
+		sizeof(struct heap_header);
+
+	size_t zone_size_idx = zone_raw_size / CHUNKSIZE;
+
+	ASSERT(zone_size_idx <= MAX_CHUNK);
+
+	return (uint32_t)zone_size_idx;
+}
+
+/*
+ * heap_zone_init -- (internal) writes zone's first chunk and header
+ */
+static void
+heap_zone_init(struct palloc_heap *heap, uint32_t zone_id, uint32_t first_chunk_id,
+	       bool is_evictable)
+{
+	struct zone *z        = ZID_TO_ZONE(&heap->layout_info, zone_id);
+	uint32_t     size_idx = zone_calc_size_idx(zone_id, heap->rt->nzones, heap->size);
+
+	ASSERT(size_idx > first_chunk_id);
+
+	struct zone_header nhdr = {
+		.size_idx = size_idx,
+		.magic = ZONE_HEADER_MAGIC,
+	};
+
+	z->header = nhdr; /* write the entire header at once */
+	if (is_evictable)
+		z->header.flags |= ZONE_EVICTABLE_MB;
+	mo_wal_persist(&heap->p_ops, &z->header, sizeof(z->header));
+
+	memblock_huge_init(heap, first_chunk_id, zone_id, size_idx - first_chunk_id);
+}
+
+/*
+ * heap_get_adjacent_free_block -- locates adjacent free memory block in heap
+ */
+static int
+heap_get_adjacent_free_block(struct palloc_heap *heap,
+	const struct memory_block *in, struct memory_block *out, int prev)
+{
+	struct zone         *z   = ZID_TO_ZONE(&heap->layout_info, in->zone_id);
+	struct chunk_header *hdr = &z->chunk_headers[in->chunk_id];
+
+	out->zone_id = in->zone_id;
+
+	if (prev) {
+		if (in->chunk_id == 0)
+			return ENOENT;
+
+		struct chunk_header *prev_hdr =
+			&z->chunk_headers[in->chunk_id - 1];
+		out->chunk_id = in->chunk_id - prev_hdr->size_idx;
+
+		if (z->chunk_headers[out->chunk_id].type != CHUNK_TYPE_FREE)
+			return ENOENT;
+
+		out->size_idx = z->chunk_headers[out->chunk_id].size_idx;
+	} else { /* next */
+		if (in->chunk_id + hdr->size_idx == z->header.size_idx)
+			return ENOENT;
+
+		out->chunk_id = in->chunk_id + hdr->size_idx;
+
+		if (z->chunk_headers[out->chunk_id].type != CHUNK_TYPE_FREE)
+			return ENOENT;
+
+		out->size_idx = z->chunk_headers[out->chunk_id].size_idx;
+	}
+	memblock_rebuild_state(heap, out);
+
+	return 0;
+}
+
+/*
+ * heap_coalesce -- (internal) merges adjacent memory blocks
+ */
+static struct memory_block
+heap_coalesce(struct palloc_heap *heap,
+	const struct memory_block *blocks[], int n)
+{
+	struct memory_block ret = MEMORY_BLOCK_NONE;
+
+	const struct memory_block *b = NULL;
+
+	ret.size_idx = 0;
+	for (int i = 0; i < n; ++i) {
+		if (blocks[i] == NULL)
+			continue;
+		b = b ? b : blocks[i];
+		ret.size_idx += blocks[i]->size_idx;
+	}
+
+	ASSERTne(b, NULL);
+
+	ret.chunk_id = b->chunk_id;
+	ret.zone_id = b->zone_id;
+	ret.block_off = b->block_off;
+	memblock_rebuild_state(heap, &ret);
+
+	return ret;
+}
+
+/*
+ * heap_coalesce_huge -- finds neighbors of a huge block, removes them from the
+ *	volatile state and returns the resulting block
+ */
+static struct memory_block
+heap_coalesce_huge(struct palloc_heap *heap, struct bucket *b,
+	const struct memory_block *m)
+{
+	const struct memory_block *blocks[3] = {NULL, m, NULL};
+
+	struct memory_block prev = MEMORY_BLOCK_NONE;
+
+	if (heap_get_adjacent_free_block(heap, m, &prev, 1) == 0 &&
+		bucket_remove_block(b, &prev) == 0) {
+		blocks[0] = &prev;
+	}
+
+	struct memory_block next = MEMORY_BLOCK_NONE;
+
+	if (heap_get_adjacent_free_block(heap, m, &next, 0) == 0 &&
+		bucket_remove_block(b, &next) == 0) {
+		blocks[2] = &next;
+	}
+
+	return heap_coalesce(heap, blocks, 3);
+}
+
+/*
+ * heap_free_chunk_reuse -- reuses existing free chunk
+ */
+int
+heap_free_chunk_reuse(struct palloc_heap *heap,
+	struct bucket *bucket,
+	struct memory_block *m)
+{
+	/*
+	 * Perform coalescing just in case there
+	 * are any neighboring free chunks.
+	 */
+	struct memory_block nm = heap_coalesce_huge(heap, bucket, m);
+
+	if (nm.size_idx != m->size_idx)
+		m->m_ops->prep_hdr(&nm, MEMBLOCK_FREE, NULL);
+
+	*m = nm;
+
+	return bucket_insert_block(bucket, m);
+}
+
+/*
+ * heap_run_into_free_chunk -- (internal) creates a new free chunk in place of
+ *	a run.
+ */
+static void
+heap_run_into_free_chunk(struct palloc_heap *heap,
+	struct bucket *bucket,
+	struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
+
+	m->block_off = 0;
+	m->size_idx = hdr->size_idx;
+
+	STATS_SUB(heap->stats, transient, heap_run_active,
+		m->size_idx * CHUNKSIZE);
+
+	/*
+	 * The only thing this could race with is heap_memblock_on_free()
+	 * because that function is called after processing the operation,
+	 * which means that a different thread might immediately call this
+	 * function if the free() made the run empty.
+	 * We could forgo this lock if it weren't for helgrind which needs it
+	 * to establish happens-before relation for the chunk metadata.
+	 */
+	pthread_mutex_t *lock = m->m_ops->get_lock(m);
+
+	util_mutex_lock(lock);
+
+	*m = memblock_huge_init(heap, m->chunk_id, m->zone_id, m->size_idx);
+
+	heap_free_chunk_reuse(heap, bucket, m);
+
+	util_mutex_unlock(lock);
+}
+
+/*
+ * heap_reclaim_run -- checks the run for available memory if unclaimed.
+ *
+ * Returns 1 if reclaimed chunk, 0 otherwise.
+ */
+static int
+heap_reclaim_run(struct palloc_heap *heap, struct memory_block *m, int startup)
+{
+	struct chunk_run    *run  = heap_get_chunk_run(heap, m);
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
+	struct mbrt         *mb   = heap_mbrt_get_mb(heap, m->zone_id);
+
+	struct alloc_class *c = alloc_class_by_run(
+		heap->rt->alloc_classes,
+		run->hdr.block_size, hdr->flags, m->size_idx);
+
+	struct recycler_element e = recycler_element_new(heap, m);
+
+	if (c == NULL) {
+		uint32_t size_idx = m->size_idx;
+		struct run_bitmap b;
+
+		m->m_ops->get_bitmap(m, &b);
+
+		ASSERTeq(size_idx, m->size_idx);
+
+		return e.free_space == b.nbits;
+	}
+
+	if (e.free_space == c->rdsc.nallocs)
+		return 1;
+
+	if (startup) {
+		STATS_INC(heap->stats, transient, heap_run_active,
+			m->size_idx * CHUNKSIZE);
+		STATS_INC(heap->stats, transient, heap_run_allocated,
+			(c->rdsc.nallocs - e.free_space) * run->hdr.block_size);
+	}
+	struct recycler *recycler = heap_get_recycler(heap, mb, c->id, c->rdsc.nallocs);
+
+	if (recycler == NULL || recycler_put(recycler, e) < 0)
+		ERR("lost runtime tracking info of %u run due to OOM", c->id);
+
+	return 0;
+}
+
+/*
+ * heap_reclaim_zone_garbage -- (internal) creates volatile state of unused runs
+ */
+static void
+heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket,
+	uint32_t zone_id)
+{
+	struct zone *z = ZID_TO_ZONE(&heap->layout_info, zone_id);
+
+	for (uint32_t i = 0; i < z->header.size_idx; ) {
+		struct chunk_header *hdr = &z->chunk_headers[i];
+
+		ASSERT(hdr->size_idx != 0);
+
+		struct memory_block m = MEMORY_BLOCK_NONE;
+
+		m.zone_id = zone_id;
+		m.chunk_id = i;
+		m.size_idx = hdr->size_idx;
+
+		memblock_rebuild_state(heap, &m);
+		m.m_ops->reinit_chunk(&m);
+
+		switch (hdr->type) {
+		case CHUNK_TYPE_RUN:
+			if (heap_reclaim_run(heap, &m, 1) != 0)
+				heap_run_into_free_chunk(heap, bucket, &m);
+			break;
+		case CHUNK_TYPE_FREE:
+			heap_free_chunk_reuse(heap, bucket, &m);
+			break;
+		case CHUNK_TYPE_USED:
+			break;
+		default:
+			ASSERT(0);
+		}
+
+		i = m.chunk_id + m.size_idx; /* hdr might have changed */
+	}
+}
+
+static int
+heap_getnext_ne_zone(struct palloc_heap *heap, uint32_t *zone_id)
+{
+	bool            allotted, evictable;
+	int             i;
+	struct heap_rt *h = heap->rt;
+
+	if (h->zones_ne_gc == h->zones_exhausted_ne)
+		return -1;
+
+	i = h->zones_ne_gc ? h->zones_lastne_gc + 1 : 0;
+
+	for (; i < h->zones_exhausted; i++) {
+		heap_zinfo_get(heap, i, &allotted, &evictable);
+		if (!allotted)
+			break;
+		if (!evictable) {
+			*zone_id = i;
+			return 0;
+		}
+	}
+	return -1;
+}
+
+/*
+ * heap_populate_bucket -- (internal) creates volatile state of memory blocks
+ */
+static int
+heap_populate_bucket(struct palloc_heap *heap, struct bucket *bucket)
+{
+	struct heap_rt         *h  = heap->rt;
+	struct mbrt            *mb = bucket_get_mbrt(bucket);
+	struct umem_cache_range rg = {0};
+	int                     rc;
+	uint32_t                zone_id;
+
+	if (mb->mb_id != 0) {
+		if (!mb->garbage_reclaimed) {
+			heap_reclaim_zone_garbage(heap, bucket, mb->mb_id);
+			mb->garbage_reclaimed = 1;
+			return 0;
+		}
+		return ENOMEM;
+	}
+
+	rc = heap_getnext_ne_zone(heap, &zone_id);
+	if (!rc)
+		goto reclaim_garbage;
+
+	/* at this point we are sure that there's no more memory in the heap */
+	if (h->zones_exhausted_ne == h->nzones_ne)
+		return ENOMEM;
+
+	zone_id = h->zones_exhausted++;
+	/* Create a umem cache map for the new zone */
+	rg.cr_off = GET_ZONE_OFFSET(zone_id);
+	rg.cr_size =
+	    ((heap->size - rg.cr_off) > ZONE_MAX_SIZE) ? ZONE_MAX_SIZE : heap->size - rg.cr_off;
+	heap_mbrt_setmb_nonevictable(heap, zone_id);
+	rc = umem_cache_map(heap->layout_info.store, &rg, 1);
+	if (rc != 0) {
+		rc = daos_der2errno(rc);
+		ERR("Failed to map zone %d to umem cache rc=%d\n", zone_id, rc);
+		h->zones_exhausted--;
+		return rc;
+	}
+	h->zones_exhausted_ne++;
+
+	struct zone *z = ZID_TO_ZONE(&heap->layout_info, zone_id);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(ZID_TO_ZONE(&heap->layout_info, zone_id), rg.cr_size);
+	if (rg.cr_size != ZONE_MAX_SIZE)
+		VALGRIND_DO_MAKE_MEM_NOACCESS(ZID_TO_ZONE(&heap->layout_info, zone_id) + rg.cr_size,
+					      (ZONE_MAX_SIZE - rg.cr_size));
+
+	/*
+	 * umem_cache_map() does not return a zeroed page.
+	 * Explicitly memset the page.
+	 */
+	memset(z, 0, rg.cr_size);
+
+	/* ignore zone and chunk headers */
+	VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(z, sizeof(z->header) +
+		sizeof(z->chunk_headers));
+
+	heap_zone_init(heap, zone_id, 0, false);
+	if (zone_id)
+		heap_zinfo_set(heap, zone_id, true, false);
+
+reclaim_garbage:
+	heap_reclaim_zone_garbage(heap, bucket, zone_id);
+	h->zones_lastne_gc = zone_id;
+	h->zones_ne_gc++;
+
+	/*
+	 * It doesn't matter that this function might not have found any
+	 * free blocks because there is still potential that subsequent calls
+	 * will find something in later zones.
+	 */
+	return 0;
+}
+
+/*
+ * heap_recycle_unused -- recalculate scores in the recycler and turn any
+ *	empty runs into free chunks
+ *
+ * If force is not set, this function might effectively be a noop if not enough
+ * of space was freed.
+ */
+static int
+heap_recycle_unused(struct palloc_heap *heap, struct recycler *recycler,
+	struct bucket *defb, int force)
+{
+	struct mbrt         *mb;
+	struct memory_block *nm;
+	struct empty_runs    r = recycler_recalc(recycler, force);
+	struct bucket       *nb;
+
+	if (VEC_SIZE(&r) == 0)
+		return ENOMEM;
+
+	mb = recycler_get_mbrt(recycler);
+	D_ASSERT(mb != NULL);
+
+	nb = defb == NULL ? mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID) : NULL;
+
+	ASSERT(defb != NULL || nb != NULL);
+
+	VEC_FOREACH_BY_PTR(nm, &r) {
+		heap_run_into_free_chunk(heap, defb ? defb : nb, nm);
+	}
+
+	if (nb != NULL)
+		mbrt_bucket_release(nb);
+
+	VEC_DELETE(&r);
+
+	return 0;
+}
+
+/*
+ * heap_reclaim_garbage -- (internal) creates volatile state of unused runs
+ */
+static int
+heap_reclaim_garbage(struct palloc_heap *heap, struct bucket *bucket)
+{
+	int              ret = ENOMEM;
+	struct recycler *r;
+	struct mbrt     *mb = bucket_get_mbrt(bucket);
+
+	for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		r = mb->recyclers[i];
+		if (r == NULL)
+			continue;
+
+		if (heap_recycle_unused(heap, r, bucket, 1) == 0)
+			ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * heap_ensure_huge_bucket_filled --
+ *	(internal) refills the default bucket if needed
+ */
+static int
+heap_ensure_huge_bucket_filled(struct palloc_heap *heap,
+	struct bucket *bucket)
+{
+	if (heap_reclaim_garbage(heap, bucket) == 0)
+		return 0;
+
+	if (heap_populate_bucket(heap, bucket) == 0)
+		return 0;
+
+	return ENOMEM;
+}
+
+/*
+ * heap_discard_run -- puts the memory block back into the global heap.
+ */
+void
+heap_discard_run(struct palloc_heap *heap, struct memory_block *m)
+{
+	struct mbrt *mb = heap_mbrt_get_mb(heap, m->zone_id);
+
+	D_ASSERT(mb != NULL);
+	if (heap_reclaim_run(heap, m, 0)) {
+		struct bucket *b = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID);
+
+		heap_run_into_free_chunk(heap, b, m);
+
+		mbrt_bucket_release(b);
+	}
+}
+
+/*
+ * heap_detach_and_try_discard_run -- detaches the active from a bucket and
+ *	tries to discard the run if it is completely empty (has no allocations)
+ */
+static int
+heap_detach_and_try_discard_run(struct palloc_heap *heap, struct bucket *b)
+{
+	int empty = 0;
+	struct memory_block m;
+
+	if (bucket_detach_run(b, &m, &empty) != 0)
+		return -1;
+
+	if (empty)
+		heap_discard_run(heap, &m);
+
+	return 0;
+}
+
+/*
+ * heap_reuse_from_recycler -- (internal) try reusing runs that are currently
+ *	in the recycler
+ */
+static int
+heap_reuse_from_recycler(struct palloc_heap *heap,
+	struct bucket *b, uint32_t units, int force)
+{
+	struct mbrt        *mb = bucket_get_mbrt(b);
+	struct memory_block m  = MEMORY_BLOCK_NONE;
+
+	m.size_idx = units;
+
+	struct alloc_class *aclass = bucket_alloc_class(b);
+
+	struct recycler *recycler = heap_get_recycler(heap, mb, aclass->id, aclass->rdsc.nallocs);
+
+	if (recycler == NULL) {
+		ERR("lost runtime tracking info of %u run due to OOM",
+			aclass->id);
+		return 0;
+	}
+
+	if (!force && recycler_get(recycler, &m) == 0)
+		return bucket_attach_run(b, &m);
+
+	heap_recycle_unused(heap, recycler, NULL, force);
+
+	if (recycler_get(recycler, &m) == 0)
+		return bucket_attach_run(b, &m);
+
+	return ENOMEM;
+}
+
+/*
+ * heap_run_create -- (internal) initializes a new run on an existing free chunk
+ */
+static int
+heap_run_create(struct palloc_heap *heap, struct bucket *b,
+	struct memory_block *m)
+{
+	struct alloc_class *aclass = bucket_alloc_class(b);
+	*m = memblock_run_init(heap, m->chunk_id, m->zone_id, &aclass->rdsc);
+
+	bucket_attach_run(b, m);
+
+	STATS_INC(heap->stats, transient, heap_run_active,
+		m->size_idx * CHUNKSIZE);
+
+	return 0;
+}
+
+/*
+ * heap_ensure_run_bucket_filled -- (internal) refills the bucket if needed
+ */
+static int
+heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b,
+	uint32_t units)
+{
+	int ret = 0;
+	struct alloc_class *aclass = bucket_alloc_class(b);
+	struct mbrt        *mb     = bucket_get_mbrt(b);
+	struct memory_block m;
+	struct bucket      *defb;
+
+	D_ASSERT(mb != NULL);
+	ASSERTeq(aclass->type, CLASS_RUN);
+
+	if (mbrt_is_laf(mb, aclass->id))
+		return ENOMEM;
+
+	if (heap_detach_and_try_discard_run(heap, b) != 0)
+		return ENOMEM;
+
+	if (heap_reuse_from_recycler(heap, b, units, 0) == 0)
+		goto out;
+
+	m = MEMORY_BLOCK_NONE;
+
+	m.size_idx = aclass->rdsc.size_idx;
+
+	defb = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID);
+
+	/* cannot reuse an existing run, create a new one */
+	if (heap_get_bestfit_block(heap, defb, &m) == 0) {
+		ASSERTeq(m.block_off, 0);
+		if (heap_run_create(heap, b, &m) != 0) {
+			mbrt_bucket_release(defb);
+			return ENOMEM;
+		}
+		mbrt_bucket_release(defb);
+		goto out;
+	}
+	mbrt_bucket_release(defb);
+
+	if (heap_reuse_from_recycler(heap, b, units, 1) == 0)
+		goto out;
+
+	mbrt_set_laf(mb, aclass->id);
+	ret = ENOMEM;
+out:
+	return ret;
+}
+
+/*
+ * heap_memblock_on_free -- bookkeeping actions executed at every free of a
+ *	block
+ */
+void
+heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m)
+{
+	struct mbrt *mb = heap_mbrt_get_mb(heap, m->zone_id);
+
+	if (m->type != MEMORY_BLOCK_RUN)
+		return;
+
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
+	struct chunk_run *run = heap_get_chunk_run(heap, m);
+
+	ASSERTeq(hdr->type, CHUNK_TYPE_RUN);
+
+	struct alloc_class *c = alloc_class_by_run(
+		heap->rt->alloc_classes,
+		run->hdr.block_size, hdr->flags, hdr->size_idx);
+
+	if (c == NULL)
+		return;
+
+	struct recycler *recycler = heap_get_recycler(heap, mb, c->id, c->rdsc.nallocs);
+
+	if (recycler == NULL) {
+		ERR("lost runtime tracking info of %u run due to OOM",
+			c->id);
+	} else {
+		recycler_inc_unaccounted(recycler, m);
+		mbrt_clear_laf(mb);
+	}
+}
+
+/*
+ * heap_split_block -- (internal) splits unused part of the memory block
+ */
+static void
+heap_split_block(struct palloc_heap *heap, struct bucket *b,
+		struct memory_block *m, uint32_t units)
+{
+	struct alloc_class *aclass = bucket_alloc_class(b);
+
+	ASSERT(units <= MAX_CHUNK);
+	ASSERT(units > 0);
+
+	if (aclass->type == CLASS_RUN) {
+		ASSERT((uint64_t)m->block_off + (uint64_t)units <= UINT32_MAX);
+		struct memory_block r = {m->chunk_id, m->zone_id,
+			m->size_idx - units, (uint32_t)(m->block_off + units),
+			NULL, NULL, 0, 0, NULL};
+		memblock_rebuild_state(heap, &r);
+		if (bucket_insert_block(b, &r) != 0)
+			D_CRIT("failed to allocate memory block runtime tracking info\n");
+	} else {
+		uint32_t new_chunk_id = m->chunk_id + units;
+		uint32_t new_size_idx = m->size_idx - units;
+
+		struct memory_block n = memblock_huge_init(heap,
+			new_chunk_id, m->zone_id, new_size_idx);
+
+		*m = memblock_huge_init(heap, m->chunk_id, m->zone_id, units);
+
+		if (bucket_insert_block(b, &n) != 0)
+			D_CRIT("failed to allocate memory block runtime tracking info\n");
+	}
+
+	m->size_idx = units;
+}
+
+/*
+ * heap_get_bestfit_block --
+ *	extracts a memory block of equal size index
+ */
+int
+heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b,
+	struct memory_block *m)
+{
+	struct alloc_class *aclass = bucket_alloc_class(b);
+	uint32_t units = m->size_idx;
+
+	while (bucket_alloc_block(b, m) != 0) {
+		if (aclass->type == CLASS_HUGE) {
+			if (heap_ensure_huge_bucket_filled(heap, b) != 0)
+				return ENOMEM;
+		} else {
+			if (heap_ensure_run_bucket_filled(heap, b, units) != 0)
+				return ENOMEM;
+		}
+	}
+
+	ASSERT(m->size_idx >= units);
+
+	if (units != m->size_idx)
+		heap_split_block(heap, b, m, units);
+
+	m->m_ops->ensure_header_type(m, aclass->header_type);
+	m->header_type = aclass->header_type;
+
+	return 0;
+}
+
+/*
+ * heap_create_alloc_class_buckets -- allocates all cache bucket
+ * instances of the specified type
+ */
+int
+heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c)
+{
+	struct mbrt *default_mb = heap->rt->default_mb;
+
+	if (default_mb->buckets[c->id] == NULL) {
+		default_mb->buckets[c->id] =
+		    bucket_locked_new(container_new_seglists(heap), c, default_mb);
+		if (default_mb->buckets[c->id] == NULL)
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_write_header -- (internal) creates a clean header
+ */
+static int
+heap_write_header(struct umem_store *store, size_t heap_size, size_t umem_cache_size,
+		  uint32_t nemb_pct)
+{
+	struct heap_header *newhdr;
+	int                 rc;
+
+	D_ALLOC_PTR(newhdr);
+	if (!newhdr)
+		return -1;
+
+	strncpy(newhdr->signature, HEAP_SIGNATURE, HEAP_SIGNATURE_LEN);
+	newhdr->major           = HEAP_MAJOR;
+	newhdr->minor           = HEAP_MINOR;
+	newhdr->heap_size       = heap_size;
+	newhdr->cache_size      = umem_cache_size;
+	newhdr->heap_hdr_size   = sizeof(struct heap_header);
+	newhdr->chunksize       = CHUNKSIZE;
+	newhdr->chunks_per_zone = MAX_CHUNK;
+	newhdr->nemb_pct        = (uint8_t)nemb_pct;
+	newhdr->checksum        = 0;
+
+	util_checksum(newhdr, sizeof(*newhdr), &newhdr->checksum, 1, 0);
+	rc = meta_update(store, newhdr, 0, sizeof(*newhdr));
+	D_FREE(newhdr);
+
+	return rc;
+}
+
+/*
+ * heap_cleanup -- cleanups the volatile heap state
+ */
+void
+heap_cleanup(struct palloc_heap *heap)
+{
+	struct heap_rt *rt = heap->rt;
+	unsigned        i;
+
+	alloc_class_collection_delete(rt->alloc_classes);
+
+	for (i = 0; i < rt->nlocks; ++i)
+		util_mutex_destroy(&rt->run_locks[i]);
+
+#if VG_MEMCHECK_ENABLED
+	VALGRIND_DO_DESTROY_MEMPOOL(heap->layout_info.zone0);
+	if (On_memcheck) {
+		for (i = 0; i < heap->rt->zones_exhausted; i++) {
+			if (!heap_mbrt_ismb_initialized(heap, i) ||
+			    !heap_mbrt_ismb_evictable(heap, i))
+				continue;
+			if (umem_cache_offisloaded(heap->layout_info.store, GET_ZONE_OFFSET(i)))
+				VALGRIND_DO_DESTROY_MEMPOOL(ZID_TO_ZONE(&heap->layout_info, i));
+		}
+	}
+#endif
+	heap_mbrt_fini(heap);
+
+	D_FREE(rt);
+	heap->rt = NULL;
+}
+
+/*
+ * heap_verify_header -- (internal) verifies if the heap header is consistent
+ */
+static int
+heap_verify_header(struct heap_header *hdr, size_t heap_size, size_t cache_size)
+{
+	if (util_checksum(hdr, sizeof(*hdr), &hdr->checksum, 0, 0) != 1) {
+		D_CRIT("heap: invalid header's checksum\n");
+		return -1;
+	}
+
+	if ((hdr->major != HEAP_MAJOR) || (hdr->minor > HEAP_MINOR)) {
+		D_ERROR("Version mismatch of heap layout\n");
+		return -1;
+	}
+
+	if (hdr->heap_size != heap_size) {
+		D_ERROR("Metadata store size mismatch, created with %lu , opened with %lu\n",
+			hdr->heap_size, heap_size);
+		return -1;
+	}
+
+	if (hdr->cache_size != cache_size) {
+		D_ERROR("umem cache size mismatch, created with %lu , opened with %lu\n",
+			hdr->cache_size, cache_size);
+		return -1;
+	}
+
+	if (hdr->nemb_pct > 100) {
+		D_ERROR("nemb pct value (%d) in heap header is incorrect\n", hdr->nemb_pct);
+		return -1;
+	}
+
+	if ((hdr->heap_hdr_size != sizeof(struct heap_header)) || (hdr->chunksize != CHUNKSIZE) ||
+	    (hdr->chunks_per_zone != MAX_CHUNK)) {
+		D_ERROR("incompatible heap layout: hdr_sz=%lu, chunk_sz=%lu, max_chunks=%lu\n",
+			hdr->heap_hdr_size, hdr->chunksize, hdr->chunks_per_zone);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+heap_zone_load(struct palloc_heap *heap, uint32_t zid)
+{
+	struct umem_cache_range rg    = {0};
+	struct umem_store      *store = heap->layout_info.store;
+	int                     rc;
+
+	D_ASSERT(heap->rt->nzones > zid);
+
+	rg.cr_off  = GET_ZONE_OFFSET(zid);
+	rg.cr_size = ((store->stor_size - rg.cr_off) > ZONE_MAX_SIZE)
+			 ? ZONE_MAX_SIZE
+			 : (store->stor_size - rg.cr_off);
+	rc         = umem_cache_load(store, &rg, 1, 0);
+	if (rc) {
+		D_ERROR("Failed to load pages to umem cache");
+		return daos_der2errno(rc);
+	}
+	return 0;
+}
+
+int
+heap_ensure_zone0_initialized(struct palloc_heap *heap)
+{
+	struct mbrt   *mb;
+	struct bucket *b;
+	int            rc = 0;
+
+	heap_mbrt_setmb_nonevictable(heap, 0);
+	if (heap->layout_info.zone0->header.magic != ZONE_HEADER_MAGIC) {
+		/* If not magic the content should be zero, indicating new file */
+		D_ASSERT(heap->layout_info.zone0->header.magic == 0);
+		mb = heap_mbrt_get_mb(heap, 0);
+		b  = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID);
+		rc = heap_populate_bucket(heap, b);
+		mbrt_bucket_release(b);
+	}
+#if VG_MEMCHECK_ENABLED
+	else {
+		if (On_memcheck)
+			palloc_heap_vg_zone_open(heap, 0, 1);
+	}
+#endif
+	heap_mbrt_setmb_usage(heap, 0, heap->layout_info.zone0->header.sp_usage);
+	return rc;
+}
+
+D_CASSERT(sizeof(struct zone) == 4096);
+D_CASSERT(sizeof(struct heap_header) == 4096);
+
+#define MAX_HEADER_FETCH 4
+
+/*
+ * heap_boot -- opens the heap region of the dav_obj pool
+ *
+ * If successful function returns zero. Otherwise an error number is returned.
+ */
+int
+heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_t cache_size,
+	  struct mo_ops *p_ops, struct stats *stats)
+{
+	struct heap_rt         *h;
+	struct heap_header     *newhdr;
+	int                     err;
+	struct heap_zone_limits hzl;
+	uint32_t                nemb_pct = HEAP_NEMB_PCT_DEFAULT;
+
+	D_ALLOC_PTR(newhdr);
+	if (!newhdr)
+		return ENOMEM;
+
+	err = meta_fetch(p_ops->umem_store, newhdr, 0, sizeof(*newhdr));
+	if (err) {
+		ERR("failed to read the heap header");
+		D_FREE(newhdr);
+		return err;
+	}
+	err = heap_verify_header(newhdr, heap_size, cache_size);
+	if (err) {
+		ERR("incompatible heap detected");
+		D_FREE(newhdr);
+		return EINVAL;
+	}
+	if (newhdr->nemb_pct)
+		nemb_pct = newhdr->nemb_pct;
+	D_FREE(newhdr);
+
+	D_ALLOC_PTR_NZ(h);
+	if (h == NULL) {
+		err = ENOMEM;
+		goto error_heap_malloc;
+	}
+
+	h->alloc_classes = alloc_class_collection_new();
+	if (h->alloc_classes == NULL) {
+		err = ENOMEM;
+		goto error_alloc_classes_new;
+	}
+
+	hzl = heap_get_zone_limits(heap_size, cache_size, nemb_pct);
+
+	h->nzones             = hzl.nzones_heap;
+	h->nzones_ne          = hzl.nzones_ne_max;
+	h->nzones_e           = hzl.nzones_e_max;
+	h->zones_exhausted    = 0;
+	h->zones_exhausted_e  = 0;
+	h->zones_exhausted_ne = 0;
+	h->zones_ne_gc        = 0;
+	h->zones_lastne_gc    = 0;
+	h->zones_unused_first = 0;
+
+	h->nlocks = On_valgrind ? MAX_RUN_LOCKS_VG : MAX_RUN_LOCKS;
+	for (unsigned i = 0; i < h->nlocks; ++i)
+		util_mutex_init(&h->run_locks[i]);
+	heap->rt = h;
+
+	heap->p_ops = *p_ops;
+	heap->layout_info.store = p_ops->umem_store;
+	heap->layout_info.zone0 = mmap_base;
+	heap->size              = heap_size;
+	heap->base              = mmap_base;
+	heap->stats             = stats;
+	heap->alloc_pattern = PALLOC_CTL_DEBUG_NO_PATTERN;
+	VALGRIND_DO_CREATE_MEMPOOL(heap->layout_info.zone0, 0, 0);
+
+	err = heap_mbrt_init(heap);
+	if (err)
+		goto error_mbrt_init;
+
+	return 0;
+
+error_mbrt_init:
+	alloc_class_collection_delete(h->alloc_classes);
+error_alloc_classes_new:
+	D_FREE(h);
+	heap->rt = NULL;
+error_heap_malloc:
+	return err;
+}
+
+static unsigned int
+heap_get_nemb_pct()
+{
+	unsigned int nemb_pct;
+
+	nemb_pct = HEAP_NEMB_PCT_DEFAULT;
+	d_getenv_uint("DAOS_MD_ON_SSD_NEMB_PCT", &nemb_pct);
+	if ((nemb_pct > 100) || (nemb_pct == 0)) {
+		D_ERROR("Invalid value %d for tunable DAOS_MD_ON_SSD_NEMB_PCT", nemb_pct);
+		nemb_pct = HEAP_NEMB_PCT_DEFAULT;
+	}
+	D_INFO("DAOS_MD_ON_SSD_NEMB_PCT set to %d", nemb_pct);
+
+	return nemb_pct;
+}
+
+int
+heap_get_max_nemb(struct palloc_heap *heap)
+{
+	return heap->rt->nzones_ne;
+}
+
+/*
+ * heap_init -- initializes the heap
+ *
+ * If successful function returns zero. Otherwise an error number is returned.
+ */
+int
+heap_init(void *heap_start, uint64_t umem_cache_size, struct umem_store *store)
+{
+	int      nzones;
+	uint32_t nemb_pct  = heap_get_nemb_pct();
+	uint64_t heap_size = store->stor_size;
+
+	if (heap_size < HEAP_MIN_SIZE)
+		return EINVAL;
+
+	D_ASSERT(store->stor_priv != NULL);
+
+	nzones = heap_max_zone(heap_size);
+	meta_clear_pages(store, sizeof(struct heap_header), 4096, ZONE_MAX_SIZE, nzones);
+
+	if (heap_write_header(store, heap_size, umem_cache_size, nemb_pct))
+		return ENOMEM;
+
+	return 0;
+}
+
+static inline int
+heap_create_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id)
+{
+	uint32_t                zone_id;
+	struct mbrt            *mb;
+	struct umem_cache_range rg = {0};
+	int                     rc;
+	struct zone            *z;
+	struct umem_pin_handle *pin_handle = NULL;
+	struct umem_store      *store      = heap->layout_info.store;
+
+	D_ASSERT(heap->rt->active_evictable_mb == NULL);
+
+	if (heap->rt->zones_exhausted_e >= heap->rt->nzones_e)
+		return -1;
+
+	heap->rt->mb_create_waiters++;
+	if (heap->rt->mb_create_waiters > 1) {
+		D_ASSERT(store->stor_ops->so_waitqueue_wait != NULL);
+		store->stor_ops->so_waitqueue_wait(heap->rt->mb_create_wq, false);
+		D_ASSERT((int)heap->rt->mb_create_waiters >= 0);
+		rc    = 1;
+		errno = EBUSY;
+		goto out;
+	}
+
+	for (zone_id = heap->rt->zones_unused_first; zone_id < heap->rt->nzones; zone_id++) {
+		if (!heap_mbrt_ismb_initialized(heap, zone_id))
+			break;
+	}
+
+	D_ASSERT(zone_id < heap->rt->nzones);
+	mb      = heap_mbrt_setup_mb(heap, zone_id);
+	if (mb == NULL) {
+		ERR("Failed to setup mbrt for zone %u\n", zone_id);
+		rc = -1;
+		goto out;
+	}
+
+	heap->rt->zones_unused_first = zone_id + 1;
+	if (heap->rt->zones_exhausted < heap->rt->zones_unused_first)
+		heap->rt->zones_exhausted = heap->rt->zones_unused_first;
+	heap->rt->zones_exhausted_e++;
+	heap_mbrt_setmb_evictable(heap, mb);
+
+	/* Create a umem cache map for the new zone */
+	rg.cr_off = GET_ZONE_OFFSET(zone_id);
+	rg.cr_size =
+	    ((heap->size - rg.cr_off) > ZONE_MAX_SIZE) ? ZONE_MAX_SIZE : heap->size - rg.cr_off;
+
+	rc = umem_cache_map(heap->layout_info.store, &rg, 1);
+	if (rc != 0) {
+		ERR("Failed to map zone %u to umem cache\n", zone_id);
+		errno = daos_der2errno(rc);
+		goto error;
+	}
+
+	D_DEBUG(DB_TRACE, "Creating evictable zone %d\n", zone_id);
+
+	z = ZID_TO_ZONE(&heap->layout_info, zone_id);
+	VALGRIND_DO_CREATE_MEMPOOL(z, 0, 0);
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(z, rg.cr_size);
+	if (rg.cr_size != ZONE_MAX_SIZE)
+		VALGRIND_DO_MAKE_MEM_NOACCESS(z + rg.cr_size, (ZONE_MAX_SIZE - rg.cr_size));
+
+	memset(z, 0, rg.cr_size);
+
+	rc = umem_cache_pin(heap->layout_info.store, &rg, 1, false, &pin_handle);
+	if (rc) {
+		errno = daos_der2errno(rc);
+		goto error;
+	}
+
+	/* ignore zone and chunk headers */
+	VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(z, sizeof(z->header) + sizeof(z->chunk_headers));
+
+	rc = lw_tx_begin(heap->p_ops.base);
+	if (rc)
+		goto error;
+
+	heap_zone_init(heap, zone_id, 0, true);
+	rc = heap_mbrt_mb_reclaim_garbage(heap, zone_id);
+	if (rc) {
+		ERR("Failed to initialize evictable zone %u", zone_id);
+		lw_tx_end(heap->p_ops.base, NULL);
+		goto error;
+	}
+	heap_zinfo_set(heap, zone_id, true, true);
+	lw_tx_end(heap->p_ops.base, NULL);
+	umem_cache_unpin(heap->layout_info.store, pin_handle);
+
+	*mb_id = zone_id;
+	rc     = 0;
+	goto out;
+
+error:
+	if (pin_handle)
+		umem_cache_unpin(heap->layout_info.store, pin_handle);
+	heap_mbrt_cleanup_mb(mb);
+	heap->rt->evictable_mbs[zone_id] = NULL;
+	heap->rt->zones_exhausted_e--;
+	if (heap->rt->zones_unused_first > zone_id)
+		heap->rt->zones_unused_first = zone_id;
+	rc = -1;
+
+out:
+	heap->rt->mb_create_waiters--;
+	D_ASSERT((int)heap->rt->mb_create_waiters >= 0);
+	if (heap->rt->mb_create_waiters) {
+		D_ASSERT(store->stor_ops->so_waitqueue_wakeup != NULL);
+		store->stor_ops->so_waitqueue_wakeup(heap->rt->mb_create_wq, false);
+	}
+	return rc;
+}
+
+int
+heap_get_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id)
+{
+	struct mbrt       *mb;
+	int                ret;
+
+retry:
+	if (heap->rt->active_evictable_mb != NULL) {
+		if ((heap->rt->mb_pressure) ||
+		    (heap->rt->active_evictable_mb->space_usage <= MB_U75)) {
+			*mb_id = heap->rt->active_evictable_mb->mb_id;
+			return 0;
+		}
+		mb                            = heap->rt->active_evictable_mb;
+		heap->rt->active_evictable_mb = NULL;
+		heap_mbrt_setmb_usage(heap, mb->mb_id, mb->space_usage);
+	}
+	heap->rt->mb_pressure = 0;
+
+	if ((mb = TAILQ_FIRST(&heap->rt->mb_u30)) != NULL)
+		TAILQ_REMOVE(&heap->rt->mb_u30, mb, mb_link);
+	else if ((mb = TAILQ_FIRST(&heap->rt->mb_u0)) != NULL)
+		TAILQ_REMOVE(&heap->rt->mb_u0, mb, mb_link);
+	else if ((ret = heap_create_evictable_mb(heap, mb_id)) >= 0) {
+		if (ret)
+			goto retry;
+		mb = heap_mbrt_get_mb(heap, *mb_id);
+		D_ASSERT(mb != NULL);
+		if (heap->rt->active_evictable_mb) {
+			TAILQ_INSERT_HEAD(&heap->rt->mb_u0, mb, mb_link);
+			mb->qptr = &heap->rt->mb_u0;
+			*mb_id   = heap->rt->active_evictable_mb->mb_id;
+			return 0;
+		}
+	} else if ((mb = TAILQ_FIRST(&heap->rt->mb_u75)) != NULL) {
+		TAILQ_REMOVE(&heap->rt->mb_u75, mb, mb_link);
+		heap->rt->mb_pressure = 1;
+	} else if ((mb = TAILQ_FIRST(&heap->rt->mb_u90)) != NULL) {
+		TAILQ_REMOVE(&heap->rt->mb_u90, mb, mb_link);
+		heap->rt->mb_pressure = 1;
+	} else {
+		D_ERROR("Failed to get an evictable MB");
+		*mb_id = 0;
+		return 0;
+	}
+	heap->rt->active_evictable_mb = mb;
+	mb->qptr                      = NULL;
+	*mb_id                        = mb->mb_id;
+	return 0;
+}
+
+uint32_t
+heap_off2mbid(struct palloc_heap *heap, uint64_t offset)
+{
+	struct memory_block m = memblock_from_offset_opt(heap, offset, 0);
+
+	if (heap_mbrt_ismb_evictable(heap, m.zone_id))
+		return m.zone_id;
+	else
+		return 0;
+}
+
+int
+heap_update_mbrt_zinfo(struct palloc_heap *heap, bool init)
+{
+	bool               allotted, evictable;
+	struct zone       *z0       = heap->layout_info.zone0;
+	int                nemb_cnt = 1, emb_cnt = 0, i;
+	struct mbrt       *mb;
+	struct zone       *z;
+	enum mb_usage_hint usage_hint;
+	int                last_allocated = 0;
+
+	heap->rt->zinfo_vec      = HEAP_OFF_TO_PTR(heap, z0->header.zone0_zinfo_off);
+	heap->rt->zinfo_vec_size = z0->header.zone0_zinfo_size;
+
+	if (init)
+		heap_zinfo_init(heap);
+	else {
+		D_ASSERT(heap->rt->zinfo_vec->num_elems == heap->rt->nzones);
+		heap_zinfo_get(heap, 0, &allotted, &evictable);
+		D_ASSERT((evictable == false) && (allotted == true));
+	}
+
+	for (i = 1; i < heap->rt->nzones; i++) {
+		heap_zinfo_get(heap, i, &allotted, &evictable);
+		if (!allotted) {
+			if (!heap->rt->zones_unused_first)
+				heap->rt->zones_unused_first = i;
+			continue;
+		}
+		if (!evictable) {
+			heap_mbrt_setmb_nonevictable(heap, i);
+			nemb_cnt++;
+		} else {
+			mb = heap_mbrt_setup_mb(heap, i);
+			if (mb == NULL)
+				return ENOMEM;
+			heap_mbrt_setmb_evictable(heap, mb);
+			if (umem_cache_offisloaded(heap->layout_info.store, GET_ZONE_OFFSET(i))) {
+				z = ZID_TO_ZONE(&heap->layout_info, i);
+				D_ASSERT(z->header.flags & ZONE_EVICTABLE_MB);
+				heap_mbrt_setmb_usage(heap, i, z->header.sp_usage);
+			} else {
+				heap_zinfo_get_usage(heap, i, &usage_hint);
+				heap_mbrt_setmb_usage(heap, i, mb_usage_byhint[(int)usage_hint]);
+			}
+			emb_cnt++;
+		}
+		last_allocated = i;
+	}
+	heap->rt->zones_exhausted    = last_allocated + 1;
+	heap->rt->zones_exhausted_ne = nemb_cnt;
+	heap->rt->zones_exhausted_e  = emb_cnt;
+
+	D_ASSERT(heap->rt->nzones_e >= heap->rt->zones_exhausted_e);
+	D_ASSERT(heap->rt->nzones_ne >= heap->rt->zones_exhausted_ne);
+	return 0;
+}
+
+/*
+ * heap_load_nonevictable_zones() -> Populate the heap with non-evictable MBs.
+ */
+int
+heap_load_nonevictable_zones(struct palloc_heap *heap)
+{
+	int  i, rc;
+	bool allotted, evictable;
+
+	for (i = 1; i < heap->rt->zones_exhausted; i++) {
+		heap_zinfo_get(heap, i, &allotted, &evictable);
+		D_ASSERT(allotted);
+		if (!evictable) {
+			rc = heap_zone_load(heap, i);
+			if (rc)
+				return rc;
+		}
+	}
+	return 0;
+}
+
+#if 0
+/*
+ * heap_verify_zone_header --
+ *	(internal) verifies if the zone header is consistent
+ */
+static int
+heap_verify_zone_header(struct zone_header *hdr)
+{
+	if (hdr->magic != ZONE_HEADER_MAGIC) /* not initialized */
+		return 0;
+
+	if (hdr->size_idx == 0) {
+		D_CRIT("heap: invalid zone size\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_verify_chunk_header --
+ *	(internal) verifies if the chunk header is consistent
+ */
+static int
+heap_verify_chunk_header(struct chunk_header *hdr)
+{
+	if (hdr->type == CHUNK_TYPE_UNKNOWN) {
+		D_CRIT("heap: invalid chunk type\n");
+		return -1;
+	}
+
+	if (hdr->type >= MAX_CHUNK_TYPE) {
+		D_CRIT("heap: unknown chunk type\n");
+		return -1;
+	}
+
+	if (hdr->flags & ~CHUNK_FLAGS_ALL_VALID) {
+		D_CRIT("heap: invalid chunk flags\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_verify_zone -- (internal) verifies if the zone is consistent
+ */
+static int
+heap_verify_zone(struct zone *zone)
+{
+	if (zone->header.magic == 0)
+		return 0; /* not initialized, and that is OK */
+
+	if (zone->header.magic != ZONE_HEADER_MAGIC) {
+		D_CRIT("heap: invalid zone magic\n");
+		return -1;
+	}
+
+	if (heap_verify_zone_header(&zone->header))
+		return -1;
+
+	uint32_t i;
+
+	for (i = 0; i < zone->header.size_idx; ) {
+		if (heap_verify_chunk_header(&zone->chunk_headers[i]))
+			return -1;
+
+		i += zone->chunk_headers[i].size_idx;
+	}
+
+	if (i != zone->header.size_idx) {
+		D_CRIT("heap: chunk sizes mismatch\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_check -- verifies if the heap is consistent and can be opened properly
+ *
+ * If successful function returns zero. Otherwise an error number is returned.
+ */
+int
+heap_check(void *heap_start, uint64_t heap_size)
+{
+	if (heap_size < HEAP_MIN_SIZE) {
+		D_CRIT("heap: invalid heap size\n");
+		return -1;
+	}
+
+	struct heap_layout *layout = heap_start;
+
+	if (heap_verify_header(&layout->header, heap_size))
+		return -1;
+
+	for (unsigned i = 0; i < heap_max_zone(heap_size); ++i) {
+		if (heap_verify_zone(ZID_TO_ZONE(layout, i)))
+			return -1;
+	}
+
+	return 0;
+}
+#endif
+
+/*
+ * heap_zone_foreach_object -- (internal) iterates through objects in a zone
+ */
+static int
+heap_zone_foreach_object(struct palloc_heap *heap, object_callback cb,
+	void *arg, struct memory_block *m)
+{
+	struct zone *zone = ZID_TO_ZONE(&heap->layout_info, m->zone_id);
+
+	if (zone->header.magic == 0)
+		return 0;
+
+	for (; m->chunk_id < zone->header.size_idx; ) {
+		struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
+
+		memblock_rebuild_state(heap, m);
+		m->size_idx = hdr->size_idx;
+
+		if (m->m_ops->iterate_used(m, cb, arg) != 0)
+			return 1;
+
+		m->chunk_id += m->size_idx;
+		m->block_off = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_foreach_object -- (internal) iterates through objects in the heap
+ */
+void
+heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg,
+	struct memory_block m)
+{
+	for (; m.zone_id < heap->rt->nzones; ++m.zone_id) {
+		if (heap_zone_foreach_object(heap, cb, arg, &m) != 0)
+			break;
+
+		m.chunk_id = 0;
+	}
+}
+
+struct heap_zone_limits
+heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size, uint32_t nemb_pct)
+{
+	struct heap_zone_limits zd = {0};
+
+	D_ASSERT(nemb_pct <= 100);
+
+	if (heap_size < sizeof(struct heap_header))
+		zd.nzones_heap = 0;
+	else
+		zd.nzones_heap = heap_max_zone(heap_size);
+
+	zd.nzones_cache = cache_size / ZONE_MAX_SIZE;
+	if (zd.nzones_cache <= UMEM_CACHE_MIN_EVICTABLE_PAGES)
+		return zd;
+
+	if (zd.nzones_heap > zd.nzones_cache) {
+		if (zd.nzones_heap < (zd.nzones_cache + UMEM_CACHE_MIN_EVICTABLE_PAGES))
+			zd.nzones_ne_max = zd.nzones_cache - UMEM_CACHE_MIN_EVICTABLE_PAGES;
+		else
+			zd.nzones_ne_max = ((unsigned long)zd.nzones_cache * nemb_pct) / 100;
+		if (zd.nzones_cache < (zd.nzones_ne_max + UMEM_CACHE_MIN_EVICTABLE_PAGES))
+			zd.nzones_ne_max = zd.nzones_cache - UMEM_CACHE_MIN_EVICTABLE_PAGES;
+	} else
+		zd.nzones_ne_max = zd.nzones_heap;
+
+	zd.nzones_e_max = zd.nzones_heap - zd.nzones_ne_max;
+
+	return zd;
+}
+
+#if VG_MEMCHECK_ENABLED
+void
+heap_vg_zone_open(struct palloc_heap *heap, uint32_t zone_id, object_callback cb, void *args,
+		  int objects)
+{
+	struct memory_block  m = MEMORY_BLOCK_NONE;
+	uint32_t             chunks;
+	struct chunk_header *hdr;
+	struct zone         *z = ZID_TO_ZONE(&heap->layout_info, zone_id);
+	uint32_t             c;
+
+	m.zone_id  = zone_id;
+	m.chunk_id = 0;
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(z, ZONE_MAX_SIZE);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(&z->header, sizeof(z->header));
+
+	D_ASSERT(z->header.magic == ZONE_HEADER_MAGIC);
+
+	chunks = z->header.size_idx;
+
+	for (c = 0; c < chunks;) {
+		hdr = &z->chunk_headers[c];
+
+		/* define the header before rebuilding state */
+		VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+
+		m.chunk_id = c;
+		m.size_idx = hdr->size_idx;
+
+		memblock_rebuild_state(heap, &m);
+
+		m.m_ops->vg_init(&m, objects, cb, args);
+		m.block_off = 0;
+
+		ASSERT(hdr->size_idx > 0);
+
+		c += hdr->size_idx;
+	}
+
+	/* mark all unused chunk headers after last as not accessible */
+	VALGRIND_DO_MAKE_MEM_NOACCESS(&z->chunk_headers[chunks],
+				      (MAX_CHUNK - chunks) * sizeof(struct chunk_header));
+}
+
+/*
+ * heap_vg_open -- notifies Valgrind about heap layout
+ */
+void
+heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int objects)
+{
+	unsigned zones = heap_max_zone(heap->size);
+
+	ASSERTne(cb, NULL);
+
+	for (unsigned i = 1; i < zones; ++i) {
+		if (!umem_cache_offisloaded(heap->layout_info.store, GET_ZONE_OFFSET(i)))
+			continue;
+
+		if (heap_mbrt_ismb_evictable(heap, i))
+			VALGRIND_DO_CREATE_MEMPOOL(ZID_TO_ZONE(&heap->layout_info, i), 0, 0);
+
+		heap_vg_zone_open(heap, i, cb, arg, objects);
+	}
+}
+#endif
diff --git a/src/common/dav_v2/heap.h b/src/common/dav_v2/heap.h
new file mode 100644
index 00000000000..8ceeff9a5cd
--- /dev/null
+++ b/src/common/dav_v2/heap.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * heap.h -- internal definitions for heap
+ */
+
+#ifndef __DAOS_COMMON_HEAP_H
+#define __DAOS_COMMON_HEAP_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "memblock.h"
+#include "bucket.h"
+#include "memops.h"
+#include "palloc.h"
+#include "dav_internal.h"
+#include <daos/mem.h>
+
+#define HEAP_OFF_TO_PTR(heap, off) umem_cache_off2ptr(heap->layout_info.store, off)
+#define HEAP_PTR_TO_OFF(heap, ptr) umem_cache_ptr2off(heap->layout_info.store, ptr)
+
+#define BIT_IS_CLR(a, i)           (!((a) & (1ULL << (i))))
+#define HEAP_ARENA_PER_THREAD      (0)
+
+struct mbrt;
+
+int
+heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_t cache_size,
+	  struct mo_ops *p_ops, struct stats *stats);
+int
+heap_init(void *heap_start, uint64_t umem_cache_size, struct umem_store *store);
+void
+heap_cleanup(struct palloc_heap *heap);
+int
+heap_check(void *heap_start, uint64_t heap_size);
+int
+heap_get_max_nemb(struct palloc_heap *heap);
+int
+heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c);
+int
+heap_mbrt_update_alloc_class_buckets(struct palloc_heap *heap, struct mbrt *mb,
+				     struct alloc_class *c);
+int
+heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size);
+void
+heap_mbrt_setmb_evictable(struct palloc_heap *heap, struct mbrt *mb);
+bool
+heap_mbrt_ismb_initialized(struct palloc_heap *heap, uint32_t zone_id);
+bool
+heap_mbrt_ismb_evictable(struct palloc_heap *heap, uint32_t zone_id);
+void
+heap_mbrt_setmb_nonevictable(struct palloc_heap *heap, uint32_t zone_id);
+void
+heap_mbrt_setmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t usage);
+int
+heap_mbrt_getmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t *allotted,
+		      uint64_t *maxsz);
+void
+heap_mbrt_incrmb_usage(struct palloc_heap *heap, uint32_t zone_id, int size);
+struct mbrt *
+heap_mbrt_setup_mb(struct palloc_heap *heap, uint32_t zone_id);
+int
+heap_mbrt_mb_reclaim_garbage(struct palloc_heap *heap, uint32_t zid);
+int
+heap_ensure_zone0_initialized(struct palloc_heap *heap);
+int
+heap_zone_load(struct palloc_heap *heap, uint32_t zid);
+int
+heap_load_nonevictable_zones(struct palloc_heap *heap);
+int
+heap_update_mbrt_zinfo(struct palloc_heap *heap, bool init);
+size_t
+heap_zinfo_get_size(uint32_t nzones);
+
+struct alloc_class *
+heap_get_best_class(struct palloc_heap *heap, size_t size);
+struct bucket *
+mbrt_bucket_acquire(struct mbrt *mb, uint8_t class_id);
+void
+mbrt_bucket_release(struct bucket *b);
+void
+heap_set_root_ptrs(struct palloc_heap *heap, uint64_t **offp, uint64_t **sizep);
+void
+heap_set_stats_ptr(struct palloc_heap *heap, struct stats_persistent **sp);
+
+int
+heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, struct memory_block *m);
+pthread_mutex_t *
+heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id);
+
+void
+heap_discard_run(struct palloc_heap *heap, struct memory_block *m);
+
+void
+heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m);
+
+int
+heap_free_chunk_reuse(struct palloc_heap *heap, struct bucket *bucket, struct memory_block *m);
+
+void
+heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg,
+		    struct memory_block start);
+
+struct alloc_class_collection *
+heap_alloc_classes(struct palloc_heap *heap);
+
+void
+heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int objects);
+void
+heap_vg_zone_open(struct palloc_heap *heap, uint32_t zone_id, object_callback cb, void *arg,
+		  int objects);
+
+static inline struct chunk_header *
+heap_get_chunk_hdr(struct palloc_heap *heap, const struct memory_block *m)
+{
+	return GET_CHUNK_HDR(&heap->layout_info, m->zone_id, m->chunk_id);
+}
+
+static inline struct chunk *
+heap_get_chunk(struct palloc_heap *heap, const struct memory_block *m)
+{
+	return GET_CHUNK(&heap->layout_info, m->zone_id, m->chunk_id);
+}
+
+static inline struct chunk_run *
+heap_get_chunk_run(struct palloc_heap *heap, const struct memory_block *m)
+{
+	return GET_CHUNK_RUN(&heap->layout_info, m->zone_id, m->chunk_id);
+}
+
+struct mbrt *
+heap_mbrt_get_mb(struct palloc_heap *heap, uint32_t zone_id);
+
+void
+heap_mbrt_log_alloc_failure(struct palloc_heap *heap, uint32_t zone_id);
+
+int
+heap_get_evictable_mb(struct palloc_heap *heap, uint32_t *zone_id);
+
+struct heap_zone_limits {
+	unsigned nzones_heap;
+	unsigned nzones_cache;
+	unsigned nzones_ne_max;
+	unsigned nzones_e_max;
+};
+
+uint32_t
+heap_off2mbid(struct palloc_heap *heap, uint64_t offset);
+
+struct heap_zone_limits
+heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size, uint32_t nemb_pct);
+#endif /* __DAOS_COMMON_HEAP_H */
diff --git a/src/common/dav_v2/heap_layout.h b/src/common/dav_v2/heap_layout.h
new file mode 100644
index 00000000000..fa65846921e
--- /dev/null
+++ b/src/common/dav_v2/heap_layout.h
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * heap_layout.h -- internal definitions for heap layout
+ */
+
+#ifndef __DAOS_COMMON_HEAP_LAYOUT_H
+#define __DAOS_COMMON_HEAP_LAYOUT_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+#include <daos/mem.h>
+
+#define HEAP_MAJOR 1
+#define HEAP_MINOR 0
+
+#define MAX_CHUNK             63
+#define CHUNK_BASE_ALIGNMENT 1024
+#define CHUNKSIZE             ((size_t)1024 * 260) /* 260 kilobytes */
+#define MAX_MEMORY_BLOCK_SIZE (MAX_CHUNK * CHUNKSIZE)
+#define HEAP_SIGNATURE_LEN 16
+#define HEAP_SIGNATURE "MEMORY_HEAP_HDR\0"
+#define ZONE_HEADER_MAGIC 0xC3F0A2D2
+#define ZONE_MIN_SIZE (sizeof(struct zone) + sizeof(struct chunk))
+#define ZONE_MAX_SIZE (sizeof(struct zone) + sizeof(struct chunk) * MAX_CHUNK)
+#define HEAP_MIN_SIZE (sizeof(struct heap_header) + ZONE_MIN_SIZE)
+
+/* Base bitmap values, relevant for both normal and flexible bitmaps */
+#define RUN_BITS_PER_VALUE 64U
+#define RUN_BASE_METADATA_VALUES\
+	((unsigned)(sizeof(struct chunk_run_header) / sizeof(uint64_t)))
+#define RUN_BASE_METADATA_SIZE (sizeof(struct chunk_run_header))
+
+#define RUN_CONTENT_SIZE (CHUNKSIZE - RUN_BASE_METADATA_SIZE)
+
+/*
+ * Calculates the size in bytes of a single run instance, including bitmap
+ */
+#define RUN_CONTENT_SIZE_BYTES(size_idx)\
+(RUN_CONTENT_SIZE + (((size_idx) - 1) * CHUNKSIZE))
+
+/* Default bitmap values, specific for old, non-flexible, bitmaps */
+#define RUN_DEFAULT_METADATA_VALUES 40 /* in 8 byte words, 320 bytes total */
+#define RUN_DEFAULT_BITMAP_VALUES \
+	(RUN_DEFAULT_METADATA_VALUES - RUN_BASE_METADATA_VALUES)
+#define RUN_DEFAULT_BITMAP_SIZE (sizeof(uint64_t) * RUN_DEFAULT_BITMAP_VALUES)
+#define RUN_DEFAULT_BITMAP_NBITS\
+	(RUN_BITS_PER_VALUE * RUN_DEFAULT_BITMAP_VALUES)
+#define RUN_DEFAULT_SIZE \
+	(CHUNKSIZE - RUN_BASE_METADATA_SIZE - RUN_DEFAULT_BITMAP_SIZE)
+
+/*
+ * Calculates the size in bytes of a single run instance, without bitmap,
+ * but only for the default fixed-bitmap algorithm
+ */
+#define RUN_DEFAULT_SIZE_BYTES(size_idx)\
+(RUN_DEFAULT_SIZE + (((size_idx) - 1) * CHUNKSIZE))
+
+enum chunk_flags {
+	CHUNK_FLAG_COMPACT_HEADER	=	0x0001,
+	CHUNK_FLAG_HEADER_NONE		=	0x0002,
+	CHUNK_FLAG_ALIGNED		=	0x0004,
+	CHUNK_FLAG_FLEX_BITMAP		=	0x0008,
+};
+
+#define CHUNK_FLAGS_ALL_VALID (\
+	CHUNK_FLAG_COMPACT_HEADER |\
+	CHUNK_FLAG_HEADER_NONE |\
+	CHUNK_FLAG_ALIGNED |\
+	CHUNK_FLAG_FLEX_BITMAP\
+)
+
+enum chunk_type {
+	CHUNK_TYPE_UNKNOWN,
+	CHUNK_TYPE_FOOTER, /* not actual chunk type */
+	CHUNK_TYPE_FREE,
+	CHUNK_TYPE_USED,
+	CHUNK_TYPE_RUN,
+	CHUNK_TYPE_RUN_DATA,
+
+	MAX_CHUNK_TYPE
+};
+
+/* zone header flags */
+#define ZONE_EVICTABLE_MB 0x0001
+
+struct chunk {
+	uint8_t data[CHUNKSIZE];
+};
+
+struct chunk_run_header {
+	uint64_t block_size;
+	uint64_t alignment; /* valid only /w CHUNK_FLAG_ALIGNED */
+};
+
+struct chunk_run {
+	struct chunk_run_header hdr;
+	uint8_t content[RUN_CONTENT_SIZE]; /* bitmap + data */
+};
+
+struct chunk_header {
+	uint16_t type;
+	uint16_t flags;
+	uint32_t size_idx;
+};
+
+struct zone_header {
+	uint32_t magic;
+	uint32_t size_idx;
+	uint32_t flags;
+	uint32_t spare1;
+	uint64_t zone0_zinfo_size;
+	uint64_t zone0_zinfo_off;
+	uint64_t reserved[2];
+	uint64_t sp_usage;
+	uint64_t sp_usage_glob;
+	uint8_t  spare[3528];
+};
+
+struct zone {
+	struct zone_header  header;
+	struct chunk_header chunk_headers[MAX_CHUNK];
+	struct chunk        chunks[];
+};
+
+struct heap_header {
+	char     signature[HEAP_SIGNATURE_LEN];
+	uint64_t major;
+	uint64_t minor;
+	uint64_t heap_size;
+	uint64_t cache_size;
+	uint64_t heap_hdr_size;
+	uint64_t chunksize;
+	uint64_t chunks_per_zone;
+	uint8_t  nemb_pct;
+	uint8_t  reserved[4015];
+	uint64_t checksum;
+};
+
+struct heap_layout_info {
+	struct heap_header header;
+	struct zone       *zone0; /* Address of the zone0 in umem_cache */
+	struct umem_store *store;
+};
+
+#define ALLOC_HDR_SIZE_SHIFT (48ULL)
+#define ALLOC_HDR_FLAGS_MASK (((1ULL) << ALLOC_HDR_SIZE_SHIFT) - 1)
+
+struct allocation_header_legacy {
+	uint8_t unused[8];
+	uint64_t size;
+	uint8_t unused2[32];
+	uint64_t root_size;
+	uint64_t type_num;
+};
+
+#define ALLOC_HDR_COMPACT_SIZE sizeof(struct allocation_header_compact)
+
+struct allocation_header_compact {
+	uint64_t size;
+	uint64_t extra;
+};
+
+enum header_type {
+	HEADER_LEGACY,
+	HEADER_COMPACT,
+	HEADER_NONE,
+
+	MAX_HEADER_TYPES
+};
+
+static const size_t header_type_to_size[MAX_HEADER_TYPES] = {
+	sizeof(struct allocation_header_legacy),
+	sizeof(struct allocation_header_compact),
+	0
+};
+
+static const enum chunk_flags header_type_to_flag[MAX_HEADER_TYPES] = {
+	(enum chunk_flags)0,
+	CHUNK_FLAG_COMPACT_HEADER,
+	CHUNK_FLAG_HEADER_NONE
+};
+
+static inline struct zone *
+ZID_TO_ZONE(struct heap_layout_info *layout_info, size_t zone_id)
+{
+	uint64_t zoff = sizeof(struct heap_header) + ZONE_MAX_SIZE * zone_id;
+
+	return umem_cache_off2ptr(layout_info->store, zoff);
+}
+
+static inline struct chunk_header *
+GET_CHUNK_HDR(struct heap_layout_info *layout_info, size_t zone_id, unsigned chunk_id)
+{
+	return &ZID_TO_ZONE(layout_info, zone_id)->chunk_headers[chunk_id];
+}
+
+static inline struct chunk *
+GET_CHUNK(struct heap_layout_info *layout_info, size_t zone_id, unsigned chunk_id)
+{
+	return &ZID_TO_ZONE(layout_info, zone_id)->chunks[chunk_id];
+}
+
+static inline struct chunk_run *
+GET_CHUNK_RUN(struct heap_layout_info *layout_info, size_t zone_id, unsigned chunk_id)
+{
+	return (struct chunk_run *)GET_CHUNK(layout_info, zone_id, chunk_id);
+}
+
+static inline uint64_t
+GET_ZONE_OFFSET(uint32_t zid)
+{
+	return sizeof(struct heap_header) + ZONE_MAX_SIZE * zid;
+}
+
+static inline bool
+IS_ZONE_HDR_OFFSET(uint64_t off)
+{
+	return (((off - sizeof(struct heap_header)) % ZONE_MAX_SIZE) == 0);
+}
+
+static inline uint32_t
+OFFSET_TO_ZID(uint64_t off)
+{
+	return (off - sizeof(struct heap_header)) / ZONE_MAX_SIZE;
+}
+
+#endif /* __DAOS_COMMON_HEAP_LAYOUT_H */
diff --git a/src/common/dav_v2/memblock.c b/src/common/dav_v2/memblock.c
new file mode 100644
index 00000000000..d66682d5f5a
--- /dev/null
+++ b/src/common/dav_v2/memblock.c
@@ -0,0 +1,1615 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2024, Intel Corporation */
+
+/*
+ * memblock.c -- implementation of memory block
+ *
+ * Memory block is a representation of persistent object that resides in the
+ * heap. A valid memory block must be either a huge (free or used) chunk or a
+ * block inside a run.
+ *
+ * Huge blocks are 1:1 correlated with the chunk headers in the zone whereas
+ * run blocks are represented by bits in corresponding chunk bitmap.
+ *
+ * This file contains implementations of abstract operations on memory blocks.
+ * Instead of storing the mbops structure inside each memory block the correct
+ * method implementation is chosen at runtime.
+ */
+
+#include <string.h>
+
+#include "obj.h"
+#include "heap.h"
+#include "memblock.h"
+#include "out.h"
+#include "valgrind_internal.h"
+#include "alloc_class.h"
+
+/* calculates the size of the entire run, including any additional chunks */
+#define SIZEOF_RUN(runp, size_idx)\
+	(sizeof(*(runp)) + (((size_idx) - 1) * CHUNKSIZE))
+
+/*
+ * memblock_header_type -- determines the memory block's header type
+ */
+static enum header_type
+memblock_header_type(const struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	if (hdr->flags & CHUNK_FLAG_COMPACT_HEADER)
+		return HEADER_COMPACT;
+
+	if (hdr->flags & CHUNK_FLAG_HEADER_NONE)
+		return HEADER_NONE;
+
+	return HEADER_LEGACY;
+}
+
+/*
+ * memblock_header_legacy_get_size --
+ *	(internal) returns the size stored in a legacy header
+ */
+static size_t
+memblock_header_legacy_get_size(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	return hdr->size;
+}
+
+/*
+ * memblock_header_compact_get_size --
+ *	(internal) returns the size stored in a compact header
+ */
+static size_t
+memblock_header_compact_get_size(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	return hdr->size & ALLOC_HDR_FLAGS_MASK;
+}
+
+/*
+ * memblock_header_none_get_size --
+ *	(internal) determines the sizes of an object without a header
+ */
+static size_t
+memblock_header_none_get_size(const struct memory_block *m)
+{
+	return m->m_ops->block_size(m);
+}
+
+/*
+ * memblock_header_legacy_get_extra --
+ *	(internal) returns the extra field stored in a legacy header
+ */
+static uint64_t
+memblock_header_legacy_get_extra(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	return hdr->type_num;
+}
+
+/*
+ * memblock_header_compact_get_extra --
+ *	(internal) returns the extra field stored in a compact header
+ */
+static uint64_t
+memblock_header_compact_get_extra(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	return hdr->extra;
+}
+
+/*
+ * memblock_header_none_get_extra --
+ *	(internal) objects without a header don't have an extra field
+ */
+static uint64_t
+memblock_header_none_get_extra(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return 0;
+}
+
+/*
+ * memblock_header_legacy_get_flags --
+ *	(internal) returns the flags stored in a legacy header
+ */
+static uint16_t
+memblock_header_legacy_get_flags(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	return (uint16_t)(hdr->root_size >> ALLOC_HDR_SIZE_SHIFT);
+}
+
+/*
+ * memblock_header_compact_get_flags --
+ *	(internal) returns the flags stored in a compact header
+ */
+static uint16_t
+memblock_header_compact_get_flags(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	return (uint16_t)(hdr->size >> ALLOC_HDR_SIZE_SHIFT);
+}
+
+/*
+ * memblock_header_none_get_flags --
+ *	(internal) objects without a header do not support flags
+ */
+static uint16_t
+memblock_header_none_get_flags(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return 0;
+}
+
+/*
+ * memblock_header_legacy_write --
+ *	(internal) writes a legacy header of an object
+ */
+static void
+memblock_header_legacy_write(const struct memory_block *m,
+	size_t size, uint64_t extra, uint16_t flags)
+{
+	struct allocation_header_legacy hdr;
+
+	hdr.size = size;
+	hdr.type_num = extra;
+	hdr.root_size = ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT);
+
+	struct allocation_header_legacy *hdrp = m->m_ops->get_real_data(m);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp));
+
+	VALGRIND_ADD_TO_TX(hdrp, sizeof(*hdrp));
+	memcpy(hdrp, &hdr, sizeof(hdr)); /* legacy header is 64 bytes in size */
+	VALGRIND_REMOVE_FROM_TX(hdrp, sizeof(*hdrp));
+
+	/* unused fields of the legacy headers are used as a red zone */
+	VALGRIND_DO_MAKE_MEM_NOACCESS(hdrp->unused, sizeof(hdrp->unused));
+}
+
+/*
+ * memblock_header_compact_write --
+ *	(internal) writes a compact header of an object
+ */
+static void
+memblock_header_compact_write(const struct memory_block *m,
+	size_t size, uint64_t extra, uint16_t flags)
+{
+	COMPILE_ERROR_ON(ALLOC_HDR_COMPACT_SIZE > CACHELINE_SIZE);
+
+	struct {
+		struct allocation_header_compact hdr;
+		uint8_t padding[CACHELINE_SIZE - ALLOC_HDR_COMPACT_SIZE];
+	} padded;
+
+	/*
+	 * REVISIT:
+	 * Below memset is added to prevent valgrind propagating the
+	 * cleared V-Bits of the padding field all the way till DMA buffer
+	 * as part of logging by WAL.
+	 * This code needs to be revisited when valgrind macros are
+	 * enabled within DAV.
+	 */
+	padded.hdr.size = size | ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT);
+	padded.hdr.extra = extra;
+
+	struct allocation_header_compact *hdrp = m->m_ops->get_real_data(m);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp));
+
+	/*
+	 * If possible write the entire header with a single memcpy, this allows
+	 * the copy implementation to avoid a cache miss on a partial cache line
+	 * write.
+	 */
+	size_t hdr_size = ALLOC_HDR_COMPACT_SIZE;
+
+	if ((uintptr_t)hdrp % CACHELINE_SIZE == 0 && size >= sizeof(padded))
+		hdr_size = sizeof(padded);
+
+	VALGRIND_ADD_TO_TX(hdrp, hdr_size);
+
+	memcpy(hdrp, &padded, hdr_size);
+	VALGRIND_DO_MAKE_MEM_UNDEFINED((char *)hdrp + ALLOC_HDR_COMPACT_SIZE,
+		hdr_size - ALLOC_HDR_COMPACT_SIZE);
+
+	VALGRIND_REMOVE_FROM_TX(hdrp, hdr_size);
+}
+
+/*
+ * memblock_header_none_write --
+ *	(internal) nothing to write
+ */
+static void
+memblock_header_none_write(const struct memory_block *m,
+	size_t size, uint64_t extra, uint16_t flags)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m, size, extra, flags);
+
+	/* NOP */
+}
+
+/*
+ * memblock_header_legacy_invalidate --
+ *	(internal) invalidates a legacy header
+ */
+static void
+memblock_header_legacy_invalidate(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	VALGRIND_SET_CLEAN(hdr, sizeof(*hdr));
+}
+
+/*
+ * memblock_header_compact_invalidate --
+ *	(internal) invalidates a compact header
+ */
+static void
+memblock_header_compact_invalidate(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	VALGRIND_SET_CLEAN(hdr, sizeof(*hdr));
+}
+
+/*
+ * memblock_no_header_invalidate --
+ *	(internal) nothing to invalidate
+ */
+static void
+memblock_header_none_invalidate(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	/* NOP */
+}
+
+/*
+ * memblock_header_legacy_reinit --
+ *	(internal) reinitializes a legacy header after a heap restart
+ */
+static void
+memblock_header_legacy_reinit(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+
+	/* unused fields of the legacy headers are used as a red zone */
+	VALGRIND_DO_MAKE_MEM_NOACCESS(hdr->unused, sizeof(hdr->unused));
+}
+
+/*
+ * memblock_header_compact_reinit --
+ *	(internal) reinitializes a compact header after a heap restart
+ */
+static void
+memblock_header_compact_reinit(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+}
+
+/*
+ * memblock_header_none_reinit --
+ *	(internal) nothing to reinitialize
+ */
+static void
+memblock_header_none_reinit(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	/* NOP */
+}
+
+static const struct {
+	/* determines the sizes of an object */
+	size_t (*get_size)(const struct memory_block *m);
+
+	/* returns the extra field (if available, 0 if not) */
+	uint64_t (*get_extra)(const struct memory_block *m);
+
+	/* returns the flags stored in a header (if available, 0 if not) */
+	uint16_t (*get_flags)(const struct memory_block *m);
+
+	/*
+	 * Stores size, extra info and flags in header of an object
+	 * (if available, does nothing otherwise).
+	 */
+	void (*write)(const struct memory_block *m,
+		size_t size, uint64_t extra, uint16_t flags);
+	void (*invalidate)(const struct memory_block *m);
+
+	/*
+	 * Reinitializes a header after a heap restart (if available, does
+	 * nothing otherwise) (VG).
+	 */
+	void (*reinit)(const struct memory_block *m);
+} memblock_header_ops[MAX_HEADER_TYPES] = {
+	[HEADER_LEGACY] = {
+		memblock_header_legacy_get_size,
+		memblock_header_legacy_get_extra,
+		memblock_header_legacy_get_flags,
+		memblock_header_legacy_write,
+		memblock_header_legacy_invalidate,
+		memblock_header_legacy_reinit,
+	},
+	[HEADER_COMPACT] = {
+		memblock_header_compact_get_size,
+		memblock_header_compact_get_extra,
+		memblock_header_compact_get_flags,
+		memblock_header_compact_write,
+		memblock_header_compact_invalidate,
+		memblock_header_compact_reinit,
+	},
+	[HEADER_NONE] = {
+		memblock_header_none_get_size,
+		memblock_header_none_get_extra,
+		memblock_header_none_get_flags,
+		memblock_header_none_write,
+		memblock_header_none_invalidate,
+		memblock_header_none_reinit,
+	}
+};
+
+/*
+ * memblock_run_default_nallocs -- returns the number of memory blocks
+ *	available in the in a run with given parameters using the default
+ *	fixed-bitmap algorithm
+ */
+static unsigned
+memblock_run_default_nallocs(uint32_t *size_idx, uint16_t flags,
+	uint64_t unit_size, uint64_t alignment)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(flags);
+
+	unsigned nallocs = (unsigned)
+		(RUN_DEFAULT_SIZE_BYTES(*size_idx) / unit_size);
+
+	while (nallocs > RUN_DEFAULT_BITMAP_NBITS) {
+		/* trying to create a run with number of units exceeding the bitmap size */
+		DAV_DBG("run:%lu number of units %u exceeds bitmap size (%u)",
+			  unit_size, nallocs, RUN_DEFAULT_BITMAP_NBITS);
+		if (*size_idx > 1) {
+			*size_idx -= 1;
+			/* recalculate the number of allocations */
+			nallocs = (uint32_t)
+				(RUN_DEFAULT_SIZE_BYTES(*size_idx) / unit_size);
+			/* run was constructed with fewer chunks (minus one) */
+			D_INFO("run:%lu constructed with fewer chunks:%u\n",
+				  unit_size, *size_idx);
+		} else {
+			/*
+			 * run was constructed with fewer units than optimal,
+			 * this might lead to inefficient memory utilization!
+			 */
+			D_INFO("run:%lu constructed with fewer units:%u than optimal:%u\n",
+				unit_size, RUN_DEFAULT_BITMAP_NBITS, nallocs);
+
+			nallocs = RUN_DEFAULT_BITMAP_NBITS;
+		}
+	}
+
+	return nallocs - (alignment ? 1 : 0);
+}
+
+/*
+ * memblock_run_bitmap -- calculate bitmap parameters for given arguments
+ */
+void
+memblock_run_bitmap(uint32_t *size_idx, uint16_t flags,
+	uint64_t unit_size, uint64_t alignment, void *content,
+	struct run_bitmap *b)
+{
+	ASSERTne(*size_idx, 0);
+
+	/*
+	 * Flexible bitmaps have a variably sized values array. The size varies
+	 * depending on:
+	 *	alignment - initial run alignment might require up-to a unit
+	 *	size idx - the larger the run, the more units it carries
+	 *	unit_size - the smaller the unit size, the more units per run
+	 *
+	 * The size of the bitmap also has to be calculated in such a way that
+	 * the beginning of allocations data is cacheline aligned. This is
+	 * required to perform many optimizations throughout the codebase.
+	 * This alignment requirement means that some of the bitmap values might
+	 * remain unused and will serve only as a padding for data.
+	 */
+	if (flags & CHUNK_FLAG_FLEX_BITMAP) {
+		/*
+		 * First calculate the number of values without accounting for
+		 * the bitmap size.
+		 */
+		size_t content_size = RUN_CONTENT_SIZE_BYTES(*size_idx);
+
+		b->nbits = (unsigned)(content_size / unit_size);
+		b->nvalues = util_div_ceil(b->nbits, RUN_BITS_PER_VALUE);
+
+		/*
+		 * Then, align the number of values up, so that the cacheline
+		 * alignment is preserved.
+		 */
+		b->nvalues = ALIGN_UP(b->nvalues + RUN_BASE_METADATA_VALUES,
+			(unsigned)(CACHELINE_SIZE / sizeof(*b->values)))
+			- RUN_BASE_METADATA_VALUES;
+
+		/*
+		 * This is the total number of bytes needed for the bitmap AND
+		 * padding.
+		 */
+		b->size = b->nvalues * sizeof(*b->values);
+
+		/*
+		 * Calculate the number of allocations again, but this time
+		 * accounting for the bitmap/padding.
+		 */
+		b->nbits = (unsigned)((content_size - b->size) / unit_size)
+			- (alignment ? 1U : 0U);
+
+		/*
+		 * The last step is to calculate how much of the padding
+		 * is left at the end of the bitmap.
+		 */
+		unsigned unused_bits = (b->nvalues * RUN_BITS_PER_VALUE)
+			- b->nbits;
+		unsigned unused_values = unused_bits / RUN_BITS_PER_VALUE;
+
+		b->nvalues -= unused_values;
+		b->values = (uint64_t *)content;
+
+		return;
+	}
+
+	b->size = RUN_DEFAULT_BITMAP_SIZE;
+	b->nbits = memblock_run_default_nallocs(size_idx, flags,
+		unit_size, alignment);
+
+	unsigned unused_bits = RUN_DEFAULT_BITMAP_NBITS - b->nbits;
+	unsigned unused_values = unused_bits / RUN_BITS_PER_VALUE;
+
+	b->nvalues = RUN_DEFAULT_BITMAP_VALUES - unused_values;
+	b->values = (uint64_t *)content;
+}
+
+/*
+ * run_get_bitmap -- initializes run bitmap information
+ */
+static void
+run_get_bitmap(const struct memory_block *m, struct run_bitmap *b)
+{
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	if (m->cached_bitmap != NULL) {
+		*b = *m->cached_bitmap;
+		b->values = (uint64_t *)run->content;
+	} else {
+		struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+		uint32_t size_idx = hdr->size_idx;
+
+		memblock_run_bitmap(&size_idx, hdr->flags, run->hdr.block_size,
+			run->hdr.alignment, run->content, b);
+		ASSERTeq(size_idx, hdr->size_idx);
+	}
+}
+
+/*
+ * huge_block_size -- returns the compile-time constant which defines the
+ *	huge memory block size.
+ */
+static size_t
+huge_block_size(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return CHUNKSIZE;
+}
+
+/*
+ * run_block_size -- looks for the right chunk and returns the block size
+ *	information that is attached to the run block metadata.
+ */
+static size_t
+run_block_size(const struct memory_block *m)
+{
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	return run->hdr.block_size;
+}
+
+/*
+ * huge_get_real_data -- returns pointer to the beginning data of a huge block
+ */
+static void *
+huge_get_real_data(const struct memory_block *m)
+{
+	return heap_get_chunk(m->heap, m)->data;
+}
+
+/*
+ * run_get_data_start -- (internal) returns the pointer to the beginning of
+ *	allocations in a run
+ */
+static char *
+run_get_data_start(const struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	if (hdr->flags & CHUNK_FLAG_ALIGNED) {
+		/*
+		 * Alignment is property of user data in allocations. And
+		 * since objects have headers, we need to take them into
+		 * account when calculating the address.
+		 */
+		uintptr_t hsize = header_type_to_size[m->header_type];
+		uintptr_t base = (uintptr_t)run->content +
+			b.size + hsize;
+		return (char *)(ALIGN_UP(base, run->hdr.alignment) - hsize);
+	} else {
+		return (char *)&run->content + b.size;
+	}
+}
+
+/*
+ * run_get_data_offset -- (internal) returns the number of bytes between
+ *	run base metadata and data
+ */
+static size_t
+run_get_data_offset(const struct memory_block *m)
+{
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	return (size_t)run_get_data_start(m) - (size_t)&run->content;
+}
+
+/*
+ * run_get_real_data -- returns pointer to the beginning data of a run block
+ */
+static void *
+run_get_real_data(const struct memory_block *m)
+{
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	ASSERT(run->hdr.block_size != 0);
+
+	return run_get_data_start(m) + (run->hdr.block_size * m->block_off);
+}
+
+/*
+ * block_get_user_data -- returns pointer to the data of a block
+ */
+static void *
+block_get_user_data(const struct memory_block *m)
+{
+	return (char *)m->m_ops->get_real_data(m) +
+		header_type_to_size[m->header_type];
+}
+
+/*
+ * chunk_get_chunk_hdr_value -- (internal) get value of a header for redo log
+ */
+static uint64_t
+chunk_get_chunk_hdr_value(uint16_t type, uint16_t flags, uint32_t size_idx)
+{
+	uint64_t val;
+	struct chunk_header hdr;
+
+	COMPILE_ERROR_ON(sizeof(struct chunk_header) != sizeof(uint64_t));
+
+	hdr.type = type;
+	hdr.flags = flags;
+	hdr.size_idx = size_idx;
+	memcpy(&val, &hdr, sizeof(val));
+
+	return val;
+}
+
+/*
+ * huge_prep_operation_hdr -- prepares the new value of a chunk header that will
+ *	be set after the operation concludes.
+ */
+static void
+huge_prep_operation_hdr(const struct memory_block *m, enum memblock_state op,
+	struct operation_context *ctx)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	/*
+	 * Depending on the operation that needs to be performed a new chunk
+	 * header needs to be prepared with the new chunk state.
+	 */
+	uint64_t val = chunk_get_chunk_hdr_value(
+		op == MEMBLOCK_ALLOCATED ? CHUNK_TYPE_USED : CHUNK_TYPE_FREE,
+		hdr->flags,
+		m->size_idx);
+
+	if (ctx == NULL) {
+		util_atomic_store_explicit64((uint64_t *)hdr, val,
+			memory_order_relaxed);
+		mo_wal_persist(&m->heap->p_ops, hdr, sizeof(*hdr));
+	} else {
+		operation_add_entry(ctx, hdr, val, ULOG_OPERATION_SET);
+	}
+
+	VALGRIND_DO_MAKE_MEM_NOACCESS(hdr + 1,
+		(hdr->size_idx - 1) * sizeof(struct chunk_header));
+
+	/*
+	 * In the case of chunks larger than one unit the footer must be
+	 * created immediately AFTER the persistent state is safely updated.
+	 */
+	if (m->size_idx == 1)
+		return;
+
+	struct chunk_header *footer = hdr + m->size_idx - 1;
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(footer, sizeof(*footer));
+
+	val = chunk_get_chunk_hdr_value(CHUNK_TYPE_FOOTER, 0, m->size_idx);
+
+	/*
+	 * It's only safe to write the footer AFTER the persistent part of
+	 * the operation have been successfully processed because the footer
+	 * pointer might point to a currently valid persistent state
+	 * of a different chunk.
+	 * The footer entry change is updated as transient because it will
+	 * be recreated at heap boot regardless - it's just needed for runtime
+	 * operations.
+	 */
+	if (ctx == NULL) {
+		util_atomic_store_explicit64((uint64_t *)footer, val,
+			memory_order_relaxed);
+		VALGRIND_SET_CLEAN(footer, sizeof(*footer));
+	} else {
+		operation_add_typed_entry(ctx,
+			footer, val, ULOG_OPERATION_SET, LOG_TRANSIENT);
+	}
+}
+
+/*
+ * run_prep_operation_hdr -- prepares the new value for a select few bytes of
+ *	a run bitmap that will be set after the operation concludes.
+ *
+ * It's VERY important to keep in mind that the particular value of the
+ * bitmap this method is modifying must not be changed after this function
+ * is called and before the operation is processed.
+ */
+static void
+run_prep_operation_hdr(const struct memory_block *m, enum memblock_state op,
+	struct operation_context *ctx)
+{
+	ASSERT(m->size_idx <= RUN_BITS_PER_VALUE);
+	ASSERT(m->size_idx > 0);
+
+	/*
+	 * Free blocks are represented by clear bits and used blocks by set
+	 * bits - which is the reverse of the commonly used scheme.
+	 *
+	 * Here a bit mask is prepared that flips the bits that represent the
+	 * memory block provided by the caller - because both the size index and
+	 * the block offset are tied 1:1 to the bitmap this operation is
+	 * relatively simple.
+	 */
+	uint64_t bmask;
+
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	if (m->size_idx == RUN_BITS_PER_VALUE) {
+		ASSERTeq(m->block_off % RUN_BITS_PER_VALUE, 0);
+		bmask = UINT64_MAX;
+	} else {
+		bmask = ((1ULL << m->size_idx) - 1ULL) <<
+				(m->block_off % RUN_BITS_PER_VALUE);
+	}
+#else
+	uint16_t num = m->size_idx;
+	uint32_t pos = m->block_off % RUN_BITS_PER_VALUE;
+
+	ASSERT_rt(num > 0 && num <= RUN_BITS_PER_VALUE);
+	bmask = ULOG_ENTRY_TO_VAL(pos, num);
+#endif
+
+	/*
+	 * The run bitmap is composed of several 8 byte values, so a proper
+	 * element of the bitmap array must be selected.
+	 */
+	unsigned bpos = m->block_off / RUN_BITS_PER_VALUE;
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	/* the bit mask is applied immediately by the add entry operations */
+	if (op == MEMBLOCK_ALLOCATED) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+		operation_add_entry(ctx, &b.values[bpos],
+				    bmask, ULOG_OPERATION_OR);
+#else
+		operation_add_entry(ctx, &b.values[bpos],
+				    bmask, ULOG_OPERATION_SET_BITS);
+#endif
+	} else if (op == MEMBLOCK_FREE) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+		operation_add_entry(ctx, &b.values[bpos],
+				    ~bmask, ULOG_OPERATION_AND);
+#else
+		operation_add_entry(ctx, &b.values[bpos],
+				    bmask, ULOG_OPERATION_CLR_BITS);
+#endif
+	} else {
+		ASSERT(0);
+	}
+}
+
+/*
+ * huge_get_lock -- because huge memory blocks are always allocated from a
+ *	single bucket there's no reason to lock them - the bucket itself is
+ *	protected.
+ */
+static pthread_mutex_t *
+huge_get_lock(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return NULL;
+}
+
+/*
+ * run_get_lock -- gets the runtime mutex from the heap.
+ */
+static pthread_mutex_t *
+run_get_lock(const struct memory_block *m)
+{
+	return heap_get_run_lock(m->heap, m->chunk_id);
+}
+
+/*
+ * huge_get_state -- returns whether a huge block is allocated or not
+ */
+static enum memblock_state
+huge_get_state(const struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	if (hdr->type == CHUNK_TYPE_USED)
+		return MEMBLOCK_ALLOCATED;
+
+	if (hdr->type == CHUNK_TYPE_FREE)
+		return MEMBLOCK_FREE;
+
+	return MEMBLOCK_STATE_UNKNOWN;
+}
+
+/*
+ * huge_get_state -- returns whether a block from a run is allocated or not
+ */
+static enum memblock_state
+run_get_state(const struct memory_block *m)
+{
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	unsigned v = m->block_off / RUN_BITS_PER_VALUE;
+	uint64_t bitmap = b.values[v];
+	unsigned bit = m->block_off % RUN_BITS_PER_VALUE;
+
+	unsigned bit_last = bit + m->size_idx;
+
+	ASSERT(bit_last <= RUN_BITS_PER_VALUE);
+
+	for (unsigned i = bit; i < bit_last; ++i) {
+		if (!BIT_IS_CLR(bitmap, i))
+			return MEMBLOCK_ALLOCATED;
+	}
+
+	return MEMBLOCK_FREE;
+}
+
+/*
+ * huge_ensure_header_type -- checks the header type of a chunk and modifies
+ *	it if necessary. This is fail-safe atomic.
+ */
+static void
+huge_ensure_header_type(const struct memory_block *m,
+	enum header_type t)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	ASSERTeq(hdr->type, CHUNK_TYPE_FREE);
+
+	if ((hdr->flags & header_type_to_flag[t]) == 0) {
+		VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr));
+		uint16_t f = ((uint16_t)header_type_to_flag[t]);
+		uint64_t nhdr = chunk_get_chunk_hdr_value(hdr->type,
+			hdr->flags | f, hdr->size_idx);
+		util_atomic_store_explicit64((uint64_t *)hdr,
+			nhdr, memory_order_relaxed);
+		mo_wal_persist(&m->heap->p_ops, hdr, sizeof(*hdr));
+		VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr));
+	}
+}
+
+/*
+ * run_ensure_header_type -- runs must be created with appropriate header type.
+ */
+static void
+run_ensure_header_type(const struct memory_block *m,
+	enum header_type t)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m, t);
+
+#ifdef DAV_EXTRA_DEBUG
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	ASSERTeq(hdr->type, CHUNK_TYPE_RUN);
+	ASSERT((hdr->flags & header_type_to_flag[t]) == header_type_to_flag[t]);
+#endif
+}
+
+/*
+ * block_get_real_size -- returns the size of a memory block that includes all
+ *	of the overhead (headers)
+ */
+static size_t
+block_get_real_size(const struct memory_block *m)
+{
+	/*
+	 * There are two valid ways to get a size. If the memory block
+	 * initialized properly and the size index is set, the chunk unit size
+	 * can be simply multiplied by that index, otherwise we need to look at
+	 * the allocation header.
+	 */
+	if (m->size_idx != 0)
+		return m->m_ops->block_size(m) * m->size_idx;
+	else
+		return memblock_header_ops[m->header_type].get_size(m);
+}
+
+/*
+ * block_get_user_size -- returns the size of a memory block without overheads,
+ *	this is the size of a data block that can be used.
+ */
+static size_t
+block_get_user_size(const struct memory_block *m)
+{
+	return block_get_real_size(m) - header_type_to_size[m->header_type];
+}
+
+/*
+ * block_write_header -- writes a header of an allocation
+ */
+static void
+block_write_header(const struct memory_block *m,
+	uint64_t extra_field, uint16_t flags)
+{
+	memblock_header_ops[m->header_type].write(m,
+		block_get_real_size(m), extra_field, flags);
+}
+
+/*
+ * block_invalidate -- invalidates allocation data and header
+ */
+static void
+block_invalidate(const struct memory_block *m)
+{
+	void *data = m->m_ops->get_user_data(m);
+	size_t size = m->m_ops->get_user_size(m);
+
+	VALGRIND_SET_CLEAN(data, size);
+
+	memblock_header_ops[m->header_type].invalidate(m);
+}
+
+/*
+ * block_reinit_header -- reinitializes a block after a heap restart
+ */
+static void
+block_reinit_header(const struct memory_block *m)
+{
+	memblock_header_ops[m->header_type].reinit(m);
+}
+
+/*
+ * block_get_extra -- returns the extra field of an allocation
+ */
+static uint64_t
+block_get_extra(const struct memory_block *m)
+{
+	return memblock_header_ops[m->header_type].get_extra(m);
+}
+
+/*
+ * block_get_flags -- returns the flags of an allocation
+ */
+static uint16_t
+block_get_flags(const struct memory_block *m)
+{
+	return memblock_header_ops[m->header_type].get_flags(m);
+}
+
+/*
+ * heap_run_process_bitmap_value -- (internal) looks for unset bits in the
+ * value, creates a valid memory block out of them and inserts that
+ * block into the given bucket.
+ */
+static int
+run_process_bitmap_value(const struct memory_block *m,
+	uint64_t value, uint32_t base_offset, object_callback cb, void *arg)
+{
+	int ret = 0;
+	uint64_t shift = 0; /* already processed bits */
+	struct memory_block s = *m;
+
+	do {
+		/*
+		 * Shift the value so that the next memory block starts on the
+		 * least significant position:
+		 *	..............0 (free block)
+		 * or	..............1 (used block)
+		 */
+		uint64_t shifted = value >> shift;
+
+		/* all clear or set bits indicate the end of traversal */
+		if (shifted == 0) {
+			/*
+			 * Insert the remaining blocks as free. Remember that
+			 * unsigned values are always zero-filled, so we must
+			 * take the current shift into account.
+			 */
+			s.block_off = (uint32_t)(base_offset + shift);
+			s.size_idx = (uint32_t)(RUN_BITS_PER_VALUE - shift);
+
+			ret = cb(&s, arg);
+			if (ret != 0)
+				return ret;
+
+			break;
+		} else if (shifted == UINT64_MAX) {
+			break;
+		}
+
+		/*
+		 * Offset and size of the next free block, either of these
+		 * can be zero depending on where the free block is located
+		 * in the value.
+		 */
+		unsigned off = (unsigned)util_lssb_index64(~shifted);
+		unsigned size = (unsigned)util_lssb_index64(shifted);
+
+		shift += off + size;
+
+		if (size != 0) { /* zero size means skip to the next value */
+			s.block_off = (uint32_t)(base_offset + (shift - size));
+			s.size_idx = (uint32_t)(size);
+
+			memblock_rebuild_state(m->heap, &s);
+			ret = cb(&s, arg);
+			if (ret != 0)
+				return ret;
+		}
+	} while (shift != RUN_BITS_PER_VALUE);
+
+	return 0;
+}
+
+/*
+ * run_iterate_free -- iterates over free blocks in a run
+ */
+static int
+run_iterate_free(const struct memory_block *m, object_callback cb, void *arg)
+{
+	int ret = 0;
+	uint32_t block_off = 0;
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	struct memory_block nm = *m;
+
+	for (unsigned i = 0; i < b.nvalues; ++i) {
+		uint64_t v = b.values[i];
+
+		ASSERT((uint64_t)RUN_BITS_PER_VALUE * (uint64_t)i
+			<= UINT32_MAX);
+		block_off = RUN_BITS_PER_VALUE * i;
+		ret = run_process_bitmap_value(&nm, v, block_off, cb, arg);
+		if (ret != 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * run_iterate_used -- iterates over used blocks in a run
+ */
+static int
+run_iterate_used(const struct memory_block *m, object_callback cb, void *arg)
+{
+	uint32_t i = m->block_off / RUN_BITS_PER_VALUE;
+	uint32_t block_start = m->block_off % RUN_BITS_PER_VALUE;
+	uint32_t block_off;
+
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	struct memory_block iter = *m;
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	for (; i < b.nvalues; ++i) {
+		uint64_t v = b.values[i];
+
+		block_off = (uint32_t)(RUN_BITS_PER_VALUE * i);
+
+		for (uint32_t j = block_start; j < RUN_BITS_PER_VALUE; ) {
+			if (block_off + j >= (uint32_t)b.nbits)
+				break;
+
+			if (!BIT_IS_CLR(v, j)) {
+				iter.block_off = (uint32_t)(block_off + j);
+
+				/*
+				 * The size index of this memory block cannot be
+				 * retrieved at this time because the header
+				 * might not be initialized in valgrind yet.
+				 */
+				iter.size_idx = 0;
+
+				if (cb(&iter, arg) != 0)
+					return 1;
+
+				iter.size_idx = CALC_SIZE_IDX(
+					run->hdr.block_size,
+					iter.m_ops->get_real_size(&iter));
+				j = (uint32_t)(j + iter.size_idx);
+			} else {
+				++j;
+			}
+		}
+		block_start = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * huge_iterate_free -- calls cb on memory block if it's free
+ */
+static int
+huge_iterate_free(const struct memory_block *m, object_callback cb, void *arg)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	return hdr->type == CHUNK_TYPE_FREE ? cb(m, arg) : 0;
+}
+
+/*
+ * huge_iterate_free -- calls cb on memory block if it's used
+ */
+static int
+huge_iterate_used(const struct memory_block *m, object_callback cb, void *arg)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	return hdr->type == CHUNK_TYPE_USED ? cb(m, arg) : 0;
+}
+
+/*
+ * huge_vg_init -- initializes chunk metadata in memcheck state
+ */
+static void
+huge_vg_init(const struct memory_block *m, int objects,
+	object_callback cb, void *arg)
+{
+	struct zone         *z     = ZID_TO_ZONE(&m->heap->layout_info, m->zone_id);
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+	struct chunk *chunk = heap_get_chunk(m->heap, m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+
+	/*
+	 * Mark unused chunk headers as not accessible.
+	 */
+	VALGRIND_DO_MAKE_MEM_NOACCESS(
+		&z->chunk_headers[m->chunk_id + 1],
+		(m->size_idx - 1) *
+		sizeof(struct chunk_header));
+
+	size_t size = block_get_real_size(m);
+
+	VALGRIND_DO_MAKE_MEM_NOACCESS(chunk, size);
+
+	if (objects && huge_get_state(m) == MEMBLOCK_ALLOCATED) {
+		if (cb(m, arg) != 0)
+			FATAL("failed to initialize valgrind state");
+	}
+}
+
+/*
+ * run_vg_init -- initializes run metadata in memcheck state
+ */
+static void
+run_vg_init(const struct memory_block *m, int objects,
+	object_callback cb, void *arg)
+{
+	struct zone         *z   = ZID_TO_ZONE(&m->heap->layout_info, m->zone_id);
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+
+	/* set the run metadata as defined */
+	VALGRIND_DO_MAKE_MEM_DEFINED(run, RUN_BASE_METADATA_SIZE);
+
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	/*
+	 * Mark run data headers as defined.
+	 */
+	for (unsigned j = 1; j < m->size_idx; ++j) {
+		struct chunk_header *data_hdr =
+			&z->chunk_headers[m->chunk_id + j];
+		VALGRIND_DO_MAKE_MEM_DEFINED(data_hdr,
+			sizeof(struct chunk_header));
+		ASSERTeq(data_hdr->type, CHUNK_TYPE_RUN_DATA);
+	}
+
+	VALGRIND_DO_MAKE_MEM_NOACCESS(run, SIZEOF_RUN(run, m->size_idx));
+
+	/* set the run bitmap as defined */
+	VALGRIND_DO_MAKE_MEM_DEFINED(run, b.size + RUN_BASE_METADATA_SIZE);
+
+	if (objects) {
+		if (run_iterate_used(m, cb, arg) != 0)
+			FATAL("failed to initialize valgrind state");
+	}
+}
+
+/*
+ * run_reinit_chunk -- run reinitialization on first zone traversal
+ */
+static void
+run_reinit_chunk(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	/* noop */
+}
+
+/*
+ * huge_write_footer -- (internal) writes a chunk footer
+ */
+static void
+huge_write_footer(struct chunk_header *hdr, uint32_t size_idx)
+{
+	if (size_idx == 1) /* that would overwrite the header */
+		return;
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr + size_idx - 1, sizeof(*hdr));
+
+	struct chunk_header f = *hdr;
+
+	f.type = CHUNK_TYPE_FOOTER;
+	f.size_idx = size_idx;
+	*(hdr + size_idx - 1) = f;
+	/* no need to persist, footers are recreated in heap_populate_buckets */
+	VALGRIND_SET_CLEAN(hdr + size_idx - 1, sizeof(f));
+}
+
+/*
+ * huge_reinit_chunk -- chunk reinitialization on first zone traversal
+ */
+static void
+huge_reinit_chunk(const struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	if (hdr->type == CHUNK_TYPE_USED)
+		huge_write_footer(hdr, hdr->size_idx);
+}
+
+/*
+ * run_calc_free -- calculates the number of free units in a run
+ */
+static void
+run_calc_free(const struct memory_block *m,
+	uint32_t *free_space, uint32_t *max_free_block)
+{
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+	for (unsigned i = 0; i < b.nvalues; ++i) {
+		uint64_t value = ~b.values[i];
+
+		if (value == 0)
+			continue;
+
+		uint32_t free_in_value = util_popcount64(value);
+
+		*free_space = *free_space + free_in_value;
+
+		/*
+		 * If this value has less free blocks than already found max,
+		 * there's no point in calculating.
+		 */
+		if (free_in_value < *max_free_block)
+			continue;
+
+		/* if the entire value is empty, no point in calculating */
+		if (free_in_value == RUN_BITS_PER_VALUE) {
+			*max_free_block = RUN_BITS_PER_VALUE;
+			continue;
+		}
+
+		/* if already at max, no point in calculating */
+		if (*max_free_block == RUN_BITS_PER_VALUE)
+			continue;
+
+		/*
+		 * Calculate the biggest free block in the bitmap.
+		 * This algorithm is not the most clever imaginable, but it's
+		 * easy to implement and fast enough.
+		 */
+		uint16_t n = 0;
+
+		while (value != 0) {
+			value &= (value << 1ULL);
+			n++;
+		}
+
+		if (n > *max_free_block)
+			*max_free_block = n;
+	}
+}
+
+/*
+ * huge_fill_pct -- huge blocks by definition use the entirety of a chunk
+ */
+static unsigned
+huge_fill_pct(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return 100;
+}
+
+/*
+ * run_fill_pct -- calculates the percentage of allocated units inside of a run
+ */
+static unsigned
+run_fill_pct(const struct memory_block *m)
+{
+	struct run_bitmap b;
+	unsigned clearbits = 0;
+
+	run_get_bitmap(m, &b);
+	for (unsigned i = 0; i < b.nvalues; ++i) {
+		uint64_t value = ~b.values[i];
+
+		if (value == 0)
+			continue;
+
+		clearbits += util_popcount64(value);
+	}
+	ASSERT(b.nbits >= clearbits);
+	unsigned setbits = b.nbits - clearbits;
+
+	return (100 * setbits) / b.nbits;
+}
+
+static const struct memory_block_ops mb_ops[MAX_MEMORY_BLOCK] = {
+	[MEMORY_BLOCK_HUGE] = {
+		.block_size = huge_block_size,
+		.prep_hdr = huge_prep_operation_hdr,
+		.get_lock = huge_get_lock,
+		.get_state = huge_get_state,
+		.get_user_data = block_get_user_data,
+		.get_real_data = huge_get_real_data,
+		.get_user_size = block_get_user_size,
+		.get_real_size = block_get_real_size,
+		.write_header = block_write_header,
+		.invalidate = block_invalidate,
+		.ensure_header_type = huge_ensure_header_type,
+		.reinit_header = block_reinit_header,
+		.vg_init = huge_vg_init,
+		.get_extra = block_get_extra,
+		.get_flags = block_get_flags,
+		.iterate_free = huge_iterate_free,
+		.iterate_used = huge_iterate_used,
+		.reinit_chunk = huge_reinit_chunk,
+		.calc_free = NULL,
+		.get_bitmap = NULL,
+		.fill_pct = huge_fill_pct,
+	},
+	[MEMORY_BLOCK_RUN] = {
+		.block_size = run_block_size,
+		.prep_hdr = run_prep_operation_hdr,
+		.get_lock = run_get_lock,
+		.get_state = run_get_state,
+		.get_user_data = block_get_user_data,
+		.get_real_data = run_get_real_data,
+		.get_user_size = block_get_user_size,
+		.get_real_size = block_get_real_size,
+		.write_header = block_write_header,
+		.invalidate = block_invalidate,
+		.ensure_header_type = run_ensure_header_type,
+		.reinit_header = block_reinit_header,
+		.vg_init = run_vg_init,
+		.get_extra = block_get_extra,
+		.get_flags = block_get_flags,
+		.iterate_free = run_iterate_free,
+		.iterate_used = run_iterate_used,
+		.reinit_chunk = run_reinit_chunk,
+		.calc_free = run_calc_free,
+		.get_bitmap = run_get_bitmap,
+		.fill_pct = run_fill_pct,
+	}
+};
+
+/*
+ * memblock_huge_init -- initializes a new huge memory block
+ */
+struct memory_block
+memblock_huge_init(struct palloc_heap *heap,
+	uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx)
+{
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	m.chunk_id = chunk_id;
+	m.zone_id = zone_id;
+	m.size_idx = size_idx;
+	m.heap = heap;
+
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr, sizeof(*hdr));
+	VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr));
+
+	uint64_t nhdr = chunk_get_chunk_hdr_value(CHUNK_TYPE_FREE,
+		0, size_idx);
+	util_atomic_store_explicit64((uint64_t *)hdr,
+		nhdr, memory_order_relaxed);
+
+	mo_wal_persist(&heap->p_ops, hdr, sizeof(*hdr));
+
+	huge_write_footer(hdr, size_idx);
+
+	memblock_rebuild_state(heap, &m);
+
+	return m;
+}
+
+/*
+ * memblock_run_init -- initializes a new run memory block
+ */
+struct memory_block
+memblock_run_init(struct palloc_heap *heap,
+	uint32_t chunk_id, uint32_t zone_id, struct run_descriptor *rdsc)
+{
+	uint32_t size_idx = rdsc->size_idx;
+
+	ASSERTne(size_idx, 0);
+
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	m.chunk_id = chunk_id;
+	m.zone_id = zone_id;
+	m.size_idx = size_idx;
+	m.heap = heap;
+
+	struct zone      *z       = ZID_TO_ZONE(&heap->layout_info, zone_id);
+	struct chunk_run *run = heap_get_chunk_run(heap, &m);
+	size_t runsize = SIZEOF_RUN(run, size_idx);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(run, runsize);
+
+	/* add/remove chunk_run and chunk_header to valgrind transaction */
+	VALGRIND_ADD_TO_TX(run, runsize);
+	run->hdr.block_size = rdsc->unit_size;
+	run->hdr.alignment = rdsc->alignment;
+
+	struct run_bitmap b = rdsc->bitmap;
+
+	b.values = (uint64_t *)run->content;
+
+	size_t bitmap_size = b.size;
+
+	/* set all the bits */
+	memset(b.values, 0xFF, bitmap_size);
+
+	/* clear only the bits available for allocations from this bucket */
+	memset(b.values, 0, sizeof(*b.values) * (b.nvalues - 1));
+
+	unsigned trailing_bits = b.nbits % RUN_BITS_PER_VALUE;
+	uint64_t last_value = UINT64_MAX << trailing_bits;
+
+	b.values[b.nvalues - 1] = last_value;
+
+	VALGRIND_REMOVE_FROM_TX(run, runsize);
+
+	mo_wal_flush(&heap->p_ops, run,
+		sizeof(struct chunk_run_header) +
+		bitmap_size, 0);
+
+	struct chunk_header run_data_hdr;
+
+	run_data_hdr.type = CHUNK_TYPE_RUN_DATA;
+	run_data_hdr.flags = 0;
+
+	VALGRIND_ADD_TO_TX(&z->chunk_headers[chunk_id],
+		sizeof(struct chunk_header) * size_idx);
+
+	struct chunk_header *data_hdr;
+
+	for (unsigned i = 1; i < size_idx; ++i) {
+		data_hdr = &z->chunk_headers[chunk_id + i];
+		VALGRIND_DO_MAKE_MEM_UNDEFINED(data_hdr, sizeof(*data_hdr));
+		VALGRIND_ANNOTATE_NEW_MEMORY(data_hdr, sizeof(*data_hdr));
+		run_data_hdr.size_idx = i;
+		*data_hdr = run_data_hdr;
+	}
+	mo_wal_persist(&heap->p_ops,
+		&z->chunk_headers[chunk_id + 1],
+		sizeof(struct chunk_header) * (size_idx - 1));
+
+	struct chunk_header *hdr = &z->chunk_headers[chunk_id];
+
+	ASSERT(hdr->type == CHUNK_TYPE_FREE);
+
+	VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr));
+
+	uint64_t run_hdr = chunk_get_chunk_hdr_value(CHUNK_TYPE_RUN,
+		rdsc->flags, hdr->size_idx);
+	util_atomic_store_explicit64((uint64_t *)hdr,
+		run_hdr, memory_order_relaxed);
+	mo_wal_persist(&heap->p_ops, hdr, sizeof(*hdr));
+
+	VALGRIND_REMOVE_FROM_TX(&z->chunk_headers[chunk_id],
+		sizeof(struct chunk_header) * size_idx);
+
+	memblock_rebuild_state(heap, &m);
+	m.cached_bitmap = &rdsc->bitmap;
+
+	return m;
+}
+
+/*
+ * memblock_detect_type -- looks for the corresponding chunk header and
+ *	depending on the chunks type returns the right memory block type
+ */
+static enum memory_block_type
+memblock_detect_type(struct palloc_heap *heap, const struct memory_block *m)
+{
+	enum memory_block_type ret = MEMORY_BLOCK_HUGE;
+
+	switch (heap_get_chunk_hdr(heap, m)->type) {
+	case CHUNK_TYPE_RUN:
+	case CHUNK_TYPE_RUN_DATA:
+		ret = MEMORY_BLOCK_RUN;
+		break;
+	case CHUNK_TYPE_FREE:
+	case CHUNK_TYPE_USED:
+	case CHUNK_TYPE_FOOTER:
+		ret = MEMORY_BLOCK_HUGE;
+		break;
+	default:
+		/* unreachable */
+		FATAL("possible zone chunks metadata corruption");
+	}
+	return ret;
+}
+
+/*
+ * memblock_from_offset -- resolves a memory block data from an offset that
+ *	originates from the heap
+ */
+struct memory_block
+memblock_from_offset_opt(struct palloc_heap *heap, uint64_t off, int size)
+{
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	m.heap = heap;
+
+	off -= HEAP_PTR_TO_OFF(heap, heap->layout_info.zone0);
+	m.zone_id = (uint32_t)(off / ZONE_MAX_SIZE);
+
+	off -= (ZONE_MAX_SIZE * m.zone_id) + sizeof(struct zone);
+	m.chunk_id = (uint32_t)(off / CHUNKSIZE);
+
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m);
+
+	if (hdr->type == CHUNK_TYPE_RUN_DATA)
+		m.chunk_id -= hdr->size_idx;
+
+	off -= CHUNKSIZE * m.chunk_id;
+
+	m.header_type = memblock_header_type(&m);
+
+	off -= header_type_to_size[m.header_type];
+
+	m.type = off != 0 ? MEMORY_BLOCK_RUN : MEMORY_BLOCK_HUGE;
+	ASSERTeq(memblock_detect_type(heap, &m), m.type);
+
+	m.m_ops = &mb_ops[m.type];
+
+	uint64_t unit_size = m.m_ops->block_size(&m);
+
+	if (off != 0) { /* run */
+		off -= run_get_data_offset(&m);
+		off -= RUN_BASE_METADATA_SIZE;
+		m.block_off = (uint16_t)(off / unit_size);
+		off -= m.block_off * unit_size;
+	}
+
+	struct alloc_class_collection *acc = heap_alloc_classes(heap);
+
+	if (acc != NULL) {
+		struct alloc_class *ac = alloc_class_by_run(acc,
+			unit_size, hdr->flags, hdr->size_idx);
+		if (ac != NULL)
+			m.cached_bitmap = &ac->rdsc.bitmap;
+	}
+
+	m.size_idx = !size ? 0 : CALC_SIZE_IDX(unit_size,
+		memblock_header_ops[m.header_type].get_size(&m));
+
+	ASSERTeq(off, 0);
+
+	return m;
+}
+
+/*
+ * memblock_from_offset -- returns memory block with size
+ */
+struct memory_block
+memblock_from_offset(struct palloc_heap *heap, uint64_t off)
+{
+	return memblock_from_offset_opt(heap, off, 1);
+}
+
+/*
+ * memblock_rebuild_state -- fills in the runtime-state related fields of a
+ *	memory block structure
+ *
+ * This function must be called on all memory blocks that were created by hand
+ * (as opposed to retrieved from memblock_from_offset function).
+ */
+void
+memblock_rebuild_state(struct palloc_heap *heap, struct memory_block *m)
+{
+	m->heap = heap;
+	m->header_type = memblock_header_type(m);
+	m->type = memblock_detect_type(heap, m);
+	m->m_ops = &mb_ops[m->type];
+	m->cached_bitmap = NULL;
+}
diff --git a/src/common/dav_v2/memblock.h b/src/common/dav_v2/memblock.h
new file mode 100644
index 00000000000..0dd133647c3
--- /dev/null
+++ b/src/common/dav_v2/memblock.h
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2023, Intel Corporation */
+
+/*
+ * memblock.h -- internal definitions for memory block
+ */
+
+#ifndef __DAOS_COMMON_MEMBLOCK_H
+#define __DAOS_COMMON_MEMBLOCK_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "heap_layout.h"
+#include "memops.h"
+#include "palloc.h"
+
+#define MEMORY_BLOCK_NONE \
+(struct memory_block)\
+{0, 0, 0, 0, NULL, NULL, MAX_HEADER_TYPES, MAX_MEMORY_BLOCK, NULL}
+
+#define MEMORY_BLOCK_IS_NONE(_m)\
+((_m).heap == NULL)
+
+#define MEMORY_BLOCK_EQUALS(lhs, rhs)\
+((lhs).zone_id == (rhs).zone_id && (lhs).chunk_id == (rhs).chunk_id &&\
+(lhs).block_off == (rhs).block_off && (lhs).heap == (rhs).heap)
+
+enum memory_block_type {
+	/*
+	 * Huge memory blocks are directly backed by memory chunks. A single
+	 * huge block can consist of several chunks.
+	 * The persistent representation of huge memory blocks can be thought
+	 * of as a doubly linked list with variable length elements.
+	 * That list is stored in the chunk headers array where one element
+	 * directly corresponds to one chunk.
+	 *
+	 * U - used, F - free, R - footer, . - empty
+	 * |U| represents a used chunk with a size index of 1, with type
+	 * information (CHUNK_TYPE_USED) stored in the corresponding header
+	 * array element - chunk_headers[chunk_id].
+	 *
+	 * |F...R| represents a free chunk with size index of 5. The empty
+	 * chunk headers have undefined values and shouldn't be used. All
+	 * chunks with size larger than 1 must have a footer in the last
+	 * corresponding header array - chunk_headers[chunk_id - size_idx - 1].
+	 *
+	 * The above representation of chunks will be used to describe the
+	 * way fail-safety is achieved during heap operations.
+	 *
+	 * Allocation of huge memory block with size index 5:
+	 * Initial heap state: |U| <> |F..R| <> |U| <> |F......R|
+	 *
+	 * The only block that matches that size is at very end of the chunks
+	 * list: |F......R|
+	 *
+	 * As the request was for memory block of size 5, and this ones size is
+	 * 7 there's a need to first split the chunk in two.
+	 * 1) The last chunk header of the new allocation is marked as footer
+	 *	and the block after that one is marked as free: |F...RF.R|
+	 *	This is allowed and has no impact on the heap because this
+	 *	modification is into chunk header that is otherwise unused, in
+	 *	other words the linked list didn't change.
+	 *
+	 * 2) The size index of the first header is changed from previous value
+	 *	of 7 to 5: |F...R||F.R|
+	 *	This is a single fail-safe atomic operation and this is the
+	 *	first change that is noticeable by the heap operations.
+	 *	A single linked list element is split into two new ones.
+	 *
+	 * 3) The allocation process either uses redo log or changes directly
+	 *	the chunk header type from free to used: |U...R| <> |F.R|
+	 *
+	 * In a similar fashion the reverse operation, free, is performed:
+	 * Initial heap state: |U| <> |F..R| <> |F| <> |U...R| <> |F.R|
+	 *
+	 * This is the heap after the previous example with the single chunk
+	 * in between changed from used to free.
+	 *
+	 * 1) Determine the neighbors of the memory block which is being
+	 *	freed.
+	 *
+	 * 2) Update the footer (if needed) information of the last chunk which
+	 *	is the memory block being freed or it's neighbor to the right.
+	 *	|F| <> |U...R| <> |F.R << this one|
+	 *
+	 * 3) Update the size index and type of the left-most chunk header.
+	 *	And so this: |F << this one| <> |U...R| <> |F.R|
+	 *	becomes this: |F.......R|
+	 *	The entire chunk header can be updated in a single fail-safe
+	 *	atomic operation because it's size is only 64 bytes.
+	 */
+	MEMORY_BLOCK_HUGE,
+	/*
+	 * Run memory blocks are chunks with CHUNK_TYPE_RUN and size index of 1.
+	 * The entire chunk is subdivided into smaller blocks and has an
+	 * additional metadata attached in the form of a bitmap - each bit
+	 * corresponds to a single block.
+	 * In this case there's no need to perform any coalescing or splitting
+	 * on the persistent metadata.
+	 * The bitmap is stored on a variable number of 64 bit values and
+	 * because of the requirement of allocation fail-safe atomicity the
+	 * maximum size index of a memory block from a run is 64 - since that's
+	 * the limit of atomic write guarantee.
+	 *
+	 * The allocation/deallocation process is a single 8 byte write that
+	 * sets/clears the corresponding bits. Depending on the user choice
+	 * it can either be made atomically or using redo-log when grouped with
+	 * other operations.
+	 * It's also important to note that in a case of realloc it might so
+	 * happen that a single 8 byte bitmap value has its bits both set and
+	 * cleared - that's why the run memory block metadata changes operate
+	 * on AND'ing or OR'ing a bitmask instead of directly setting the value.
+	 */
+	MEMORY_BLOCK_RUN,
+
+	MAX_MEMORY_BLOCK
+};
+
+enum memblock_state {
+	MEMBLOCK_STATE_UNKNOWN,
+	MEMBLOCK_ALLOCATED,
+	MEMBLOCK_FREE,
+
+	MAX_MEMBLOCK_STATE,
+};
+
+/* runtime bitmap information for a run */
+struct run_bitmap {
+	unsigned nvalues; /* number of 8 byte values - size of values array */
+	unsigned nbits; /* number of valid bits */
+
+	size_t size; /* total size of the bitmap in bytes */
+
+	uint64_t *values; /* pointer to the bitmap's values array */
+};
+
+/* runtime information necessary to create a run */
+struct run_descriptor {
+	uint16_t flags; /* chunk flags for the run */
+	size_t unit_size; /* the size of a single unit in a run */
+	uint32_t size_idx; /* size index of a single run instance */
+	size_t alignment; /* required alignment of objects */
+	unsigned nallocs; /* number of allocs per run */
+	struct run_bitmap bitmap;
+};
+
+struct memory_block_ops {
+	/* returns memory block size */
+	size_t (*block_size)(const struct memory_block *m);
+
+	/* prepares header modification operation */
+	void (*prep_hdr)(const struct memory_block *m,
+		enum memblock_state dest_state, struct operation_context *ctx);
+
+	/* returns lock associated with memory block */
+	pthread_mutex_t *(*get_lock)(const struct memory_block *m);
+
+	/* returns whether a block is allocated or not */
+	enum memblock_state (*get_state)(const struct memory_block *m);
+
+	/* returns pointer to the data of a block */
+	void *(*get_user_data)(const struct memory_block *m);
+
+	/*
+	 * Returns the size of a memory block without overhead.
+	 * This is the size of a data block that can be used.
+	 */
+	size_t (*get_user_size)(const struct memory_block *m);
+
+	/* returns pointer to the beginning of data of a run block */
+	void *(*get_real_data)(const struct memory_block *m);
+
+	/* returns the size of a memory block, including headers */
+	size_t (*get_real_size)(const struct memory_block *m);
+
+	/* writes a header of an allocation */
+	void (*write_header)(const struct memory_block *m,
+		uint64_t extra_field, uint16_t flags);
+	void (*invalidate)(const struct memory_block *m);
+
+	/*
+	 * Checks the header type of a chunk matches the expected type and
+	 * modifies it if necessary. This is fail-safe atomic.
+	 */
+	void (*ensure_header_type)(const struct memory_block *m,
+		enum header_type t);
+
+	/*
+	 * Reinitializes a block after a heap restart.
+	 * This is called for EVERY allocation, but *only* under Valgrind.
+	 */
+	void (*reinit_header)(const struct memory_block *m);
+
+	/* returns the extra field of an allocation */
+	uint64_t (*get_extra)(const struct memory_block *m);
+
+	/* returns the flags of an allocation */
+	uint16_t (*get_flags)(const struct memory_block *m);
+
+	/* initializes memblock in valgrind */
+	void (*vg_init)(const struct memory_block *m, int objects,
+		object_callback cb, void *arg);
+
+	/* iterates over every free block */
+	int (*iterate_free)(const struct memory_block *m,
+		object_callback cb, void *arg);
+
+	/* iterates over every used block */
+	int (*iterate_used)(const struct memory_block *m,
+		object_callback cb, void *arg);
+
+	/* calculates number of free units, valid only for runs */
+	void (*calc_free)(const struct memory_block *m,
+		uint32_t *free_space, uint32_t *max_free_block);
+
+	/* this is called exactly once for every existing chunk */
+	void (*reinit_chunk)(const struct memory_block *m);
+
+	/*
+	 * Initializes bitmap data for a run.
+	 * Do *not* use this function unless absolutely necessary, it breaks
+	 * the abstraction layer by exposing implementation details.
+	 */
+	void (*get_bitmap)(const struct memory_block *m, struct run_bitmap *b);
+
+	/* calculates the ratio between occupied and unoccupied space */
+	unsigned (*fill_pct)(const struct memory_block *m);
+};
+
+struct memory_block {
+	uint32_t chunk_id; /* index of the memory block in its zone */
+	uint32_t zone_id; /* index of this block zone in the heap */
+
+	/*
+	 * Size index of the memory block represented in either multiple of
+	 * CHUNKSIZE in the case of a huge chunk or in multiple of a run
+	 * block size.
+	 */
+	uint32_t size_idx;
+
+	/*
+	 * Used only for run chunks, must be zeroed for huge.
+	 * Number of preceding blocks in the chunk. In other words, the
+	 * position of this memory block in run bitmap.
+	 */
+	uint32_t block_off;
+
+	/*
+	 * The variables below are associated with the memory block and are
+	 * stored here for convenience. Those fields are filled by either the
+	 * memblock_from_offset or memblock_rebuild_state, and they should not
+	 * be modified manually.
+	 */
+	const struct memory_block_ops *m_ops;
+	struct palloc_heap *heap;
+	enum header_type header_type;
+	enum memory_block_type type;
+	struct run_bitmap *cached_bitmap;
+};
+
+/*
+ * This is a representation of a run memory block that is active in a bucket or
+ * is on a pending list in the recycler.
+ * This structure should never be passed around by value because the address of
+ * the nresv variable can be in reservations made through palloc_reserve(). Only
+ * if the number of reservations equals 0 the structure can be moved/freed.
+ */
+struct memory_block_reserved {
+	struct memory_block m;
+
+	struct bucket_locked *bucket;
+	/*
+	 * Number of reservations made from this run, the pointer to this value
+	 * is stored in a user facing pobj_action structure. Decremented once
+	 * the reservation is published or canceled.
+	 */
+	int nresv;
+};
+
+struct memory_block memblock_from_offset(struct palloc_heap *heap,
+	uint64_t off);
+struct memory_block memblock_from_offset_opt(struct palloc_heap *heap,
+	uint64_t off, int size);
+void memblock_rebuild_state(struct palloc_heap *heap, struct memory_block *m);
+
+struct memory_block memblock_huge_init(struct palloc_heap *heap,
+	uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx);
+
+struct memory_block memblock_run_init(struct palloc_heap *heap,
+	uint32_t chunk_id, uint32_t zone_id, struct run_descriptor *rdsc);
+
+void memblock_run_bitmap(uint32_t *size_idx, uint16_t flags,
+	uint64_t unit_size, uint64_t alignment, void *content,
+	struct run_bitmap *b);
+
+#endif /* __DAOS_COMMON_MEMBLOCK_H */
diff --git a/src/common/dav_v2/memops.c b/src/common/dav_v2/memops.c
new file mode 100644
index 00000000000..c550ce34e39
--- /dev/null
+++ b/src/common/dav_v2/memops.c
@@ -0,0 +1,678 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2024, Intel Corporation */
+
+/*
+ * memops.c -- aggregated memory operations helper implementation
+ *
+ * The operation collects all of the required memory modifications that
+ * need to happen in an atomic way (all of them or none), and abstracts
+ * away the storage type (transient/persistent) and the underlying
+ * implementation of how it's actually performed - in some cases using
+ * the redo log is unnecessary and the allocation process can be sped up
+ * a bit by completely omitting that whole machinery.
+ *
+ * The modifications are not visible until the context is processed.
+ */
+
+#include "memops.h"
+#include "obj.h"
+#include "out.h"
+#include "ravl.h"
+#include "valgrind_internal.h"
+#include "vecq.h"
+#include "sys_util.h"
+#include "dav_internal.h"
+#include "tx.h"
+
+static inline int
+OBJ_OFF_IS_VALID_FROM_CTX(void *ctx, uint64_t offset)
+{
+	dav_obj_t *dav_hdl = (dav_obj_t *)ctx;
+
+	return OBJ_OFF_IS_VALID(dav_hdl, offset);
+}
+
+#define ULOG_BASE_SIZE 1024
+#define OP_MERGE_SEARCH 64
+
+enum operation_state {
+	OPERATION_IDLE,
+	OPERATION_IN_PROGRESS,
+	OPERATION_CLEANUP,
+};
+
+struct operation_log {
+	size_t capacity; /* capacity of the ulog log */
+	size_t offset; /* data offset inside of the log */
+	struct ulog *ulog; /* DRAM allocated log of modifications */
+};
+
+/*
+ * operation_context -- context of an ongoing palloc operation
+ */
+struct operation_context {
+	enum log_type type;
+
+	ulog_extend_fn extend; /* function to allocate next ulog */
+	ulog_free_fn ulog_free; /* function to free next ulogs */
+
+	const struct mo_ops *p_ops;
+	struct mo_ops t_ops; /* used for transient data processing */
+	struct mo_ops s_ops; /* used for shadow copy data processing */
+
+	size_t ulog_curr_offset; /* offset in the log for buffer stores */
+	size_t ulog_curr_capacity; /* capacity of the current log */
+	size_t ulog_curr_gen_num; /* transaction counter in the current log */
+	struct ulog *ulog_curr; /* current persistent log */
+	size_t total_logged; /* total amount of buffer stores in the logs */
+
+	struct ulog *ulog; /* pointer to the ulog used by context for undo ops */
+	size_t ulog_base_nbytes; /* available bytes in initial ulog log */
+	size_t ulog_capacity; /* sum of capacity, incl all next ulog logs */
+	int ulog_auto_reserve; /* allow or do not to auto ulog reservation */
+
+	struct ulog_next next; /* vector of 'next' fields of persistent ulog */
+
+	enum operation_state state; /* operation sanity check */
+
+	struct operation_log pshadow_ops; /* used by context for redo ops */
+	struct operation_log transient_ops; /* log of transient changes */
+
+	/* collection used to look for potential merge candidates */
+	VECQ(, struct ulog_entry_val *) merge_entries;
+};
+
+/*
+ * operation_log_transient_init -- (internal) initialize operation log
+ *	containing transient memory resident changes
+ */
+static int
+operation_log_transient_init(struct operation_log *log)
+{
+	struct ulog *src;
+
+	log->capacity = ULOG_BASE_SIZE;
+	log->offset = 0;
+
+	D_ALLOC(src, (sizeof(struct ulog) + ULOG_BASE_SIZE));
+	if (src == NULL) {
+		D_CRIT("Zalloc!\n");
+		return -1;
+	}
+
+	/* initialize underlying redo log structure */
+	src->capacity = ULOG_BASE_SIZE;
+
+	log->ulog = src;
+
+	return 0;
+}
+
+/*
+ * operation_log_persistent_init -- (internal) initialize operation log
+ *	containing persistent memory resident changes
+ */
+static int
+operation_log_persistent_init(struct operation_log *log,
+	size_t ulog_base_nbytes)
+{
+	struct ulog *src;
+
+	log->capacity = ULOG_BASE_SIZE;
+	log->offset = 0;
+
+	D_ALLOC(src, (sizeof(struct ulog) + ULOG_BASE_SIZE));
+	if (src == NULL) {
+		D_CRIT("Zalloc!\n");
+		return -1;
+	}
+
+	/* initialize underlying redo log structure */
+	src->capacity = ULOG_BASE_SIZE;
+	memset(src->unused, 0, sizeof(src->unused));
+
+	log->ulog = src;
+
+	return 0;
+}
+
+/*
+ * operation_transient_clean -- cleans pmemcheck address state
+ */
+static int
+operation_transient_clean(void *base, const void *addr, size_t len,
+	unsigned flags)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(base, flags);
+
+	VALGRIND_SET_CLEAN(addr, len);
+
+	return 0;
+}
+
+/*
+ * operation_transient_drain -- noop
+ */
+static void
+operation_transient_drain(void *base)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(base);
+}
+
+/*
+ * operation_transient_memcpy -- transient memcpy wrapper
+ */
+static void *
+operation_transient_memcpy(void *base, void *dest, const void *src, size_t len,
+	unsigned flags)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(base, flags);
+
+	return memcpy(dest, src, len);
+}
+
+/*
+ * operation_new -- creates new operation context
+ */
+struct operation_context *
+operation_new(struct ulog *ulog, size_t ulog_base_nbytes,
+	ulog_extend_fn extend, ulog_free_fn ulog_free,
+	const struct mo_ops *p_ops, enum log_type type)
+{
+
+	SUPPRESS_UNUSED(p_ops);
+
+	struct operation_context *ctx;
+
+	D_ALLOC_PTR(ctx);
+	if (ctx == NULL) {
+		D_CRIT("Zalloc!\n");
+		goto error_ctx_alloc;
+	}
+
+	ctx->ulog = ulog;
+	ctx->ulog_base_nbytes = ulog_base_nbytes;
+	ctx->ulog_capacity = ulog_capacity(ulog,
+		ulog_base_nbytes);
+	ctx->extend = extend;
+	ctx->ulog_free = ulog_free;
+	ctx->state = OPERATION_IDLE;
+	VEC_INIT(&ctx->next);
+	ulog_rebuild_next_vec(ulog, &ctx->next);
+	ctx->p_ops = p_ops;
+	ctx->type = type;
+
+	ctx->ulog_curr_offset = 0;
+	ctx->ulog_curr_capacity = 0;
+	ctx->ulog_curr = NULL;
+
+	ctx->t_ops.base = NULL;
+	ctx->t_ops.flush = operation_transient_clean;
+	ctx->t_ops.memcpy = operation_transient_memcpy;
+	ctx->t_ops.drain = operation_transient_drain;
+
+	ctx->s_ops.base = p_ops->base;
+	ctx->s_ops.flush = operation_transient_clean;
+	ctx->s_ops.memcpy = operation_transient_memcpy;
+	ctx->s_ops.drain = operation_transient_drain;
+	ctx->s_ops.umem_store = p_ops->umem_store;
+
+	VECQ_INIT(&ctx->merge_entries);
+
+	if (operation_log_transient_init(&ctx->transient_ops) != 0)
+		goto error_ulog_alloc;
+
+	if (operation_log_persistent_init(&ctx->pshadow_ops,
+	    ulog_base_nbytes) != 0)
+		goto error_ulog_alloc;
+
+	return ctx;
+
+error_ulog_alloc:
+	operation_delete(ctx);
+error_ctx_alloc:
+	return NULL;
+}
+
+/*
+ * operation_delete -- deletes operation context
+ */
+void
+operation_delete(struct operation_context *ctx)
+{
+	VECQ_DELETE(&ctx->merge_entries);
+	VEC_DELETE(&ctx->next);
+	D_FREE(ctx->pshadow_ops.ulog);
+	D_FREE(ctx->transient_ops.ulog);
+	D_FREE(ctx);
+}
+
+/*
+ * operation_free_logs -- free all logs except first
+ */
+void
+operation_free_logs(struct operation_context *ctx)
+{
+	int freed = ulog_free_next(ctx->ulog, ctx->ulog_free);
+
+	if (freed) {
+		ctx->ulog_capacity = ulog_capacity(ctx->ulog,
+			ctx->ulog_base_nbytes);
+		VEC_CLEAR(&ctx->next);
+		ulog_rebuild_next_vec(ctx->ulog, &ctx->next);
+	}
+
+	ASSERTeq(VEC_SIZE(&ctx->next), 0);
+}
+
+/*
+ * operation_merge -- (internal) performs operation on a field
+ */
+static inline int
+operation_merge(struct ulog_entry_base *entry, uint64_t value,
+	ulog_operation_type type)
+{
+	struct ulog_entry_val *e = (struct ulog_entry_val *)entry;
+	uint16_t num, num1, num2;
+	uint32_t pos, pos1, pos2;
+
+	switch (type) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+		e->value &= value;
+		break;
+	case ULOG_OPERATION_OR:
+		e->value |= value;
+		break;
+#else
+	case ULOG_OPERATION_SET_BITS:
+	case ULOG_OPERATION_CLR_BITS:
+		num1 = ULOG_ENTRY_VAL_TO_BITS(e->value);
+		pos1 = ULOG_ENTRY_VAL_TO_POS(e->value);
+		num2 = ULOG_ENTRY_VAL_TO_BITS(value);
+		pos2 = ULOG_ENTRY_VAL_TO_POS(value);
+
+		if ((pos2 > pos1 + num1) || (pos1 > pos2 + num2))
+			return 0; /* there is a gap, no merge */
+
+		pos = MIN(pos1, pos2);
+		num = MAX(pos1 + num1, pos2 + num2) - pos;
+
+		e->value = ULOG_ENTRY_TO_VAL(pos, num);
+		break;
+#endif
+	case ULOG_OPERATION_SET:
+		e->value = value;
+	default:
+		ASSERT(0); /* unreachable */
+	}
+	return 1;
+}
+
+/*
+ * operation_try_merge_entry -- tries to merge the incoming log entry with
+ *	existing entries
+ *
+ * Because this requires a reverse foreach, it cannot be implemented using
+ * the on-media ulog log structure since there's no way to find what's
+ * the previous entry in the log. Instead, the last N entries are stored
+ * in a collection and traversed backwards.
+ */
+static int
+operation_try_merge_entry(struct operation_context *ctx,
+	void *ptr, uint64_t value, ulog_operation_type type)
+{
+	int ret = 0;
+	uint64_t offset = OBJ_PTR_TO_OFF(ctx->p_ops->base, ptr);
+
+	struct ulog_entry_val *e;
+
+	VECQ_FOREACH_REVERSE(e, &ctx->merge_entries) {
+		if (ulog_entry_offset(&e->base) == offset) {
+			if (ulog_entry_type(&e->base) == type) {
+				if (operation_merge(&e->base, value, type))
+					return 1;
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * operation_merge_entry_add -- adds a new entry to the merge collection,
+ *	keeps capacity at OP_MERGE_SEARCH. Removes old entries in FIFO fashion.
+ */
+static void
+operation_merge_entry_add(struct operation_context *ctx,
+	struct ulog_entry_val *entry)
+{
+	if (VECQ_SIZE(&ctx->merge_entries) == OP_MERGE_SEARCH)
+		(void) VECQ_DEQUEUE(&ctx->merge_entries);
+
+	if (VECQ_ENQUEUE(&ctx->merge_entries, entry) != 0) {
+		/* this is fine, only runtime perf will get slower */
+		D_CRIT("out of memory - unable to track entries\n");
+	}
+}
+
+/*
+ * operation_add_typed_value -- adds new entry to the current operation, if the
+ *	same ptr address already exists and the operation type is set,
+ *	the new value is not added and the function has no effect.
+ */
+int
+operation_add_typed_entry(struct operation_context *ctx,
+	void *ptr, uint64_t value,
+	ulog_operation_type type, enum operation_log_type log_type)
+{
+	struct operation_log *oplog = log_type == LOG_PERSISTENT ?
+		&ctx->pshadow_ops : &ctx->transient_ops;
+
+	/*
+	 * Always make sure to have one extra spare cacheline so that the
+	 * ulog log entry creation has enough room for zeroing.
+	 */
+	if (oplog->offset + CACHELINE_SIZE == oplog->capacity) {
+		size_t ncapacity = oplog->capacity + ULOG_BASE_SIZE;
+		struct ulog *ulog;
+
+		D_REALLOC_NZ(ulog, oplog->ulog, SIZEOF_ULOG(ncapacity));
+		if (ulog == NULL)
+			return -1;
+		oplog->capacity += ULOG_BASE_SIZE;
+		oplog->ulog = ulog;
+		oplog->ulog->capacity = oplog->capacity;
+
+		/*
+		 * Realloc invalidated the ulog entries that are inside of this
+		 * vector, need to clear it to avoid use after free.
+		 */
+		VECQ_CLEAR(&ctx->merge_entries);
+	}
+
+	if (log_type == LOG_PERSISTENT &&
+		operation_try_merge_entry(ctx, ptr, value, type) != 0)
+		return 0;
+
+	struct ulog_entry_val *entry = ulog_entry_val_create(
+		oplog->ulog, oplog->offset, ptr, value, type,
+		log_type == LOG_TRANSIENT ? &ctx->t_ops : &ctx->s_ops);
+
+	if (log_type == LOG_PERSISTENT)
+		operation_merge_entry_add(ctx, entry);
+
+	oplog->offset += ulog_entry_size(&entry->base);
+
+	return 0;
+}
+
+
+/*
+ * operation_add_value -- adds new entry to the current operation with
+ *	entry type autodetected based on the memory location
+ */
+int
+operation_add_entry(struct operation_context *ctx, void *ptr, uint64_t value,
+	ulog_operation_type type)
+{
+	const struct mo_ops *p_ops = ctx->p_ops;
+	dav_obj_t *pop = (dav_obj_t *)p_ops->base;
+
+	int from_pool = OBJ_PTR_IS_VALID(pop, ptr);
+
+	return operation_add_typed_entry(ctx, ptr, value, type,
+		from_pool ? LOG_PERSISTENT : LOG_TRANSIENT);
+}
+
+/*
+ * operation_add_buffer -- adds a buffer operation to the log
+ */
+int
+operation_add_buffer(struct operation_context *ctx,
+	void *dest, void *src, size_t size, ulog_operation_type type)
+{
+	size_t real_size = size + sizeof(struct ulog_entry_buf);
+
+	/* if there's no space left in the log, reserve some more */
+	if (ctx->ulog_curr_capacity == 0) {
+		ctx->ulog_curr_gen_num = ctx->ulog->gen_num;
+		if (operation_reserve(ctx, ctx->total_logged + real_size) != 0)
+			return -1;
+
+		ctx->ulog_curr = ctx->ulog_curr == NULL ? ctx->ulog :
+			ulog_next(ctx->ulog_curr);
+		ASSERTne(ctx->ulog_curr, NULL);
+		ctx->ulog_curr_offset = 0;
+		ctx->ulog_curr_capacity = ctx->ulog_curr->capacity;
+	}
+
+	size_t curr_size = MIN(real_size, ctx->ulog_curr_capacity);
+	size_t data_size = curr_size - sizeof(struct ulog_entry_buf);
+	size_t entry_size = ALIGN_UP(curr_size, CACHELINE_SIZE);
+
+	/*
+	 * To make sure that the log is consistent and contiguous, we need
+	 * make sure that the header of the entry that would be located
+	 * immediately after this one is zeroed.
+	 */
+	struct ulog_entry_base *next_entry = NULL;
+
+	if (entry_size == ctx->ulog_curr_capacity) {
+		struct ulog *u = ulog_next(ctx->ulog_curr);
+
+		if (u != NULL)
+			next_entry = (struct ulog_entry_base *)u->data;
+	} else {
+		size_t next_entry_offset = ctx->ulog_curr_offset + entry_size;
+
+		next_entry = (struct ulog_entry_base *)(ctx->ulog_curr->data +
+			next_entry_offset);
+	}
+	if (next_entry != NULL)
+		ulog_clobber_entry(next_entry);
+
+	/* create a persistent log entry */
+	struct ulog_entry_buf *e = ulog_entry_buf_create(ctx->ulog_curr,
+		ctx->ulog_curr_offset,
+		ctx->ulog_curr_gen_num,
+		dest, src, data_size,
+		type, ctx->p_ops);
+	ASSERT(entry_size == ulog_entry_size(&e->base));
+	ASSERT(entry_size <= ctx->ulog_curr_capacity);
+
+	ctx->total_logged += entry_size;
+	ctx->ulog_curr_offset += entry_size;
+	ctx->ulog_curr_capacity -= entry_size;
+
+	/*
+	 * Recursively add the data to the log until the entire buffer is
+	 * processed.
+	 */
+	return size - data_size == 0 ? 0 : operation_add_buffer(ctx,
+			(char *)dest + data_size,
+			(char *)src + data_size,
+			size - data_size, type);
+}
+
+/*
+ * operation_set_auto_reserve -- set auto reserve value for context
+ */
+void
+operation_set_auto_reserve(struct operation_context *ctx, int auto_reserve)
+{
+	ctx->ulog_auto_reserve = auto_reserve;
+}
+
+/*
+ * operation_process_persistent_redo -- (internal) process using ulog
+ */
+static void
+operation_process_persistent_redo(struct operation_context *ctx)
+{
+	ASSERTeq(ctx->pshadow_ops.capacity % CACHELINE_SIZE, 0);
+
+	/* Copy the redo log to wal redo */
+	ulog_foreach_entry(ctx->pshadow_ops.ulog, tx_create_wal_entry,
+			   NULL, ctx->p_ops);
+
+	ulog_process(ctx->pshadow_ops.ulog, OBJ_OFF_IS_VALID_FROM_CTX,
+		ctx->p_ops);
+
+	ulog_clobber(ctx->ulog, &ctx->next);
+}
+
+/*
+ * operation_reserve -- (internal) reserves new capacity in persistent ulog log
+ */
+int
+operation_reserve(struct operation_context *ctx, size_t new_capacity)
+{
+	if ((ctx->type == LOG_TYPE_UNDO) && (new_capacity > ctx->ulog_capacity)) {
+		if (ctx->extend == NULL) {
+			ERR("no extend function present");
+			return -1;
+		}
+
+		if (ulog_reserve(ctx->ulog,
+		    ctx->ulog_base_nbytes,
+		    ctx->ulog_curr_gen_num,
+		    ctx->ulog_auto_reserve,
+		    &new_capacity, ctx->extend,
+		    &ctx->next) != 0)
+			return -1;
+		ctx->ulog_capacity = new_capacity;
+	}
+
+	return 0;
+}
+
+/*
+ * operation_init -- initializes runtime state of an operation
+ */
+void
+operation_init(struct operation_context *ctx)
+{
+	struct operation_log *plog = &ctx->pshadow_ops;
+	struct operation_log *tlog = &ctx->transient_ops;
+
+	VALGRIND_ANNOTATE_NEW_MEMORY(ctx, sizeof(*ctx));
+	VALGRIND_ANNOTATE_NEW_MEMORY(tlog->ulog, sizeof(struct ulog) +
+		tlog->capacity);
+	VALGRIND_ANNOTATE_NEW_MEMORY(plog->ulog, sizeof(struct ulog) +
+		plog->capacity);
+	tlog->offset = 0;
+	plog->offset = 0;
+	VECQ_REINIT(&ctx->merge_entries);
+
+	ctx->ulog_curr_offset = 0;
+	ctx->ulog_curr_capacity = 0;
+	ctx->ulog_curr_gen_num = 0;
+	ctx->ulog_curr = NULL;
+	ctx->total_logged = 0;
+	ctx->ulog_auto_reserve = 1;
+}
+
+/*
+ * operation_start -- initializes and starts a new operation
+ */
+void
+operation_start(struct operation_context *ctx)
+{
+	operation_init(ctx);
+	ASSERTeq(ctx->state, OPERATION_IDLE);
+	ctx->state = OPERATION_IN_PROGRESS;
+}
+
+/*
+ * operation_cancel -- cancels a running operation
+ */
+void
+operation_cancel(struct operation_context *ctx)
+{
+	ASSERTeq(ctx->state, OPERATION_IN_PROGRESS);
+	ctx->state = OPERATION_IDLE;
+}
+
+/*
+ * operation_process -- processes registered operations
+ *
+ * The order of processing is important: persistent, transient.
+ * This is because the transient entries that reside on persistent memory might
+ * require write to a location that is currently occupied by a valid persistent
+ * state but becomes a transient state after operation is processed.
+ */
+void
+operation_process(struct operation_context *ctx)
+{
+	/*
+	 * If there's exactly one persistent entry there's no need to involve
+	 * the redo log. We can simply assign the value, the operation will be
+	 * atomic.
+	 */
+	int redo_process = ctx->type == LOG_TYPE_REDO &&
+		ctx->pshadow_ops.offset != 0;
+	if (redo_process &&
+	    ctx->pshadow_ops.offset == sizeof(struct ulog_entry_val)) {
+		struct ulog_entry_base *e = (struct ulog_entry_base *)
+			ctx->pshadow_ops.ulog->data;
+		ulog_operation_type t = ulog_entry_type(e);
+
+		if ((t == ULOG_OPERATION_SET) || ULOG_ENTRY_IS_BIT_OP(t)) {
+			tx_create_wal_entry(e, NULL, ctx->p_ops);
+			ulog_entry_apply(e, 1, ctx->p_ops);
+			redo_process = 0;
+		}
+	}
+
+	if (redo_process) {
+		operation_process_persistent_redo(ctx);
+		ctx->state = OPERATION_CLEANUP;
+	}
+	D_ASSERT(ctx->type != LOG_TYPE_UNDO);
+
+	/* process transient entries with transient memory ops */
+	if (ctx->transient_ops.offset != 0)
+		ulog_process(ctx->transient_ops.ulog, NULL, &ctx->t_ops);
+}
+
+/*
+ * operation_finish -- finalizes the operation
+ */
+void
+operation_finish(struct operation_context *ctx, unsigned flags)
+{
+	ASSERTne(ctx->state, OPERATION_IDLE);
+
+	if (ctx->type == LOG_TYPE_UNDO && ctx->total_logged != 0)
+		ctx->state = OPERATION_CLEANUP;
+
+	if (ctx->state != OPERATION_CLEANUP)
+		goto out;
+
+	if (ctx->type == LOG_TYPE_UNDO) {
+		int ret = ulog_clobber_data(ctx->ulog,
+			&ctx->next, ctx->ulog_free, flags);
+
+		if (ret == 0)
+			goto out;
+	} else if (ctx->type == LOG_TYPE_REDO) {
+		int ret = ulog_free_next(ctx->ulog, ctx->ulog_free);
+
+		if (ret == 0)
+			goto out;
+	}
+
+	/* clobbering shrunk the ulog */
+	ctx->ulog_capacity = ulog_capacity(ctx->ulog,
+		ctx->ulog_base_nbytes);
+	VEC_CLEAR(&ctx->next);
+	ulog_rebuild_next_vec(ctx->ulog, &ctx->next);
+
+out:
+	ctx->state = OPERATION_IDLE;
+}
diff --git a/src/common/dav_v2/memops.h b/src/common/dav_v2/memops.h
new file mode 100644
index 00000000000..23e5d531cde
--- /dev/null
+++ b/src/common/dav_v2/memops.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2023, Intel Corporation */
+
+/*
+ * memops.h -- aggregated memory operations helper definitions
+ */
+
+#ifndef __DAOS_COMMON_MEMOPS_H
+#define __DAOS_COMMON_MEMOPS_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "vec.h"
+#include "mo_wal.h"
+#include "ulog.h"
+
+enum operation_log_type {
+	LOG_PERSISTENT, /* log of persistent modifications */
+	LOG_TRANSIENT, /* log of transient memory modifications */
+
+	MAX_OPERATION_LOG_TYPE
+};
+
+enum log_type {
+	LOG_TYPE_UNDO,
+	LOG_TYPE_REDO,
+
+	MAX_LOG_TYPE,
+};
+
+struct user_buffer_def {
+	void *addr;
+	size_t size;
+};
+
+struct operation_context;
+
+struct operation_context *
+operation_new(struct ulog *redo, size_t ulog_base_nbytes,
+	ulog_extend_fn extend, ulog_free_fn ulog_free,
+	const struct mo_ops *p_ops, enum log_type type);
+
+void operation_init(struct operation_context *ctx);
+void operation_start(struct operation_context *ctx);
+
+void operation_delete(struct operation_context *ctx);
+void operation_free_logs(struct operation_context *ctx);
+
+int operation_add_buffer(struct operation_context *ctx,
+	void *dest, void *src, size_t size, ulog_operation_type type);
+
+int operation_add_entry(struct operation_context *ctx,
+	void *ptr, uint64_t value, ulog_operation_type type);
+int operation_add_typed_entry(struct operation_context *ctx,
+	void *ptr, uint64_t value,
+	ulog_operation_type type, enum operation_log_type log_type);
+void operation_set_auto_reserve(struct operation_context *ctx,
+		int auto_reserve);
+
+int operation_reserve(struct operation_context *ctx, size_t new_capacity);
+void operation_process(struct operation_context *ctx);
+void operation_finish(struct operation_context *ctx, unsigned flags);
+void operation_cancel(struct operation_context *ctx);
+
+#endif /* __DAOS_COMMON_MEMOPS_H */
diff --git a/src/common/dav_v2/meta_io.c b/src/common/dav_v2/meta_io.c
new file mode 100644
index 00000000000..2e4b044aaa7
--- /dev/null
+++ b/src/common/dav_v2/meta_io.c
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2017-2024, Intel Corporation */
+
+/*
+ * meta_io.c -- IO to/from meta blob bypassing WAL.
+ */
+
+#include <errno.h>
+#include <daos/mem.h>
+
+/** Maximum number of sets of pages in-flight at a time */
+#define MAX_INFLIGHT_SETS 4
+
+int
+meta_clear_pages(struct umem_store *store, daos_off_t start_off, daos_size_t size,
+		 daos_size_t hop_dist, int cnt)
+{
+	struct umem_store_iod    iod;
+	struct umem_store_region iod_region[MAX_INFLIGHT_SETS];
+	d_sg_list_t              sgl;
+	d_iov_t                  sg_iov[MAX_INFLIGHT_SETS];
+	char                    *src;
+	int                      rc;
+	int                      i;
+
+	D_ASSERT((size % 4096) == 0);
+	D_ASSERT(hop_dist != 0);
+
+	D_ALLOC(src, size);
+	if (src == NULL)
+		return ENOMEM;
+
+	sgl.sg_iovs = sg_iov;
+	for (i = 0; i < MAX_INFLIGHT_SETS; i++)
+		d_iov_set(&sg_iov[i], src, size);
+	do {
+		iod.io_nr     = (cnt > MAX_INFLIGHT_SETS) ? MAX_INFLIGHT_SETS : cnt;
+		sgl.sg_nr     = iod.io_nr;
+		sgl.sg_nr_out = iod.io_nr;
+
+		for (i = 0; i < iod.io_nr; i++) {
+			iod_region[i].sr_addr = start_off;
+			iod_region[i].sr_size = size;
+			start_off += hop_dist;
+		}
+		iod.io_regions = iod_region;
+
+		rc = store->stor_ops->so_write(store, &iod, &sgl);
+		D_ASSERT(rc == 0);
+
+		cnt -= iod.io_nr;
+	} while (cnt > 0);
+
+	D_FREE(src);
+	return 0;
+}
+
+/*
+ * meta_update -- Write size bytes from addr src to meta blob at offset off.
+ */
+int
+meta_update(struct umem_store *store, void *src, daos_off_t off, daos_size_t size)
+{
+	struct umem_store_iod iod;
+	d_sg_list_t           sgl;
+	d_iov_t               sg_iov;
+	int                   rc;
+
+	iod.io_nr             = 1;
+	iod.io_region.sr_addr = off;
+	iod.io_region.sr_size = size;
+	iod.io_regions        = &iod.io_region;
+	sgl.sg_nr             = 1;
+	sgl.sg_nr_out         = 1;
+	sgl.sg_iovs           = &sg_iov;
+	d_iov_set(&sg_iov, src, size);
+
+	D_ASSERT(store != NULL);
+	if (store->stor_ops->so_write == NULL)
+		return 0;
+
+	rc = store->stor_ops->so_write(store, &iod, &sgl);
+	if (rc != 0) {
+		D_ERROR("Failed to write to meta at offset %lu, size %lu, rc = %d\n", off, size,
+			rc);
+		return EFAULT;
+	}
+	return 0;
+}
+
+/*
+ * meta_fetch -- Fetch size bytes from offset off in the meta blob to addr dest.
+ */
+int
+meta_fetch(struct umem_store *store, void *dest, daos_off_t off, daos_size_t size)
+{
+	struct umem_store_iod iod;
+	d_sg_list_t           sgl;
+	d_iov_t               sg_iov;
+	int                   rc;
+
+	iod.io_nr             = 1;
+	iod.io_region.sr_addr = off;
+	iod.io_region.sr_size = size;
+	iod.io_regions        = &iod.io_region;
+	sgl.sg_nr             = 1;
+	sgl.sg_nr_out         = 1;
+	sgl.sg_iovs           = &sg_iov;
+	d_iov_set(&sg_iov, dest, size);
+
+	D_ASSERT(store != NULL);
+	if (store->stor_ops->so_write == NULL)
+		return 0;
+
+	rc = store->stor_ops->so_read(store, &iod, &sgl);
+	if (rc != 0) {
+		D_ERROR("Failed to read from meta at offset %lu, size %lu, rc = %d\n", off, size,
+			rc);
+		return EFAULT;
+	}
+	return 0;
+}
+
+/*
+ * meta_fetch_batch -- Fetch nelems of elem_size bytes starting from metablob offset start_off and
+ * hop distance of hop_dist to the buffer dest.
+ */
+int
+meta_fetch_batch(struct umem_store *store, void *dest, daos_off_t start_off, daos_size_t elem_size,
+		 daos_size_t hop_dist, int nelems)
+{
+	struct umem_store_iod    iod;
+	struct umem_store_region iod_region[MAX_INFLIGHT_SETS];
+	d_sg_list_t              sgl;
+	d_iov_t                  sg_iov[MAX_INFLIGHT_SETS];
+	int                      rc;
+	int                      i;
+
+	D_ASSERT((elem_size % 4096) == 0);
+	D_ASSERT(hop_dist != 0);
+
+	sgl.sg_iovs = sg_iov;
+	while (nelems > 0) {
+		iod.io_nr     = (nelems > MAX_INFLIGHT_SETS) ? MAX_INFLIGHT_SETS : nelems;
+		sgl.sg_nr     = iod.io_nr;
+		sgl.sg_nr_out = iod.io_nr;
+
+		for (i = 0; i < iod.io_nr; i++) {
+			d_iov_set(&sg_iov[i], dest, elem_size);
+			iod_region[i].sr_addr = start_off;
+			iod_region[i].sr_size = elem_size;
+			start_off += hop_dist;
+			dest += elem_size;
+		}
+		iod.io_regions = iod_region;
+
+		rc = store->stor_ops->so_read(store, &iod, &sgl);
+		if (rc)
+			return -1;
+
+		nelems -= iod.io_nr;
+	}
+	return 0;
+}
diff --git a/src/common/dav_v2/meta_io.h b/src/common/dav_v2/meta_io.h
new file mode 100644
index 00000000000..3193df364fb
--- /dev/null
+++ b/src/common/dav_v2/meta_io.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2024, Intel Corporation */
+
+/*
+ * meta_io.h -- definitions of statistics
+ */
+
+#ifndef __DAOS_COMMON_META_IO_H
+#define __DAOS_COMMON_META_IO_H 1
+
+#include <daos_types.h>
+
+struct umem_store;
+/*
+ * meta_clear_pages - fill zeros at various offsets in the meta blob.
+ */
+int
+meta_clear_pages(struct umem_store *store, daos_off_t start_off, daos_size_t size,
+		 daos_size_t hop_dist, int cnt);
+
+/*
+ * meta_update -- Write size bytes from addr src to meta blob at offset off.
+ */
+int
+meta_update(struct umem_store *store, void *src, daos_off_t off, daos_size_t size);
+
+/*
+ * meta_fetch -- Fetch size bytes from offset off in the meta blob to addr dest.
+ */
+int
+meta_fetch(struct umem_store *store, void *dest, daos_off_t off, daos_size_t size);
+
+/*
+ * meta_fetch_batch -- Fetch nelems of elem_size bytes starting from metablob offset
+ * start_off and hop distance of hop_dist to the buffer dest.
+ */
+int
+meta_fetch_batch(struct umem_store *store, void *dest, daos_off_t start_off, daos_size_t elem_size,
+		 daos_size_t hop_dist, int nelems);
+
+#endif /* __DAOS_COMMON_META_IO_H */
diff --git a/src/common/dav_v2/mo_wal.h b/src/common/dav_v2/mo_wal.h
new file mode 100644
index 00000000000..5ff7b8a71ac
--- /dev/null
+++ b/src/common/dav_v2/mo_wal.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2024, Intel Corporation */
+
+#ifndef __DAOS_COMMON_MO_WAL_H
+#define __DAOS_COMMON_MO_WAL_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#include "out.h"
+#include "wal_tx.h"
+
+typedef int (*persist_fn)(void *base, const void *, size_t, unsigned);
+typedef int (*flush_fn)(void *base, const void *, size_t, unsigned);
+typedef void (*drain_fn)(void *base);
+
+typedef void *(*memcpy_fn)(void *base, void *dest, const void *src, size_t len,
+		unsigned flags);
+typedef void *(*memmove_fn)(void *base, void *dest, const void *src, size_t len,
+		unsigned flags);
+typedef void *(*memset_fn)(void *base, void *dest, int c, size_t len,
+		unsigned flags);
+
+typedef int (*remote_read_fn)(void *ctx, uintptr_t base, void *dest, void *addr,
+		size_t length);
+
+struct umem_store;
+
+struct mo_ops {
+	/* for 'master' replica: with or without data replication */
+	persist_fn persist;	/* persist function */
+	flush_fn flush;		/* flush function */
+	drain_fn drain;		/* drain function */
+	memcpy_fn memcpy; /* persistent memcpy function */
+	memmove_fn memmove; /* persistent memmove function */
+	memset_fn memset; /* persistent memset function */
+	void *base;
+	struct umem_store *umem_store;
+};
+
+static force_inline void
+mo_wal_persist(const struct mo_ops *p_ops, void *d, size_t s)
+{
+	dav_wal_tx_snap(p_ops->base, d, s, d, 0);
+}
+
+static force_inline void
+mo_wal_flush(const struct mo_ops *p_ops, void *d, size_t s, int flags)
+{
+	dav_wal_tx_snap(p_ops->base, d, s, d, flags);
+}
+
+static force_inline void
+mo_wal_drain(const struct mo_ops *p_ops)
+{
+	SUPPRESS_UNUSED(p_ops);
+}
+
+static force_inline void *
+mo_wal_memcpy(const struct mo_ops *p_ops, void *dest,
+		const void *src, size_t len, unsigned flags)
+{
+	SUPPRESS_UNUSED(p_ops);
+	memcpy(dest, src, len);
+	mo_wal_flush(p_ops, dest, len, 0);
+	return dest;
+}
+
+static force_inline void *
+mo_wal_memmove(const struct mo_ops *p_ops, void *dest,
+		const void *src, size_t len, unsigned flags)
+{
+	SUPPRESS_UNUSED(p_ops);
+	memmove(dest, src, len);
+	mo_wal_flush(p_ops, dest, len, 0);
+	return dest;
+}
+
+static force_inline void *
+mo_wal_memset(const struct mo_ops *p_ops, void *dest, int c,
+		size_t len, unsigned flags)
+{
+	SUPPRESS_UNUSED(p_ops);
+	memset(dest, c, len);
+	dav_wal_tx_set(p_ops->base, dest, c, len);
+	return dest;
+}
+
+#endif /* __DAOS_COMMON_MO_WAL_H */
diff --git a/src/common/dav_v2/obj.h b/src/common/dav_v2/obj.h
new file mode 100644
index 00000000000..3182077bbfd
--- /dev/null
+++ b/src/common/dav_v2/obj.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2024, Intel Corporation */
+
+/*
+ * obj.h -- internal definitions for obj module
+ */
+
+#ifndef __DAOS_COMMON_OBJ_H
+#define __DAOS_COMMON_OBJ_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "dav_internal.h"
+#include "stats.h"
+#include "daos/mem.h"
+
+#define OBJ_OFF_TO_PTR(pop, off) umem_cache_off2ptr(((dav_obj_t *)pop)->do_store, off)
+#define OBJ_PTR_TO_OFF(pop, ptr) umem_cache_ptr2off(((dav_obj_t *)pop)->do_store, ptr)
+#define OBJ_OFF_FROM_HEAP(pop, off)                                                                \
+	(((off) >= (ALIGN_UP(sizeof(struct heap_header), 4096))) &&                                \
+	 ((off) < ((dav_obj_t *)(pop))->do_size_meta))
+
+#define OBJ_OFF_IS_VALID(pop, off) OBJ_OFF_FROM_HEAP(pop, off)
+
+#define OBJ_PTR_FROM_POOL(pop, ptr)                                                                \
+	((uintptr_t)(ptr) >= (uintptr_t)(((dav_obj_t *)pop)->do_base) &&                           \
+	 (uintptr_t)(ptr) <                                                                        \
+	     (uintptr_t)(((dav_obj_t *)pop)->do_base) + (((dav_obj_t *)pop)->do_size_mem_usable))
+
+#define OBJ_PTR_IS_VALID(pop, ptr) OBJ_PTR_FROM_POOL(pop, ptr)
+
+#define OBJ_OFFRANGE_FROM_HEAP(pop, start, end)                                                    \
+	(((start) >= (ALIGN_UP(sizeof(struct heap_header), 4096))) &&                              \
+	 ((end) <= (((dav_obj_t *)pop)->do_size_meta)))
+
+typedef uint64_t type_num_t;
+
+#define CLASS_ID_FROM_FLAG(flag) ((uint16_t)((flag) >> 48))
+#define EZONE_ID_FROM_FLAG(flag) ((uint32_t)((flag) >> 16))
+
+#endif /* __DAOS_COMMON_OBJ_H */
diff --git a/src/common/dav_v2/out.h b/src/common/dav_v2/out.h
new file mode 100644
index 00000000000..9c5cc8516b9
--- /dev/null
+++ b/src/common/dav_v2/out.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2023, Intel Corporation */
+
+/*
+ * out.h -- definitions for "out" module
+ */
+
+#ifndef __DAOS_COMMON_OUT_H
+#define __DAOS_COMMON_OUT_H 1
+
+#include <daos/debug.h>
+#include "util.h"
+
+#define DAV_LOG_FAC DB_TRACE
+
+/* enable extra debug messages and extra checks */
+/*#define DAV_EXTRA_DEBUG*/
+
+#ifndef EVALUATE_DBG_EXPRESSIONS
+#if defined(DAV_EXTRA_DEBUG) || defined(__clang_analyzer__) || defined(__COVERITY__) ||\
+	defined(__KLOCWORK__)
+#define EVALUATE_DBG_EXPRESSIONS 1
+#else
+#define EVALUATE_DBG_EXPRESSIONS 0
+#endif
+#endif
+
+#define TEST_ALWAYS_TRUE_EXPR(cnd) do {	\
+	if (__builtin_constant_p(cnd))	\
+		COMPILE_ERROR_ON(cnd);	\
+} while (0)
+#define TEST_ALWAYS_EQ_EXPR(lhs, rhs) do {				\
+	if (__builtin_constant_p(lhs) && __builtin_constant_p(rhs))	\
+		COMPILE_ERROR_ON((lhs) == (rhs));			\
+} while (0)
+#define TEST_ALWAYS_NE_EXPR(lhs, rhs) do {				\
+	if (__builtin_constant_p(lhs) && __builtin_constant_p(rhs))	\
+		COMPILE_ERROR_ON((lhs) != (rhs));			\
+} while (0)
+
+/* produce debug/trace output */
+#if defined(DAV_EXTRA_DEBUG)
+#define DAV_DBG(fmt, ...) D_DEBUG(DAV_LOG_FAC, fmt "\n", ##__VA_ARGS__)
+#else
+#define DAV_DBG(fmt, ...) SUPPRESS_UNUSED(__VA_ARGS__)
+#endif
+
+/* produce output and exit */
+#define FATAL(fmt, ...)					\
+	D_ASSERTF(0, fmt "\n", ## __VA_ARGS__)
+
+/* assert a condition is true at runtime */
+#define ASSERT_rt(cnd) do {				\
+	if (!EVALUATE_DBG_EXPRESSIONS || (cnd))		\
+		break;					\
+	D_ASSERT(cnd);					\
+} while (0)
+
+/* assert two integer values are equal at runtime */
+#define ASSERTeq_rt(lhs, rhs) do {			\
+	if (!EVALUATE_DBG_EXPRESSIONS || ((lhs) == (rhs)))\
+		break; \
+	D_ASSERTF(((lhs) == (rhs)),			\
+	"assertion failure: %s (0x%llx) == %s (0x%llx)", #lhs,\
+	(unsigned long long)(lhs), #rhs, (unsigned long long)(rhs)); \
+} while (0)
+
+/* assert two integer values are not equal at runtime */
+#define ASSERTne_rt(lhs, rhs) do {			\
+	if (!EVALUATE_DBG_EXPRESSIONS || ((lhs) != (rhs)))\
+		break;					\
+	D_ASSERTF(((lhs) != (rhs)),			\
+	"assertion failure: %s (0x%llx) != %s (0x%llx)", #lhs,\
+	(unsigned long long)(lhs), #rhs, (unsigned long long)(rhs)); \
+} while (0)
+
+/*
+ * Detect useless asserts on always true expression. Please use
+ * COMPILE_ERROR_ON(!cnd) or ASSERT_rt(cnd) in such cases.
+ */
+/* assert a condition is true */
+#define ASSERT(cnd) do {\
+		TEST_ALWAYS_TRUE_EXPR(cnd);\
+		ASSERT_rt(cnd);\
+	} while (0)
+
+/* assert two integer values are equal */
+#define ASSERTeq(lhs, rhs) do {\
+		/* See comment in ASSERT. */\
+		TEST_ALWAYS_EQ_EXPR(lhs, rhs);\
+		ASSERTeq_rt(lhs, rhs);\
+	} while (0)
+
+/* assert two integer values are not equal */
+#define ASSERTne(lhs, rhs) do {\
+		/* See comment in ASSERT. */\
+		TEST_ALWAYS_NE_EXPR(lhs, rhs);\
+		ASSERTne_rt(lhs, rhs);\
+	} while (0)
+
+#define ERR(fmt, ...)\
+	D_ERROR(fmt "\n", ## __VA_ARGS__)
+
+#endif /* __DAOS_COMMON_OUT_H */
diff --git a/src/common/dav_v2/palloc.c b/src/common/dav_v2/palloc.c
new file mode 100644
index 00000000000..3b929583e9a
--- /dev/null
+++ b/src/common/dav_v2/palloc.c
@@ -0,0 +1,982 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * palloc.c -- implementation of pmalloc POSIX-like API
+ *
+ * This is the front-end part of the persistent memory allocator. It uses both
+ * transient and persistent representation of the heap to provide memory blocks
+ * in a reasonable time and with an acceptable common-case fragmentation.
+ *
+ * Lock ordering in the entirety of the allocator is simple, but might be hard
+ * to follow at times because locks are, by necessity, externalized.
+ * There are two sets of locks that need to be taken into account:
+ *	- runtime state locks, represented by buckets.
+ *	- persistent state locks, represented by memory block mutexes.
+ *
+ * To properly use them, follow these rules:
+ *	- When nesting, always lock runtime state first.
+ *	Doing the reverse might cause deadlocks in other parts of the code.
+ *
+ *	- When introducing functions that would require runtime state locks,
+ *	always try to move the lock acquiring to the upper most layer. This
+ *	usually means that the functions will simply take "struct bucket" as
+ *	their argument. By doing so most of the locking can happen in
+ *	the frontend part of the allocator and it's easier to follow the first
+ *	rule because all functions in the backend can safely use the persistent
+ *	state locks - the runtime lock, if it is needed, will be already taken
+ *	by the upper layer.
+ *
+ * General lock ordering:
+ *	1. arenas.lock
+ *	2. buckets (sorted by ID)
+ *	3. memory blocks (sorted by lock address)
+ */
+
+#include "bucket.h"
+#include "valgrind_internal.h"
+#include "heap_layout.h"
+#include "heap.h"
+#include "alloc_class.h"
+#include "out.h"
+#include "sys_util.h"
+#include "palloc.h"
+#include "ravl.h"
+#include "vec.h"
+
+struct dav_action_internal {
+	/* type of operation (alloc/free vs set) */
+	enum dav_action_type type;
+
+	uint32_t padding;
+
+	/*
+	 * Action-specific lock that needs to be taken for the duration of
+	 * an action.
+	 */
+	pthread_mutex_t *lock;
+
+	/* action-specific data */
+	union {
+		/* valid only when type == DAV_ACTION_TYPE_HEAP */
+		struct {
+			uint64_t offset;
+			uint64_t usable_size;
+			enum memblock_state new_state;
+			struct memory_block m;
+			struct memory_block_reserved *mresv;
+		};
+
+		/* valid only when type == DAV_ACTION_TYPE_MEM */
+		struct {
+			uint64_t *ptr;
+			uint64_t value;
+		};
+
+		/* padding, not used */
+		uint64_t data2[14];
+	};
+};
+D_CASSERT(offsetof(struct dav_action_internal, data2) == offsetof(struct dav_action, data2),
+	  "struct dav_action misaligned!");
+
+/*
+ * palloc_set_value -- creates a new set memory action
+ */
+void
+palloc_set_value(struct palloc_heap *heap, struct dav_action *act,
+	uint64_t *ptr, uint64_t value)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(heap);
+
+	act->type = DAV_ACTION_TYPE_MEM;
+
+	struct dav_action_internal *actp = (struct dav_action_internal *)act;
+
+	actp->ptr = ptr;
+	actp->value = value;
+	actp->lock = NULL;
+}
+
+static void *
+zone_get_base_address(struct palloc_heap *heap, void *ptr)
+{
+	uint64_t off = HEAP_PTR_TO_OFF(heap, ptr);
+	uint32_t zid = heap_off2mbid(heap, off);
+
+	if (zid)
+		return ZID_TO_ZONE(&heap->layout_info, zid);
+
+	return heap->layout_info.zone0;
+}
+
+/*
+ * alloc_prep_block -- (internal) prepares a memory block for allocation
+ *
+ * Once the block is fully reserved and it's guaranteed that no one else will
+ * be able to write to this memory region it is safe to write the allocation
+ * header and call the object construction function.
+ *
+ * Because the memory block at this stage is only reserved in transient state
+ * there's no need to worry about fail-safety of this method because in case
+ * of a crash the memory will be back in the free blocks collection.
+ */
+static int
+alloc_prep_block(struct palloc_heap *heap, const struct memory_block *m,
+	palloc_constr constructor, void *arg,
+	uint64_t extra_field, uint16_t object_flags,
+	struct dav_action_internal *out)
+{
+	void *uptr = m->m_ops->get_user_data(m);
+	size_t usize = m->m_ops->get_user_size(m);
+
+	VALGRIND_DO_MEMPOOL_ALLOC(zone_get_base_address(heap, uptr), uptr, usize);
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(uptr, usize);
+	VALGRIND_ANNOTATE_NEW_MEMORY(uptr, usize);
+
+	m->m_ops->write_header(m, extra_field, object_flags);
+
+	/*
+	 * Set allocated memory with pattern, if debug.heap.alloc_pattern CTL
+	 * parameter had been set.
+	 */
+	if (unlikely(heap->alloc_pattern > PALLOC_CTL_DEBUG_NO_PATTERN)) {
+		mo_wal_memset(&heap->p_ops, uptr, heap->alloc_pattern,
+			usize, 0);
+		VALGRIND_DO_MAKE_MEM_UNDEFINED(uptr, usize);
+	}
+
+	int ret;
+
+	if (constructor != NULL) {
+		ret = constructor(heap->p_ops.base, uptr, usize, arg);
+		if (ret  != 0) {
+			/*
+			 * If canceled, revert the block back to the free
+			 * state in vg machinery.
+			 */
+			VALGRIND_DO_MEMPOOL_FREE(zone_get_base_address(heap, uptr), uptr);
+			return ret;
+		}
+	}
+
+	/*
+	 * To avoid determining the user data pointer twice this method is also
+	 * responsible for calculating the offset of the object in the pool that
+	 * will be used to set the offset destination pointer provided by the
+	 * caller.
+	 */
+	out->offset = HEAP_PTR_TO_OFF(heap, uptr);
+	out->usable_size = usize;
+
+	return 0;
+}
+
+/*
+ * palloc_reservation_create -- creates a volatile reservation of a
+ *	memory block.
+ *
+ * The first step in the allocation of a new block is reserving it in
+ * the transient heap - which is represented by the bucket abstraction.
+ *
+ * To provide optimal scaling for multi-threaded applications and reduce
+ * fragmentation the appropriate bucket is chosen depending on the
+ * current thread context and to which allocation class the requested
+ * size falls into.
+ *
+ * Once the bucket is selected, just enough memory is reserved for the
+ * requested size. The underlying block allocation algorithm
+ * (best-fit, next-fit, ...) varies depending on the bucket container.
+ */
+static int
+palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr constructor,
+			  void *arg, uint64_t extra_field, uint16_t object_flags, uint16_t class_id,
+			  uint32_t mb_id, struct dav_action_internal *out)
+{
+	int                  err       = 0;
+	struct memory_block *new_block = &out->m;
+	struct mbrt         *mb;
+
+	out->type = DAV_ACTION_TYPE_HEAP;
+
+	ASSERT(class_id < UINT8_MAX);
+	struct alloc_class *c = class_id == 0 ?
+		heap_get_best_class(heap, size) :
+		alloc_class_by_id(heap_alloc_classes(heap),
+			(uint8_t)class_id);
+
+	if (c == NULL) {
+		ERR("no allocation class for size %lu bytes", size);
+		errno = EINVAL;
+		return -1;
+	}
+
+retry:
+	mb = heap_mbrt_get_mb(heap, mb_id);
+	if (mb == NULL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	/*
+	 * The caller provided size in bytes, but buckets operate in
+	 * 'size indexes' which are multiples of the block size in the
+	 * bucket.
+	 *
+	 * For example, to allocate 500 bytes from a bucket that
+	 * provides 256 byte blocks two memory 'units' are required.
+	 */
+	ssize_t size_idx = alloc_class_calc_size_idx(c, size);
+
+	if (size_idx < 0) {
+		ERR("allocation class not suitable for size %lu bytes",
+			size);
+		errno = EINVAL;
+		return -1;
+	}
+	ASSERT(size_idx <= UINT32_MAX);
+	*new_block = MEMORY_BLOCK_NONE;
+	new_block->size_idx = (uint32_t)size_idx;
+
+	err = heap_mbrt_update_alloc_class_buckets(heap, mb, c);
+	if (err != 0) {
+		errno = err;
+		return -1;
+	}
+
+	struct bucket *b = mbrt_bucket_acquire(mb, c->id);
+
+	err = heap_get_bestfit_block(heap, b, new_block);
+	if (err != 0)
+		goto out;
+
+	if (alloc_prep_block(heap, new_block, constructor, arg,
+		extra_field, object_flags, out) != 0) {
+		/*
+		 * Constructor returned non-zero value which means
+		 * the memory block reservation has to be rolled back.
+		 */
+		if (new_block->type == MEMORY_BLOCK_HUGE)
+			bucket_insert_block(b, new_block);
+		err = ECANCELED;
+		goto out;
+	}
+
+	/*
+	 * Each as of yet unfulfilled reservation needs to be tracked in the
+	 * runtime state.
+	 * The memory block cannot be put back into the global state unless
+	 * there are no active reservations.
+	 */
+	out->mresv = bucket_active_block(b);
+	if (out->mresv != NULL)
+		util_fetch_and_add64(&out->mresv->nresv, 1);
+
+	out->lock = new_block->m_ops->get_lock(new_block);
+	out->new_state = MEMBLOCK_ALLOCATED;
+
+out:
+	mbrt_bucket_release(b);
+
+	if (err == 0)
+		return 0;
+
+	/*
+	 * If there is no memory in evictable zone then do the allocation
+	 * from non-evictable zone.
+	 */
+	if ((mb_id != 0) && (err == ENOMEM)) {
+		heap_mbrt_log_alloc_failure(heap, mb_id);
+		mb_id = 0;
+		goto retry;
+	}
+
+	errno = err;
+	return -1;
+}
+
+/*
+ * palloc_heap_action_exec -- executes a single heap action (alloc, free)
+ */
+static void
+palloc_heap_action_exec(struct palloc_heap *heap,
+	const struct dav_action_internal *act,
+	struct operation_context *ctx)
+{
+	struct zone *zone;
+#ifdef DAV_EXTRA_DEBUG
+	if (act->m.m_ops->get_state(&act->m) == act->new_state) {
+		D_CRIT("invalid operation or heap corruption\n");
+		ASSERT(0);
+	}
+#endif
+
+	/*
+	 * The actual required metadata modifications are chunk-type
+	 * dependent, but it always is a modification of a single 8 byte
+	 * value - either modification of few bits in a bitmap or
+	 * changing a chunk type from free to used or vice versa.
+	 */
+	act->m.m_ops->prep_hdr(&act->m, act->new_state, ctx);
+
+	/*
+	 * Update the memory bucket utilization info.
+	 */
+	if (heap_mbrt_ismb_evictable(heap, act->m.zone_id))
+		zone = ZID_TO_ZONE(&heap->layout_info, act->m.zone_id);
+	else
+		zone = heap->layout_info.zone0;
+
+	if (act->new_state == MEMBLOCK_FREE)
+		zone->header.sp_usage -= act->m.m_ops->get_real_size(&act->m);
+	else
+		zone->header.sp_usage += act->m.m_ops->get_real_size(&act->m);
+	operation_add_entry(ctx, &zone->header.sp_usage, zone->header.sp_usage, ULOG_OPERATION_SET);
+}
+
+/*
+ * palloc_restore_free_chunk_state -- updates the runtime state of a free chunk.
+ *
+ * This function also takes care of coalescing of huge chunks.
+ */
+static void
+palloc_restore_free_chunk_state(struct palloc_heap *heap,
+	struct memory_block *m)
+{
+	struct mbrt *mb = heap_mbrt_get_mb(heap, m->zone_id);
+
+	if (m->type == MEMORY_BLOCK_HUGE) {
+		struct bucket *b = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID);
+
+		if (heap_free_chunk_reuse(heap, b, m) != 0) {
+			if (errno == EEXIST)
+				FATAL("duplicate runtime chunk state, possible double free");
+			else
+				D_CRIT("unable to track runtime chunk state\n");
+		}
+		mbrt_bucket_release(b);
+	}
+}
+
+/*
+ * palloc_mem_action_noop -- empty handler for unused memory action funcs
+ */
+static void
+palloc_mem_action_noop(struct palloc_heap *heap,
+	struct dav_action_internal *act)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(heap, act);
+}
+
+/*
+ * palloc_reservation_clear -- clears the reservation state of the block,
+ *	discards the associated memory block if possible
+ */
+static void
+palloc_reservation_clear(struct palloc_heap *heap,
+	struct dav_action_internal *act, int publish)
+{
+	if (act->mresv == NULL)
+		return;
+
+	struct memory_block_reserved *mresv = act->mresv;
+	struct bucket_locked *locked = mresv->bucket;
+
+	if (!publish) {
+		/*
+		 * If a memory block used for the action is the currently active
+		 * memory block of the bucket it can be returned back to the
+		 * bucket. This way it will be available for future allocation
+		 * requests, improving performance.
+		 */
+		struct bucket *b = bucket_acquire(locked);
+
+		bucket_try_insert_attached_block(b, &act->m);
+		bucket_release(b);
+	}
+
+	if (util_fetch_and_sub64(&mresv->nresv, 1) == 1) {
+		VALGRIND_ANNOTATE_HAPPENS_AFTER(&mresv->nresv);
+		/*
+		 * If the memory block used for the action is not currently used
+		 * in any bucket nor action it can be discarded (given back to
+		 * the heap).
+		 */
+		heap_discard_run(heap, &mresv->m);
+		D_FREE(mresv);
+	} else {
+		VALGRIND_ANNOTATE_HAPPENS_BEFORE(&mresv->nresv);
+	}
+}
+
+/*
+ * palloc_heap_action_on_cancel -- restores the state of the heap
+ */
+static void
+palloc_heap_action_on_cancel(struct palloc_heap *heap,
+	struct dav_action_internal *act)
+{
+	void *uptr;
+
+	if (act->new_state == MEMBLOCK_FREE)
+		return;
+
+	uptr = act->m.m_ops->get_user_data(&act->m);
+	VALGRIND_DO_MEMPOOL_FREE(zone_get_base_address(heap, uptr), uptr);
+
+	act->m.m_ops->invalidate(&act->m);
+	palloc_restore_free_chunk_state(heap, &act->m);
+
+	palloc_reservation_clear(heap, act, 0 /* publish */);
+}
+
+/*
+ * palloc_heap_action_on_process -- performs finalization steps under a lock
+ *	on the persistent state
+ */
+static void
+palloc_heap_action_on_process(struct palloc_heap *heap,
+	struct dav_action_internal *act)
+{
+	if (act->new_state == MEMBLOCK_ALLOCATED) {
+		STATS_INC(heap->stats, persistent, heap_curr_allocated,
+			act->m.m_ops->get_real_size(&act->m));
+		if (act->m.type == MEMORY_BLOCK_RUN) {
+			STATS_INC(heap->stats, transient, heap_run_allocated,
+				act->m.m_ops->get_real_size(&act->m));
+		}
+		heap_mbrt_incrmb_usage(heap, act->m.zone_id, act->m.m_ops->get_real_size(&act->m));
+	} else if (act->new_state == MEMBLOCK_FREE) {
+		if (On_memcheck) {
+			void *ptr = act->m.m_ops->get_user_data(&act->m);
+
+			VALGRIND_DO_MEMPOOL_FREE(zone_get_base_address(heap, ptr), ptr);
+		}
+
+		STATS_SUB(heap->stats, persistent, heap_curr_allocated,
+			act->m.m_ops->get_real_size(&act->m));
+		if (act->m.type == MEMORY_BLOCK_RUN) {
+			STATS_SUB(heap->stats, transient, heap_run_allocated,
+				act->m.m_ops->get_real_size(&act->m));
+		}
+		heap_memblock_on_free(heap, &act->m);
+		heap_mbrt_incrmb_usage(heap, act->m.zone_id,
+				       -(act->m.m_ops->get_real_size(&act->m)));
+	}
+}
+
+/*
+ * palloc_heap_action_on_unlock -- performs finalization steps that need to be
+ *	performed without a lock on persistent state
+ */
+static void
+palloc_heap_action_on_unlock(struct palloc_heap *heap,
+	struct dav_action_internal *act)
+{
+	if (act->new_state == MEMBLOCK_ALLOCATED)
+		palloc_reservation_clear(heap, act, 1 /* publish */);
+	else if (act->new_state == MEMBLOCK_FREE)
+		palloc_restore_free_chunk_state(heap, &act->m);
+}
+
+/*
+ * palloc_mem_action_exec -- executes a single memory action (set, and, or)
+ */
+static void
+palloc_mem_action_exec(struct palloc_heap *heap,
+	const struct dav_action_internal *act,
+	struct operation_context *ctx)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(heap);
+
+	operation_add_entry(ctx, act->ptr, act->value, ULOG_OPERATION_SET);
+}
+
+static const struct {
+	/*
+	 * Translate action into some number of operation_entry'ies.
+	 */
+	void (*exec)(struct palloc_heap *heap,
+		const struct dav_action_internal *act,
+		struct operation_context *ctx);
+
+	/*
+	 * Cancel any runtime state changes. Can be called only when action has
+	 * not been translated to persistent operation yet.
+	 */
+	void (*on_cancel)(struct palloc_heap *heap,
+		struct dav_action_internal *act);
+
+	/*
+	 * Final steps after persistent state has been modified. Performed
+	 * under action-specific lock.
+	 */
+	void (*on_process)(struct palloc_heap *heap,
+		struct dav_action_internal *act);
+
+	/*
+	 * Final steps after persistent state has been modified. Performed
+	 * after action-specific lock has been dropped.
+	 */
+	void (*on_unlock)(struct palloc_heap *heap,
+		struct dav_action_internal *act);
+} action_funcs[DAV_MAX_ACTION_TYPE] = {
+	[DAV_ACTION_TYPE_HEAP] = {
+		.exec = palloc_heap_action_exec,
+		.on_cancel = palloc_heap_action_on_cancel,
+		.on_process = palloc_heap_action_on_process,
+		.on_unlock = palloc_heap_action_on_unlock,
+	},
+	[DAV_ACTION_TYPE_MEM] = {
+		.exec = palloc_mem_action_exec,
+		.on_cancel = palloc_mem_action_noop,
+		.on_process = palloc_mem_action_noop,
+		.on_unlock = palloc_mem_action_noop,
+	}
+};
+
+/*
+ * palloc_action_compare -- compares two actions based on lock address
+ */
+static int
+palloc_action_compare(const void *lhs, const void *rhs)
+{
+	const struct dav_action_internal *mlhs = lhs;
+	const struct dav_action_internal *mrhs = rhs;
+	uintptr_t vlhs = (uintptr_t)(mlhs->lock);
+	uintptr_t vrhs = (uintptr_t)(mrhs->lock);
+
+	if (vlhs < vrhs)
+		return -1;
+	if (vlhs > vrhs)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * palloc_exec_actions -- perform the provided free/alloc operations
+ */
+static void
+palloc_exec_actions(struct palloc_heap *heap,
+	struct operation_context *ctx,
+	struct dav_action_internal *actv,
+	size_t actvcnt)
+{
+	/*
+	 * The operations array is sorted so that proper lock ordering is
+	 * ensured.
+	 */
+	if (actv)
+		qsort(actv, actvcnt, sizeof(struct dav_action_internal),
+			palloc_action_compare);
+	else
+		ASSERTeq(actvcnt, 0);
+
+	struct dav_action_internal *act;
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		act = &actv[i];
+
+		/*
+		 * This lock must be held for the duration between the creation
+		 * of the allocation metadata updates in the operation context
+		 * and the operation processing. This is because a different
+		 * thread might operate on the same 8-byte value of the run
+		 * bitmap and override allocation performed by this thread.
+		 */
+		if (i == 0 || act->lock != actv[i - 1].lock) {
+			if (act->lock)
+				util_mutex_lock(act->lock);
+		}
+
+		/* translate action to some number of operation_entry'ies */
+		action_funcs[act->type].exec(heap, act, ctx);
+	}
+
+	/* wait for all allocated object headers to be persistent */
+	mo_wal_drain(&heap->p_ops);
+
+	/* perform all persistent memory operations */
+	operation_process(ctx);
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		act = &actv[i];
+
+		action_funcs[act->type].on_process(heap, act);
+
+		if (i == actvcnt - 1 || act->lock != actv[i + 1].lock) {
+			if (act->lock)
+				util_mutex_unlock(act->lock);
+		}
+	}
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		act = &actv[i];
+
+		action_funcs[act->type].on_unlock(heap, act);
+	}
+
+	operation_finish(ctx, 0);
+}
+
+/*
+ * palloc_reserve -- creates a single reservation
+ */
+int
+palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg,
+	       uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t mb_id,
+	       struct dav_action *act)
+{
+	COMPILE_ERROR_ON(sizeof(struct dav_action) !=
+		sizeof(struct dav_action_internal));
+
+	return palloc_reservation_create(heap, size, constructor, arg, extra_field, object_flags,
+					 class_id, mb_id, (struct dav_action_internal *)act);
+}
+
+/*
+ * palloc_action_isalloc - action is a heap reservation
+ *			   created by palloc_reserve().
+ */
+int
+palloc_action_isalloc(struct dav_action *act)
+{
+	struct dav_action_internal *actp = (struct dav_action_internal *)act;
+
+	return ((actp->type == DAV_ACTION_TYPE_HEAP) &&
+		(actp->new_state == MEMBLOCK_ALLOCATED));
+}
+
+uint64_t
+palloc_get_realoffset(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+
+	return HEAP_PTR_TO_OFF(m.heap, m.m_ops->get_real_data(&m));
+}
+
+/*
+ * palloc_get_prange -- get the start offset and size of allocated memory that
+ *			needs to be persisted.
+ *
+ * persist_udata - if true, persist the user data.
+ */
+void
+palloc_get_prange(struct dav_action *act, uint64_t *const offp, uint64_t *const sizep,
+		  int persist_udata)
+{
+	struct dav_action_internal *act_in = (struct dav_action_internal *)act;
+
+	D_ASSERT(act_in->type == DAV_ACTION_TYPE_HEAP);
+	/* we need to persist the header if present */
+	*offp = HEAP_PTR_TO_OFF(act_in->m.heap, act_in->m.m_ops->get_real_data(&act_in->m));
+	*sizep = header_type_to_size[act_in->m.header_type];
+
+	D_ASSERT(act_in->offset == *offp + header_type_to_size[act_in->m.header_type]);
+	/* persist the user data */
+	if (persist_udata)
+		*sizep += act_in->usable_size;
+}
+
+/*
+ * palloc_defer_free -- creates an internal deferred free action
+ */
+static void
+palloc_defer_free_create(struct palloc_heap *heap, uint64_t off,
+			 struct dav_action_internal *out)
+{
+	COMPILE_ERROR_ON(sizeof(struct dav_action) !=
+		sizeof(struct dav_action_internal));
+
+	out->type = DAV_ACTION_TYPE_HEAP;
+	out->offset = off;
+	out->m = memblock_from_offset(heap, off);
+
+	/*
+	 * For the duration of free we may need to protect surrounding
+	 * metadata from being modified.
+	 */
+	out->lock = out->m.m_ops->get_lock(&out->m);
+	out->mresv = NULL;
+	out->new_state = MEMBLOCK_FREE;
+}
+
+/*
+ * palloc_defer_free -- creates a deferred free action
+ */
+void
+palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act)
+{
+	COMPILE_ERROR_ON(sizeof(struct dav_action) !=
+		sizeof(struct dav_action_internal));
+
+	palloc_defer_free_create(heap, off, (struct dav_action_internal *)act);
+}
+
+/*
+ * palloc_cancel -- cancels all reservations in the array
+ */
+void
+palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt)
+{
+	struct dav_action_internal *act;
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		act = (struct dav_action_internal *)&actv[i];
+		action_funcs[act->type].on_cancel(heap, act);
+	}
+}
+
+/*
+ * palloc_publish -- publishes all reservations in the array
+ */
+void
+palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt,
+	       struct operation_context *ctx)
+{
+	palloc_exec_actions(heap, ctx,
+		(struct dav_action_internal *)actv, actvcnt);
+}
+
+/*
+ * palloc_operation -- persistent memory operation. Takes a NULL pointer
+ *	or an existing memory block and modifies it to occupy, at least, 'size'
+ *	number of bytes.
+ *
+ * The malloc, free and realloc routines are implemented in the context of this
+ * common operation which encompasses all of the functionality usually done
+ * separately in those methods.
+ *
+ * The first thing that needs to be done is determining which memory blocks
+ * will be affected by the operation - this varies depending on the whether the
+ * operation will need to modify or free an existing block and/or allocate
+ * a new one.
+ *
+ * Simplified allocation process flow is as follows:
+ *	- reserve a new block in the transient heap
+ *	- prepare the new block
+ *	- create redo log of required modifications
+ *		- chunk metadata
+ *		- offset of the new object
+ *	- commit and process the redo log
+ *
+ * And similarly, the deallocation process:
+ *	- create redo log of required modifications
+ *		- reverse the chunk metadata back to the 'free' state
+ *		- set the destination of the object offset to zero
+ *	- commit and process the redo log
+ * There's an important distinction in the deallocation process - it does not
+ * return the memory block to the transient container. That is done once no more
+ * memory is available.
+ *
+ * Reallocation is a combination of the above, with one additional step
+ * of copying the old content.
+ */
+int
+palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size,
+		 palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags,
+		 uint16_t class_id, uint32_t mb_id, struct operation_context *ctx)
+{
+	size_t user_size = 0;
+
+	size_t nops = 0;
+	uint64_t aoff;
+	uint64_t asize;
+	struct dav_action_internal ops[2];
+	struct dav_action_internal *alloc = NULL;
+	struct dav_action_internal *dealloc = NULL;
+
+	/*
+	 * The offset of an existing block can be nonzero which means this
+	 * operation is either free or a realloc - either way the offset of the
+	 * object needs to be translated into memory block, which is a structure
+	 * that all of the heap methods expect.
+	 */
+	if (off != 0) {
+		dealloc = &ops[nops++];
+		palloc_defer_free_create(heap, off, dealloc);
+		user_size = dealloc->m.m_ops->get_user_size(&dealloc->m);
+		if (user_size == size) {
+			operation_cancel(ctx);
+			return 0;
+		}
+	}
+
+	/* alloc or realloc */
+	if (size != 0) {
+		alloc = &ops[nops++];
+		if (palloc_reservation_create(heap, size, constructor, arg, extra_field,
+					      object_flags, class_id, mb_id, alloc) != 0) {
+			operation_cancel(ctx);
+			return -1;
+		}
+
+		palloc_get_prange((struct dav_action *)alloc, &aoff, &asize, 0);
+		if (asize) /* != CHUNK_FLAG_HEADER_NONE */
+			dav_wal_tx_snap(heap->p_ops.base, HEAP_OFF_TO_PTR(heap, aoff),
+					asize, HEAP_OFF_TO_PTR(heap, aoff), 0);
+	}
+
+	/* realloc */
+	if (alloc != NULL && dealloc != NULL) {
+		/* copy data to newly allocated memory */
+		size_t old_size = user_size;
+		size_t to_cpy = old_size > size ? size : old_size;
+
+		VALGRIND_ADD_TO_TX(
+			HEAP_OFF_TO_PTR(heap, alloc->offset),
+			to_cpy);
+		mo_wal_memcpy(&heap->p_ops,
+			HEAP_OFF_TO_PTR(heap, alloc->offset),
+			HEAP_OFF_TO_PTR(heap, off),
+			to_cpy,
+			0);
+		VALGRIND_REMOVE_FROM_TX(
+			HEAP_OFF_TO_PTR(heap, alloc->offset),
+			to_cpy);
+	}
+
+	/*
+	 * If the caller provided a destination value to update, it needs to be
+	 * modified atomically alongside the heap metadata, and so the operation
+	 * context must be used.
+	 */
+	if (dest_off) {
+		operation_add_entry(ctx, dest_off,
+			alloc ? alloc->offset : 0, ULOG_OPERATION_SET);
+	}
+
+	/* and now actually perform the requested operation! */
+	palloc_exec_actions(heap, ctx, ops, nops);
+
+	return 0;
+}
+
+/*
+ * palloc_usable_size -- returns the number of bytes in the memory block
+ */
+size_t
+palloc_usable_size(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+
+	return m.m_ops->get_user_size(&m);
+}
+
+/*
+ * palloc_extra -- returns allocation extra field
+ */
+uint64_t
+palloc_extra(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+
+	return m.m_ops->get_extra(&m);
+}
+
+/*
+ * palloc_flags -- returns allocation flags
+ */
+uint16_t
+palloc_flags(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+
+	return m.m_ops->get_flags(&m);
+}
+
+/*
+ * pmalloc_search_cb -- (internal) foreach callback.
+ */
+static int
+pmalloc_search_cb(const struct memory_block *m, void *arg)
+{
+	struct memory_block *out = arg;
+
+	if (MEMORY_BLOCK_EQUALS(*m, *out))
+		return 0; /* skip the same object */
+
+	*out = *m;
+
+	return 1;
+}
+
+/*
+ * palloc_first -- returns the first object from the heap.
+ */
+uint64_t
+palloc_first(struct palloc_heap *heap)
+{
+	struct memory_block search = MEMORY_BLOCK_NONE;
+
+	heap_foreach_object(heap, pmalloc_search_cb,
+		&search, MEMORY_BLOCK_NONE);
+
+	if (MEMORY_BLOCK_IS_NONE(search))
+		return 0;
+
+	void *uptr = search.m_ops->get_user_data(&search);
+
+	return HEAP_PTR_TO_OFF(heap, uptr);
+}
+
+/*
+ * palloc_next -- returns the next object relative to 'off'.
+ */
+uint64_t
+palloc_next(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+	struct memory_block search = m;
+
+	heap_foreach_object(heap, pmalloc_search_cb, &search, m);
+
+	if (MEMORY_BLOCK_IS_NONE(search) ||
+		MEMORY_BLOCK_EQUALS(search, m))
+		return 0;
+
+	void *uptr = search.m_ops->get_user_data(&search);
+
+	return HEAP_PTR_TO_OFF(heap, uptr);
+}
+
+#if VG_MEMCHECK_ENABLED
+/*
+ * palloc_vg_register_alloc -- (internal) registers allocation header
+ * in Valgrind
+ */
+static int
+palloc_vg_register_alloc(const struct memory_block *m, void *arg)
+{
+	struct palloc_heap *heap = arg;
+
+	m->m_ops->reinit_header(m);
+
+	void *uptr = m->m_ops->get_user_data(m);
+	size_t usize = m->m_ops->get_user_size(m);
+
+	VALGRIND_DO_MEMPOOL_ALLOC(zone_get_base_address(heap, uptr), uptr, usize);
+	VALGRIND_DO_MAKE_MEM_DEFINED(uptr, usize);
+
+	return 0;
+}
+
+/*
+ * palloc_heap_vg_open -- notifies Valgrind about heap layout
+ */
+void
+palloc_heap_vg_open(struct palloc_heap *heap, int objects)
+{
+	heap_vg_open(heap, palloc_vg_register_alloc, heap, objects);
+}
+
+void
+palloc_heap_vg_zone_open(struct palloc_heap *heap, uint32_t zid, int objects)
+{
+	heap_vg_zone_open(heap, zid, palloc_vg_register_alloc, heap, objects);
+}
+#endif
diff --git a/src/common/dav_v2/palloc.h b/src/common/dav_v2/palloc.h
new file mode 100644
index 00000000000..027fb94667b
--- /dev/null
+++ b/src/common/dav_v2/palloc.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * palloc.h -- internal definitions for persistent allocator
+ */
+
+#ifndef __DAOS_COMMON_PALLOC_H
+#define __DAOS_COMMON_PALLOC_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "memops.h"
+#include "ulog.h"
+#include "valgrind_internal.h"
+#include "stats.h"
+#include "dav_v2.h"
+
+#define PALLOC_CTL_DEBUG_NO_PATTERN (-1)
+
+struct palloc_heap {
+	struct mo_ops           p_ops;
+	struct heap_layout_info layout_info;
+	struct heap_rt         *rt;
+	uint64_t                size;
+	struct stats           *stats;
+	void                   *base;
+	int                     alloc_pattern;
+};
+
+struct memory_block;
+struct mbrt;
+
+typedef int (*palloc_constr)(void *base, void *ptr, size_t usable_size, void *arg);
+
+int
+palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size,
+		 palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags,
+		 uint16_t class_id, uint32_t zset_id, struct operation_context *ctx);
+
+int
+palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg,
+	       uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id,
+	       struct dav_action *act);
+
+int
+palloc_action_isalloc(struct dav_action *act);
+void
+palloc_get_prange(struct dav_action *act, uint64_t *const off, uint64_t *const size,
+		  int persist_udata);
+uint64_t
+palloc_get_realoffset(struct palloc_heap *heap, uint64_t off);
+
+void
+palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act);
+
+void
+palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt);
+
+void
+palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt,
+	       struct operation_context *ctx);
+
+void
+palloc_set_value(struct palloc_heap *heap, struct dav_action *act, uint64_t *ptr, uint64_t value);
+
+uint64_t
+palloc_first(struct palloc_heap *heap);
+uint64_t
+palloc_next(struct palloc_heap *heap, uint64_t off);
+
+size_t
+palloc_usable_size(struct palloc_heap *heap, uint64_t off);
+uint64_t
+palloc_extra(struct palloc_heap *heap, uint64_t off);
+uint16_t
+palloc_flags(struct palloc_heap *heap, uint64_t off);
+
+/* foreach callback, terminates iteration if return value is non-zero */
+typedef int (*object_callback)(const struct memory_block *m, void *arg);
+
+#if VG_MEMCHECK_ENABLED
+void
+palloc_heap_vg_open(struct palloc_heap *heap, int objects);
+void
+palloc_heap_vg_zone_open(struct palloc_heap *heap, uint32_t zid, int objects);
+#endif
+
+#endif /* __DAOS_COMMON_PALLOC_H */
diff --git a/src/common/dav_v2/queue.h b/src/common/dav_v2/queue.h
new file mode 100644
index 00000000000..654c60cec9b
--- /dev/null
+++ b/src/common/dav_v2/queue.h
@@ -0,0 +1,112 @@
+/*
+ * Source: glibc 2.24 (git://sourceware.org/glibc.git /misc/sys/queue.h)
+ *
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2016, Microsoft Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ */
+
+#ifndef	__DAOS_COMMON_QUEUE_H_
+#define	__DAOS_COMMON_QUEUE_H_
+
+/*
+ * This file defines five types of data structures: singly-linked lists,
+ * lists, simple queues, tail queues, and circular queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The
+ * elements are singly linked for minimum space and pointer manipulation
+ * overhead at the expense of O(n) removal for arbitrary elements. New
+ * elements can be added to the list after an existing element or at the
+ * head of the list.  Elements being removed from the head of the list
+ * should use the explicit macro for this purpose for optimum
+ * efficiency. A singly-linked list may only be traversed in the forward
+ * direction.  Singly-linked lists are ideal for applications with large
+ * datasets and few or no removals or for implementing a LIFO queue.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ */
+
+/*
+ * Singly-linked List definitions.
+ */
+#define	DAV_SLIST_HEAD(name, type)					\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	DAV_SLIST_HEAD_INITIALIZER(head)				\
+	{ NULL }
+
+#define	DAV_SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define	DAV_SLIST_INIT(head) ((head)->slh_first = NULL)
+
+#define	DAV_SLIST_INSERT_AFTER(slistelm, elm, field) do {		\
+	(elm)->field.sle_next = (slistelm)->field.sle_next;		\
+	(slistelm)->field.sle_next = (elm);				\
+} while (/*CONSTCOND*/0)
+
+#define	DAV_SLIST_INSERT_HEAD(head, elm, field) do {			\
+	(elm)->field.sle_next = (head)->slh_first;			\
+	(head)->slh_first = (elm);					\
+} while (/*CONSTCOND*/0)
+
+#define	DAV_SLIST_REMOVE_HEAD(head, field)				\
+	((head)->slh_first = (head)->slh_first->field.sle_next)
+
+#define	DAV_SLIST_REMOVE(head, elm, type, field) do {			\
+	if ((head)->slh_first == (elm)) {				\
+		DAV_SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = (head)->slh_first;		\
+		while (curelm->field.sle_next != (elm))			\
+			curelm = curelm->field.sle_next;		\
+		curelm->field.sle_next =				\
+		    curelm->field.sle_next->field.sle_next;		\
+	}								\
+} while (/*CONSTCOND*/0)
+
+#define	DAV_SLIST_FOREACH(var, head, field)					\
+	for ((var) = (head)->slh_first; (var); (var) = (var)->field.sle_next)
+
+/*
+ * Singly-linked List access methods.
+ */
+#define	DAV_SLIST_EMPTY(head)	((head)->slh_first == NULL)
+#define	DAV_SLIST_FIRST(head)	((head)->slh_first)
+#define	DAV_SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#endif	/* __DAOS_COMMON_QUEUE_H_ */
diff --git a/src/common/dav_v2/ravl.c b/src/common/dav_v2/ravl.c
new file mode 100644
index 00000000000..9a9639b367a
--- /dev/null
+++ b/src/common/dav_v2/ravl.c
@@ -0,0 +1,613 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2023, Intel Corporation */
+
+/*
+ * ravl.c -- implementation of a RAVL tree
+ * https://sidsen.azurewebsites.net//papers/ravl-trees-journal.pdf
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "out.h"
+#include "ravl.h"
+#include "util.h"
+
+#define RAVL_DEFAULT_DATA_SIZE (sizeof(void *))
+
+enum ravl_slot_type {
+	RAVL_LEFT,
+	RAVL_RIGHT,
+
+	MAX_SLOTS,
+
+	RAVL_ROOT
+};
+
+struct ravl_node {
+	struct ravl_node *parent;
+	struct ravl_node *slots[MAX_SLOTS];
+	int32_t rank; /* cannot be greater than height of the subtree */
+	int32_t pointer_based;
+	char data[];
+};
+
+struct ravl {
+	struct ravl_node *root;
+	ravl_compare *compare;
+	size_t data_size;
+};
+
+/*
+ * ravl_new -- creates a new ravl tree instance
+ */
+struct ravl *
+ravl_new_sized(ravl_compare *compare, size_t data_size)
+{
+	struct ravl *r;
+
+	D_ALLOC_PTR_NZ(r);
+	if (r == NULL) {
+		D_CRIT("Malloc!\n");
+		return r;
+	}
+
+	r->compare = compare;
+	r->root = NULL;
+	r->data_size = data_size;
+
+	return r;
+}
+
+/*
+ * ravl_new -- creates a new tree that stores data pointers
+ */
+struct ravl *
+ravl_new(ravl_compare *compare)
+{
+	return ravl_new_sized(compare, RAVL_DEFAULT_DATA_SIZE);
+}
+
+/*
+ * ravl_clear_node -- (internal) recursively clears the given subtree,
+ *	calls callback in an in-order fashion. Optionally frees the given node.
+ */
+static void
+ravl_foreach_node(struct ravl_node *n, ravl_cb cb, void *arg, int free_node)
+{
+	if (n == NULL)
+		return;
+
+	ravl_foreach_node(n->slots[RAVL_LEFT], cb, arg, free_node);
+	if (cb)
+		cb((void *)n->data, arg);
+	ravl_foreach_node(n->slots[RAVL_RIGHT], cb, arg, free_node);
+
+	if (free_node)
+		D_FREE(n);
+}
+
+/*
+ * ravl_clear -- clears the entire tree, starting from the root
+ */
+void
+ravl_clear(struct ravl *ravl)
+{
+	ravl_foreach_node(ravl->root, NULL, NULL, 1);
+	ravl->root = NULL;
+}
+
+/*
+ * ravl_delete_cb -- clears and deletes the given ravl instance, calls callback
+ */
+void
+ravl_delete_cb(struct ravl *ravl, ravl_cb cb, void *arg)
+{
+	ravl_foreach_node(ravl->root, cb, arg, 1);
+	D_FREE(ravl);
+}
+
+/*
+ * ravl_delete -- clears and deletes the given ravl instance
+ */
+void
+ravl_delete(struct ravl *ravl)
+{
+	ravl_delete_cb(ravl, NULL, NULL);
+}
+
+/*
+ * ravl_foreach -- traverses the entire tree, calling callback for every node
+ */
+void
+ravl_foreach(struct ravl *ravl, ravl_cb cb, void *arg)
+{
+	ravl_foreach_node(ravl->root, cb, arg, 0);
+}
+
+/*
+ * ravl_empty -- checks whether the given tree is empty
+ */
+int
+ravl_empty(struct ravl *ravl)
+{
+	return ravl->root == NULL;
+}
+
+/*
+ * ravl_node_insert_constructor -- node data constructor for ravl_insert
+ */
+static void
+ravl_node_insert_constructor(void *data, size_t data_size, const void *arg)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(data_size);
+
+	/* copy only the 'arg' pointer */
+	memcpy(data, &arg, sizeof(arg));
+}
+
+/*
+ * ravl_node_copy_constructor -- node data constructor for ravl_emplace_copy
+ */
+static void
+ravl_node_copy_constructor(void *data, size_t data_size, const void *arg)
+{
+	memcpy(data, arg, data_size);
+}
+
+/*
+ * ravl_new_node -- (internal) allocates and initializes a new node
+ */
+static struct ravl_node *
+ravl_new_node(struct ravl *ravl, ravl_constr constr, const void *arg)
+{
+	struct ravl_node *n;
+
+	D_ALLOC_NZ(n, (sizeof(*n) + ravl->data_size));
+	if (n == NULL) {
+		D_CRIT("Malloc!\n");
+		return n;
+	}
+
+	n->parent = NULL;
+	n->slots[RAVL_LEFT] = NULL;
+	n->slots[RAVL_RIGHT] = NULL;
+	n->rank = 0;
+	n->pointer_based = constr == ravl_node_insert_constructor;
+	constr(n->data, ravl->data_size, arg);
+
+	return n;
+}
+
+/*
+ * ravl_slot_opposite -- (internal) returns the opposite slot type, cannot be
+ *	called for root type
+ */
+static enum ravl_slot_type
+ravl_slot_opposite(enum ravl_slot_type t)
+{
+	ASSERTne(t, RAVL_ROOT);
+
+	return t == RAVL_LEFT ? RAVL_RIGHT : RAVL_LEFT;
+}
+
+/*
+ * ravl_node_slot_type -- (internal) returns the type of the given node:
+ *	left child, right child or root
+ */
+static enum ravl_slot_type
+ravl_node_slot_type(struct ravl_node *n)
+{
+	if (n->parent == NULL)
+		return RAVL_ROOT;
+
+	return n->parent->slots[RAVL_LEFT] == n ? RAVL_LEFT : RAVL_RIGHT;
+}
+
+/*
+ * ravl_node_sibling -- (internal) returns the sibling of the given node,
+ *	NULL if the node is root (has no parent)
+ */
+static struct ravl_node *
+ravl_node_sibling(struct ravl_node *n)
+{
+	enum ravl_slot_type t = ravl_node_slot_type(n);
+
+	if (t == RAVL_ROOT)
+		return NULL;
+
+	return n->parent->slots[t == RAVL_LEFT ? RAVL_RIGHT : RAVL_LEFT];
+}
+
+/*
+ * ravl_node_ref -- (internal) returns the pointer to the memory location in
+ *	which the given node resides
+ */
+static struct ravl_node **
+ravl_node_ref(struct ravl *ravl, struct ravl_node *n)
+{
+	enum ravl_slot_type t = ravl_node_slot_type(n);
+
+	return t == RAVL_ROOT ? &ravl->root : &n->parent->slots[t];
+}
+
+/*
+ * ravl_rotate -- (internal) performs a rotation around a given node
+ *
+ * The node n swaps place with its parent. If n is right child, parent becomes
+ * the left child of n, otherwise parent becomes right child of n.
+ */
+static void
+ravl_rotate(struct ravl *ravl, struct ravl_node *n)
+{
+	ASSERTne(n->parent, NULL);
+	struct ravl_node *p = n->parent;
+	struct ravl_node **pref = ravl_node_ref(ravl, p);
+
+	enum ravl_slot_type t = ravl_node_slot_type(n);
+	enum ravl_slot_type t_opposite = ravl_slot_opposite(t);
+
+	n->parent = p->parent;
+	p->parent = n;
+	*pref = n;
+
+	p->slots[t] = n->slots[t_opposite];
+	if (p->slots[t] != NULL)
+		p->slots[t]->parent = p;
+	n->slots[t_opposite] = p;
+}
+
+/*
+ * ravl_node_rank -- (internal) returns the rank of the node
+ *
+ * For the purpose of balancing, NULL nodes have rank -1.
+ */
+static int
+ravl_node_rank(struct ravl_node *n)
+{
+	return n == NULL ? -1 : n->rank;
+}
+
+/*
+ * ravl_node_rank_difference_parent -- (internal) returns the rank different
+ *	between parent node p and its child n
+ *
+ * Every rank difference must be positive.
+ *
+ * Either of these can be NULL.
+ */
+static int
+ravl_node_rank_difference_parent(struct ravl_node *p, struct ravl_node *n)
+{
+	return ravl_node_rank(p) - ravl_node_rank(n);
+}
+
+/*
+ * ravl_node_rank_differenced - (internal) returns the rank difference between
+ *	parent and its child
+ *
+ * Can be used to check if a given node is an i-child.
+ */
+static int
+ravl_node_rank_difference(struct ravl_node *n)
+{
+	return ravl_node_rank_difference_parent(n->parent, n);
+}
+
+/*
+ * ravl_node_is_i_j -- (internal) checks if a given node is strictly i,j-node
+ */
+static int
+ravl_node_is_i_j(struct ravl_node *n, int i, int j)
+{
+	return (ravl_node_rank_difference_parent(n, n->slots[RAVL_LEFT]) == i &&
+		ravl_node_rank_difference_parent(n, n->slots[RAVL_RIGHT]) == j);
+}
+
+/*
+ * ravl_node_is -- (internal) checks if a given node is i,j-node or j,i-node
+ */
+static int
+ravl_node_is(struct ravl_node *n, int i, int j)
+{
+	return ravl_node_is_i_j(n, i, j) || ravl_node_is_i_j(n, j, i);
+}
+
+/*
+ * ravl_node_promote -- promotes a given node by increasing its rank
+ */
+static void
+ravl_node_promote(struct ravl_node *n)
+{
+	n->rank += 1;
+}
+
+/*
+ * ravl_node_promote -- demotes a given node by increasing its rank
+ */
+static void
+ravl_node_demote(struct ravl_node *n)
+{
+	ASSERT(n->rank > 0);
+	n->rank -= 1;
+}
+
+/*
+ * ravl_balance -- balances the tree after insert
+ *
+ * This function must restore the invariant that every rank
+ * difference is positive.
+ */
+static void
+ravl_balance(struct ravl *ravl, struct ravl_node *n)
+{
+	/* walk up the tree, promoting nodes */
+	while (n->parent && ravl_node_is(n->parent, 0, 1)) {
+		ravl_node_promote(n->parent);
+		n = n->parent;
+	}
+
+	/*
+	 * Either the rank rule holds or n is a 0-child whose sibling is an
+	 * i-child with i > 1.
+	 */
+	struct ravl_node *s = ravl_node_sibling(n);
+
+	if (!(ravl_node_rank_difference(n) == 0 &&
+	    ravl_node_rank_difference_parent(n->parent, s) > 1))
+		return;
+
+	struct ravl_node *y = n->parent;
+	/* if n is a left child, let z be n's right child and vice versa */
+	enum ravl_slot_type t = ravl_slot_opposite(ravl_node_slot_type(n));
+	struct ravl_node *z = n->slots[t];
+
+	if (z == NULL || ravl_node_rank_difference(z) == 2) {
+		ravl_rotate(ravl, n);
+		ravl_node_demote(y);
+	} else if (ravl_node_rank_difference(z) == 1) {
+		ravl_rotate(ravl, z);
+		ravl_rotate(ravl, z);
+		ravl_node_promote(z);
+		ravl_node_demote(n);
+		ravl_node_demote(y);
+	}
+}
+
+/*
+ * ravl_insert -- insert data into the tree
+ */
+int
+ravl_insert(struct ravl *ravl, const void *data)
+{
+	return ravl_emplace(ravl, ravl_node_insert_constructor, data);
+}
+
+/*
+ * ravl_insert -- copy construct data inside of a new tree node
+ */
+int
+ravl_emplace_copy(struct ravl *ravl, const void *data)
+{
+	return ravl_emplace(ravl, ravl_node_copy_constructor, data);
+}
+
+/*
+ * ravl_emplace -- construct data inside of a new tree node
+ */
+int
+ravl_emplace(struct ravl *ravl, ravl_constr constr, const void *arg)
+{
+	struct ravl_node *n = ravl_new_node(ravl, constr, arg);
+
+	if (n == NULL)
+		return -1;
+
+	/* walk down the tree and insert the new node into a missing slot */
+	struct ravl_node **dstp = &ravl->root;
+	struct ravl_node *dst = NULL;
+
+	while (*dstp != NULL) {
+		dst = (*dstp);
+		int cmp_result = ravl->compare(ravl_data(n), ravl_data(dst));
+
+		if (cmp_result == 0)
+			goto error_duplicate;
+
+		dstp = &dst->slots[cmp_result > 0];
+	}
+	n->parent = dst;
+	*dstp = n;
+
+	ravl_balance(ravl, n);
+
+	return 0;
+
+error_duplicate:
+	errno = EEXIST;
+	D_FREE(n);
+	return -1;
+}
+
+/*
+ * ravl_node_type_most -- (internal) returns left-most or right-most node in
+ *	the subtree
+ */
+static struct ravl_node *
+ravl_node_type_most(struct ravl_node *n, enum ravl_slot_type t)
+{
+	while (n->slots[t] != NULL)
+		n = n->slots[t];
+
+	return n;
+}
+
+/*
+ * ravl_node_cessor -- (internal) returns the successor or predecessor of the
+ *	node
+ */
+static struct ravl_node *
+ravl_node_cessor(struct ravl_node *n, enum ravl_slot_type t)
+{
+	/*
+	 * If t child is present, we are looking for t-opposite-most node
+	 * in t child subtree
+	 */
+	if (n->slots[t])
+		return ravl_node_type_most(n->slots[t], ravl_slot_opposite(t));
+
+	/* otherwise get the first parent on the t path */
+	while (n->parent != NULL && n == n->parent->slots[t])
+		n = n->parent;
+
+	return n->parent;
+}
+
+/*
+ * ravl_node_successor -- (internal) returns node's successor
+ *
+ * It's the first node larger than n.
+ */
+static struct ravl_node *
+ravl_node_successor(struct ravl_node *n)
+{
+	return ravl_node_cessor(n, RAVL_RIGHT);
+}
+
+/*
+ * ravl_node_successor -- (internal) returns node's successor
+ *
+ * It's the first node smaller than n.
+ */
+static struct ravl_node *
+ravl_node_predecessor(struct ravl_node *n)
+{
+	return ravl_node_cessor(n, RAVL_LEFT);
+}
+
+/*
+ * ravl_predicate_holds -- (internal) verifies the given predicate for
+ *	the current node in the search path
+ *
+ * If the predicate holds for the given node or a node that can be directly
+ * derived from it, returns 1. Otherwise returns 0.
+ */
+static int
+ravl_predicate_holds(int result, struct ravl_node **ret,
+	struct ravl_node *n, enum ravl_predicate flags)
+{
+	if (flags & RAVL_PREDICATE_EQUAL) {
+		if (result == 0) {
+			*ret = n;
+			return 1;
+		}
+	}
+	if (flags & RAVL_PREDICATE_GREATER) {
+		if (result < 0) { /* data < n->data */
+			*ret = n;
+			return 0;
+		} else if (result == 0) {
+			*ret = ravl_node_successor(n);
+			return 1;
+		}
+	}
+	if (flags & RAVL_PREDICATE_LESS) {
+		if (result > 0) { /* data > n->data */
+			*ret = n;
+			return 0;
+		} else if (result == 0) {
+			*ret = ravl_node_predecessor(n);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * ravl_find -- searches for the node in the tree
+ */
+struct ravl_node *
+ravl_find(struct ravl *ravl, const void *data, enum ravl_predicate flags)
+{
+	struct ravl_node *r = NULL;
+	struct ravl_node *n = ravl->root;
+
+	while (n) {
+		int result = ravl->compare(data, ravl_data(n));
+
+		if (ravl_predicate_holds(result, &r, n, flags))
+			return r;
+
+		n = n->slots[result > 0];
+	}
+
+	return r;
+}
+
+/*
+ * ravl_remove -- removes the given node from the tree
+ */
+void
+ravl_remove(struct ravl *ravl, struct ravl_node *n)
+{
+	if (n->slots[RAVL_LEFT] != NULL && n->slots[RAVL_RIGHT] != NULL) {
+		/* if both children are present, remove the successor instead */
+		struct ravl_node *s = ravl_node_successor(n);
+
+		memcpy(n->data, s->data, ravl->data_size);
+		ravl_remove(ravl, s);
+	} else {
+		/* swap n with the child that may exist */
+		struct ravl_node *r = n->slots[RAVL_LEFT] ?
+			n->slots[RAVL_LEFT] : n->slots[RAVL_RIGHT];
+
+		if (r != NULL)
+			r->parent = n->parent;
+
+		*ravl_node_ref(ravl, n) = r;
+		D_FREE(n);
+	}
+}
+
+/*
+ * ravl_data -- returns the data contained within the node
+ */
+void *
+ravl_data(struct ravl_node *node)
+{
+	if (node->pointer_based) {
+		void *data;
+
+		memcpy(&data, node->data, sizeof(void *));
+		return data;
+	} else {
+		return (void *)node->data;
+	}
+}
+
+/*
+ * ravl_first -- returns first (left-most) node in the tree
+ */
+struct ravl_node *
+ravl_first(struct ravl *ravl)
+{
+	if (ravl->root)
+		return ravl_node_type_most(ravl->root, RAVL_LEFT);
+
+	return NULL;
+}
+
+/*
+ * ravl_last -- returns last (right-most) node in the tree
+ */
+struct ravl_node *
+ravl_last(struct ravl *ravl)
+{
+	if (ravl->root)
+		return ravl_node_type_most(ravl->root, RAVL_RIGHT);
+
+	return NULL;
+}
diff --git a/src/common/dav_v2/ravl.h b/src/common/dav_v2/ravl.h
new file mode 100644
index 00000000000..d1d69ec91b6
--- /dev/null
+++ b/src/common/dav_v2/ravl.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2023, Intel Corporation */
+
+/*
+ * ravl.h -- internal definitions for ravl tree
+ */
+
+#ifndef __DAOS_COMMON_RAVL_H
+#define __DAOS_COMMON_RAVL_H 1
+
+#include <stddef.h>
+
+struct ravl;
+struct ravl_node;
+
+enum ravl_predicate {
+	RAVL_PREDICATE_EQUAL		=	1 << 0,
+	RAVL_PREDICATE_GREATER		=	1 << 1,
+	RAVL_PREDICATE_LESS		=	1 << 2,
+	RAVL_PREDICATE_LESS_EQUAL	=
+		RAVL_PREDICATE_EQUAL | RAVL_PREDICATE_LESS,
+	RAVL_PREDICATE_GREATER_EQUAL	=
+		RAVL_PREDICATE_EQUAL | RAVL_PREDICATE_GREATER,
+};
+
+typedef int ravl_compare(const void *lhs, const void *rhs);
+typedef void ravl_cb(void *data, void *arg);
+typedef void ravl_constr(void *data, size_t data_size, const void *arg);
+
+struct ravl *ravl_new(ravl_compare *compare);
+struct ravl *ravl_new_sized(ravl_compare *compare, size_t data_size);
+void ravl_delete(struct ravl *ravl);
+void ravl_delete_cb(struct ravl *ravl, ravl_cb cb, void *arg);
+void ravl_foreach(struct ravl *ravl, ravl_cb cb, void *arg);
+int ravl_empty(struct ravl *ravl);
+void ravl_clear(struct ravl *ravl);
+int ravl_insert(struct ravl *ravl, const void *data);
+int ravl_emplace(struct ravl *ravl, ravl_constr constr, const void *arg);
+int ravl_emplace_copy(struct ravl *ravl, const void *data);
+
+struct ravl_node *ravl_find(struct ravl *ravl, const void *data,
+	enum ravl_predicate predicate_flags);
+struct ravl_node *ravl_first(struct ravl *ravl);
+struct ravl_node *ravl_last(struct ravl *ravl);
+void *ravl_data(struct ravl_node *node);
+void ravl_remove(struct ravl *ravl, struct ravl_node *node);
+
+#endif /* __DAOS_COMMON_RAVL_H */
diff --git a/src/common/dav_v2/ravl_interval.c b/src/common/dav_v2/ravl_interval.c
new file mode 100644
index 00000000000..e493b031bba
--- /dev/null
+++ b/src/common/dav_v2/ravl_interval.c
@@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2020-2023, Intel Corporation */
+
+/*
+ * ravl_interval.c -- ravl_interval implementation
+ */
+
+#include <stdbool.h>
+
+#include "ravl_interval.h"
+#include "sys_util.h"
+#include "ravl.h"
+
+/*
+ * ravl_interval - structure representing two points
+ *                 on the number line
+ */
+struct ravl_interval {
+	struct ravl *tree;
+	ravl_interval_min *get_min;
+	ravl_interval_max *get_max;
+};
+
+/*
+ * ravl_interval_node - structure holding min, max functions and address
+ */
+struct ravl_interval_node {
+	void *addr;
+	ravl_interval_min *get_min;
+	ravl_interval_max *get_max;
+	bool overlap;
+};
+
+/*
+ * ravl_interval_compare -- compare intervals by its boundaries
+ */
+static int
+ravl_interval_compare(const void *lhs, const void *rhs)
+{
+	const struct ravl_interval_node *left = lhs;
+	const struct ravl_interval_node *right = rhs;
+
+	/*
+	 * when searching, comparing should return the
+	 * earliest overlapped record
+	 */
+	if (left->overlap) {
+		if (left->get_min(left->addr) >= right->get_max(right->addr))
+			return 1;
+		if (left->get_min(left->addr) == right->get_min(right->addr))
+			return 0;
+		return -1;
+	}
+
+	/* when inserting, comparing shouldn't allow overlapping intervals */
+	if (left->get_min(left->addr) >= right->get_max(right->addr))
+		return 1;
+	if (left->get_max(left->addr) <= right->get_min(right->addr))
+		return -1;
+	return 0;
+}
+
+/*
+ * ravl_interval_delete - finalize the ravl interval module
+ */
+void
+ravl_interval_delete(struct ravl_interval *ri)
+{
+	ravl_delete(ri->tree);
+	ri->tree = NULL;
+	D_FREE(ri);
+}
+
+/*
+ * ravl_interval_delete_cb - finalize the ravl interval module with entries
+ * and execute provided callback function for each entry.
+ */
+void
+ravl_interval_delete_cb(struct ravl_interval *ri, ravl_cb cb, void *arg)
+{
+	ravl_delete_cb(ri->tree, cb, arg);
+	ri->tree = NULL;
+	D_FREE(ri);
+}
+
+/*
+ * ravl_interval_new -- initialize the ravl interval module
+ */
+struct ravl_interval *
+ravl_interval_new(ravl_interval_min *get_min, ravl_interval_max *get_max)
+{
+	struct ravl_interval *interval;
+
+	D_ALLOC_PTR_NZ(interval);
+	if (!interval)
+		return NULL;
+
+	interval->tree = ravl_new_sized(ravl_interval_compare,
+			sizeof(struct ravl_interval_node));
+	if (!(interval->tree))
+		goto free_alloc;
+
+	interval->get_min = get_min;
+	interval->get_max = get_max;
+
+	return interval;
+
+free_alloc:
+	D_FREE(interval);
+	return NULL;
+}
+
+/*
+ * ravl_interval_insert -- insert interval entry into the tree
+ */
+int
+ravl_interval_insert(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node rin;
+
+	rin.addr = addr;
+	rin.get_min = ri->get_min;
+	rin.get_max = ri->get_max;
+	rin.overlap = false;
+
+	int ret = ravl_emplace_copy(ri->tree, &rin);
+
+	if (ret && errno)
+		return -errno;
+
+	return ret;
+}
+
+/*
+ * ravl_interval_remove -- remove interval entry from the tree
+ */
+int
+ravl_interval_remove(struct ravl_interval *ri, struct ravl_interval_node *rin)
+{
+	struct ravl_node *node = ravl_find(ri->tree, rin,
+			RAVL_PREDICATE_EQUAL);
+	if (!node)
+		return -ENOENT;
+
+	ravl_remove(ri->tree, node);
+
+	return 0;
+}
+
+/*
+ * ravl_interval_find_prior -- find overlapping interval starting prior to
+ *                             the current one
+ */
+static struct ravl_interval_node *
+ravl_interval_find_prior(struct ravl *tree, struct ravl_interval_node *rin)
+{
+	struct ravl_node *node;
+	struct ravl_interval_node *cur;
+
+	node = ravl_find(tree, rin, RAVL_PREDICATE_LESS);
+	if (!node)
+		return NULL;
+
+	cur = ravl_data(node);
+	/*
+	 * If the end of the found interval is below the searched boundary, then
+	 * those intervals are not overlapping.
+	 */
+	if (cur->get_max(cur->addr) <= rin->get_min(rin->addr))
+		return NULL;
+
+	return cur;
+}
+
+/*
+ * ravl_interval_find_eq -- find overlapping interval starting neither prior or
+ *                          lather than the current one
+ */
+static struct ravl_interval_node *
+ravl_interval_find_eq(struct ravl *tree, struct ravl_interval_node *rin)
+{
+	struct ravl_node *node;
+
+	node = ravl_find(tree, rin, RAVL_PREDICATE_EQUAL);
+	if (!node)
+		return NULL;
+
+	return ravl_data(node);
+}
+
+/*
+ * ravl_interval_find_later -- find overlapping interval starting later than
+ *                             the current one
+ */
+static struct ravl_interval_node *
+ravl_interval_find_later(struct ravl *tree, struct ravl_interval_node *rin)
+{
+	struct ravl_node *node;
+	struct ravl_interval_node *cur;
+
+	node = ravl_find(tree, rin, RAVL_PREDICATE_GREATER);
+	if (!node)
+		return NULL;
+
+	cur = ravl_data(node);
+
+	/*
+	 * If the beginning of the found interval is above the end of
+	 * the searched range, then those interval are not overlapping
+	 */
+	if (cur->get_min(cur->addr) >= rin->get_max(rin->addr))
+		return NULL;
+
+	return cur;
+}
+
+/*
+ * ravl_interval_find_equal -- find the interval with exact (min, max) range
+ */
+struct ravl_interval_node *
+ravl_interval_find_equal(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node range;
+
+	range.addr = addr;
+	range.get_min = ri->get_min;
+	range.get_max = ri->get_max;
+	range.overlap = true;
+
+	struct ravl_node *node;
+
+	node = ravl_find(ri->tree, &range, RAVL_PREDICATE_EQUAL);
+	if (!node)
+		return NULL;
+
+	return ravl_data(node);
+}
+
+/*
+ * ravl_interval_find -- find the earliest interval within (min, max) range
+ */
+struct ravl_interval_node *
+ravl_interval_find(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node range;
+
+	range.addr = addr;
+	range.get_min = ri->get_min;
+	range.get_max = ri->get_max;
+	range.overlap = true;
+
+	struct ravl_interval_node *cur;
+
+	cur = ravl_interval_find_prior(ri->tree, &range);
+	if (!cur)
+		cur = ravl_interval_find_eq(ri->tree, &range);
+	if (!cur)
+		cur = ravl_interval_find_later(ri->tree, &range);
+
+	return cur;
+}
+
+/*
+ * ravl_interval_data -- returns the data contained within an interval node
+ */
+void *
+ravl_interval_data(struct ravl_interval_node *rin)
+{
+	return (void *)rin->addr;
+}
+
+/*
+ * ravl_interval_find_first -- returns first interval in the tree
+ */
+struct ravl_interval_node *
+ravl_interval_find_first(struct ravl_interval *ri)
+{
+	struct ravl_node *first;
+
+	first = ravl_first(ri->tree);
+	if (first)
+		return ravl_data(first);
+
+	return NULL;
+}
+
+/*
+ * ravl_interval_find_last -- returns last interval in the tree
+ */
+struct ravl_interval_node *
+ravl_interval_find_last(struct ravl_interval *ri)
+{
+	struct ravl_node *last;
+
+	last = ravl_last(ri->tree);
+	if (last)
+		return ravl_data(last);
+
+	return NULL;
+}
+
+/*
+ * ravl_interval_find_next -- returns interval succeeding the one provided
+ */
+struct ravl_interval_node *
+ravl_interval_find_next(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node range;
+
+	range.addr = addr;
+	range.get_min = ri->get_min;
+	range.get_max = ri->get_max;
+	range.overlap = true;
+
+	struct ravl_node *next = NULL;
+
+	next = ravl_find(ri->tree, &range, RAVL_PREDICATE_GREATER);
+	if (next)
+		return ravl_data(next);
+
+	return NULL;
+}
+
+/*
+ * ravl_interval_find_prev -- returns interval preceding the one provided
+ */
+struct ravl_interval_node *
+ravl_interval_find_prev(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node range;
+
+	range.addr = addr;
+	range.get_min = ri->get_min;
+	range.get_max = ri->get_max;
+	range.overlap = true;
+
+	struct ravl_node *prev = NULL;
+
+	prev = ravl_find(ri->tree, &range, RAVL_PREDICATE_LESS);
+	if (prev)
+		return ravl_data(prev);
+
+	return NULL;
+}
diff --git a/src/common/dav_v2/ravl_interval.h b/src/common/dav_v2/ravl_interval.h
new file mode 100644
index 00000000000..6b106fc4bfe
--- /dev/null
+++ b/src/common/dav_v2/ravl_interval.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2020-2023, Intel Corporation */
+
+/*
+ * ravl_interval.h -- internal definitions for ravl_interval
+ */
+
+#ifndef RAVL_INTERVAL_H
+#define RAVL_INTERVAL_H
+
+#include "ravl.h"
+
+struct ravl_interval;
+struct ravl_interval_node;
+
+typedef size_t ravl_interval_min(void *addr);
+typedef size_t ravl_interval_max(void *addr);
+
+struct ravl_interval *ravl_interval_new(ravl_interval_min *min,
+		ravl_interval_min *max);
+void ravl_interval_delete(struct ravl_interval *ri);
+void ravl_interval_delete_cb(struct ravl_interval *ri, ravl_cb cb, void *arg);
+int ravl_interval_insert(struct ravl_interval *ri, void *addr);
+int ravl_interval_remove(struct ravl_interval *ri,
+		struct ravl_interval_node *rin);
+struct ravl_interval_node *ravl_interval_find_equal(struct ravl_interval *ri,
+		void *addr);
+struct ravl_interval_node *ravl_interval_find(struct ravl_interval *ri,
+		void *addr);
+struct ravl_interval_node *ravl_interval_find_first(struct ravl_interval *ri);
+struct ravl_interval_node *ravl_interval_find_last(struct ravl_interval *ri);
+struct ravl_interval_node *ravl_interval_find_next(struct ravl_interval *ri,
+		void *addr);
+struct ravl_interval_node *ravl_interval_find_prev(struct ravl_interval *ri,
+		void *addr);
+void *ravl_interval_data(struct ravl_interval_node *rin);
+#endif
diff --git a/src/common/dav_v2/recycler.c b/src/common/dav_v2/recycler.c
new file mode 100644
index 00000000000..5680735b341
--- /dev/null
+++ b/src/common/dav_v2/recycler.c
@@ -0,0 +1,323 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2024, Intel Corporation */
+
+/*
+ * recycler.c -- implementation of run recycler
+ */
+
+#include "heap.h"
+#include "recycler.h"
+#include "vec.h"
+#include "out.h"
+#include "util.h"
+#include "sys_util.h"
+#include "ravl.h"
+#include "valgrind_internal.h"
+
+#define THRESHOLD_MUL 4
+
+/*
+ * recycler_element_cmp -- compares two recycler elements
+ */
+static int
+recycler_element_cmp(const void *lhs, const void *rhs)
+{
+	const struct recycler_element *l = lhs;
+	const struct recycler_element *r = rhs;
+
+	int64_t diff = (int64_t)l->max_free_block - (int64_t)r->max_free_block;
+
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->free_space - (int64_t)r->free_space;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->zone_id - (int64_t)r->zone_id;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->chunk_id - (int64_t)r->chunk_id;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	return 0;
+}
+
+struct recycler {
+	struct ravl        *runs;
+	struct palloc_heap *heap;
+	struct mbrt        *mb;
+
+	/*
+	 * How many unaccounted units there *might* be inside of the memory
+	 * blocks stored in the recycler.
+	 * The value is not meant to be accurate, but rather a rough measure on
+	 * how often should the memory block scores be recalculated.
+	 *
+	 * Per-chunk unaccounted units are shared for all zones, which might
+	 * lead to some unnecessary recalculations.
+	 */
+	size_t unaccounted_units[MAX_CHUNK];
+	size_t unaccounted_total;
+	size_t              nallocs;
+
+	VEC(, struct recycler_element) recalc;
+
+	pthread_mutex_t lock;
+};
+
+/*
+ * recycler_new -- creates new recycler instance
+ */
+struct recycler *
+recycler_new(struct palloc_heap *heap, size_t nallocs, struct mbrt *mb)
+{
+	struct recycler *r;
+
+	D_ALLOC_PTR_NZ(r);
+	if (r == NULL)
+		goto error_alloc_recycler;
+
+	r->runs = ravl_new_sized(recycler_element_cmp,
+		sizeof(struct recycler_element));
+	if (r->runs == NULL)
+		goto error_alloc_tree;
+
+	r->heap = heap;
+	r->nallocs = nallocs;
+	r->mb                = mb;
+	r->unaccounted_total = 0;
+	memset(&r->unaccounted_units, 0, sizeof(r->unaccounted_units));
+
+	VEC_INIT(&r->recalc);
+
+	util_mutex_init(&r->lock);
+
+	return r;
+
+error_alloc_tree:
+	D_FREE(r);
+error_alloc_recycler:
+	return NULL;
+}
+
+/*
+ * recycler_delete -- deletes recycler instance
+ */
+void
+recycler_delete(struct recycler *r)
+{
+	VEC_DELETE(&r->recalc);
+
+	util_mutex_destroy(&r->lock);
+	ravl_delete(r->runs);
+	D_FREE(r);
+}
+
+/*
+ * recycler_element_new -- calculates how many free bytes does a run have and
+ *	what's the largest request that the run can handle, returns that as
+ *	recycler element struct
+ */
+struct recycler_element
+recycler_element_new(struct palloc_heap *heap, const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(heap);
+
+	/*
+	 * Counting of the clear bits can race with a concurrent deallocation
+	 * that operates on the same run. This race is benign and has absolutely
+	 * no effect on the correctness of this algorithm. Ideally, we would
+	 * avoid grabbing the lock, but helgrind gets very confused if we
+	 * try to disable reporting for this function.
+	 */
+	pthread_mutex_t *lock = m->m_ops->get_lock(m);
+
+	util_mutex_lock(lock);
+
+	struct recycler_element e = {
+		.free_space = 0,
+		.max_free_block = 0,
+		.chunk_id = m->chunk_id,
+		.zone_id = m->zone_id,
+	};
+	m->m_ops->calc_free(m, &e.free_space, &e.max_free_block);
+
+	util_mutex_unlock(lock);
+
+	return e;
+}
+
+/*
+ * recycler_put -- inserts new run into the recycler
+ */
+int
+recycler_put(struct recycler *r, struct recycler_element element)
+{
+	int ret = 0;
+
+	util_mutex_lock(&r->lock);
+
+	ret = ravl_emplace_copy(r->runs, &element);
+
+	util_mutex_unlock(&r->lock);
+
+	return ret;
+}
+
+/*
+ * recycler_get -- retrieves a chunk from the recycler
+ */
+int
+recycler_get(struct recycler *r, struct memory_block *m)
+{
+	int ret = 0;
+
+	util_mutex_lock(&r->lock);
+
+	struct recycler_element e = { .max_free_block = m->size_idx, 0, 0, 0};
+	struct ravl_node *n = ravl_find(r->runs, &e,
+		RAVL_PREDICATE_GREATER_EQUAL);
+	if (n == NULL) {
+		ret = ENOMEM;
+		goto out;
+	}
+
+	struct recycler_element *ne = ravl_data(n);
+
+	m->chunk_id = ne->chunk_id;
+	m->zone_id = ne->zone_id;
+
+	ravl_remove(r->runs, n);
+
+	struct chunk_header *hdr = heap_get_chunk_hdr(r->heap, m);
+
+	m->size_idx = hdr->size_idx;
+
+	memblock_rebuild_state(r->heap, m);
+
+out:
+	util_mutex_unlock(&r->lock);
+
+	return ret;
+}
+
+/*
+ * recycler_recalc -- recalculates the scores of runs in the recycler to match
+ *	the updated persistent state
+ */
+struct empty_runs
+recycler_recalc(struct recycler *r, int force)
+{
+	struct empty_runs runs;
+
+	VEC_INIT(&runs);
+
+	uint64_t units = r->unaccounted_total;
+
+	uint64_t recalc_threshold = THRESHOLD_MUL * r->nallocs;
+
+	if (!force && units < recalc_threshold)
+		return runs;
+
+	if (util_mutex_trylock(&r->lock) != 0)
+		return runs;
+
+	/* If the search is forced, recalculate everything */
+	uint64_t search_limit = force ? UINT64_MAX : units;
+
+	uint64_t found_units = 0;
+	struct memory_block nm = MEMORY_BLOCK_NONE;
+	struct ravl_node *n;
+	struct recycler_element next = {0, 0, 0, 0};
+	enum ravl_predicate p = RAVL_PREDICATE_GREATER_EQUAL;
+
+	do {
+		n = ravl_find(r->runs, &next, p);
+		if (n == NULL)
+			break;
+
+		p = RAVL_PREDICATE_GREATER;
+
+		struct recycler_element *ne = ravl_data(n);
+
+		next = *ne;
+
+		uint64_t chunk_units = r->unaccounted_units[ne->chunk_id];
+
+		if (!force && chunk_units == 0)
+			continue;
+
+		uint32_t existing_free_space = ne->free_space;
+
+		nm.chunk_id = ne->chunk_id;
+		nm.zone_id = ne->zone_id;
+		memblock_rebuild_state(r->heap, &nm);
+
+		struct recycler_element e = recycler_element_new(r->heap, &nm);
+
+		ASSERT(e.free_space >= existing_free_space);
+		uint64_t free_space_diff = e.free_space - existing_free_space;
+
+		found_units += free_space_diff;
+
+		if (free_space_diff == 0)
+			continue;
+
+		/*
+		 * Decrease the per chunk_id counter by the number of nallocs
+		 * found, increased by the blocks potentially freed in the
+		 * active memory block. Cap the sub value to prevent overflow.
+		 */
+		util_fetch_and_sub64(&r->unaccounted_units[nm.chunk_id],
+			MIN(chunk_units, free_space_diff + r->nallocs));
+
+		ravl_remove(r->runs, n);
+
+		if (e.free_space == r->nallocs) {
+			memblock_rebuild_state(r->heap, &nm);
+			if (VEC_PUSH_BACK(&runs, nm) != 0)
+				ASSERT(0); /* XXX: fix after refactoring */
+		} else {
+			VEC_PUSH_BACK(&r->recalc, e);
+		}
+	} while (found_units < search_limit);
+
+	struct recycler_element *e;
+
+	VEC_FOREACH_BY_PTR(e, &r->recalc) {
+		ravl_emplace_copy(r->runs, e);
+	}
+
+	VEC_CLEAR(&r->recalc);
+
+	util_mutex_unlock(&r->lock);
+
+	util_fetch_and_sub64(&r->unaccounted_total, units);
+
+	return runs;
+}
+
+/*
+ * recycler_inc_unaccounted -- increases the number of unaccounted units in the
+ *	recycler
+ */
+void
+recycler_inc_unaccounted(struct recycler *r, const struct memory_block *m)
+{
+	util_fetch_and_add64(&r->unaccounted_total, m->size_idx);
+	util_fetch_and_add64(&r->unaccounted_units[m->chunk_id],
+		m->size_idx);
+}
+
+/*
+ * Return the Memory Bucket runtime associated with the recycler.
+ */
+struct mbrt *
+recycler_get_mbrt(struct recycler *r)
+{
+	return r->mb;
+}
diff --git a/src/common/dav_v2/recycler.h b/src/common/dav_v2/recycler.h
new file mode 100644
index 00000000000..769ce4a4c4a
--- /dev/null
+++ b/src/common/dav_v2/recycler.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2024, Intel Corporation */
+
+/*
+ * recycler.h -- internal definitions of run recycler
+ *
+ * This is a container that stores runs that are currently not used by any of
+ * the buckets.
+ */
+
+#ifndef __DAOS_COMMON_RECYCLER_H
+#define __DAOS_COMMON_RECYCLER_H 1
+
+#include "memblock.h"
+#include "vec.h"
+
+struct recycler;
+VEC(empty_runs, struct memory_block);
+
+struct recycler_element {
+	uint32_t max_free_block;
+	uint32_t free_space;
+
+	uint32_t chunk_id;
+	uint32_t zone_id;
+};
+
+struct recycler *
+recycler_new(struct palloc_heap *layout, size_t nallocs, struct mbrt *mb);
+void recycler_delete(struct recycler *r);
+struct recycler_element recycler_element_new(struct palloc_heap *heap,
+	const struct memory_block *m);
+
+int recycler_put(struct recycler *r, struct recycler_element element);
+
+int recycler_get(struct recycler *r, struct memory_block *m);
+
+struct empty_runs recycler_recalc(struct recycler *r, int force);
+
+void recycler_inc_unaccounted(struct recycler *r,
+	const struct memory_block *m);
+
+struct mbrt *
+recycler_get_mbrt(struct recycler *r);
+
+#endif /* __DAOS_COMMON_RECYCLER_H */
diff --git a/src/common/dav_v2/stats.c b/src/common/dav_v2/stats.c
new file mode 100644
index 00000000000..173b8bb1bab
--- /dev/null
+++ b/src/common/dav_v2/stats.c
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2017-2024, Intel Corporation */
+
+/*
+ * stats.c -- implementation of statistics
+ */
+
+#include <errno.h>
+
+#include "dav_internal.h"
+#include "obj.h"
+#include "stats.h"
+#include "heap.h"
+
+/*
+ * stats_new -- allocates and initializes statistics instance
+ */
+struct stats *
+stats_new(dav_obj_t *pop)
+{
+	struct stats *s;
+
+	D_ALLOC_PTR_NZ(s);
+	if (s == NULL) {
+		D_CRIT("Malloc\n");
+		return NULL;
+	}
+
+	D_ALLOC_PTR(s->transient);
+	if (s->transient == NULL)
+		goto error_transient_alloc;
+
+	return s;
+
+error_transient_alloc:
+	D_FREE(s);
+	return NULL;
+}
+
+/*
+ * stats_delete -- deletes statistics instance
+ */
+void
+stats_delete(dav_obj_t *pop, struct stats *s)
+{
+	D_FREE(s->transient);
+	D_FREE(s);
+}
+
+/*
+ * stats_persist -- save the persistent statistics to wal
+ */
+void
+stats_persist(dav_obj_t *pop, struct stats *s)
+{
+	if (s->transient->heap_prev_pval !=
+	    s->persistent->heap_curr_allocated) {
+		mo_wal_persist(&pop->p_ops, s->persistent,
+			       sizeof(struct stats_persistent));
+		s->transient->heap_prev_pval =
+		    s->persistent->heap_curr_allocated;
+	}
+}
+
+DAV_FUNC_EXPORT int
+dav_get_heap_stats_v2(dav_obj_t *pop, struct dav_heap_stats *st)
+{
+	if ((pop == NULL) || (st == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	st->curr_allocated = pop->do_stats->persistent->heap_curr_allocated;
+	st->run_allocated = pop->do_stats->transient->heap_run_allocated;
+	st->run_active = pop->do_stats->transient->heap_run_active;
+	return 0;
+}
+
+DAV_FUNC_EXPORT int
+dav_get_heap_mb_stats_v2(dav_obj_t *pop, uint32_t mb_id, struct dav_heap_mb_stats *st)
+{
+	return heap_mbrt_getmb_usage(pop->do_heap, mb_id, &st->dhms_allocated, &st->dhms_maxsz);
+}
diff --git a/src/common/dav_v2/stats.h b/src/common/dav_v2/stats.h
new file mode 100644
index 00000000000..a295563ec5f
--- /dev/null
+++ b/src/common/dav_v2/stats.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2017-2023, Intel Corporation */
+
+/*
+ * stats.h -- definitions of statistics
+ */
+
+#ifndef __DAOS_COMMON_STATS_H
+#define __DAOS_COMMON_STATS_H 1
+
+struct stats_transient {
+	uint64_t heap_run_allocated;
+	uint64_t heap_run_active;
+	uint64_t heap_prev_pval; /* previous persisted value of curr allocated */
+};
+
+struct stats_persistent {
+	uint64_t heap_curr_allocated;
+};
+
+struct stats {
+	struct stats_transient *transient;
+	struct stats_persistent *persistent;
+};
+
+#define STATS_INC(stats, type, name, value) \
+	STATS_INC_##type(stats, name, value)
+
+#define STATS_INC_transient(stats, name, value)\
+	util_fetch_and_add64((&(stats)->transient->name), (value))
+
+#define STATS_INC_persistent(stats, name, value)\
+	util_fetch_and_add64((&(stats)->persistent->name), (value))
+
+#define STATS_SUB(stats, type, name, value)\
+	STATS_SUB_##type(stats, name, value)
+
+#define STATS_SUB_transient(stats, name, value)\
+	util_fetch_and_sub64((&(stats)->transient->name), (value))
+
+#define STATS_SUB_persistent(stats, name, value)\
+	util_fetch_and_sub64((&(stats)->persistent->name), (value))
+
+#define STATS_SET(stats, type, name, value)\
+	STATS_SET_##type(stats, name, value)
+
+#define STATS_SET_transient(stats, name, value)\
+	util_atomic_store_explicit64((&(stats)->transient->name),\
+		(value), memory_order_release)\
+
+#define STATS_SET_persistent(stats, name, value)\
+	util_atomic_store_explicit64((&(stats)->persistent->name),\
+		(value), memory_order_release)\
+
+struct dav_obj;
+
+struct stats *stats_new(struct dav_obj *pop);
+void stats_delete(struct dav_obj *pop, struct stats *stats);
+void stats_persist(struct dav_obj *pop, struct stats *s);
+
+#endif /* __DAOS_COMMON_STATS_H */
diff --git a/src/common/dav_v2/sys_util.h b/src/common/dav_v2/sys_util.h
new file mode 100644
index 00000000000..3730f60c0ce
--- /dev/null
+++ b/src/common/dav_v2/sys_util.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2023, Intel Corporation */
+
+/*
+ * sys_util.h -- internal utility wrappers around system functions
+ */
+
+#ifndef __DAOS_COMMON_SYS_UTIL_H
+#define __DAOS_COMMON_SYS_UTIL_H 1
+
+#include <errno.h>
+
+#include <gurt/common.h>
+#include "out.h"
+
+/*
+ * util_mutex_init -- os_mutex_init variant that never fails from
+ * caller perspective. If os_mutex_init failed, this function aborts
+ * the program.
+ */
+static inline void
+util_mutex_init(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_INIT(m, NULL);
+
+	D_ASSERTF(tmp == 0, "!os_mutex_init");
+}
+
+/*
+ * util_mutex_destroy -- os_mutex_destroy variant that never fails from
+ * caller perspective. If os_mutex_destroy failed, this function aborts
+ * the program.
+ */
+static inline void
+util_mutex_destroy(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_DESTROY(m);
+
+	D_ASSERTF(tmp == 0, "!os_mutex_destroy");
+}
+
+/*
+ * util_mutex_lock -- os_mutex_lock variant that never fails from
+ * caller perspective. If os_mutex_lock failed, this function aborts
+ * the program.
+ */
+static inline void
+util_mutex_lock(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_LOCK(m);
+
+	D_ASSERTF(tmp == 0, "!os_mutex_destroy");
+}
+
+/*
+ * util_mutex_trylock -- os_mutex_trylock variant that never fails from
+ * caller perspective (other than EBUSY). If util_mutex_trylock failed, this
+ * function aborts the program.
+ * Returns 0 if locked successfully, otherwise returns EBUSY.
+ */
+static inline int
+util_mutex_trylock(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_TRYLOCK(m);
+
+	D_ASSERTF((!tmp || (tmp == -DER_BUSY)), "!os_mutex_trylock");
+	return tmp?EBUSY:0;
+}
+
+/*
+ * util_mutex_unlock -- os_mutex_unlock variant that never fails from
+ * caller perspective. If os_mutex_unlock failed, this function aborts
+ * the program.
+ */
+static inline void
+util_mutex_unlock(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_UNLOCK(m);
+
+	D_ASSERTF(tmp == 0, "!os_mutex_unlock");
+}
+
+#endif /* __DAOS_COMMON_SYS_UTIL_H */
diff --git a/src/common/dav_v2/tx.c b/src/common/dav_v2/tx.c
new file mode 100644
index 00000000000..d50c3f52299
--- /dev/null
+++ b/src/common/dav_v2/tx.c
@@ -0,0 +1,1895 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * tx.c -- transactions implementation
+ */
+
+#include <inttypes.h>
+#include <wchar.h>
+#include <errno.h>
+
+#include "queue.h"
+#include "ravl.h"
+#include "obj.h"
+#include "out.h"
+#include "tx.h"
+#include "valgrind_internal.h"
+#include "memops.h"
+#include "dav_internal.h"
+
+struct tx_data {
+	DAV_SLIST_ENTRY(tx_data) tx_entry;
+	jmp_buf env;
+	enum dav_tx_failure_behavior failure_behavior;
+};
+
+struct tx {
+	dav_obj_t *pop;
+	enum dav_tx_stage stage;
+	int last_errnum;
+
+	DAV_SLIST_HEAD(txd, tx_data) tx_entries;
+
+	struct ravl *ranges;
+
+	VEC(, struct dav_action) actions;
+
+	dav_tx_callback stage_callback;
+	void *stage_callback_arg;
+
+	int first_snapshot;
+};
+
+/*
+ * get_tx -- returns current transaction
+ *
+ * This function should be used only in high-level functions.
+ */
+static struct tx *
+get_tx()
+{
+	static __thread struct tx tx;
+
+	return &tx;
+}
+
+struct tx_alloc_args {
+	uint64_t flags;
+	const void *copy_ptr;
+	size_t copy_size;
+};
+
+#define ALLOC_ARGS(flags)\
+(struct tx_alloc_args){flags, NULL, 0}
+
+struct tx_range_def {
+	uint64_t offset;
+	uint64_t size;
+	uint64_t flags;
+};
+
+/*
+ * tx_range_def_cmp -- compares two snapshot ranges
+ */
+static int
+tx_range_def_cmp(const void *lhs, const void *rhs)
+{
+	const struct tx_range_def *l = lhs;
+	const struct tx_range_def *r = rhs;
+
+	if (l->offset > r->offset)
+		return 1;
+	else if (l->offset < r->offset)
+		return -1;
+
+	return 0;
+}
+
+static void
+obj_tx_abort(int errnum, int user);
+
+/*
+ * obj_tx_fail_err -- (internal) dav_tx_abort variant that returns
+ * error code
+ */
+static inline int
+obj_tx_fail_err(int errnum, uint64_t flags)
+{
+	if ((flags & DAV_FLAG_TX_NO_ABORT) == 0)
+		obj_tx_abort(errnum, 0);
+	errno = errnum;
+	return errnum;
+}
+
+/*
+ * obj_tx_fail_null -- (internal) dav_tx_abort variant that returns
+ * null PMEMoid
+ */
+static inline uint64_t
+obj_tx_fail_null(int errnum, uint64_t flags)
+{
+	if ((flags & DAV_FLAG_TX_NO_ABORT) == 0)
+		obj_tx_abort(errnum, 0);
+	errno = errnum;
+	return 0;
+}
+
+/* ASSERT_IN_TX -- checks whether there's open transaction */
+#define ASSERT_IN_TX(tx) do {\
+	if ((tx)->stage == DAV_TX_STAGE_NONE)\
+		FATAL("%s called outside of transaction", __func__);\
+} while (0)
+
+/* ASSERT_TX_STAGE_WORK -- checks whether current transaction stage is WORK */
+#define ASSERT_TX_STAGE_WORK(tx) do {\
+	if ((tx)->stage != DAV_TX_STAGE_WORK)\
+		FATAL("%s called in invalid stage %d", __func__, (tx)->stage);\
+} while (0)
+
+/*
+ * tx_action_reserve -- (internal) reserve space for the given number of actions
+ */
+static int
+tx_action_reserve(struct tx *tx, size_t n)
+{
+	size_t entries_size = (VEC_SIZE(&tx->actions) + n) *
+		sizeof(struct ulog_entry_val);
+
+	if (operation_reserve(tx->pop->external, entries_size) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * tx_action_add -- (internal) reserve space and add a new tx action
+ */
+static struct dav_action *
+tx_action_add(struct tx *tx)
+{
+	if (tx_action_reserve(tx, 1) != 0)
+		return NULL;
+
+	VEC_INC_BACK(&tx->actions);
+
+	return &VEC_BACK(&tx->actions);
+}
+
+/*
+ * tx_action_remove -- (internal) remove last tx action
+ */
+static void
+tx_action_remove(struct tx *tx)
+{
+	VEC_POP_BACK(&tx->actions);
+}
+
+/*
+ * constructor_tx_alloc -- (internal) constructor for normal alloc
+ */
+static int
+constructor_tx_alloc(void *ctx, void *ptr, size_t usable_size, void *arg)
+{
+	ASSERTne(ptr, NULL);
+	ASSERTne(arg, NULL);
+
+	struct tx_alloc_args *args = arg;
+
+	/* do not report changes to the new object */
+	VALGRIND_ADD_TO_TX(ptr, usable_size);
+
+	if (args->flags & DAV_FLAG_ZERO)
+		memset(ptr, 0, usable_size);
+
+	if (args->copy_ptr && args->copy_size != 0) {
+		FATAL("dav xalloc does not support copy_ptr\n");
+		memcpy(ptr, args->copy_ptr, args->copy_size);
+	}
+
+	return 0;
+}
+
+/*
+ * tx_restore_range -- (internal) restore a single range from undo log
+ */
+static void
+tx_restore_range(dav_obj_t *pop, struct ulog_entry_buf *range)
+{
+	void *begin, *end;
+	size_t size = range->size;
+	uint64_t range_offset = ulog_entry_offset(&range->base);
+
+	begin = OBJ_OFF_TO_PTR(pop, range_offset);
+	end = (char *)begin + size;
+	ASSERT((char *)end >= (char *)begin);
+
+	memcpy(begin, range->data, size);
+}
+
+/*
+ * tx_undo_entry_apply -- applies modifications of a single ulog entry
+ */
+static int
+tx_undo_entry_apply(struct ulog_entry_base *e, void *arg,
+		    const struct mo_ops *p_ops)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(arg);
+
+	struct ulog_entry_buf *eb;
+
+	switch (ulog_entry_type(e)) {
+	case ULOG_OPERATION_BUF_CPY:
+		eb = (struct ulog_entry_buf *)e;
+
+		tx_restore_range(p_ops->base, eb);
+		break;
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+	case ULOG_OPERATION_OR:
+#else
+	case ULOG_OPERATION_CLR_BITS:
+	case ULOG_OPERATION_SET_BITS:
+#endif
+	case ULOG_OPERATION_SET:
+	case ULOG_OPERATION_BUF_SET:
+	default:
+		ASSERT(0);
+	}
+
+	return 0;
+}
+
+/*
+ * tx_abort_set -- (internal) abort all set operations
+ */
+static void
+tx_abort_set(dav_obj_t *pop)
+{
+	ulog_foreach_entry((struct ulog *)&pop->clogs.undo,
+		tx_undo_entry_apply, NULL, &pop->p_ops);
+	operation_finish(pop->undo, ULOG_INC_FIRST_GEN_NUM);
+}
+
+/*
+ * tx_flush_range -- (internal) flush one range
+ */
+static void
+tx_flush_range(void *data, void *ctx)
+{
+	dav_obj_t *pop = ctx;
+	struct tx_range_def *range = data;
+
+	if (!(range->flags & DAV_FLAG_NO_FLUSH)) {
+		mo_wal_flush(&pop->p_ops, OBJ_OFF_TO_PTR(pop, range->offset),
+			     range->size, range->flags & DAV_XADD_WAL_CPTR);
+	}
+	VALGRIND_REMOVE_FROM_TX(OBJ_OFF_TO_PTR(pop, range->offset),
+				range->size);
+}
+
+/*
+ * tx_clean_range -- (internal) clean one range
+ */
+static void
+tx_clean_range(void *data, void *ctx)
+{
+	dav_obj_t *pop = ctx;
+	struct tx_range_def *range = data;
+
+	VALGRIND_REMOVE_FROM_TX(OBJ_OFF_TO_PTR(pop, range->offset),
+		range->size);
+	VALGRIND_SET_CLEAN(OBJ_OFF_TO_PTR(pop, range->offset), range->size);
+}
+
+/*
+ * tx_pre_commit -- (internal) do pre-commit operations
+ */
+static void
+tx_pre_commit(struct tx *tx)
+{
+	/* Flush all regions and destroy the whole tree. */
+	ravl_delete_cb(tx->ranges, tx_flush_range, tx->pop);
+	tx->ranges = NULL;
+}
+
+/*
+ * tx_abort -- (internal) abort all allocated objects
+ */
+static void
+tx_abort(dav_obj_t *pop)
+{
+	struct tx *tx = get_tx();
+
+	tx_abort_set(pop);
+
+	ravl_delete_cb(tx->ranges, tx_clean_range, pop);
+	palloc_cancel(pop->do_heap,
+		VEC_ARR(&tx->actions), VEC_SIZE(&tx->actions));
+	tx->ranges = NULL;
+}
+
+/*
+ * tx_ranges_insert_def -- (internal) allocates and inserts a new range
+ *	definition into the ranges tree
+ */
+static int
+tx_ranges_insert_def(dav_obj_t *pop, struct tx *tx,
+	const struct tx_range_def *rdef)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(pop);
+
+	DAV_DBG("(%lu,%lu) size=%zu",
+		rdef->offset / 4096, rdef->offset % 4096, rdef->size);
+
+	int ret = ravl_emplace_copy(tx->ranges, rdef);
+
+	if (ret && errno == EEXIST)
+		FATAL("invalid state of ranges tree");
+	return ret;
+}
+
+/*
+ * tx_alloc_common -- (internal) common function for alloc and zalloc
+ */
+static uint64_t
+tx_alloc_common(struct tx *tx, size_t size, type_num_t type_num,
+		palloc_constr constructor, struct tx_alloc_args args)
+{
+	const struct tx_range_def *r;
+	uint64_t off;
+
+	if (size > DAV_MAX_ALLOC_SIZE) {
+		ERR("requested size too large");
+		return obj_tx_fail_null(ENOMEM, args.flags);
+	}
+
+	dav_obj_t *pop = tx->pop;
+
+	struct dav_action *action = tx_action_add(tx);
+
+	if (action == NULL)
+		return obj_tx_fail_null(ENOMEM, args.flags);
+
+	if (palloc_reserve(pop->do_heap, size, constructor, &args, type_num, 0,
+			   CLASS_ID_FROM_FLAG(args.flags), EZONE_ID_FROM_FLAG(args.flags),
+			   action) != 0)
+		goto err_oom;
+
+	palloc_get_prange(action, &off, &size, 1);
+	r = &(struct tx_range_def){off, size, args.flags};
+	if (tx_ranges_insert_def(pop, tx, r) != 0)
+		goto err_oom;
+
+	return action->heap.offset;
+
+err_oom:
+	tx_action_remove(tx);
+	D_CRIT("out of memory\n");
+	return obj_tx_fail_null(ENOMEM, args.flags);
+}
+
+/*
+ * tx_create_wal_entry -- convert to WAL a single ulog UNDO entry
+ */
+int
+tx_create_wal_entry(struct ulog_entry_base *e, void *arg,
+		    const struct mo_ops *p_ops)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(arg);
+
+	int			 rc = 0;
+	uint64_t		 offset = ulog_entry_offset(e);
+	daos_size_t		 dst_size = sizeof(uint64_t);
+	struct ulog_entry_val	*ev;
+	struct ulog_entry_buf	*eb;
+	uint64_t		 v;
+	uint64_t		*dst;
+
+	D_ASSERT(p_ops->base != NULL);
+	dst = umem_cache_off2ptr(p_ops->umem_store, offset);
+
+	switch (ulog_entry_type(e)) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+		ev = (struct ulog_entry_val *)e;
+		v = ev->value;
+
+		rc = dav_wal_tx_and(p_ops->base, dst, v);
+		break;
+	case ULOG_OPERATION_OR:
+		ev = (struct ulog_entry_val *)e;
+		v = ev->value;
+
+		rc = dav_wal_tx_or(p_ops->base, dst, v);
+		break;
+#else
+	case ULOG_OPERATION_CLR_BITS:
+		ev = (struct ulog_entry_val *)e;
+		v = ev->value;
+
+		rc = dav_wal_tx_clr_bits(p_ops->base, dst, ULOG_ENTRY_VAL_TO_POS(v),
+					 ULOG_ENTRY_VAL_TO_BITS(v));
+		break;
+	case ULOG_OPERATION_SET_BITS:
+		ev = (struct ulog_entry_val *)e;
+		v = ev->value;
+
+		rc = dav_wal_tx_set_bits(p_ops->base, dst, ULOG_ENTRY_VAL_TO_POS(v),
+					 ULOG_ENTRY_VAL_TO_BITS(v));
+		break;
+#endif
+	case ULOG_OPERATION_SET:
+		ev = (struct ulog_entry_val *)e;
+
+		rc = dav_wal_tx_snap(p_ops->base, dst, dst_size, (void *)&ev->value, 0);
+		break;
+	case ULOG_OPERATION_BUF_SET:
+		eb = (struct ulog_entry_buf *)e;
+
+		dst_size = eb->size;
+		rc = dav_wal_tx_set(p_ops->base, dst, 0, dst_size);
+		break;
+	case ULOG_OPERATION_BUF_CPY:
+		eb = (struct ulog_entry_buf *)e;
+
+		dst_size = eb->size;
+		/* The only undo entry from dav that needs to be
+		 * transformed into redo
+		 */
+		rc = dav_wal_tx_snap(p_ops->base, dst, dst_size, dst, 0);
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	return rc;
+}
+
+int
+lw_tx_begin(dav_obj_t *pop)
+{
+	struct umem_wal_tx	*utx = NULL;
+	int			 rc;
+	uint64_t		 wal_id;
+
+	rc = umem_cache_reserve(pop->do_store);
+	if (rc) {
+		D_ERROR("umem_cache_reserve failed, " DF_RC "\n", DP_RC(rc));
+		return rc;
+	}
+	rc = dav_wal_tx_reserve(pop, &wal_id);
+	if (rc) {
+		D_ERROR("so_wal_reserv failed, "DF_RC"\n", DP_RC(rc));
+		return rc;
+	}
+	if (pop->do_utx == NULL) {
+		utx = dav_umem_wtx_new(pop);
+		if (utx == NULL) {
+			D_ERROR("dav_umem_wtx_new failed\n");
+			return ENOMEM;
+		}
+	}
+	pop->do_utx->utx_id = wal_id;
+	return rc;
+}
+
+int
+lw_tx_end(dav_obj_t *pop, void *data)
+{
+	struct umem_wal_tx	*utx;
+	int			 rc;
+
+	/* Persist the frequently updated persistent globals */
+	stats_persist(pop, pop->do_stats);
+
+	utx = pop->do_utx;
+	D_ASSERT(utx != NULL);
+	pop->do_utx = NULL;
+
+	rc = dav_wal_tx_commit(pop, utx, data);
+	D_FREE(utx);
+	return rc;
+}
+
+/*
+ * dav_tx_begin -- initializes new transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...)
+{
+	int		 err = 0;
+	struct tx	*tx = get_tx();
+	uint64_t	 wal_id;
+
+	enum dav_tx_failure_behavior failure_behavior = DAV_TX_FAILURE_ABORT;
+
+	if (tx->stage == DAV_TX_STAGE_WORK) {
+		if (tx->pop != pop) {
+			ERR("nested transaction for different pool");
+			return obj_tx_fail_err(EINVAL, 0);
+		}
+
+		/* inherits this value from the parent transaction */
+		struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+		failure_behavior = txd->failure_behavior;
+
+		VALGRIND_START_TX;
+	} else if (tx->stage == DAV_TX_STAGE_NONE) {
+		struct umem_wal_tx *utx = NULL;
+
+		DAV_DBG("");
+		err = umem_cache_reserve(pop->do_store);
+		if (err) {
+			D_ERROR("umem_cache_reserve failed, " DF_RC "\n", DP_RC(err));
+			err = daos_der2errno(err);
+			goto err_abort;
+		}
+
+		err = dav_wal_tx_reserve(pop, &wal_id);
+		if (err) {
+			D_ERROR("so_wal_reserv failed, "DF_RC"\n", DP_RC(err));
+			goto err_abort;
+		}
+
+		if (pop->do_utx == NULL) {
+			utx = dav_umem_wtx_new(pop);
+			if (utx == NULL) {
+				err = ENOMEM;
+				goto err_abort;
+			}
+		}
+		pop->do_utx->utx_id = wal_id;
+
+		tx = get_tx();
+
+		VALGRIND_START_TX;
+
+		dav_hold_clogs(pop);
+		operation_start(pop->undo);
+
+		VEC_INIT(&tx->actions);
+		DAV_SLIST_INIT(&tx->tx_entries);
+
+		tx->ranges = ravl_new_sized(tx_range_def_cmp,
+			sizeof(struct tx_range_def));
+		tx->first_snapshot = 1;
+		tx->pop = pop;
+	} else {
+		FATAL("Invalid stage %d to begin new transaction", tx->stage);
+	}
+
+	struct tx_data *txd;
+
+	D_ALLOC_PTR_NZ(txd);
+	if (txd == NULL) {
+		err = errno;
+		D_CRIT("Malloc!\n");
+		goto err_abort;
+	}
+
+	tx->last_errnum = 0;
+	ASSERT(env == NULL);
+	if (env != NULL)
+		memcpy(txd->env, env, sizeof(jmp_buf));
+	else
+		memset(txd->env, 0, sizeof(jmp_buf));
+
+	txd->failure_behavior = failure_behavior;
+
+	DAV_SLIST_INSERT_HEAD(&tx->tx_entries, txd, tx_entry);
+
+	tx->stage = DAV_TX_STAGE_WORK;
+
+	/* handle locks */
+	va_list argp;
+
+	va_start(argp, env);
+
+	enum dav_tx_param param_type;
+
+	while ((param_type = va_arg(argp, enum dav_tx_param)) !=
+			DAV_TX_PARAM_NONE) {
+		if (param_type == DAV_TX_PARAM_CB) {
+			dav_tx_callback cb =
+					va_arg(argp, dav_tx_callback);
+			void *arg = va_arg(argp, void *);
+
+			if (tx->stage_callback &&
+					(tx->stage_callback != cb ||
+					tx->stage_callback_arg != arg)) {
+				FATAL(
+			 "transaction callback is already set, old %p new %p old_arg %p new_arg %p",
+					tx->stage_callback, cb,
+					tx->stage_callback_arg, arg);
+			}
+
+			tx->stage_callback = cb;
+			tx->stage_callback_arg = arg;
+		} else {
+			ASSERT(param_type == DAV_TX_PARAM_CB);
+		}
+	}
+	va_end(argp);
+
+	ASSERT(err == 0);
+	return 0;
+
+err_abort:
+	if (tx->stage == DAV_TX_STAGE_WORK)
+		obj_tx_abort(err, 0);
+	else
+		tx->stage = DAV_TX_STAGE_ONABORT;
+	return err;
+}
+
+/*
+ * tx_abort_on_failure_flag -- (internal) return 0 or DAV_FLAG_TX_NO_ABORT
+ * based on transaction setting
+ */
+static uint64_t
+tx_abort_on_failure_flag(struct tx *tx)
+{
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	if (txd->failure_behavior == DAV_TX_FAILURE_RETURN)
+		return DAV_FLAG_TX_NO_ABORT;
+	return 0;
+}
+
+/*
+ * obj_tx_callback -- (internal) executes callback associated with current stage
+ */
+static void
+obj_tx_callback(struct tx *tx)
+{
+	if (!tx->stage_callback)
+		return;
+
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	/* is this the outermost transaction? */
+	if (DAV_SLIST_NEXT(txd, tx_entry) == NULL)
+		tx->stage_callback(tx->pop, tx->stage, tx->stage_callback_arg);
+}
+
+/*
+ * dav_tx_stage -- returns current transaction stage
+ */
+DAV_FUNC_EXPORT enum dav_tx_stage
+dav_tx_stage_v2(void)
+{
+	return get_tx()->stage;
+}
+
+/*
+ * obj_tx_abort -- aborts current transaction
+ */
+static void
+obj_tx_abort(int errnum, int user)
+{
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+	ASSERT(tx->pop != NULL);
+
+	if (errnum == 0)
+		errnum = ECANCELED;
+
+	tx->stage = DAV_TX_STAGE_ONABORT;
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) {
+		/* this is the outermost transaction */
+
+		/* process the undo log */
+		tx_abort(tx->pop);
+
+		dav_release_clogs(tx->pop);
+	}
+
+	tx->last_errnum = errnum;
+	errno = errnum;
+	if (user) {
+		DAV_DBG("!explicit transaction abort");
+	}
+
+	/* ONABORT */
+	obj_tx_callback(tx);
+
+	if (!util_is_zeroed(txd->env, sizeof(jmp_buf)))
+		longjmp(txd->env, errnum);
+}
+
+/*
+ * dav_tx_abort -- aborts current transaction
+ *
+ * Note: this function should not be called from inside of dav.
+ */
+DAV_FUNC_EXPORT void
+dav_tx_abort_v2(int errnum)
+{
+	DAV_API_START();
+	DAV_DBG("");
+	obj_tx_abort(errnum, 1);
+	DAV_API_END();
+}
+
+/*
+ * dav_tx_errno -- returns last transaction error code
+ */
+DAV_FUNC_EXPORT int
+dav_tx_errno_v2(void)
+{
+	DAV_DBG("err:%d", get_tx()->last_errnum);
+
+	return get_tx()->last_errnum;
+}
+
+static void
+tx_post_commit(struct tx *tx)
+{
+	operation_finish(tx->pop->undo, 0);
+}
+
+/*
+ * dav_tx_commit -- commits current transaction
+ */
+DAV_FUNC_EXPORT void
+dav_tx_commit_v2(void)
+{
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+	ASSERT(tx->pop);
+	DAV_DBG("");
+
+	/* WORK */
+	obj_tx_callback(tx);
+	dav_obj_t *pop = tx->pop;
+
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) {
+		/* this is the outermost transaction */
+
+		/* pre-commit phase */
+		tx_pre_commit(tx);
+
+		mo_wal_drain(&pop->p_ops);
+
+		operation_start(pop->external);
+
+		palloc_publish(pop->do_heap, VEC_ARR(&tx->actions),
+			       VEC_SIZE(&tx->actions), pop->external);
+
+		tx_post_commit(tx);
+
+		dav_release_clogs(pop);
+	}
+
+	tx->stage = DAV_TX_STAGE_ONCOMMIT;
+
+	/* ONCOMMIT */
+	obj_tx_callback(tx);
+	DAV_API_END();
+}
+
+/*
+ * dav_tx_end -- ends current transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_end_v2(void *data)
+{
+	struct tx *tx = get_tx();
+
+	if (tx->stage == DAV_TX_STAGE_WORK)
+		FATAL("dav_tx_end called without dav_tx_commit");
+
+	if (tx->pop == NULL)
+		FATAL("dav_tx_end called without dav_tx_begin");
+
+	if (tx->stage_callback &&
+			(tx->stage == DAV_TX_STAGE_ONCOMMIT ||
+			 tx->stage == DAV_TX_STAGE_ONABORT)) {
+		tx->stage = DAV_TX_STAGE_FINALLY;
+		obj_tx_callback(tx);
+	}
+
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	DAV_SLIST_REMOVE_HEAD(&tx->tx_entries, tx_entry);
+
+	D_FREE(txd);
+
+	VALGRIND_END_TX;
+	int ret = tx->last_errnum;
+
+	if (DAV_SLIST_EMPTY(&tx->tx_entries)) {
+		dav_obj_t *pop = tx->pop;
+		dav_tx_callback cb = tx->stage_callback;
+		void *arg = tx->stage_callback_arg;
+		int rc;
+
+		DAV_DBG("");
+		ASSERT(pop);
+		tx->pop = NULL;
+		tx->stage = DAV_TX_STAGE_NONE;
+		tx->stage_callback = NULL;
+		tx->stage_callback_arg = NULL;
+
+		VEC_DELETE(&tx->actions);
+		/* tx should not be accessed after this */
+
+		/* commit to WAL */
+		rc = lw_tx_end(pop, data);
+		/* TODO: Handle WAL commit errors */
+		D_ASSERT(rc == 0);
+
+		if (cb)
+			cb(pop, DAV_TX_STAGE_NONE, arg);
+	} else {
+		/* resume the next transaction */
+		tx->stage = DAV_TX_STAGE_WORK;
+
+		/* abort called within inner transaction, waterfall the error */
+		if (tx->last_errnum)
+			obj_tx_abort(tx->last_errnum, 0);
+	}
+
+	return ret;
+}
+
+/*
+ * vg_verify_initialized -- when executed under Valgrind verifies that
+ *   the buffer has been initialized; explicit check at snapshotting time,
+ *   because Valgrind may find it much later when it's impossible to tell
+ *   for which snapshot it triggered
+ */
+static void
+vg_verify_initialized(dav_obj_t *pop, const struct tx_range_def *def)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(pop, def);
+#if VG_MEMCHECK_ENABLED
+	if (!On_memcheck)
+		return;
+
+	VALGRIND_DO_DISABLE_ERROR_REPORTING;
+	char *start = OBJ_OFF_TO_PTR(pop, def->offset);
+	char *uninit = (char *)VALGRIND_CHECK_MEM_IS_DEFINED(start, def->size);
+
+	if (uninit) {
+		VALGRIND_PRINTF(
+			"Snapshotting uninitialized data in range <%p,%p> (<offset:0x%lx,size:0x%lx>)\n",
+			start, start + def->size, def->offset, def->size);
+
+		if (uninit != start)
+			VALGRIND_PRINTF("Uninitialized data starts at: %p\n",
+					uninit);
+
+		VALGRIND_DO_ENABLE_ERROR_REPORTING;
+		VALGRIND_CHECK_MEM_IS_DEFINED(start, def->size);
+	} else {
+		VALGRIND_DO_ENABLE_ERROR_REPORTING;
+	}
+#endif
+}
+
+/*
+ * dav_tx_add_snapshot -- (internal) creates a variably sized snapshot
+ */
+static int
+dav_tx_add_snapshot(struct tx *tx, struct tx_range_def *snapshot)
+{
+	/*
+	 * Depending on the size of the block, either allocate an
+	 * entire new object or use cache.
+	 */
+	void *ptr = OBJ_OFF_TO_PTR(tx->pop, snapshot->offset);
+
+	VALGRIND_ADD_TO_TX(ptr, snapshot->size);
+
+	/* do nothing */
+	if (snapshot->flags & DAV_XADD_NO_SNAPSHOT)
+		return 0;
+
+	if (!(snapshot->flags & DAV_XADD_ASSUME_INITIALIZED))
+		vg_verify_initialized(tx->pop, snapshot);
+
+	/*
+	 * If we are creating the first snapshot, setup a redo log action to
+	 * increment counter in the undo log, so that the log becomes
+	 * invalid once the redo log is processed.
+	 */
+	if (tx->first_snapshot) {
+		struct dav_action *action = tx_action_add(tx);
+
+		if (action == NULL)
+			return -1;
+
+		uint64_t *n = &tx->pop->clogs.undo.gen_num;
+
+		palloc_set_value(tx->pop->do_heap, action,
+			n, *n + 1);
+
+		tx->first_snapshot = 0;
+	}
+
+	return operation_add_buffer(tx->pop->undo, ptr, ptr, snapshot->size,
+		ULOG_OPERATION_BUF_CPY);
+}
+
+/*
+ * dav_tx_merge_flags -- (internal) common code for merging flags between
+ * two ranges to ensure resultant behavior is correct
+ */
+static void
+dav_tx_merge_flags(struct tx_range_def *dest, struct tx_range_def *merged)
+{
+	/*
+	 * DAV_XADD_NO_FLUSH should only be set in merged range if set in
+	 * both ranges
+	 */
+	if ((dest->flags & DAV_XADD_NO_FLUSH) &&
+				!(merged->flags & DAV_XADD_NO_FLUSH)) {
+		dest->flags = dest->flags & (~DAV_XADD_NO_FLUSH);
+	}
+
+	/*
+	 * Extend DAV_XADD_WAL_CPTR when merged.
+	 * REVISIT: Ideally merge should happen only if address ranges
+	 * overlap. Current code merges adjacent ranges even if only one
+	 * of them has this flag set. Fix this before closing DAOS-11049.
+	 */
+	if (merged->flags & DAV_XADD_WAL_CPTR)
+		dest->flags = dest->flags | DAV_XADD_WAL_CPTR;
+}
+
+/*
+ * dav_tx_add_common -- (internal) common code for adding persistent memory
+ * into the transaction
+ */
+static int
+dav_tx_add_common(struct tx *tx, struct tx_range_def *args)
+{
+	if (args->size > DAV_MAX_ALLOC_SIZE) {
+		ERR("snapshot size too large");
+		return obj_tx_fail_err(EINVAL, args->flags);
+	}
+
+	if (!OBJ_OFFRANGE_FROM_HEAP(tx->pop, args->offset, (args->offset + args->size))) {
+		ERR("object outside of heap");
+		return obj_tx_fail_err(EINVAL, args->flags);
+	}
+
+	int ret = 0;
+
+	/*
+	 * Search existing ranges backwards starting from the end of the
+	 * snapshot.
+	 */
+	struct tx_range_def r = *args;
+
+	DAV_DBG("(%lu,%lu) size=%zu", r.offset / 4096, r.offset % 4096, r.size);
+	struct tx_range_def search = {0, 0, 0};
+	/*
+	 * If the range is directly adjacent to an existing one,
+	 * they can be merged, so search for less or equal elements.
+	 */
+	enum ravl_predicate p = RAVL_PREDICATE_LESS_EQUAL;
+	struct ravl_node *nprev = NULL;
+
+	while (r.size != 0) {
+		search.offset = r.offset + r.size;
+		struct ravl_node *n = ravl_find(tx->ranges, &search, p);
+		/*
+		 * We have to skip searching for LESS_EQUAL because
+		 * the snapshot we would find is the one that was just
+		 * created.
+		 */
+		p = RAVL_PREDICATE_LESS;
+
+		struct tx_range_def *f = n ? ravl_data(n) : NULL;
+
+		size_t fend = f == NULL ? 0 : f->offset + f->size;
+		size_t rend = r.offset + r.size;
+
+		if (fend == 0 || fend < r.offset) {
+			/*
+			 * If found no range or the found range is not
+			 * overlapping or adjacent on the left side, we can just
+			 * create the entire r.offset + r.size snapshot.
+			 *
+			 * Snapshot:
+			 *	--+-
+			 * Existing ranges:
+			 *	---- (no ranges)
+			 * or	+--- (no overlap)
+			 * or	---+ (adjacent on on right side)
+			 */
+			if (nprev != NULL) {
+				/*
+				 * But, if we have an existing adjacent snapshot
+				 * on the right side, we can just extend it to
+				 * include the desired range.
+				 */
+				struct tx_range_def *fprev = ravl_data(nprev);
+
+				ASSERTeq(rend, fprev->offset);
+				fprev->offset -= r.size;
+				fprev->size += r.size;
+			} else {
+				/*
+				 * If we don't have anything adjacent, create
+				 * a new range in the tree.
+				 */
+				ret = tx_ranges_insert_def(tx->pop,
+					tx, &r);
+				if (ret != 0)
+					break;
+			}
+			ret = dav_tx_add_snapshot(tx, &r);
+			break;
+		} else if (fend <= rend) {
+			/*
+			 * If found range has its end inside of the desired
+			 * snapshot range, we can extend the found range by the
+			 * size leftover on the left side.
+			 *
+			 * Snapshot:
+			 *	--+++--
+			 * Existing ranges:
+			 *	+++---- (overlap on left)
+			 * or	---+--- (found snapshot is inside)
+			 * or	---+-++ (inside, and adjacent on the right)
+			 * or	+++++-- (desired snapshot is inside)
+			 *
+			 */
+			struct tx_range_def snapshot = *args;
+
+			snapshot.offset = fend;
+			/* the side not yet covered by an existing snapshot */
+			snapshot.size = rend - fend;
+
+			/* the number of bytes intersecting in both ranges */
+			size_t intersection = fend - MAX(f->offset, r.offset);
+
+			r.size -= intersection + snapshot.size;
+			f->size += snapshot.size;
+			dav_tx_merge_flags(f, args);
+
+			if (snapshot.size != 0) {
+				ret = dav_tx_add_snapshot(tx, &snapshot);
+				if (ret != 0)
+					break;
+			}
+
+			/*
+			 * If there's a snapshot adjacent on right side, merge
+			 * the two ranges together.
+			 */
+			if (nprev != NULL) {
+				struct tx_range_def *fprev = ravl_data(nprev);
+
+				ASSERTeq(rend, fprev->offset);
+				f->size += fprev->size;
+				dav_tx_merge_flags(f, fprev);
+				ravl_remove(tx->ranges, nprev);
+			}
+		} else if (fend >= r.offset) {
+			/*
+			 * If found range has its end extending beyond the
+			 * desired snapshot.
+			 *
+			 * Snapshot:
+			 *	--+++--
+			 * Existing ranges:
+			 *	-----++ (adjacent on the right)
+			 * or	----++- (overlapping on the right)
+			 * or	----+++ (overlapping and adjacent on the right)
+			 * or	--+++++ (desired snapshot is inside)
+			 *
+			 * Notice that we cannot create a snapshot based solely
+			 * on this information without risking overwriting an
+			 * existing one. We have to continue iterating, but we
+			 * keep the information about adjacent snapshots in the
+			 * nprev variable.
+			 */
+			size_t overlap = rend - MAX(f->offset, r.offset);
+
+			r.size -= overlap;
+			dav_tx_merge_flags(f, args);
+		} else {
+			ASSERT(0);
+		}
+
+		nprev = n;
+	}
+
+	if (ret != 0) {
+		DAV_DBG("out of memory\n");
+		return obj_tx_fail_err(ENOMEM, args->flags);
+	}
+
+	return 0;
+}
+
+/*
+ * dav_tx_add_range_direct -- adds persistent memory range into the
+ *					transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_add_range_direct_v2(const void *ptr, size_t size)
+{
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+	ASSERT(tx->pop != NULL);
+
+	int ret;
+
+	uint64_t flags = tx_abort_on_failure_flag(tx);
+
+	if (!OBJ_PTR_FROM_POOL(tx->pop, ptr)) {
+		ERR("object outside of pool");
+		ret = obj_tx_fail_err(EINVAL, flags);
+		DAV_API_END();
+		return ret;
+	}
+
+	struct tx_range_def args = {
+		.offset = OBJ_PTR_TO_OFF(tx->pop, ptr),
+		.size = size,
+		.flags = flags,
+	};
+
+	ret = dav_tx_add_common(tx, &args);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_tx_xadd_range_direct -- adds persistent memory range into the
+ *					transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_xadd_range_direct_v2(const void *ptr, size_t size, uint64_t flags)
+{
+
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	int ret;
+	uint64_t off;
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	if (flags & ~DAV_XADD_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags
+			& ~DAV_XADD_VALID_FLAGS);
+		ret = obj_tx_fail_err(EINVAL, flags);
+		DAV_API_END();
+		return ret;
+	}
+
+	if (!OBJ_PTR_FROM_POOL(tx->pop, ptr)) {
+		ERR("object outside of pool");
+		ret = obj_tx_fail_err(EINVAL, flags);
+		DAV_API_END();
+		return ret;
+	}
+
+	off = OBJ_PTR_TO_OFF(tx->pop, ptr);
+	struct tx_range_def args = {
+		.offset = off,
+		.size = size,
+		.flags = flags,
+	};
+
+	ret = dav_tx_add_common(tx, &args);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_tx_add_range -- adds persistent memory range into the transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_add_range_v2(uint64_t hoff, size_t size)
+{
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	int ret;
+
+	uint64_t flags = tx_abort_on_failure_flag(tx);
+
+	ASSERT(OBJ_OFF_IS_VALID(tx->pop, hoff));
+
+	struct tx_range_def args = {
+		.offset = hoff,
+		.size = size,
+		.flags = flags,
+	};
+
+	ret = dav_tx_add_common(tx, &args);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_tx_xadd_range -- adds persistent memory range into the transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_xadd_range_v2(uint64_t hoff, size_t size, uint64_t flags)
+{
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	int ret;
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	if (flags & ~DAV_XADD_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags
+			& ~DAV_XADD_VALID_FLAGS);
+		ret = obj_tx_fail_err(EINVAL, flags);
+		DAV_API_END();
+		return ret;
+	}
+
+	ASSERT(OBJ_OFF_IS_VALID(tx->pop, hoff));
+
+	struct tx_range_def args = {
+		.offset = hoff,
+		.size = size,
+		.flags = flags,
+	};
+
+	ret = dav_tx_add_common(tx, &args);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_tx_alloc -- allocates a new object
+ */
+DAV_FUNC_EXPORT uint64_t
+dav_tx_alloc_v2(size_t size, uint64_t type_num, uint64_t flags)
+{
+	uint64_t off;
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	DAV_API_START();
+
+	if (size == 0) {
+		ERR("allocation with size 0");
+		off = obj_tx_fail_null(EINVAL, flags);
+		DAV_API_END();
+		return off;
+	}
+
+	if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags
+			& ~(DAV_TX_XALLOC_VALID_FLAGS));
+		off = obj_tx_fail_null(EINVAL, flags);
+		DAV_API_END();
+		return off;
+	}
+
+	off = tx_alloc_common(tx, size, (type_num_t)type_num,
+			constructor_tx_alloc, ALLOC_ARGS(flags));
+
+	DAV_API_END();
+	return off;
+}
+
+/*
+ * dav_tx_xfree -- frees an existing object, with no_abort option
+ */
+static int
+dav_tx_xfree(uint64_t off, uint64_t flags)
+{
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	if (flags & ~DAV_XFREE_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64,
+				flags & ~DAV_XFREE_VALID_FLAGS);
+		return obj_tx_fail_err(EINVAL, flags);
+	}
+
+	if (off == 0)
+		return 0;
+
+	dav_obj_t *pop = tx->pop;
+
+	ASSERT(pop != NULL);
+	ASSERT(OBJ_OFF_IS_VALID(pop, off));
+
+	DAV_API_START();
+
+	struct dav_action *action;
+	uint64_t roff = palloc_get_realoffset(pop->do_heap, off);
+
+	struct tx_range_def range = {roff, 0, 0};
+	struct ravl_node *n = ravl_find(tx->ranges, &range,
+			RAVL_PREDICATE_LESS_EQUAL);
+
+	/*
+	 * If attempting to free an object allocated within the same
+	 * transaction, simply cancel the alloc and remove it from the actions.
+	 */
+	if (n != NULL) {
+		struct tx_range_def *r = ravl_data(n);
+
+		if ((r->offset + r->size) < roff)
+			goto out;
+
+		VEC_FOREACH_BY_PTR(action, &tx->actions) {
+			if (action->type == DAV_ACTION_TYPE_HEAP &&
+			    action->heap.offset == off) {
+				void *ptr = OBJ_OFF_TO_PTR(pop, roff);
+				uint64_t toff, usize;
+
+				palloc_get_prange(action, &toff, &usize, 1);
+				D_ASSERT(usize <= r->size);
+				if ((r->offset == roff) && (r->size == usize)) {
+					/* Exact match. */
+					ravl_remove(tx->ranges, n);
+				} else if (r->offset == roff) {
+					/* Retain the right portion. */
+					r->offset += usize;
+					r->size   -= usize;
+				} else {
+					/* Retain the left portion. */
+					uint64_t osize = r->size;
+
+					r->size = roff - r->offset;
+
+					/* Still data after range remove. */
+					osize -= (r->size + usize);
+					if (osize) {
+						struct tx_range_def *r1 =
+							&(struct tx_range_def)
+							 {roff + usize, osize, r->flags};
+
+						tx_ranges_insert_def(pop, tx, r1);
+					}
+				}
+
+				VALGRIND_SET_CLEAN(ptr, usize);
+				VALGRIND_REMOVE_FROM_TX(ptr, usize);
+				palloc_cancel(pop->do_heap, action, 1);
+				VEC_ERASE_BY_PTR(&tx->actions, action);
+				DAV_API_END();
+				return 0;
+			}
+		}
+	}
+
+out:
+	action = tx_action_add(tx);
+	if (action == NULL) {
+		int ret = obj_tx_fail_err(errno, flags);
+
+		DAV_API_END();
+		return ret;
+	}
+
+	palloc_defer_free(pop->do_heap, off, action);
+
+	DAV_API_END();
+	return 0;
+}
+
+/*
+ * dav_tx_free -- frees an existing object
+ */
+DAV_FUNC_EXPORT int
+dav_tx_free_v2(uint64_t off)
+{
+	return dav_tx_xfree(off, 0);
+}
+
+/* arguments for constructor_alloc */
+struct constr_args {
+	int zero_init;
+	dav_constr constructor;
+	void *arg;
+};
+
+/* arguments for constructor_alloc_root */
+struct carg_root {
+	size_t size;
+	dav_constr constructor;
+	void *arg;
+};
+
+/* arguments for constructor_realloc and constructor_zrealloc */
+struct carg_realloc {
+	void *ptr;
+	size_t old_size;
+	size_t new_size;
+	int zero_init;
+	type_num_t user_type;
+	dav_constr constructor;
+	void *arg;
+};
+
+/*
+ * constructor_zrealloc_root -- (internal) constructor for dav_root
+ */
+static int
+constructor_zrealloc_root(void *ctx, void *ptr, size_t usable_size, void *arg)
+{
+	dav_obj_t *pop = ctx;
+
+	DAV_DBG("pop %p ptr %p arg %p", pop, ptr, arg);
+
+	ASSERTne(ptr, NULL);
+	ASSERTne(arg, NULL);
+
+	VALGRIND_ADD_TO_TX(ptr, usable_size);
+
+	struct carg_realloc *carg = arg;
+
+	if (usable_size > carg->old_size) {
+		size_t grow_len = usable_size - carg->old_size;
+		void *new_data_ptr = (void *)((uintptr_t)ptr + carg->old_size);
+
+		mo_wal_memset(&pop->p_ops, new_data_ptr, 0, grow_len, 0);
+	}
+	int ret = 0;
+
+	if (carg->constructor)
+		ret = carg->constructor(pop, ptr, carg->arg);
+
+	VALGRIND_REMOVE_FROM_TX(ptr, usable_size);
+
+	return ret;
+}
+
+/*
+ * obj_realloc_root -- (internal) reallocate root object
+ */
+static int
+obj_alloc_root(dav_obj_t *pop, size_t size)
+{
+	struct operation_context *ctx;
+	struct carg_realloc       carg;
+	int                       ret;
+
+	DAV_DBG("pop %p size %zu", pop, size);
+
+	carg.ptr = (*pop->do_root_offsetp == 0) ? 0 : OBJ_OFF_TO_PTR(pop, *pop->do_root_offsetp);
+	carg.old_size    = *pop->do_root_sizep;
+	carg.new_size = size;
+	carg.user_type = 0;
+	carg.constructor = NULL;
+	carg.zero_init = 1;
+	carg.arg = NULL;
+
+	ret = lw_tx_begin(pop);
+	if (ret)
+		return ret;
+
+	ctx = pop->external;
+	operation_start(ctx);
+
+	operation_add_entry(ctx, pop->do_root_sizep, size, ULOG_OPERATION_SET);
+
+	ret = palloc_operation(pop->do_heap, *pop->do_root_offsetp, pop->do_root_offsetp, size,
+			       constructor_zrealloc_root, &carg, 0, 0, 0, 0,
+			       ctx); /* REVISIT: object_flags and type num ignored*/
+
+	lw_tx_end(pop, NULL);
+	return ret;
+}
+
+/*
+ * dav_root_construct -- returns root object
+ */
+DAV_FUNC_EXPORT uint64_t
+dav_root_v2(dav_obj_t *pop, size_t size)
+{
+	DAV_DBG("pop %p size %zu", pop, size);
+
+	DAV_API_START();
+	if (size > DAV_MAX_ALLOC_SIZE) {
+		ERR("requested size too large");
+		errno = ENOMEM;
+		DAV_API_END();
+		return 0;
+	}
+
+	if (size == 0 && *pop->do_root_offsetp == 0) {
+		ERR("requested size cannot equals zero");
+		errno = EINVAL;
+		DAV_API_END();
+		return 0;
+	}
+
+	if (size > *pop->do_root_sizep && obj_alloc_root(pop, size)) {
+		ERR("dav_root failed");
+		errno = ENOMEM;
+		DAV_API_END();
+		return 0;
+	}
+
+	DAV_API_END();
+	return *pop->do_root_offsetp;
+}
+
+/*
+ * constructor_alloc -- (internal) constructor for obj_alloc_construct
+ */
+static int
+constructor_alloc(void *ctx, void *ptr, size_t usable_size, void *arg)
+{
+	dav_obj_t *pop = ctx;
+
+	struct mo_ops *p_ops = &pop->p_ops;
+
+	DAV_DBG("pop %p ptr %p arg %p", pop, ptr, arg);
+
+	ASSERTne(ptr, NULL);
+	ASSERTne(arg, NULL);
+
+	struct constr_args *carg = arg;
+
+	if (carg->zero_init)
+		mo_wal_memset(p_ops, ptr, 0, usable_size, 0);
+
+	int ret = 0;
+
+	if (carg->constructor)
+		ret = carg->constructor(pop, ptr, carg->arg);
+
+	return ret;
+}
+
+/*
+ * obj_alloc_construct -- (internal) allocates a new object with constructor
+ */
+static int
+obj_alloc_construct(dav_obj_t *pop, uint64_t *offp, size_t size,
+	type_num_t type_num, uint64_t flags,
+	dav_constr constructor, void *arg)
+{
+	struct operation_context *ctx;
+	struct constr_args        carg;
+	int                       ret;
+
+	if (size > DAV_MAX_ALLOC_SIZE) {
+		ERR("requested size too large");
+		errno = ENOMEM;
+		return -1;
+	}
+
+	carg.zero_init = flags & DAV_FLAG_ZERO;
+	carg.constructor = constructor;
+	carg.arg = arg;
+
+	ret = lw_tx_begin(pop);
+	if (ret)
+		return ret;
+	ctx = pop->external;
+	operation_start(ctx);
+
+	ret = palloc_operation(pop->do_heap, 0, offp, size, constructor_alloc, &carg, type_num, 0,
+			       CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), ctx);
+
+	lw_tx_end(pop, NULL);
+	return ret;
+}
+
+/*
+ * dav_alloc -- allocates a new object
+ */
+DAV_FUNC_EXPORT int
+dav_alloc_v2(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags,
+	   dav_constr constructor, void *arg)
+{
+	DAV_DBG(3, "pop %p offp %p size %zu type_num %llx flags %llx constructor %p arg %p", pop,
+		offp, size, (unsigned long long)type_num, (unsigned long long)flags, constructor,
+		arg);
+
+	if (size == 0) {
+		ERR("allocation with size 0");
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags & ~DAV_TX_XALLOC_VALID_FLAGS);
+		errno = EINVAL;
+		return -1;
+	}
+
+	DAV_API_START();
+	int ret = obj_alloc_construct(pop, offp, size, type_num, flags, constructor, arg);
+	if (ret) {
+		errno = ret;
+		ret   = -1;
+	}
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_free -- frees an existing object
+ */
+DAV_FUNC_EXPORT void
+dav_free_v2(dav_obj_t *pop, uint64_t off)
+{
+	struct operation_context *ctx;
+	int                       rc;
+
+	DAV_DBG("oid.off 0x%016" PRIx64, off);
+
+	if (off == 0)
+		return;
+
+	DAV_API_START();
+
+	ASSERTne(pop, NULL);
+	ASSERT(OBJ_OFF_IS_VALID(pop, off));
+	rc = lw_tx_begin(pop);
+	D_ASSERT(rc == 0);
+	ctx = pop->external;
+	operation_start(ctx);
+
+	palloc_operation(pop->do_heap, off, NULL, 0, NULL, NULL,
+			0, 0, 0, 0, ctx);
+
+	lw_tx_end(pop, NULL);
+	DAV_API_END();
+}
+
+/*
+ * dav_memcpy_persist -- dav version of memcpy
+ */
+DAV_FUNC_EXPORT void *
+dav_memcpy_persist_v2(dav_obj_t *pop, void *dest, const void *src,
+	size_t len)
+{
+	int rc;
+
+	DAV_DBG("pop %p dest %p src %p len %zu", pop, dest, src, len);
+	D_ASSERT((dav_tx_stage_v2() == DAV_TX_STAGE_NONE));
+
+	DAV_API_START();
+	rc = lw_tx_begin(pop);
+	D_ASSERT(rc == 0);
+
+	void *ptr = mo_wal_memcpy(&pop->p_ops, dest, src, len, 0);
+
+	lw_tx_end(pop, NULL);
+	DAV_API_END();
+	return ptr;
+}
+
+/*
+ * dav_reserve -- reserves a single object
+ */
+DAV_FUNC_EXPORT uint64_t
+dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num,
+		uint64_t flags)
+{
+	struct constr_args carg;
+	int                tx_inprogress = 0;
+	int                rc;
+
+	DAV_DBG(3, "pop %p act %p size %zu type_num %llx flags %llx", pop, act, size,
+		(unsigned long long)type_num, (unsigned long long)flags);
+
+	if (flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS);
+		errno = EINVAL;
+		return 0;
+	}
+
+	if (get_tx()->stage != DAV_TX_STAGE_NONE)
+		tx_inprogress = 1;
+
+	DAV_API_START();
+	if (!tx_inprogress) {
+		rc = lw_tx_begin(pop);
+		if (rc)
+			return 0;
+	}
+
+	carg.zero_init   = flags & DAV_FLAG_ZERO;
+	carg.constructor = NULL;
+	carg.arg         = NULL;
+
+	if (palloc_reserve(pop->do_heap, size, constructor_alloc, &carg, type_num, 0,
+			   CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), act) != 0) {
+		DAV_API_END();
+		return 0;
+	}
+
+	if (!tx_inprogress)
+		lw_tx_end(pop, NULL);
+	DAV_API_END();
+	return act->heap.offset;
+}
+
+/*
+ * dav_defer_free -- creates a deferred free action
+ */
+DAV_FUNC_EXPORT void
+dav_defer_free_v2(dav_obj_t *pop, uint64_t off, struct dav_action *act)
+{
+	ASSERT(off != 0);
+	ASSERT(OBJ_OFF_IS_VALID(pop, off));
+	palloc_defer_free(pop->do_heap, off, act);
+}
+
+#if 0
+/*
+ * dav_publish -- publishes a collection of actions
+ */
+int
+dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt)
+{
+	DAV_API_START();
+	struct operation_context *ctx = pmalloc_operation_hold(pop);
+
+	size_t entries_size = actvcnt * sizeof(struct ulog_entry_val);
+
+	if (operation_reserve(ctx, entries_size) != 0) {
+		DAV_API_END();
+		return -1;
+	}
+
+	palloc_publish(&pop->do_heap, actv, actvcnt, ctx);
+
+	pmalloc_operation_release(pop);
+
+	DAV_API_END();
+	return 0;
+}
+#endif
+
+/*
+ * dav_cancel -- cancels collection of actions
+ */
+DAV_FUNC_EXPORT void
+dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt)
+{
+	DAV_DBG("actvcnt=%zu", actvcnt);
+	DAV_API_START();
+	palloc_cancel(pop->do_heap, actv, actvcnt);
+	DAV_API_END();
+}
+
+/*
+ * dav_tx_publish -- publishes actions inside of a transaction,
+ * with no_abort option
+ */
+DAV_FUNC_EXPORT int
+dav_tx_publish_v2(struct dav_action *actv, size_t actvcnt)
+{
+	struct tx *tx    = get_tx();
+	uint64_t   flags = 0;
+	uint64_t   off, size;
+	int        ret;
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	DAV_API_START();
+
+	if (tx_action_reserve(tx, actvcnt) != 0) {
+		ret = obj_tx_fail_err(ENOMEM, flags);
+
+		DAV_API_END();
+		return ret;
+	}
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		VEC_PUSH_BACK(&tx->actions, actv[i]);
+		if (palloc_action_isalloc(&actv[i])) {
+			palloc_get_prange(&actv[i], &off, &size, 1);
+			struct tx_range_def r = {off, size,
+						 DAV_XADD_NO_SNAPSHOT | DAV_XADD_WAL_CPTR};
+
+			ret = dav_tx_add_common(tx, &r);
+			D_ASSERT(ret == 0);
+		}
+	}
+
+	DAV_API_END();
+	return 0;
+}
+
+/*
+ * dav_allot_zone_evictable -- Returns an evictable memory bucket id that can be used
+ * for allocations. If there are no evictable zone with sufficient free space then
+ * zero is returned which maps to non-evictable memory bucket.
+ */
+DAV_FUNC_EXPORT uint32_t
+dav_allot_mb_evictable_v2(dav_obj_t *pop, int flags)
+{
+	uint32_t mb_id;
+	int      err;
+
+	D_ASSERT(flags == 0);
+	D_ASSERT((dav_tx_stage_v2() == DAV_TX_STAGE_NONE));
+
+	err = heap_get_evictable_mb(pop->do_heap, &mb_id);
+	if (err) {
+		D_ERROR("failed to get evictable mb, error = %d", err);
+		return 0;
+	}
+
+	return mb_id;
+}
+
+/*
+ * obj_realloc -- (internal) reallocate zinfo object
+ */
+int
+obj_realloc(dav_obj_t *pop, uint64_t *offp, size_t *sizep, size_t size)
+{
+	struct operation_context *ctx;
+	struct carg_realloc       carg;
+	int                       ret;
+
+	DAV_DBG("pop %p size %zu", pop, size);
+
+	carg.ptr         = (*offp == 0) ? 0 : OBJ_OFF_TO_PTR(pop, *offp);
+	carg.old_size    = *sizep;
+	carg.new_size    = size;
+	carg.user_type   = 0;
+	carg.constructor = NULL;
+	carg.zero_init   = 1;
+	carg.arg         = NULL;
+
+	ctx = pop->external;
+	operation_start(ctx);
+
+	operation_add_entry(ctx, sizep, size, ULOG_OPERATION_SET);
+
+	ret = palloc_operation(pop->do_heap, *offp, offp, size, constructor_zrealloc_root, &carg, 0,
+			       0, 0, 0, ctx);
+
+	return ret;
+}
diff --git a/src/common/dav_v2/tx.h b/src/common/dav_v2/tx.h
new file mode 100644
index 00000000000..f3906f65465
--- /dev/null
+++ b/src/common/dav_v2/tx.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2023, Intel Corporation */
+
+/*
+ * tx.h -- internal definitions for transactions
+ */
+
+#ifndef __DAOS_COMMON_INTERNAL_TX_H
+#define __DAOS_COMMON_INTERNAL_TX_H 1
+
+#include <stdint.h>
+
+#define TX_DEFAULT_RANGE_CACHE_SIZE (1 << 15)
+
+struct ulog_entry_base;
+struct mo_ops;
+/*
+ * tx_create_wal_entry -- convert to WAL a single ulog UNDO entry
+ */
+int tx_create_wal_entry(struct ulog_entry_base *e, void *arg, const struct mo_ops *p_ops);
+
+int
+obj_realloc(dav_obj_t *pop, uint64_t *offp, size_t *sizep, size_t size);
+
+#endif
diff --git a/src/common/dav_v2/ulog.c b/src/common/dav_v2/ulog.c
new file mode 100644
index 00000000000..282ab6ae9fd
--- /dev/null
+++ b/src/common/dav_v2/ulog.c
@@ -0,0 +1,691 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2024, Intel Corporation */
+
+/*
+ * ulog.c -- unified log implementation
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "dav_internal.h"
+#include "mo_wal.h"
+#include "ulog.h"
+#include "obj.h"
+#include "out.h"
+#include "valgrind_internal.h"
+
+/*
+ * Operation flag at the three most significant bits
+ */
+#define ULOG_OPERATION(op)		((uint64_t)(op))
+#define ULOG_OPERATION_MASK		((uint64_t)(0b111ULL << 61ULL))
+#define ULOG_OPERATION_FROM_OFFSET(off)	\
+	((ulog_operation_type) ((off) & ULOG_OPERATION_MASK))
+#define ULOG_OFFSET_MASK		(~(ULOG_OPERATION_MASK))
+
+#define CACHELINE_ALIGN(size) ALIGN_UP(size, CACHELINE_SIZE)
+#define IS_CACHELINE_ALIGNED(ptr)\
+	(((uintptr_t)(ptr) & (CACHELINE_SIZE - 1)) == 0)
+
+/*
+ * ulog_next -- retrieves the pointer to the next ulog
+ */
+struct ulog *
+ulog_next(struct ulog *ulog)
+{
+	return ulog->next;
+}
+
+/*
+ * ulog_operation -- returns the type of entry operation
+ */
+ulog_operation_type
+ulog_entry_type(const struct ulog_entry_base *entry)
+{
+	return ULOG_OPERATION_FROM_OFFSET(entry->offset);
+}
+
+/*
+ * ulog_offset -- returns offset
+ */
+uint64_t
+ulog_entry_offset(const struct ulog_entry_base *entry)
+{
+	return entry->offset & ULOG_OFFSET_MASK;
+}
+
+/*
+ * ulog_entry_size -- returns the size of a ulog entry
+ */
+size_t
+ulog_entry_size(const struct ulog_entry_base *entry)
+{
+	struct ulog_entry_buf *eb;
+
+	switch (ulog_entry_type(entry)) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+	case ULOG_OPERATION_OR:
+#else
+	case ULOG_OPERATION_CLR_BITS:
+	case ULOG_OPERATION_SET_BITS:
+#endif
+	case ULOG_OPERATION_SET:
+		return sizeof(struct ulog_entry_val);
+	case ULOG_OPERATION_BUF_SET:
+	case ULOG_OPERATION_BUF_CPY:
+		eb = (struct ulog_entry_buf *)entry;
+		return CACHELINE_ALIGN(
+			sizeof(struct ulog_entry_buf) + eb->size);
+	default:
+		ASSERT(0);
+	}
+
+	return 0;
+}
+
+/*
+ * ulog_entry_valid -- (internal) checks if a ulog entry is valid
+ * Returns 1 if the range is valid, otherwise 0 is returned.
+ */
+static int
+ulog_entry_valid(struct ulog *ulog, const struct ulog_entry_base *entry)
+{
+	if (entry->offset == 0)
+		return 0;
+
+	size_t size;
+	struct ulog_entry_buf *b;
+
+	switch (ulog_entry_type(entry)) {
+	case ULOG_OPERATION_BUF_CPY:
+	case ULOG_OPERATION_BUF_SET:
+		size = ulog_entry_size(entry);
+		b = (struct ulog_entry_buf *)entry;
+
+		uint64_t csum = util_checksum_compute(b, size,
+				&b->checksum, 0);
+		csum = util_checksum_seq(&ulog->gen_num,
+				sizeof(ulog->gen_num), csum);
+
+		if (b->checksum != csum)
+			return 0;
+		break;
+	default:
+		break;
+	}
+
+	return 1;
+}
+
+/*
+ * ulog_construct -- initializes the ulog structure
+ */
+void
+ulog_construct_new(struct ulog *ulog, size_t capacity, uint64_t gen_num, uint64_t flags)
+{
+	ASSERTne(ulog, NULL);
+
+	ulog->capacity = capacity;
+	ulog->checksum = 0;
+	ulog->next = 0;
+	ulog->gen_num = gen_num;
+	ulog->flags = flags;
+	memset(ulog->unused, 0, sizeof(ulog->unused));
+
+	/* we only need to zero out the header of ulog's first entry */
+	size_t zeroed_data = CACHELINE_ALIGN(sizeof(struct ulog_entry_base));
+	/*
+	 * We want to avoid replicating zeroes for every ulog of every
+	 * lane, to do that, we need to use plain old memset.
+	 */
+	memset(ulog->data, 0, zeroed_data);
+}
+
+/*
+ * ulog_foreach_entry -- iterates over every existing entry in the ulog
+ */
+int
+ulog_foreach_entry(struct ulog *ulog, ulog_entry_cb cb, void *arg, const struct mo_ops *ops)
+{
+	struct ulog_entry_base *e;
+	int ret = 0;
+
+	for (struct ulog *r = ulog; r != NULL; r = ulog_next(r)) {
+		for (size_t offset = 0; offset < r->capacity; ) {
+			e = (struct ulog_entry_base *)(r->data + offset);
+			if (!ulog_entry_valid(ulog, e))
+				return ret;
+
+			ret = cb(e, arg, ops);
+			if (ret != 0)
+				return ret;
+
+			offset += ulog_entry_size(e);
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * ulog_capacity -- (internal) returns the total capacity of the ulog
+ */
+size_t
+ulog_capacity(struct ulog *ulog, size_t ulog_base_bytes)
+{
+	size_t capacity = ulog_base_bytes;
+
+	ulog = ulog_next(ulog);
+	/* skip the first one, we count it in 'ulog_base_bytes' */
+	while (ulog != NULL) {
+		capacity += ulog->capacity;
+		ulog = ulog_next(ulog);
+	}
+
+	return capacity;
+}
+
+/*
+ * ulog_rebuild_next_vec -- rebuilds the vector of next entries
+ */
+void
+ulog_rebuild_next_vec(struct ulog *ulog, struct ulog_next *next)
+{
+	do {
+		if (ulog->next != 0)
+			VEC_PUSH_BACK(next, ulog->next);
+	} while ((ulog = ulog_next(ulog)) != NULL);
+}
+
+/*
+ * ulog_reserve -- reserves new capacity in the ulog
+ */
+int
+ulog_reserve(struct ulog *ulog,
+	size_t ulog_base_nbytes, size_t gen_num,
+	int auto_reserve, size_t *new_capacity,
+	ulog_extend_fn extend, struct ulog_next *next)
+{
+	if (!auto_reserve) {
+		D_CRIT("cannot auto reserve next ulog\n");
+		return -1;
+	}
+
+	size_t capacity = ulog_base_nbytes;
+
+	VEC_FOREACH(ulog, next) {
+		ASSERTne(ulog, NULL);
+		capacity += ulog->capacity;
+	}
+
+	while (capacity < *new_capacity) {
+		if (extend(&ulog->next, gen_num) != 0)
+			return -1;
+		VEC_PUSH_BACK(next, ulog->next);
+		ulog = ulog_next(ulog);
+		ASSERTne(ulog, NULL);
+
+		capacity += ulog->capacity;
+	}
+	*new_capacity = capacity;
+
+	return 0;
+}
+
+/*
+ * ulog_checksum -- (internal) calculates ulog checksum
+ */
+static int
+ulog_checksum(struct ulog *ulog, size_t ulog_base_bytes, int insert)
+{
+	return util_checksum(ulog, SIZEOF_ULOG(ulog_base_bytes),
+		&ulog->checksum, insert, 0);
+}
+
+/*
+ * ulog_entry_val_create -- creates a new log value entry in the ulog
+ *
+ * This function requires at least a cacheline of space to be available in the
+ * ulog.
+ */
+struct ulog_entry_val *
+ulog_entry_val_create(struct ulog *ulog, size_t offset, uint64_t *dest,
+		      uint64_t value, ulog_operation_type type, const struct mo_ops *p_ops)
+{
+	struct ulog_entry_val *e =
+		(struct ulog_entry_val *)(ulog->data + offset);
+
+	struct {
+		struct ulog_entry_val v;
+		struct ulog_entry_base zeroes;
+	} data;
+	COMPILE_ERROR_ON(sizeof(data) != sizeof(data.v) + sizeof(data.zeroes));
+
+	/*
+	 * Write a little bit more to the buffer so that the next entry that
+	 * resides in the log is erased. This will prevent leftovers from
+	 * a previous, clobbered, log from being incorrectly applied.
+	 */
+	data.zeroes.offset = 0;
+	data.v.base.offset =
+	    p_ops->base ? umem_cache_ptr2off(p_ops->umem_store, dest) : (uint64_t)dest;
+	data.v.base.offset |= ULOG_OPERATION(type);
+	data.v.value = value;
+
+	memcpy(e, &data, sizeof(data));
+
+	return e;
+}
+
+/*
+ * ulog_clobber_entry -- zeroes out a single log entry header
+ */
+void
+ulog_clobber_entry(const struct ulog_entry_base *e)
+{
+	static const size_t aligned_entry_size =
+		CACHELINE_ALIGN(sizeof(struct ulog_entry_base));
+
+	memset((char *)e, 0, aligned_entry_size);
+}
+
+/*
+ * ulog_entry_buf_create -- atomically creates a buffer entry in the log
+ */
+struct ulog_entry_buf *
+ulog_entry_buf_create(struct ulog *ulog, size_t offset, uint64_t gen_num,
+		uint64_t *dest, const void *src, uint64_t size,
+		ulog_operation_type type, const struct mo_ops *p_ops)
+{
+	struct ulog_entry_buf *e =
+		(struct ulog_entry_buf *)(ulog->data + offset);
+
+	/*
+	 * Depending on the size of the source buffer, we might need to perform
+	 * up to three separate copies:
+	 *	1. The first cacheline, 24b of metadata and 40b of data
+	 * If there's still data to be logged:
+	 *	2. The entire remainder of data data aligned down to cacheline,
+	 *	for example, if there's 150b left, this step will copy only
+	 *	128b.
+	 * Now, we are left with between 0 to 63 bytes. If nonzero:
+	 *	3. Create a stack allocated cacheline-sized buffer, fill in the
+	 *	remainder of the data, and copy the entire cacheline.
+	 *
+	 * This is done so that we avoid a cache-miss on misaligned writes.
+	 */
+
+	struct ulog_entry_buf *b = alloca(CACHELINE_SIZE);
+
+	ASSERT(p_ops->base != NULL);
+	b->base.offset = umem_cache_ptr2off(p_ops->umem_store, dest);
+	b->base.offset |= ULOG_OPERATION(type);
+	b->size = size;
+	b->checksum = 0;
+
+	size_t bdatasize = CACHELINE_SIZE - sizeof(struct ulog_entry_buf);
+	size_t ncopy = MIN(size, bdatasize);
+
+	memcpy(b->data, src, ncopy);
+	memset(b->data + ncopy, 0, bdatasize - ncopy);
+
+	size_t remaining_size = ncopy > size ? 0 : size - ncopy;
+
+	char *srcof = (char *)src + ncopy;
+	size_t rcopy = ALIGN_DOWN(remaining_size, CACHELINE_SIZE);
+	size_t lcopy = remaining_size - rcopy;
+
+	uint8_t last_cacheline[CACHELINE_SIZE];
+
+	if (lcopy != 0) {
+		memcpy(last_cacheline, srcof + rcopy, lcopy);
+		memset(last_cacheline + lcopy, 0, CACHELINE_SIZE - lcopy);
+	}
+
+	if (rcopy != 0) {
+		void *rdest = e->data + ncopy;
+
+		ASSERT(IS_CACHELINE_ALIGNED(rdest));
+		memcpy(rdest, srcof, rcopy);
+	}
+
+	if (lcopy != 0) {
+		void *ldest = e->data + ncopy + rcopy;
+
+		ASSERT(IS_CACHELINE_ALIGNED(ldest));
+
+		memcpy(ldest, last_cacheline, CACHELINE_SIZE);
+	}
+
+	b->checksum = util_checksum_seq(b, CACHELINE_SIZE, 0);
+	if (rcopy != 0)
+		b->checksum = util_checksum_seq(srcof, rcopy, b->checksum);
+	if (lcopy != 0)
+		b->checksum = util_checksum_seq(last_cacheline,
+			CACHELINE_SIZE, b->checksum);
+
+	b->checksum = util_checksum_seq(&gen_num, sizeof(gen_num),
+			b->checksum);
+
+	ASSERT(IS_CACHELINE_ALIGNED(e));
+
+	memcpy(e, b, CACHELINE_SIZE);
+
+	/*
+	 * Allow having uninitialized data in the buffer - this requires marking
+	 * data as defined so that comparing checksums is not reported as an
+	 * error by memcheck.
+	 */
+	VALGRIND_DO_MAKE_MEM_DEFINED(e->data, ncopy + rcopy + lcopy);
+	VALGRIND_DO_MAKE_MEM_DEFINED(&e->checksum, sizeof(e->checksum));
+
+	ASSERT(ulog_entry_valid(ulog, &e->base));
+
+	return e;
+}
+
+/*
+ * ulog_entry_apply -- applies modifications of a single ulog entry
+ */
+void
+ulog_entry_apply(const struct ulog_entry_base *e, int persist,
+		 const struct mo_ops *p_ops)
+{
+	ulog_operation_type    t = ulog_entry_type(e);
+	uint64_t               offset   = ulog_entry_offset(e);
+	size_t                 dst_size = sizeof(uint64_t);
+	struct ulog_entry_val *ev;
+	struct ulog_entry_buf *eb;
+	uint16_t               nbits;
+	uint32_t               pos;
+	uint64_t               bmask;
+	uint64_t              *dst;
+
+	dst = p_ops->base ? umem_cache_off2ptr(p_ops->umem_store, offset) : (uint64_t *)offset;
+
+	SUPPRESS_UNUSED(persist);
+
+	switch (t) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+		ev = (struct ulog_entry_val *)e;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst &= ev->value;
+		break;
+	case ULOG_OPERATION_OR:
+		ev = (struct ulog_entry_val *)e;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst |= ev->value;
+		break;
+#else
+	case ULOG_OPERATION_CLR_BITS:
+		ev = (struct ulog_entry_val *)e;
+		pos = ULOG_ENTRY_VAL_TO_POS(ev->value);
+		nbits = ULOG_ENTRY_VAL_TO_BITS(ev->value);
+		if (nbits == RUN_BITS_PER_VALUE)
+			bmask = UINT64_MAX;
+		else
+			bmask = ((1ULL << nbits) - 1ULL) << pos;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst &= ~bmask;
+		break;
+	case ULOG_OPERATION_SET_BITS:
+		ev = (struct ulog_entry_val *)e;
+		pos = ULOG_ENTRY_VAL_TO_POS(ev->value);
+		nbits = ULOG_ENTRY_VAL_TO_BITS(ev->value);
+		if (nbits == RUN_BITS_PER_VALUE)
+			bmask = UINT64_MAX;
+		else
+			bmask = ((1ULL << nbits) - 1ULL) << pos;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst |= bmask;
+		break;
+#endif
+	case ULOG_OPERATION_SET:
+		ev = (struct ulog_entry_val *)e;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst = ev->value;
+		break;
+	case ULOG_OPERATION_BUF_CPY:
+		eb = (struct ulog_entry_buf *)e;
+
+		dst_size = eb->size;
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		mo_wal_memcpy(p_ops, dst, eb->data, eb->size, 0);
+		break;
+	case ULOG_OPERATION_BUF_SET:
+	default:
+		ASSERT(0);
+	}
+	VALGRIND_REMOVE_FROM_TX(dst, dst_size);
+}
+
+/*
+ * ulog_process_entry -- (internal) processes a single ulog entry
+ */
+static int
+ulog_process_entry(struct ulog_entry_base *e, void *arg,
+		   const struct mo_ops *p_ops)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(arg);
+
+	ulog_entry_apply(e, 0, p_ops);
+
+	return 0;
+}
+/*
+ * ulog_inc_gen_num -- (internal) increments gen num in the ulog
+ */
+static void
+ulog_inc_gen_num(struct ulog *ulog)
+{
+	ulog->gen_num++;
+}
+
+/*
+ * ulog_free_next -- free all ulogs starting from the indicated one.
+ * Function returns 1 if any ulog have been freed or unpinned, 0 otherwise.
+ */
+int
+ulog_free_next(struct ulog *u, ulog_free_fn ulog_free)
+{
+	int ret = 0;
+
+	if (u == NULL)
+		return ret;
+
+	VEC(, struct ulog **) ulogs_internal_except_first;
+	VEC_INIT(&ulogs_internal_except_first);
+
+	while (u->next != 0) {
+		if (VEC_PUSH_BACK(&ulogs_internal_except_first,
+			&u->next) != 0) {
+			/* this is fine, it will just use more memory */
+			DAV_DBG("unable to free transaction logs memory");
+			goto out;
+		}
+		u = u->next;
+	}
+
+	/* free non-user defined logs */
+	struct ulog **ulog_ptr;
+
+	VEC_FOREACH_REVERSE(ulog_ptr, &ulogs_internal_except_first) {
+		ulog_free(*ulog_ptr);
+		*ulog_ptr = NULL;
+		ret = 1;
+	}
+
+out:
+	VEC_DELETE(&ulogs_internal_except_first);
+	return ret;
+}
+
+/*
+ * ulog_clobber -- zeroes the metadata of the ulog
+ */
+void
+ulog_clobber(struct ulog *dest, struct ulog_next *next)
+{
+	struct ulog empty;
+
+	memset(&empty, 0, sizeof(empty));
+
+	if (next != NULL)
+		empty.next = VEC_SIZE(next) == 0 ? 0 : VEC_FRONT(next);
+	else
+		empty.next = dest->next;
+
+	memcpy(dest, &empty, sizeof(empty));
+}
+
+/*
+ * ulog_clobber_data -- zeroes out 'nbytes' of data in the logs
+ */
+int
+ulog_clobber_data(struct ulog *ulog_first,
+	struct ulog_next *next, ulog_free_fn ulog_free,
+	unsigned flags)
+{
+	ASSERTne(ulog_first, NULL);
+
+	/* In case of abort we need to increment counter in the first ulog. */
+	if (flags & ULOG_INC_FIRST_GEN_NUM)
+		ulog_inc_gen_num(ulog_first);
+
+	/*
+	 * In the case of abort or commit, we are not going to free all ulogs,
+	 * but rather increment the generation number to be consistent in the
+	 * first two ulogs.
+	 */
+	struct ulog *ulog_second = VEC_SIZE(next) == 0 ? 0 : *VEC_GET(next, 0);
+
+	if (ulog_second && !(flags & ULOG_FREE_AFTER_FIRST))
+		/*
+		 * We want to keep gen_nums consistent between ulogs.
+		 * If the transaction will commit successfully we'll reuse the
+		 * second buffer (third and next ones will be freed anyway).
+		 * If the application will crash we'll free 2nd ulog on
+		 * recovery, which means we'll never read gen_num of the
+		 * second ulog in case of an ungraceful shutdown.
+		 */
+		ulog_inc_gen_num(ulog_second);
+
+	struct ulog *u;
+
+	/*
+	 * To make sure that transaction logs do not occupy too
+	 * much of space, all of them, expect for the first one,
+	 * are freed at the end of the operation. The reasoning for
+	 * this is that pmalloc() is a relatively cheap operation for
+	 * transactions where many hundreds of kilobytes are being
+	 * snapshot, and so, allocating and freeing the buffer for
+	 * each transaction is an acceptable overhead for the average
+	 * case.
+	 */
+	if (flags & ULOG_FREE_AFTER_FIRST)
+		u = ulog_first;
+	else
+		u = ulog_second;
+
+	if (u == NULL)
+		return 0;
+
+	return ulog_free_next(u, ulog_free);
+}
+
+/*
+ * ulog_process -- process ulog entries
+ */
+void
+ulog_process(struct ulog *ulog, ulog_check_offset_fn check,
+	     const struct mo_ops *p_ops)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(check);
+
+#ifdef DAV_EXTRA_DEBUG
+	if (check)
+		ulog_check(ulog, check, p_ops);
+#endif
+
+	ulog_foreach_entry(ulog, ulog_process_entry, NULL, p_ops);
+	mo_wal_drain(p_ops);
+}
+
+/*
+ * ulog_base_nbytes -- (internal) counts the actual of number of bytes
+ *	occupied by the ulog
+ */
+size_t
+ulog_base_nbytes(struct ulog *ulog)
+{
+	size_t offset = 0;
+	struct ulog_entry_base *e;
+
+	for (offset = 0; offset < ulog->capacity; ) {
+		e = (struct ulog_entry_base *)(ulog->data + offset);
+		if (!ulog_entry_valid(ulog, e))
+			break;
+
+		offset += ulog_entry_size(e);
+	}
+
+	return offset;
+}
+
+/*
+ * ulog_recovery_needed -- checks if the logs needs recovery
+ */
+int
+ulog_recovery_needed(struct ulog *ulog, int verify_checksum)
+{
+	size_t nbytes = MIN(ulog_base_nbytes(ulog), ulog->capacity);
+
+	if (nbytes == 0)
+		return 0;
+
+	if (verify_checksum && !ulog_checksum(ulog, nbytes, 0))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * ulog_check_entry --
+ *	(internal) checks consistency of a single ulog entry
+ */
+static int
+ulog_check_entry(struct ulog_entry_base *e, void *arg, const struct mo_ops *p_ops)
+{
+	uint64_t offset = ulog_entry_offset(e);
+	ulog_check_offset_fn check = arg;
+
+	if (!check(p_ops->base, offset)) {
+		DAV_DBG("ulog %p invalid offset %" PRIu64,
+				e, e->offset);
+		return -1;
+	}
+
+	return offset == 0 ? -1 : 0;
+}
+
+/*
+ * ulog_check -- (internal) check consistency of ulog entries
+ */
+int
+ulog_check(struct ulog *ulog, ulog_check_offset_fn check, const struct mo_ops *p_ops)
+{
+	DAV_DBG("ulog %p", ulog);
+
+	return ulog_foreach_entry(ulog,
+			ulog_check_entry, check, p_ops);
+}
diff --git a/src/common/dav_v2/ulog.h b/src/common/dav_v2/ulog.h
new file mode 100644
index 00000000000..6be0cd9b3ed
--- /dev/null
+++ b/src/common/dav_v2/ulog.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * ulog.h -- unified log public interface
+ */
+
+#ifndef __DAOS_COMMON_ULOG_H
+#define __DAOS_COMMON_ULOG_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "util.h"
+#include "vec.h"
+#include "mo_wal.h"
+
+struct ulog_entry_base {
+	uint64_t offset; /* offset with operation type flag */
+};
+
+/*
+ * ulog_entry_val -- log entry
+ */
+struct ulog_entry_val {
+	struct ulog_entry_base base;
+	uint64_t value; /* value to be applied */
+};
+
+/*
+ * ulog_entry_buf - ulog buffer entry
+ */
+struct ulog_entry_buf {
+	struct ulog_entry_base base; /* offset with operation type flag */
+	uint64_t checksum; /* checksum of the entire log entry */
+	uint64_t size; /* size of the buffer to be modified */
+	uint8_t data[]; /* content to fill in */
+};
+
+#define ULOG_UNUSED ((CACHELINE_SIZE - 40) / 8)
+/*
+ * This structure *must* be located at a cacheline boundary. To achieve this,
+ * the next field is always allocated with extra padding, and then the offset
+ * is additionally aligned.
+ */
+#define ULOG(capacity_bytes) {\
+	/* 64 bytes of metadata */\
+	uint64_t checksum; /* checksum of ulog header and its entries */\
+	struct ulog *next; /* offset of ulog extension */\
+	uint64_t capacity; /* capacity of this ulog in bytes */\
+	uint64_t gen_num; /* generation counter */\
+	uint64_t flags; /* ulog flags */\
+	uint64_t unused[ULOG_UNUSED]; /* must be 0 */\
+	uint8_t data[capacity_bytes]; /* N bytes of data */\
+}
+
+#define SIZEOF_ULOG(base_capacity)\
+(sizeof(struct ulog) + base_capacity)
+
+/*
+ * Ulog buffer allocated by the user must be marked by this flag.
+ * It is important to not free it at the end:
+ * what user has allocated - user should free himself.
+ */
+#define ULOG_USER_OWNED (1U << 0)
+
+/* use this for allocations of aligned ulog extensions */
+#define SIZEOF_ALIGNED_ULOG(base_capacity)\
+ALIGN_UP(SIZEOF_ULOG(base_capacity + (2 * CACHELINE_SIZE)), CACHELINE_SIZE)
+
+struct ulog ULOG(0);
+
+VEC(ulog_next, struct ulog *);
+
+typedef uint64_t ulog_operation_type;
+
+#define ULOG_OPERATION_SET		(0b000ULL << 61ULL)
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+#define ULOG_OPERATION_AND		(0b001ULL << 61ULL)
+#define ULOG_OPERATION_OR		(0b010ULL << 61ULL)
+#else
+#define ULOG_OPERATION_CLR_BITS		(0b001ULL << 61ULL)
+#define ULOG_OPERATION_SET_BITS		(0b010ULL << 61ULL)
+#endif
+#define ULOG_OPERATION_BUF_SET		(0b101ULL << 61ULL)
+#define ULOG_OPERATION_BUF_CPY		(0b110ULL << 61ULL)
+
+#ifndef	WAL_SUPPORTS_AND_OR_OPS
+#endif
+
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+#define	ULOG_ENTRY_IS_BIT_OP(opc)	((opc == ULOG_OPERATION_AND) || \
+					 (opc == ULOG_OPERATION_OR))
+#else
+#define	ULOG_ENTRY_IS_BIT_OP(opc)	((opc == ULOG_OPERATION_CLR_BITS) || \
+					 (opc == ULOG_OPERATION_SET_BITS))
+#define ULOG_ENTRY_OPS_POS		16 /* bits' pos at value:16 */
+#define ULOG_ENTRY_OPS_BITS_MASK	((1ULL << ULOG_ENTRY_OPS_POS) - 1)
+#define ULOG_ENTRY_VAL_TO_BITS(val)	((val) & ULOG_ENTRY_OPS_BITS_MASK)
+#define ULOG_ENTRY_VAL_TO_POS(val)	((val) >> ULOG_ENTRY_OPS_POS)
+#define ULOG_ENTRY_OPS_POS_MASK		(RUN_BITS_PER_VALUE - 1ULL)
+#define ULOG_ENTRY_TO_VAL(pos, nbits)	(((uint64_t)(nbits) & ULOG_ENTRY_OPS_BITS_MASK) | \
+					 ((pos) & ULOG_ENTRY_OPS_POS_MASK) << ULOG_ENTRY_OPS_POS)
+#endif
+
+/* immediately frees all associated ulog structures */
+#define ULOG_FREE_AFTER_FIRST (1U << 0)
+/* increments gen_num of the first, preallocated, ulog */
+#define ULOG_INC_FIRST_GEN_NUM (1U << 1)
+
+typedef int (*ulog_check_offset_fn)(void *ctx, uint64_t offset);
+typedef int (*ulog_extend_fn)(struct ulog **, uint64_t);
+typedef int (*ulog_entry_cb)(struct ulog_entry_base *e, void *arg,
+	const struct mo_ops *p_ops);
+typedef void (*ulog_free_fn)(struct ulog *ptr);
+
+struct ulog *ulog_next(struct ulog *ulog);
+
+void ulog_construct(uint64_t offset, size_t capacity, uint64_t gen_num,
+		    int flush, uint64_t flags, const struct mo_ops *p_ops);
+void ulog_construct_new(struct ulog *ulog, size_t capacity, uint64_t gen_num,
+			uint64_t flags);
+
+size_t ulog_capacity(struct ulog *ulog, size_t ulog_base_bytes);
+void ulog_rebuild_next_vec(struct ulog *ulog, struct ulog_next *next);
+
+int ulog_foreach_entry(struct ulog *ulog,
+		       ulog_entry_cb cb, void *arg, const struct mo_ops *ops);
+
+int ulog_reserve(struct ulog *ulog,
+		 size_t ulog_base_nbytes, size_t gen_num,
+		 int auto_reserve, size_t *new_capacity_bytes,
+		 ulog_extend_fn extend, struct ulog_next *next);
+
+int ulog_free_next(struct ulog *u, ulog_free_fn ulog_free);
+void ulog_clobber(struct ulog *dest, struct ulog_next *next);
+int ulog_clobber_data(struct ulog *dest,
+		      struct ulog_next *next, ulog_free_fn ulog_free, unsigned flags);
+void ulog_clobber_entry(const struct ulog_entry_base *e);
+
+void ulog_process(struct ulog *ulog, ulog_check_offset_fn check,
+		  const struct mo_ops *p_ops);
+
+size_t ulog_base_nbytes(struct ulog *ulog);
+int ulog_recovery_needed(struct ulog *ulog, int verify_checksum);
+
+uint64_t ulog_entry_offset(const struct ulog_entry_base *entry);
+ulog_operation_type ulog_entry_type(const struct ulog_entry_base *entry);
+
+struct ulog_entry_val *
+ulog_entry_val_create(struct ulog *ulog, size_t offset, uint64_t *dest, uint64_t value,
+		      ulog_operation_type type, const struct mo_ops *p_ops);
+
+struct ulog_entry_buf *
+ulog_entry_buf_create(struct ulog *ulog, size_t offset, uint64_t gen_num,
+		      uint64_t *dest, const void *src, uint64_t size,
+		      ulog_operation_type type, const struct mo_ops *p_ops);
+
+void ulog_entry_apply(const struct ulog_entry_base *e, int persist,
+		      const struct mo_ops *p_ops);
+
+size_t ulog_entry_size(const struct ulog_entry_base *entry);
+
+int ulog_check(struct ulog *ulog, ulog_check_offset_fn check,
+	       const struct mo_ops *p_ops);
+
+#endif /* __DAOS_COMMON_ULOG_H */
diff --git a/src/common/dav_v2/util.c b/src/common/dav_v2/util.c
new file mode 100644
index 00000000000..f3f6850997a
--- /dev/null
+++ b/src/common/dav_v2/util.c
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2023, Intel Corporation */
+
+/*
+ * util.c -- very basic utilities
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <endian.h>
+
+#include "util.h"
+#include "valgrind_internal.h"
+
+
+#if ANY_VG_TOOL_ENABLED
+/* Initialized to true if the process is running inside Valgrind. */
+unsigned _On_valgrind;
+#endif
+
+#if VG_HELGRIND_ENABLED
+/* Initialized to true if the process is running inside Valgrind helgrind. */
+unsigned _On_helgrind;
+#endif
+
+#if VG_DRD_ENABLED
+/* Initialized to true if the process is running inside Valgrind drd. */
+unsigned _On_drd;
+#endif
+
+#if VG_HELGRIND_ENABLED || VG_DRD_ENABLED
+/* Initialized to true if the process is running inside Valgrind drd or hg. */
+unsigned _On_drd_or_hg;
+#endif
+
+#if VG_MEMCHECK_ENABLED
+/* Initialized to true if the process is running inside Valgrind memcheck. */
+unsigned _On_memcheck;
+#endif
+
+#if VG_TXINFO_ENABLED
+/* true if DAV API and TX-related messages has to be enabled in Valgrind log. */
+int _Vg_txinfo_emit;
+#endif /* VG_TXINFO_ENABLED */
+
+/*
+ * util_is_zeroed -- check if given memory range is all zero
+ */
+int
+util_is_zeroed(const void *addr, size_t len)
+{
+	const char *a = addr;
+
+	if (len == 0)
+		return 1;
+
+	if (a[0] == 0 && memcmp(a, a + 1, len - 1) == 0)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * util_checksum_compute -- compute Fletcher64-like checksum
+ *
+ * csump points to where the checksum lives, so that location
+ * is treated as zeros while calculating the checksum. The
+ * checksummed data is assumed to be in little endian order.
+ */
+uint64_t
+util_checksum_compute(void *addr, size_t len, uint64_t *csump, size_t skip_off)
+{
+	if (len % 4 != 0)
+		abort();
+
+	uint32_t *p32 = addr;
+	uint32_t *p32end = (uint32_t *)((char *)addr + len);
+	uint32_t *skip;
+	uint32_t lo32 = 0;
+	uint32_t hi32 = 0;
+
+	if (skip_off)
+		skip = (uint32_t *)((char *)addr + skip_off);
+	else
+		skip = (uint32_t *)((char *)addr + len);
+
+	while (p32 < p32end)
+		if (p32 == (uint32_t *)csump || p32 >= skip) {
+			/* lo32 += 0; treat first 32-bits as zero */
+			p32++;
+			hi32 += lo32;
+			/* lo32 += 0; treat second 32-bits as zero */
+			p32++;
+			hi32 += lo32;
+		} else {
+			lo32 += le32toh(*p32);
+			++p32;
+			hi32 += lo32;
+		}
+
+	return (uint64_t)hi32 << 32 | lo32;
+}
+
+/*
+ * util_checksum -- compute Fletcher64-like checksum
+ *
+ * csump points to where the checksum lives, so that location
+ * is treated as zeros while calculating the checksum.
+ * If insert is true, the calculated checksum is inserted into
+ * the range at *csump.  Otherwise the calculated checksum is
+ * checked against *csump and the result returned (true means
+ * the range checksummed correctly).
+ */
+int
+util_checksum(void *addr, size_t len, uint64_t *csump,
+		int insert, size_t skip_off)
+{
+	uint64_t csum = util_checksum_compute(addr, len, csump, skip_off);
+
+	if (insert) {
+		*csump = htole64(csum);
+		return 1;
+	}
+
+	return *csump == htole64(csum);
+}
+
+/*
+ * util_checksum_seq -- compute sequential Fletcher64-like checksum
+ *
+ * Merges checksum from the old buffer with checksum for current buffer.
+ */
+uint64_t
+util_checksum_seq(const void *addr, size_t len, uint64_t csum)
+{
+	if (len % 4 != 0)
+		abort();
+	const uint32_t *p32 = addr;
+	const uint32_t *p32end = (const uint32_t *)((const char *)addr + len);
+	uint32_t lo32 = (uint32_t)csum;
+	uint32_t hi32 = (uint32_t)(csum >> 32);
+
+	while (p32 < p32end) {
+		lo32 += le32toh(*p32);
+		++p32;
+		hi32 += lo32;
+	}
+	return (uint64_t)hi32 << 32 | lo32;
+}
+
+/*
+ * util_init -- initialize the utils
+ *
+ * This is called from the library initialization code.
+ */
+#if ANY_VG_TOOL_ENABLED
+__attribute__((constructor))
+static void
+_util_init(void)
+{
+	util_init();
+}
+#endif
+
+void
+util_init(void)
+{
+#if ANY_VG_TOOL_ENABLED
+	_On_valgrind = RUNNING_ON_VALGRIND;
+#endif
+
+#if VG_MEMCHECK_ENABLED
+	if (_On_valgrind) {
+		unsigned tmp;
+		unsigned result;
+		unsigned res = VALGRIND_GET_VBITS(&tmp, &result, sizeof(tmp));
+
+		_On_memcheck = res ? 1 : 0;
+	} else {
+		_On_memcheck = 0;
+	}
+#endif
+
+#if VG_DRD_ENABLED
+	if (_On_valgrind)
+		_On_drd = DRD_GET_DRD_THREADID ? 1 : 0;
+	else
+		_On_drd = 0;
+#endif
+
+#if VG_HELGRIND_ENABLED
+	if (_On_valgrind) {
+		unsigned tmp;
+		unsigned result;
+		/*
+		 * As of now (pmem-3.15) VALGRIND_HG_GET_ABITS is broken on
+		 * the upstream version of Helgrind headers. It generates
+		 * a sign-conversion error and actually returns UINT32_MAX-1
+		 * when not running under Helgrind.
+		 */
+		long res = VALGRIND_HG_GET_ABITS(&tmp, &result, sizeof(tmp));
+
+		_On_helgrind = res != -2 ? 1 : 0;
+	} else {
+		_On_helgrind = 0;
+	}
+#endif
+
+#if VG_DRD_ENABLED || VG_HELGRIND_ENABLED
+	_On_drd_or_hg = (unsigned)(On_helgrind + On_drd);
+#endif
+
+#if VG_TXINFO_ENABLED
+	if (_On_valgrind) {
+		char *txinfo_env = secure_getenv("D_DAV_VG_TXINFO");
+
+		if (txinfo_env)
+			_Vg_txinfo_emit = atoi(txinfo_env);
+	} else {
+		_Vg_txinfo_emit = 0;
+	}
+#endif
+}
diff --git a/src/common/dav_v2/util.h b/src/common/dav_v2/util.h
new file mode 100644
index 00000000000..537898edd64
--- /dev/null
+++ b/src/common/dav_v2/util.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2023, Intel Corporation */
+/*
+ * Copyright (c) 2016-2020, Microsoft Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *
+ *     * Neither the name of the copyright holder nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * util.h -- internal definitions for util module
+ */
+
+#ifndef __DAOS_COMMON_UTIL_H
+#define __DAOS_COMMON_UTIL_H 1
+
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdatomic.h>
+#include <sys/param.h>
+
+#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || \
+	defined(__riscv)
+#define PAGESIZE 4096
+#elif defined(__PPC64__)
+#define PAGESIZE 65536
+#else
+#error unable to recognize ISA at compile time
+#endif
+
+#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || \
+	defined(__riscv)
+#define CACHELINE_SIZE 64ULL
+#elif defined(__PPC64__)
+#define CACHELINE_SIZE 128ULL
+#else
+#error unable to recognize architecture at compile time
+#endif
+
+#define ALIGN_UP(size, align) (((size) + (align) - 1) & ~((align) - 1))
+#define ALIGN_DOWN(size, align) ((size) & ~((align) - 1))
+
+void util_init(void);
+int util_is_zeroed(const void *addr, size_t len);
+uint64_t util_checksum_compute(void *addr, size_t len, uint64_t *csump,
+		size_t skip_off);
+int util_checksum(void *addr, size_t len, uint64_t *csump,
+		int insert, size_t skip_off);
+uint64_t util_checksum_seq(const void *addr, size_t len, uint64_t csum);
+
+#define force_inline __attribute__((always_inline)) inline
+
+typedef uint64_t ua_uint64_t __attribute__((aligned(1)));
+typedef uint32_t ua_uint32_t __attribute__((aligned(1)));
+typedef uint16_t ua_uint16_t __attribute__((aligned(1)));
+
+/*
+ * util_div_ceil -- divides a by b and rounds up the result
+ */
+static force_inline unsigned
+util_div_ceil(unsigned a, unsigned b)
+{
+	return (unsigned)(((unsigned long)a + b - 1) / b);
+}
+
+/*
+ * util_bool_compare_and_swap -- perform an atomic compare and swap
+ * util_fetch_and_* -- perform an operation atomically, return old value
+ * util_popcount -- count number of set bits
+ * util_lssb_index -- return index of least significant set bit,
+ *			undefined on zero
+ * util_mssb_index -- return index of most significant set bit
+ *			undefined on zero
+ *
+ * XXX assertions needed on (value != 0) in both versions of bitscans
+ *
+ */
+
+/*
+ * ISO C11 -- 7.17.7.2 The atomic_load generic functions
+ * Integer width specific versions as supplement for:
+ *
+ *
+ * #include <stdatomic.h>
+ * C atomic_load(volatile A *object);
+ * C atomic_load_explicit(volatile A *object, memory_order order);
+ *
+ * The atomic_load interface doesn't return the loaded value, but instead
+ * copies it to a specified address.
+ *
+ * void util_atomic_load64(volatile A *object, A *destination);
+ * void util_atomic_load_explicit32(volatile A *object, A *destination,
+ *                                  memory_order order);
+ * void util_atomic_load_explicit64(volatile A *object, A *destination,
+ *                                  memory_order order);
+ * Also, instead of generic functions, two versions are available:
+ * for 32 bit fundamental integers, and for 64 bit ones.
+ */
+
+#define util_atomic_load_explicit32 __atomic_load
+#define util_atomic_load_explicit64 __atomic_load
+
+/* ISO C11 -- 7.17.7.1 The atomic_store generic functions */
+/*
+ * ISO C11 -- 7.17.7.1 The atomic_store generic functions
+ * Integer width specific versions as supplement for:
+ *
+ * #include <stdatomic.h>
+ * void atomic_store(volatile A *object, C desired);
+ * void atomic_store_explicit(volatile A *object, C desired,
+ *                            memory_order order);
+ */
+#define util_atomic_store_explicit32 __atomic_store_n
+#define util_atomic_store_explicit64 __atomic_store_n
+
+/*
+ * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html
+ * https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+ * https://clang.llvm.org/docs/LanguageExtensions.html#builtin-functions
+ */
+#define util_bool_compare_and_swap64 __sync_bool_compare_and_swap
+#define util_fetch_and_add64 __sync_fetch_and_add
+#define util_fetch_and_sub64 __sync_fetch_and_sub
+#define util_popcount64(value) ((unsigned char)__builtin_popcountll(value))
+
+#define util_lssb_index64(value) ((unsigned char)__builtin_ctzll(value))
+#define util_mssb_index64(value) ((unsigned char)(63 - __builtin_clzll(value)))
+
+/* ISO C11 -- 7.17.7 Operations on atomic types */
+#define util_atomic_load64(object, dest)\
+	util_atomic_load_explicit64(object, dest, memory_order_seq_cst)
+
+#define COMPILE_ERROR_ON(cond) ((void)sizeof(char[(cond) ? -1 : 1]))
+
+/* macro for counting the number of varargs (up to 9) */
+#define COUNT(...)\
+	COUNT_11TH(_, ##__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define COUNT_11TH(_11, _10, _9, _8, _7, _6, _5, _4, _3, _2,  X, ...) X
+
+/* concatenation macro */
+#define GLUE(A, B) GLUE_I(A, B)
+#define GLUE_I(A, B) A##B
+
+/* macro for suppressing errors from unused variables (zero to 9) */
+#define SUPPRESS_UNUSED(...)\
+	GLUE(SUPPRESS_ARG_, COUNT(__VA_ARGS__))(__VA_ARGS__)
+#define SUPPRESS_ARG_0(X)
+#define SUPPRESS_ARG_1(X) ((void)(X))
+#define SUPPRESS_ARG_2(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_1(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_3(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_2(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_4(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_3(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_5(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_4(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_6(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_5(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_7(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_6(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_8(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_7(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_9(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_8(__VA_ARGS__);\
+} while (0)
+
+#endif /* __DAOS_COMMON_UTIL_H */
diff --git a/src/common/dav_v2/valgrind_internal.h b/src/common/dav_v2/valgrind_internal.h
new file mode 100644
index 00000000000..86fe9d47a19
--- /dev/null
+++ b/src/common/dav_v2/valgrind_internal.h
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * valgrind_internal.h -- internal definitions for valgrind macros
+ */
+
+#ifndef __DAOS_COMMON_VALGRIND_INTERNAL_H
+#define __DAOS_COMMON_VALGRIND_INTERNAL_H 1
+
+#ifdef D_HAS_VALGRIND
+#if !defined(_WIN32) && !defined(__FreeBSD__) && !defined(__riscv)
+#define VG_TXINFO_ENABLED 1
+#define VG_HELGRIND_ENABLED 1
+#define VG_MEMCHECK_ENABLED 1
+#define VG_DRD_ENABLED 1
+#endif
+#endif
+
+#if VG_TXINFO_ENABLED || VG_HELGRIND_ENABLED || VG_MEMCHECK_ENABLED || \
+	VG_DRD_ENABLED
+#define ANY_VG_TOOL_ENABLED 1
+#else
+#define ANY_VG_TOOL_ENABLED 0
+#endif
+
+#if ANY_VG_TOOL_ENABLED
+extern unsigned _On_valgrind;
+#define On_valgrind __builtin_expect(_On_valgrind, 0)
+#include "valgrind/valgrind.h"
+#else
+#define On_valgrind (0)
+#endif
+
+#if VG_HELGRIND_ENABLED
+extern unsigned _On_helgrind;
+#define On_helgrind __builtin_expect(_On_helgrind, 0)
+#include "valgrind/helgrind.h"
+#else
+#define On_helgrind (0)
+#endif
+
+#if VG_DRD_ENABLED
+extern unsigned _On_drd;
+#define On_drd __builtin_expect(_On_drd, 0)
+#include "valgrind/drd.h"
+#else
+#define On_drd (0)
+#endif
+
+#if VG_HELGRIND_ENABLED || VG_DRD_ENABLED
+
+extern unsigned _On_drd_or_hg;
+#define On_drd_or_hg __builtin_expect(_On_drd_or_hg, 0)
+
+#define VALGRIND_ANNOTATE_HAPPENS_BEFORE(obj) do {\
+	if (On_drd_or_hg) \
+		ANNOTATE_HAPPENS_BEFORE((obj));\
+} while (0)
+
+#define VALGRIND_ANNOTATE_HAPPENS_AFTER(obj) do {\
+	if (On_drd_or_hg) \
+		ANNOTATE_HAPPENS_AFTER((obj));\
+} while (0)
+
+#define VALGRIND_ANNOTATE_NEW_MEMORY(addr, size) do {\
+	if (On_drd_or_hg) \
+		ANNOTATE_NEW_MEMORY((addr), (size));\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_READS_BEGIN() do {\
+	if (On_drd_or_hg) \
+	ANNOTATE_IGNORE_READS_BEGIN();\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_READS_END() do {\
+	if (On_drd_or_hg) \
+	ANNOTATE_IGNORE_READS_END();\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_WRITES_BEGIN() do {\
+	if (On_drd_or_hg) \
+	ANNOTATE_IGNORE_WRITES_BEGIN();\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_WRITES_END() do {\
+	if (On_drd_or_hg) \
+	ANNOTATE_IGNORE_WRITES_END();\
+} while (0)
+
+/* Supported by both helgrind and drd. */
+#define VALGRIND_HG_DRD_DISABLE_CHECKING(addr, size) do {\
+	if (On_drd_or_hg) \
+		VALGRIND_HG_DISABLE_CHECKING((addr), (size));\
+} while (0)
+
+#else
+
+#define On_drd_or_hg (0)
+
+#define VALGRIND_ANNOTATE_HAPPENS_BEFORE(obj) { (void)(obj); }
+
+#define VALGRIND_ANNOTATE_HAPPENS_AFTER(obj) { (void)(obj); }
+
+#define VALGRIND_ANNOTATE_NEW_MEMORY(addr, size) do {\
+	(void) (addr);\
+	(void) (size);\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_READS_BEGIN() do {} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_READS_END() do {} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_WRITES_BEGIN() do {} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_WRITES_END() do {} while (0)
+
+#define VALGRIND_HG_DRD_DISABLE_CHECKING(addr, size) do {\
+	(void) (addr);\
+	(void) (size);\
+} while (0)
+
+#endif
+
+#if VG_TXINFO_ENABLED
+
+extern int _Vg_txinfo_emit;
+#define VG_txinfo_emit __builtin_expect(_Vg_txinfo_emit, 0)
+
+void util_emit_log(const char *func, int order);
+
+#define VALGRIND_SET_CLEAN(addr, len) do {\
+	(void)(addr);\
+	(void)(len);\
+} while (0)
+
+#define VALGRIND_START_TX do {} while (0)
+
+#define VALGRIND_END_TX do {} while (0)
+
+#define VALGRIND_ADD_TO_TX(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_REMOVE_FROM_TX(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+/*
+ * Logs library and function name with proper suffix
+ * to VG log file.
+ */
+#define DAV_API_START() do {\
+	if (VG_txinfo_emit)\
+		VALGRIND_PRINTF("%s BEGIN\n", __func__);\
+} while (0)
+#define DAV_API_END() do {\
+	if (VG_txinfo_emit)\
+		VALGRIND_PRINTF("%s END\n", __func__);\
+} while (0)
+
+#else /* VG_TXINFO_ENABLED */
+
+#define VG_txinfo_emit (0)
+
+#define VALGRIND_SET_CLEAN(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_START_TX do {} while (0)
+
+#define VALGRIND_END_TX do {} while (0)
+
+#define VALGRIND_ADD_TO_TX(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_REMOVE_FROM_TX(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define DAV_API_START() do {} while (0)
+
+#define DAV_API_END() do {} while (0)
+
+#endif /* VG_TXINFO_ENABLED */
+
+#if VG_MEMCHECK_ENABLED
+
+extern unsigned _On_memcheck;
+#define On_memcheck __builtin_expect(_On_memcheck, 0)
+
+#include "valgrind/memcheck.h"
+
+#define VALGRIND_DO_DISABLE_ERROR_REPORTING do {\
+	if (On_valgrind)\
+		VALGRIND_DISABLE_ERROR_REPORTING;\
+} while (0)
+
+#define VALGRIND_DO_ENABLE_ERROR_REPORTING do {\
+	if (On_valgrind)\
+		VALGRIND_ENABLE_ERROR_REPORTING;\
+} while (0)
+
+#define VALGRIND_DO_CREATE_MEMPOOL(heap, rzB, is_zeroed) do {\
+	if (On_memcheck)\
+		VALGRIND_CREATE_MEMPOOL(heap, rzB, is_zeroed);\
+} while (0)
+
+#define VALGRIND_DO_DESTROY_MEMPOOL(heap) do {\
+	if (On_memcheck)\
+		VALGRIND_DESTROY_MEMPOOL(heap);\
+} while (0)
+
+#define VALGRIND_DO_MEMPOOL_ALLOC(heap, addr, size) do {\
+	if (On_memcheck)\
+		VALGRIND_MEMPOOL_ALLOC(heap, addr, size);\
+} while (0)
+
+#define VALGRIND_DO_MEMPOOL_FREE(heap, addr) do {\
+	if (On_memcheck)\
+		VALGRIND_MEMPOOL_FREE(heap, addr);\
+} while (0)
+
+#define VALGRIND_DO_MAKE_MEM_DEFINED(addr, len) do {\
+	if (On_memcheck)\
+		VALGRIND_MAKE_MEM_DEFINED(addr, len);\
+} while (0)
+
+#define VALGRIND_DO_MAKE_MEM_UNDEFINED(addr, len) do {\
+	if (On_memcheck)\
+		VALGRIND_MAKE_MEM_UNDEFINED(addr, len);\
+} while (0)
+
+#define VALGRIND_DO_MAKE_MEM_NOACCESS(addr, len) do {\
+	if (On_memcheck)\
+		VALGRIND_MAKE_MEM_NOACCESS(addr, len);\
+} while (0)
+
+#define VALGRIND_DO_CHECK_MEM_IS_ADDRESSABLE(addr, len) do {\
+	if (On_memcheck)\
+		VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, len);\
+} while (0)
+
+#else /* VG_MEMCHECK_ENABLED */
+
+#define On_memcheck (0)
+
+#define VALGRIND_DO_DISABLE_ERROR_REPORTING do {} while (0)
+
+#define VALGRIND_DO_ENABLE_ERROR_REPORTING do {} while (0)
+
+#define VALGRIND_DO_CREATE_MEMPOOL(heap, rzB, is_zeroed)\
+	do { (void) (heap); (void) (rzB); (void) (is_zeroed); } while (0)
+
+#define VALGRIND_DO_DESTROY_MEMPOOL(heap) { (void) (heap); }
+
+#define VALGRIND_DO_MEMPOOL_ALLOC(heap, addr, size)\
+	do { (void) (heap); (void) (addr); (void) (size); } while (0)
+
+#define VALGRIND_DO_MEMPOOL_FREE(heap, addr)\
+	do { (void) (heap); (void) (addr); } while (0)
+
+#define VALGRIND_DO_MAKE_MEM_DEFINED(addr, len)\
+	do { (void) (addr); (void) (len); } while (0)
+
+#define VALGRIND_DO_MAKE_MEM_UNDEFINED(addr, len)\
+	do { (void) (addr); (void) (len); } while (0)
+
+#define VALGRIND_DO_MAKE_MEM_NOACCESS(addr, len)\
+	do { (void) (addr); (void) (len); } while (0)
+
+#define VALGRIND_DO_CHECK_MEM_IS_ADDRESSABLE(addr, len)\
+	do { (void) (addr); (void) (len); } while (0)
+
+#endif /* VG_MEMCHECK_ENABLED */
+
+#endif /* __DAOS_COMMON_VALGRIND_INTERNAL_H */
diff --git a/src/common/dav_v2/vec.h b/src/common/dav_v2/vec.h
new file mode 100644
index 00000000000..5d527cb9746
--- /dev/null
+++ b/src/common/dav_v2/vec.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2017-2023, Intel Corporation */
+
+/*
+ * vec.h -- vector interface
+ */
+
+#ifndef __DAOS_COMMON_VEC_H
+#define __DAOS_COMMON_VEC_H 1
+
+#include <stddef.h>
+#include "valgrind_internal.h"
+#include "util.h"
+#include "out.h"
+
+#define VEC_INIT_SIZE (64)
+
+#define VEC(name, type)\
+struct name {\
+	type *buffer;\
+	size_t size;\
+	size_t capacity;\
+}
+
+#define VEC_INITIALIZER {NULL, 0, 0}
+
+#define VEC_INIT(vec) do {\
+	(vec)->buffer = NULL;\
+	(vec)->size = 0;\
+	(vec)->capacity = 0;\
+} while (0)
+
+#define VEC_MOVE(vecl, vecr) do {\
+	D_FREE((vecl)->buffer);\
+	(vecl)->buffer = (vecr)->buffer;\
+	(vecl)->size = (vecr)->size;\
+	(vecl)->capacity = (vecr)->capacity;\
+	(vecr)->buffer = NULL;\
+	(vecr)->size = 0;\
+	(vecr)->capacity = 0;\
+} while (0)
+
+#define VEC_REINIT(vec) do {\
+	VALGRIND_ANNOTATE_NEW_MEMORY((vec), sizeof(*vec));\
+	VALGRIND_ANNOTATE_NEW_MEMORY((vec)->buffer,\
+		(sizeof(*(vec)->buffer) * ((vec)->capacity)));\
+	(vec)->size = 0;\
+} while (0)
+
+static inline int
+vec_reserve(void *vec, size_t ncapacity, size_t s)
+{
+	void *tbuf;
+	size_t ncap = ncapacity == 0 ? VEC_INIT_SIZE : ncapacity;
+
+	VEC(vvec, void) *vecp = (struct vvec *)vec;
+
+	D_REALLOC_NZ(tbuf, vecp->buffer, s * ncap);
+	if (tbuf == NULL) {
+		D_CRIT("Realloc!\n");
+		return -1;
+	}
+	vecp->buffer = tbuf;
+	vecp->capacity = ncap;
+	return 0;
+}
+
+#define VEC_RESERVE(vec, ncapacity)\
+(((vec)->size == 0 || (ncapacity) > (vec)->size) ?\
+	vec_reserve((void *)vec, ncapacity, sizeof(*(vec)->buffer)) :\
+	0)
+
+#define VEC_POP_BACK(vec) ((vec)->size -= 1)
+
+#define VEC_FRONT(vec) ((vec)->buffer[0])
+
+#define VEC_BACK(vec) ((vec)->buffer[(vec)->size - 1])
+
+#define VEC_ERASE_BY_POS(vec, pos) do {\
+	if ((pos) != ((vec)->size - 1))\
+		(vec)->buffer[(pos)] = VEC_BACK(vec);\
+	VEC_POP_BACK(vec);\
+} while (0)
+
+#define VEC_ERASE_BY_PTR(vec, element) do {\
+	if ((element) != &VEC_BACK(vec))\
+		*(element) = VEC_BACK(vec);\
+	VEC_POP_BACK(vec);\
+} while (0)
+
+#define VEC_INSERT(vec, element)\
+((vec)->buffer[(vec)->size - 1] = (element), 0)
+
+#define VEC_INC_SIZE(vec)\
+(((vec)->size++), 0)
+
+#define VEC_INC_BACK(vec)\
+((vec)->capacity == (vec)->size ? \
+	(VEC_RESERVE((vec), ((vec)->capacity * 2)) == 0 ? \
+		VEC_INC_SIZE(vec) : -1) : \
+	VEC_INC_SIZE(vec))
+
+#define VEC_PUSH_BACK(vec, element)\
+(VEC_INC_BACK(vec) == 0 ? VEC_INSERT(vec, element) : -1)
+
+#define VEC_FOREACH(el, vec)\
+for (size_t _vec_i = 0;\
+	_vec_i < (vec)->size && (((el) = (vec)->buffer[_vec_i]), 1);\
+	++_vec_i)
+
+#define VEC_FOREACH_REVERSE(el, vec)\
+for (size_t _vec_i = ((vec)->size);\
+	_vec_i != 0 && (((el) = (vec)->buffer[_vec_i - 1]), 1);\
+	--_vec_i)
+
+#define VEC_FOREACH_BY_POS(elpos, vec)\
+for ((elpos) = 0; (elpos) < (vec)->size; ++(elpos))
+
+#define VEC_FOREACH_BY_PTR(el, vec)\
+for (size_t _vec_i = 0;\
+	_vec_i < (vec)->size && (((el) = &(vec)->buffer[_vec_i]), 1);\
+	++_vec_i)
+
+#define VEC_SIZE(vec)\
+((vec)->size)
+
+#define VEC_CAPACITY(vec)\
+((vec)->capacity)
+
+#define VEC_ARR(vec)\
+((vec)->buffer)
+
+#define VEC_GET(vec, id)\
+(&(vec)->buffer[id])
+
+#define VEC_CLEAR(vec) ((vec)->size = 0)
+
+#define VEC_DELETE(vec) do {\
+	D_FREE((vec)->buffer);\
+	(vec)->buffer = NULL;\
+	(vec)->size = 0;\
+	(vec)->capacity = 0;\
+} while (0)
+
+#endif /* __DAOS_COMMON_VEC_H */
diff --git a/src/common/dav_v2/vecq.h b/src/common/dav_v2/vecq.h
new file mode 100644
index 00000000000..a9618862b39
--- /dev/null
+++ b/src/common/dav_v2/vecq.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2023, Intel Corporation */
+
+/*
+ * vecq.h -- vector queue (FIFO) interface
+ */
+
+#ifndef __DAOS_COMMON_VECQ_H
+#define __DAOS_COMMON_VECQ_H 1
+
+#include <stddef.h>
+
+#include "util.h"
+#include "out.h"
+
+#define VECQ_INIT_SIZE (64)
+
+#define VECQ(name, type)\
+struct name {\
+	type *buffer;\
+	size_t capacity;\
+	size_t front;\
+	size_t back;\
+}
+
+#define VECQ_INIT(vec) do {\
+	(vec)->buffer = NULL;\
+	(vec)->capacity = 0;\
+	(vec)->front = 0;\
+	(vec)->back = 0;\
+} while (0)
+
+#define VECQ_REINIT(vec) do {\
+	VALGRIND_ANNOTATE_NEW_MEMORY((vec), sizeof(*vec));\
+	VALGRIND_ANNOTATE_NEW_MEMORY((vec)->buffer,\
+		(sizeof(*(vec)->buffer) * ((vec)->capacity)));\
+	(vec)->front = 0;\
+	(vec)->back = 0;\
+} while (0)
+
+#define VECQ_FRONT_POS(vec)\
+((vec)->front & ((vec)->capacity - 1))
+
+#define VECQ_BACK_POS(vec)\
+((vec)->back & ((vec)->capacity - 1))
+
+#define VECQ_FRONT(vec)\
+((vec)->buffer[VECQ_FRONT_POS(vec)])
+
+#define VECQ_BACK(vec) ((vec)->buffer[VECQ_BACK_POS(vec)])
+
+#define VECQ_DEQUEUE(vec)\
+((vec)->buffer[(((vec)->front++) & ((vec)->capacity - 1))])
+
+#define VECQ_SIZE(vec)\
+((vec)->back - (vec)->front)
+
+static inline int
+realloc_set(void **buf, size_t s)
+{
+	void *tbuf;
+
+	D_REALLOC_NZ(tbuf, *buf, s);
+	if (tbuf == NULL) {
+		D_CRIT("Realloc!\n");
+		return -1;
+	}
+	*buf = tbuf;
+	return 0;
+}
+
+#define VECQ_NCAPACITY(vec)\
+((vec)->capacity == 0 ? VECQ_INIT_SIZE : (vec)->capacity * 2)
+#define VECQ_GROW(vec)\
+(realloc_set((void **)&(vec)->buffer,\
+		VECQ_NCAPACITY(vec) * sizeof(*(vec)->buffer)) ? -1 :\
+	(memcpy((vec)->buffer + (vec)->capacity, (vec)->buffer,\
+		VECQ_FRONT_POS(vec) * sizeof(*(vec)->buffer)),\
+	(vec)->front = VECQ_FRONT_POS(vec),\
+	(vec)->back = (vec)->front + (vec)->capacity,\
+	(vec)->capacity = VECQ_NCAPACITY(vec),\
+	0\
+))
+
+#define VECQ_INSERT(vec, element)\
+(VECQ_BACK(vec) = element, (vec)->back += 1, 0)
+
+#define VECQ_ENQUEUE(vec, element)\
+((vec)->capacity == VECQ_SIZE(vec) ?\
+	(VECQ_GROW(vec) == 0 ? VECQ_INSERT(vec, element) : -1) :\
+VECQ_INSERT(vec, element))
+
+#define VECQ_CAPACITY(vec)\
+((vec)->capacity)
+
+#define VECQ_FOREACH(el, vec)\
+for (size_t _vec_i = 0;\
+	_vec_i < VECQ_SIZE(vec) &&\
+	(((el) = (vec)->buffer[_vec_i & ((vec)->capacity - 1)]), 1);\
+	++_vec_i)
+
+#define VECQ_FOREACH_REVERSE(el, vec)\
+for (size_t _vec_i = VECQ_SIZE(vec);\
+	_vec_i > 0 &&\
+	(((el) = (vec)->buffer[(_vec_i - 1) & ((vec)->capacity - 1)]), 1);\
+	--_vec_i)
+
+#define VECQ_CLEAR(vec) do {\
+	(vec)->front = 0;\
+	(vec)->back = 0;\
+} while (0)
+
+#define VECQ_DELETE(vec) do {\
+	D_FREE((vec)->buffer);\
+	(vec)->buffer = NULL;\
+	(vec)->capacity = 0;\
+	(vec)->front = 0;\
+	(vec)->back = 0;\
+} while (0)
+
+#endif /* __DAOS_COMMON_VECQ_H */
diff --git a/src/common/dav_v2/wal_tx.c b/src/common/dav_v2/wal_tx.c
new file mode 100644
index 00000000000..9cd5d55d4ac
--- /dev/null
+++ b/src/common/dav_v2/wal_tx.c
@@ -0,0 +1,546 @@
+/**
+ * (C) Copyright 2022-2024 Intel Corporation.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ */
+
+#include <daos/mem.h>
+#include "dav_internal.h"
+#include "wal_tx.h"
+#include "util.h"
+#include "heap.h"
+
+struct umem_wal_tx_ops dav_wal_tx_ops;
+
+static inline uint64_t
+mdblob_addr2offset(struct dav_obj *hdl, void *addr)
+{
+	return umem_cache_ptr2off(hdl->do_store, addr);
+}
+
+#define AD_TX_ACT_ADD(tx, wa)							\
+	do {									\
+		d_list_add_tail(&(wa)->wa_link, &(tx)->wt_redo);		\
+		(tx)->wt_redo_cnt++;						\
+		if ((wa)->wa_act.ac_opc == UMEM_ACT_COPY ||			\
+		    (wa)->wa_act.ac_opc == UMEM_ACT_COPY_PTR) {			\
+			(tx)->wt_redo_payload_len += (wa)->wa_act.ac_copy.size;	\
+		} else if ((wa)->wa_act.ac_opc == UMEM_ACT_MOVE) {		\
+			/* ac_move src addr is playload after wal_trans_entry */\
+			(tx)->wt_redo_payload_len += sizeof(uint64_t);		\
+		}								\
+	} while (0)
+
+/** allocate wal_action, if success the wa_link and wa_act.ac_opc will be init-ed */
+#define D_ALLOC_ACT(wa, opc, size)							\
+	do {										\
+		if (opc == UMEM_ACT_COPY)						\
+			D_ALLOC(wa, offsetof(struct wal_action,				\
+					     wa_act.ac_copy.payload[size]));		\
+		else									\
+			D_ALLOC_PTR(wa);						\
+		if (likely(wa != NULL)) {						\
+			D_INIT_LIST_HEAD(&wa->wa_link);					\
+			wa->wa_act.ac_opc = opc;					\
+		}									\
+	} while (0)
+
+static inline void
+act_copy_payload(struct umem_action *act, void *addr, daos_size_t size)
+{
+	char	*dst = (char *)&act->ac_copy.payload[0];
+
+	if (size > 0)
+		memcpy(dst, addr, size);
+}
+
+static void
+dav_wal_tx_init(struct umem_wal_tx *utx, struct dav_obj *dav_hdl)
+{
+	struct dav_tx	*tx = utx2wtx(utx);
+
+	D_INIT_LIST_HEAD(&tx->wt_redo);
+	tx->wt_redo_cnt = 0;
+	tx->wt_redo_payload_len = 0;
+	tx->wt_redo_act_pos = NULL;
+	tx->wt_dav_hdl = dav_hdl;
+}
+
+struct umem_wal_tx *
+dav_umem_wtx_new(struct dav_obj *dav_hdl)
+{
+	struct umem_wal_tx *umem_wtx;
+
+	D_ASSERT(dav_hdl->do_utx == NULL);
+	D_ALLOC_PTR(umem_wtx);
+	if (umem_wtx == NULL)
+		return NULL;
+
+	umem_wtx->utx_ops = &dav_wal_tx_ops;
+	umem_wtx->utx_id = ULLONG_MAX;
+	dav_wal_tx_init(umem_wtx, dav_hdl);
+	dav_hdl->do_utx = umem_wtx;
+	return umem_wtx;
+}
+
+void
+dav_umem_wtx_cleanup(struct umem_wal_tx *utx)
+{
+	struct dav_tx		*tx = utx2wtx(utx);
+	d_list_t		*list = &tx->wt_redo;
+	struct wal_action	*wa, *next;
+
+	d_list_for_each_entry_safe(wa, next, list, wa_link) {
+		d_list_del(&wa->wa_link);
+		D_FREE(wa);
+	}
+}
+
+static int
+dav_wal_tx_submit(struct dav_obj *dav_hdl, struct umem_wal_tx *utx, void *data)
+{
+	struct wal_action	*wa, *next;
+	struct umem_action	*ua;
+	struct umem_store	*store = dav_hdl->do_store;
+	struct dav_tx		*tx = utx2wtx(utx);
+	d_list_t		*redo_list = &tx->wt_redo;
+
+	char	*pathname = basename(dav_hdl->do_path);
+	uint64_t id = utx->utx_id;
+	int	 rc;
+
+	if (wal_tx_act_nr(utx) == 0)
+		return 0;
+
+	d_list_for_each_entry_safe(wa, next, redo_list, wa_link) {
+		ua = &wa->wa_act;
+		switch (ua->ac_opc) {
+		case UMEM_ACT_COPY:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_COPY txid=%lu, (p,o)=%lu,%lu size=%lu\n",
+				pathname, id,
+				ua->ac_copy.addr / PAGESIZE, ua->ac_copy.addr % PAGESIZE,
+				ua->ac_copy.size);
+			break;
+		case UMEM_ACT_COPY_PTR:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_COPY_PTR txid=%lu, (p,o)=%lu,%lu size=%lu ptr=0x%lx\n",
+				pathname, id,
+				ua->ac_copy_ptr.addr / PAGESIZE, ua->ac_copy_ptr.addr % PAGESIZE,
+				ua->ac_copy_ptr.size, ua->ac_copy_ptr.ptr);
+			break;
+		case UMEM_ACT_ASSIGN:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_ASSIGN txid=%lu, (p,o)=%lu,%lu size=%u\n",
+				pathname, id,
+				ua->ac_assign.addr / PAGESIZE, ua->ac_assign.addr % PAGESIZE,
+				ua->ac_assign.size);
+			break;
+		case UMEM_ACT_SET:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_SET txid=%lu, (p,o)=%lu,%lu size=%u val=%u\n",
+				pathname, id,
+				ua->ac_set.addr / PAGESIZE, ua->ac_set.addr % PAGESIZE,
+				ua->ac_set.size, ua->ac_set.val);
+			break;
+		case UMEM_ACT_SET_BITS:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_SET_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n",
+				pathname, id,
+				ua->ac_op_bits.addr / PAGESIZE, ua->ac_op_bits.addr % PAGESIZE,
+				ua->ac_op_bits.pos, ua->ac_op_bits.num);
+			break;
+		case UMEM_ACT_CLR_BITS:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_CLR_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n",
+				pathname, id,
+				ua->ac_op_bits.addr / PAGESIZE, ua->ac_op_bits.addr % PAGESIZE,
+				ua->ac_op_bits.pos, ua->ac_op_bits.num);
+			break;
+		default:
+			D_ERROR("%s: unknown opc %d\n", dav_hdl->do_path, ua->ac_opc);
+			ASSERT(0);
+		}
+	}
+	DAV_DBG("tx_id:%lu submitting to WAL: %u bytes in %u actions",
+		id, tx->wt_redo_payload_len, tx->wt_redo_cnt);
+	rc = store->stor_ops->so_wal_submit(store, utx, data);
+	return rc;
+}
+
+/** complete the wl transaction */
+int
+dav_wal_tx_commit(struct dav_obj *hdl, struct umem_wal_tx *utx, void *data)
+{
+	int rc;
+
+	/* write actions in redo list to WAL */
+	rc = dav_wal_tx_submit(hdl, utx, data);
+
+	/* FAIL the engine if commit fails */
+	D_ASSERT(rc == 0);
+	dav_umem_wtx_cleanup(utx);
+	return 0;
+}
+
+int
+dav_wal_tx_reserve(struct dav_obj *hdl, uint64_t *id)
+{
+	int rc;
+
+	rc = hdl->do_store->stor_ops->so_wal_reserv(hdl->do_store, id);
+	/* REVISIT:
+	 * Remove this assert once callers of dav_free() and dav_memcpy_persist()
+	 * are modified to handle failures.
+	 */
+	D_ASSERT(rc == 0);
+	return rc;
+}
+
+/**
+ * snapshot data from src to either wal redo log.
+ */
+int
+dav_wal_tx_snap(void *hdl, void *addr, daos_size_t size, void *src, uint32_t flags)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+
+	if (addr == NULL || size == 0 || size > UMEM_ACT_PAYLOAD_MAX_LEN)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), size);
+	if (rc != 0)
+		return rc;
+
+	if (flags & DAV_XADD_WAL_CPTR) {
+		D_ALLOC_ACT(wa_redo, UMEM_ACT_COPY_PTR, size);
+		if (wa_redo == NULL)
+			return -DER_NOMEM;
+		wa_redo->wa_act.ac_copy_ptr.ptr = (uintptr_t)src;
+		wa_redo->wa_act.ac_copy_ptr.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+		wa_redo->wa_act.ac_copy_ptr.size = size;
+	} else {
+		D_ALLOC_ACT(wa_redo, UMEM_ACT_COPY, size);
+		if (wa_redo == NULL)
+			return -DER_NOMEM;
+		act_copy_payload(&wa_redo->wa_act, src, size);
+		wa_redo->wa_act.ac_copy.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+		wa_redo->wa_act.ac_copy.size = size;
+	}
+	AD_TX_ACT_ADD(tx, wa_redo);
+	return 0;
+}
+
+/** assign uint64_t value to @addr */
+int
+dav_wal_tx_assign(void *hdl, void *addr, uint64_t val)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+	if (addr == NULL)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t));
+	if (rc != 0)
+		return rc;
+
+	D_ALLOC_ACT(wa_redo, UMEM_ACT_ASSIGN, sizeof(uint64_t));
+	if (wa_redo == NULL)
+		return -DER_NOMEM;
+	wa_redo->wa_act.ac_assign.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+	wa_redo->wa_act.ac_assign.size = 8;
+	wa_redo->wa_act.ac_assign.val = val;
+	AD_TX_ACT_ADD(tx, wa_redo);
+
+	return 0;
+}
+
+/** Set bits starting from pos */
+int
+dav_wal_tx_set_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+	if (addr == NULL)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t));
+	if (rc != 0)
+		return rc;
+
+	D_ALLOC_ACT(wa_redo, UMEM_ACT_SET_BITS, sizeof(uint64_t));
+	if (wa_redo == NULL)
+		return -DER_NOMEM;
+	wa_redo->wa_act.ac_op_bits.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+	wa_redo->wa_act.ac_op_bits.num = num_bits;
+	wa_redo->wa_act.ac_op_bits.pos = pos;
+	AD_TX_ACT_ADD(tx, wa_redo);
+
+	return 0;
+}
+
+/** Clr bits starting from pos */
+int
+dav_wal_tx_clr_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+	if (addr == NULL)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t));
+	if (rc != 0)
+		return rc;
+
+	D_ALLOC_ACT(wa_redo, UMEM_ACT_CLR_BITS, sizeof(uint64_t));
+	if (wa_redo == NULL)
+		return -DER_NOMEM;
+	wa_redo->wa_act.ac_op_bits.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+	wa_redo->wa_act.ac_op_bits.num = num_bits;
+	wa_redo->wa_act.ac_op_bits.pos = pos;
+	AD_TX_ACT_ADD(tx, wa_redo);
+
+	return 0;
+}
+
+/**
+ * memset a storage region, save the operation for redo
+ */
+int
+dav_wal_tx_set(void *hdl, void *addr, char c, daos_size_t size)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+
+	if (addr == NULL || size == 0 || size > UMEM_ACT_PAYLOAD_MAX_LEN)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), size);
+	if (rc != 0)
+		return rc;
+
+	D_ALLOC_ACT(wa_redo, UMEM_ACT_SET, size);
+	if (wa_redo == NULL)
+		return -DER_NOMEM;
+
+	wa_redo->wa_act.ac_set.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+	wa_redo->wa_act.ac_set.size = size;
+	wa_redo->wa_act.ac_set.val = c;
+	AD_TX_ACT_ADD(tx, wa_redo);
+	return 0;
+}
+
+/**
+ * query action number in redo list.
+ */
+uint32_t
+wal_tx_act_nr(struct umem_wal_tx *utx)
+{
+	struct dav_tx *tx = utx2wtx(utx);
+
+	return tx->wt_redo_cnt;
+}
+
+/**
+ * query payload length in redo list.
+ */
+uint32_t
+wal_tx_payload_len(struct umem_wal_tx *utx)
+{
+	struct dav_tx *tx = utx2wtx(utx);
+
+	return tx->wt_redo_payload_len;
+}
+
+/**
+ * get first action pointer, NULL for list empty.
+ */
+struct umem_action *
+wal_tx_act_first(struct umem_wal_tx *utx)
+{
+	struct dav_tx *tx = utx2wtx(utx);
+
+	if (d_list_empty(&tx->wt_redo)) {
+		tx->wt_redo_act_pos = NULL;
+		return NULL;
+	}
+
+	tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo);
+	return &tx->wt_redo_act_pos->wa_act;
+}
+
+/**
+ * get next action pointer, NULL for done or list empty.
+ */
+struct umem_action *
+wal_tx_act_next(struct umem_wal_tx *utx)
+{
+	struct dav_tx *tx = utx2wtx(utx);
+
+	if (tx->wt_redo_act_pos == NULL) {
+		if (d_list_empty(&tx->wt_redo))
+			return NULL;
+		tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo);
+		return &tx->wt_redo_act_pos->wa_act;
+	}
+
+	D_ASSERT(!d_list_empty(&tx->wt_redo));
+	tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo_act_pos->wa_link);
+	if (&tx->wt_redo_act_pos->wa_link == &tx->wt_redo) {
+		tx->wt_redo_act_pos = NULL;
+		return NULL;
+	}
+	return &tx->wt_redo_act_pos->wa_act;
+}
+
+struct umem_wal_tx_ops dav_wal_tx_ops = {
+	.wtx_act_nr = wal_tx_act_nr,
+	.wtx_payload_sz = wal_tx_payload_len,
+	.wtx_act_first = wal_tx_act_first,
+	.wtx_act_next = wal_tx_act_next,
+};
+
+static inline void *
+dav_wal_replay_heap_off2ptr(dav_obj_t *dav_hdl, uint64_t off)
+{
+	uint32_t                     z_id = OFFSET_TO_ZID(off);
+	struct umem_cache_range      rg   = {0};
+	int                          rc;
+	struct umem_store           *store = dav_hdl->do_store;
+
+	rg.cr_off  = GET_ZONE_OFFSET(z_id);
+	rg.cr_size = ((store->stor_size - rg.cr_off) > ZONE_MAX_SIZE)
+			 ? ZONE_MAX_SIZE
+			 : (store->stor_size - rg.cr_off);
+	rc         = umem_cache_load(store, &rg, 1, 0);
+	if (rc) {
+		D_ERROR("Failed to load pages to umem cache");
+		errno = daos_der2errno(rc);
+		return NULL;
+	}
+	return umem_cache_off2ptr(store, off);
+}
+
+int
+dav_wal_replay_cb(uint64_t tx_id, struct umem_action *act, void *arg)
+{
+	void *src, *dst;
+	ptrdiff_t off;
+	uint64_t *p, mask;
+	daos_size_t size;
+	int pos, num, val;
+	int rc = 0;
+	dav_obj_t         *dav_hdl = arg;
+	struct umem_store *store   = dav_hdl->do_store;
+
+	umem_cache_commit(store, tx_id);
+	switch (act->ac_opc) {
+	case UMEM_ACT_COPY:
+		D_DEBUG(DB_TRACE,
+			"ACT_COPY txid=%lu, (p,o)=%lu,%lu size=%lu\n",
+			tx_id,
+			act->ac_copy.addr / PAGESIZE, act->ac_copy.addr % PAGESIZE,
+			act->ac_copy.size);
+		off  = act->ac_copy.addr;
+		src = (void *)&act->ac_copy.payload;
+		size = act->ac_copy.size;
+		dst  = dav_wal_replay_heap_off2ptr(dav_hdl, off);
+		if (dst == NULL) {
+			rc = daos_errno2der(errno);
+			goto out;
+		}
+		memcpy(dst, src, size);
+		break;
+	case UMEM_ACT_ASSIGN:
+		D_DEBUG(DB_TRACE,
+			"ACT_ASSIGN txid=%lu, (p,o)=%lu,%lu size=%u\n",
+			tx_id,
+			act->ac_assign.addr / PAGESIZE, act->ac_assign.addr % PAGESIZE,
+			act->ac_assign.size);
+		off = act->ac_assign.addr;
+		dst = dav_wal_replay_heap_off2ptr(dav_hdl, off);
+		if (dst == NULL) {
+			rc = daos_errno2der(errno);
+			goto out;
+		}
+		size = act->ac_assign.size;
+		ASSERT_rt(size == 1 || size == 2 || size == 4);
+		src = &act->ac_assign.val;
+		memcpy(dst, src, size);
+		break;
+	case UMEM_ACT_SET:
+		D_DEBUG(DB_TRACE,
+			"ACT_SET txid=%lu, (p,o)=%lu,%lu size=%u val=%u\n",
+			tx_id,
+			act->ac_set.addr / PAGESIZE, act->ac_set.addr % PAGESIZE,
+			act->ac_set.size, act->ac_set.val);
+		off = act->ac_set.addr;
+		dst = dav_wal_replay_heap_off2ptr(dav_hdl, off);
+		if (dst == NULL) {
+			rc = daos_errno2der(errno);
+			goto out;
+		}
+		size = act->ac_set.size;
+		val = act->ac_set.val;
+		memset(dst, val, size);
+		break;
+	case UMEM_ACT_SET_BITS:
+	case UMEM_ACT_CLR_BITS:
+		D_DEBUG(DB_TRACE,
+			"ACT_CLR_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n",
+			tx_id,
+			act->ac_op_bits.addr / PAGESIZE, act->ac_op_bits.addr % PAGESIZE,
+			act->ac_op_bits.pos, act->ac_op_bits.num);
+		off = act->ac_op_bits.addr;
+		size = sizeof(uint64_t);
+		p    = dav_wal_replay_heap_off2ptr(dav_hdl, off);
+		if (p == NULL) {
+			rc = daos_errno2der(errno);
+			goto out;
+		}
+		num = act->ac_op_bits.num;
+		pos = act->ac_op_bits.pos;
+		ASSERT_rt((pos >= 0) && (pos + num) <= 64);
+		mask = ((1ULL << num) - 1) << pos;
+		if (act->ac_opc == UMEM_ACT_SET_BITS)
+			*p |= mask;
+		else
+			*p &= ~mask;
+		break;
+	default:
+		D_ASSERT(0);
+		break;
+	}
+
+	if (rc == 0)
+		rc = umem_cache_touch(store, tx_id, off, size);
+
+out:
+	return rc;
+}
diff --git a/src/common/dav_v2/wal_tx.h b/src/common/dav_v2/wal_tx.h
new file mode 100644
index 00000000000..1a7e06c2fed
--- /dev/null
+++ b/src/common/dav_v2/wal_tx.h
@@ -0,0 +1,44 @@
+/**
+ * (C) Copyright 2021-2024 Intel Corporation.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ */
+
+#ifndef __DAOS_COMMON_DAV_WAL_TX_
+#define __DAOS_COMMON_DAV_WAL_TX_
+
+#include <gurt/list.h>
+#include <daos_types.h>
+#include <daos/mem.h>
+
+struct dav_obj;
+
+struct wal_action {
+	d_list_t                wa_link;
+	struct umem_action      wa_act;
+};
+
+struct dav_tx {
+	struct dav_obj		*wt_dav_hdl;
+	d_list_t		 wt_redo;
+	uint32_t		 wt_redo_cnt;
+	uint32_t		 wt_redo_payload_len;
+	struct wal_action	*wt_redo_act_pos;
+};
+D_CASSERT(sizeof(struct dav_tx) <= UTX_PRIV_SIZE,
+	  "Size of struct dav_tx is too big!");
+
+#define dav_action_get_next(it) d_list_entry(it.next, struct wal_action, wa_link)
+
+struct umem_wal_tx *dav_umem_wtx_new(struct dav_obj *dav_hdl);
+void dav_umem_wtx_cleanup(struct umem_wal_tx *utx);
+int dav_wal_tx_reserve(struct dav_obj *hdl, uint64_t *id);
+int dav_wal_tx_commit(struct dav_obj *hdl, struct umem_wal_tx *utx, void *data);
+int dav_wal_tx_snap(void *hdl, void *addr, daos_size_t size, void *src, uint32_t flags);
+int dav_wal_tx_assign(void *hdl, void *addr, uint64_t val);
+int dav_wal_tx_clr_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits);
+int dav_wal_tx_set_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits);
+int dav_wal_tx_set(void *hdl, void *addr, char c, daos_size_t size);
+int dav_wal_replay_cb(uint64_t tx_id, struct umem_action *act, void *arg);
+
+#endif	/*__DAOS_COMMON_DAV_WAL_TX_*/
diff --git a/src/common/mem.c b/src/common/mem.c
index 0ee9bcb07b2..beccab45266 100644
--- a/src/common/mem.c
+++ b/src/common/mem.c
@@ -17,7 +17,9 @@
 #ifdef DAOS_PMEM_BUILD
 #include <libpmemobj.h>
 #include <daos_srv/ad_mem.h>
+#define DAV_V2_BUILD
 #include "dav/dav.h"
+#include "dav_v2/dav_v2.h"
 #endif
 
 #define UMEM_TX_DATA_MAGIC	(0xc01df00d)
@@ -34,7 +36,8 @@ struct umem_tx_stage_item {
 
 #ifdef DAOS_PMEM_BUILD
 
-static int daos_md_backend = DAOS_MD_PMEM;
+static int  daos_md_backend      = DAOS_MD_PMEM;
+static bool daos_disable_bmem_v2 = false;
 #define UMM_SLABS_CNT 16
 
 /** Initializes global settings for the pmem objects.
@@ -49,6 +52,7 @@ umempobj_settings_init(bool md_on_ssd)
 	int					rc;
 	enum pobj_arenas_assignment_type	atype;
 	unsigned int				md_mode = DAOS_MD_BMEM;
+	unsigned int                            md_disable_bmem_v2 = 0;
 
 	if (!md_on_ssd) {
 		daos_md_backend = DAOS_MD_PMEM;
@@ -70,22 +74,39 @@ umempobj_settings_init(bool md_on_ssd)
 	case DAOS_MD_ADMEM:
 		D_INFO("UMEM will use AD-hoc Memory as the metadata backend interface\n");
 		break;
+	case DAOS_MD_BMEM_V2:
+		D_INFO("UMEM will use Blob Backed Memory v2 as the metadata backend interface\n");
+		break;
 	default:
 		D_ERROR("DAOS_MD_ON_SSD_MODE=%d envar invalid, use %d for BMEM or %d for ADMEM\n",
 			md_mode, DAOS_MD_BMEM, DAOS_MD_ADMEM);
 		return -DER_INVAL;
 	};
 
+	d_getenv_uint("DAOS_MD_DISABLE_BMEM_V2", &md_disable_bmem_v2);
+	if (md_disable_bmem_v2 && (md_mode != DAOS_MD_BMEM))
+		D_INFO("Ignoring DAOS_MD_DISABLE_BMEM_V2 tunable");
+	else
+		daos_disable_bmem_v2 = md_disable_bmem_v2;
+
 	daos_md_backend = md_mode;
 	return 0;
 }
 
-int umempobj_get_backend_type(void)
+int
+umempobj_get_backend_type(void)
 {
 	return daos_md_backend;
 }
 
-int umempobj_backend_type2class_id(int backend)
+bool
+umempobj_allow_md_bmem_v2()
+{
+	return !daos_disable_bmem_v2;
+}
+
+int
+umempobj_backend_type2class_id(int backend)
 {
 	switch (backend) {
 	case DAOS_MD_PMEM:
@@ -94,6 +115,8 @@ int umempobj_backend_type2class_id(int backend)
 		return UMEM_CLASS_BMEM;
 	case DAOS_MD_ADMEM:
 		return UMEM_CLASS_ADMEM;
+	case DAOS_MD_BMEM_V2:
+		return UMEM_CLASS_BMEM_V2;
 	default:
 		D_ASSERTF(0,
 			  "bad daos_md_backend %d\n", backend);
@@ -101,6 +124,15 @@ int umempobj_backend_type2class_id(int backend)
 	}
 }
 
+size_t
+umempobj_pgsz(int backend)
+{
+	if (backend == DAOS_MD_BMEM_V2)
+		return dav_obj_pgsz_v2();
+	else
+		return (1UL << 12);
+}
+
 /** Define common slabs.  We can refine this for 2.4 pools but that is for next patch */
 static const int        slab_map[] = {
     0,          /* 32 bytes */
@@ -161,6 +193,16 @@ set_slab_desc(struct umem_pool *ph_p, struct umem_slab_desc *slab)
 		/* update with the new slab id */
 		slab->class_id = davslab.class_id;
 		break;
+	case DAOS_MD_BMEM_V2:
+		davslab.unit_size = slab->unit_size;
+		davslab.alignment = 0;
+		davslab.units_per_block = 1000;
+		davslab.header_type = DAV_HEADER_NONE;
+		davslab.class_id = slab->class_id;
+		rc = dav_class_register_v2((dav_obj_t *)ph_p->up_priv, &davslab);
+		/* update with the new slab id */
+		slab->class_id = davslab.class_id;
+		break;
 	case DAOS_MD_ADMEM:
 		/* NOOP for ADMEM now */
 		slab->class_id = class_id++;
@@ -325,6 +367,15 @@ umempobj_create(const char *path, const char *layout_name, int flags,
 		}
 		umm_pool->up_priv = dav_hdl;
 		break;
+	case DAOS_MD_BMEM_V2:
+		dav_hdl = dav_obj_create_v2(path, 0, poolsize, mode, &umm_pool->up_store);
+		if (!dav_hdl) {
+			D_ERROR("Failed to create pool %s, size="DF_U64": errno = %d\n",
+				path, poolsize, errno);
+			goto error;
+		}
+		umm_pool->up_priv = dav_hdl;
+		break;
 	case DAOS_MD_ADMEM:
 		rc = ad_blob_create(path, 0, store, &bh);
 		if (rc) {
@@ -408,6 +459,16 @@ umempobj_open(const char *path, const char *layout_name, int flags, struct umem_
 			goto error;
 		}
 
+		umm_pool->up_priv = dav_hdl;
+		break;
+	case DAOS_MD_BMEM_V2:
+		dav_hdl = dav_obj_open_v2(path, 0, &umm_pool->up_store);
+		if (!dav_hdl) {
+			D_ERROR("Error in opening the pool %s: errno =%d\n",
+				path, errno);
+			goto error;
+		}
+
 		umm_pool->up_priv = dav_hdl;
 		break;
 	case DAOS_MD_ADMEM:
@@ -452,6 +513,9 @@ umempobj_close(struct umem_pool *ph_p)
 	case DAOS_MD_BMEM:
 		dav_obj_close((dav_obj_t *)ph_p->up_priv);
 		break;
+	case DAOS_MD_BMEM_V2:
+		dav_obj_close_v2((dav_obj_t *)ph_p->up_priv);
+		break;
 	case DAOS_MD_ADMEM:
 		bh.bh_blob = (struct ad_blob *)ph_p->up_priv;
 		ad_blob_close(bh);
@@ -491,6 +555,9 @@ umempobj_get_rootptr(struct umem_pool *ph_p, size_t size)
 	case DAOS_MD_BMEM:
 		off = dav_root((dav_obj_t *)ph_p->up_priv, size);
 		return (char *)dav_get_base_ptr((dav_obj_t *)ph_p->up_priv) + off;
+	case DAOS_MD_BMEM_V2:
+		off = dav_root_v2((dav_obj_t *)ph_p->up_priv, size);
+		return (char *)umem_cache_off2ptr(&ph_p->up_store, off);
 	case DAOS_MD_ADMEM:
 		bh.bh_blob = (struct ad_blob *)ph_p->up_priv;
 		return ad_root(bh, size);
@@ -528,6 +595,11 @@ umempobj_get_heapusage(struct umem_pool *ph_p, daos_size_t *curr_allocated)
 		if (rc == 0)
 			*curr_allocated = st.curr_allocated;
 		break;
+	case DAOS_MD_BMEM_V2:
+		rc = dav_get_heap_stats_v2((dav_obj_t *)ph_p->up_priv, &st);
+		if (rc == 0)
+			*curr_allocated = st.curr_allocated;
+		break;
 	case DAOS_MD_ADMEM:
 		*curr_allocated = 40960; /* TODO */
 		break;
@@ -539,6 +611,46 @@ umempobj_get_heapusage(struct umem_pool *ph_p, daos_size_t *curr_allocated)
 	return rc;
 }
 
+/** Obtain the usage statistics for the memory bucket. Note that the usage
+ *  statistics for an evictable memory bucket can be approximate value if
+ *  memory bucket is not yet loaded on to the umem cache.
+ *
+ *  \param	pool[IN]		Pointer to the persistent object.
+ *  \param	mb_id[IN]		memory bucket id.
+ *  \param	curr_allocated[IN|OUT]	Total bytes currently allocated
+ *  \param	maxsz[IN|OUT]	        Max size the memory bucket can grow.
+ *
+ *  \return	zero on success and non-zero on failure.
+ */
+int
+umempobj_get_mbusage(struct umem_pool *ph_p, uint32_t mb_id, daos_size_t *curr_allocated,
+		     daos_size_t *maxsz)
+{
+	struct dav_heap_mb_stats st;
+	int                      rc = 0;
+
+	switch (ph_p->up_store.store_type) {
+	case DAOS_MD_PMEM:
+	case DAOS_MD_BMEM:
+	case DAOS_MD_ADMEM:
+		rc = -DER_INVAL;
+		break;
+	case DAOS_MD_BMEM_V2:
+		rc = dav_get_heap_mb_stats_v2((dav_obj_t *)ph_p->up_priv, mb_id, &st);
+		if (rc == 0) {
+			*curr_allocated = st.dhms_allocated;
+			*maxsz          = st.dhms_maxsz;
+		} else
+			rc = daos_errno2der(errno);
+		break;
+	default:
+		D_ASSERTF(0, "bad daos_md_backend %d\n", ph_p->up_store.store_type);
+		break;
+	}
+
+	return rc;
+}
+
 /** Log fragmentation related info for the pool.
  *
  *  \param	pool[IN]		Pointer to the persistent object.
@@ -567,6 +679,12 @@ umempobj_log_fraginfo(struct umem_pool *ph_p)
 		  DF_U64", run_active: "DF_U64"\n",
 		  st.run_allocated, st.run_active);
 		break;
+	case DAOS_MD_BMEM_V2:
+		dav_get_heap_stats_v2((dav_obj_t *)ph_p->up_priv, &st);
+		D_ERROR("Fragmentation info, run_allocated: "
+		  DF_U64", run_active: "DF_U64"\n",
+		  st.run_allocated, st.run_active);
+		break;
 	case DAOS_MD_ADMEM:
 		/* TODO */
 		D_ERROR("Fragmentation info, not implemented in ADMEM yet.\n");
@@ -658,7 +776,8 @@ pmem_tx_free(struct umem_instance *umm, umem_off_t umoff)
 }
 
 static umem_off_t
-pmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num)
+pmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num,
+	      unsigned int unused)
 {
 	uint64_t pflags = 0;
 
@@ -866,7 +985,8 @@ pmem_tx_stage(void)
 }
 
 static umem_off_t
-pmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num)
+pmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num,
+	     unsigned int unused)
 {
 	PMEMobjpool *pop = (PMEMobjpool *)umm->umm_pool->up_priv;
 
@@ -900,8 +1020,8 @@ pmem_atomic_copy(struct umem_instance *umm, void *dest, const void *src,
 }
 
 static umem_off_t
-pmem_atomic_alloc(struct umem_instance *umm, size_t size,
-		  unsigned int type_num)
+pmem_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num,
+		  unsigned int unused)
 {
 	PMEMoid oid;
 	PMEMobjpool *pop = (PMEMobjpool *)umm->umm_pool->up_priv;
@@ -1049,7 +1169,8 @@ bmem_tx_free(struct umem_instance *umm, umem_off_t umoff)
 }
 
 static umem_off_t
-bmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num)
+bmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num,
+	      unsigned int mbkt_id)
 {
 	uint64_t pflags = 0;
 
@@ -1162,7 +1283,8 @@ bmem_defer_free(struct umem_instance *umm, umem_off_t off, void *act)
 }
 
 static umem_off_t
-bmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num)
+bmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num,
+	     unsigned int mbkt_id)
 {
 	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
 
@@ -1201,8 +1323,8 @@ bmem_atomic_copy(struct umem_instance *umm, void *dest, const void *src,
 }
 
 static umem_off_t
-bmem_atomic_alloc(struct umem_instance *umm, size_t size,
-		  unsigned int type_num)
+bmem_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num,
+		  unsigned int mbkt_id)
 {
 	uint64_t off;
 	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
@@ -1255,6 +1377,255 @@ static umem_ops_t	bmem_ops = {
 	.mo_tx_add_callback	= umem_tx_add_cb,
 };
 
+/** BMEM v2 operations (depends on dav) */
+
+static int
+bmem_tx_free_v2(struct umem_instance *umm, umem_off_t umoff)
+{
+	/*
+	 * This free call could be on error cleanup code path where
+	 * the transaction is already aborted due to previous failed
+	 * pmemobj_tx call. Let's just skip it in this case.
+	 *
+	 * The reason we don't fix caller to avoid calling tx_free()
+	 * in an aborted transaction is that the caller code could be
+	 * shared by both transactional and non-transactional (where
+	 * UMEM_CLASS_VMEM is used, see btree code) interfaces, and
+	 * the explicit umem_free() on error cleanup is necessary for
+	 * non-transactional case.
+	 */
+	if (dav_tx_stage_v2() == DAV_TX_STAGE_ONABORT)
+		return 0;
+
+	if (!UMOFF_IS_NULL(umoff)) {
+		int	rc;
+
+		rc = dav_tx_free_v2(umem_off2offset(umoff));
+		return rc ? umem_tx_errno(rc) : 0;
+	}
+
+	return 0;
+}
+
+static umem_off_t
+bmem_tx_alloc_v2(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num,
+	      unsigned int mbkt_id)
+{
+	uint64_t pflags = 0;
+
+	get_slab(umm, &pflags, &size);
+
+	if (flags & UMEM_FLAG_ZERO)
+		pflags |= DAV_FLAG_ZERO;
+	if (flags & UMEM_FLAG_NO_FLUSH)
+		pflags |= DAV_FLAG_NO_FLUSH;
+	if (mbkt_id != 0)
+		pflags |= DAV_EZONE_ID(mbkt_id);
+	return dav_tx_alloc_v2(size, type_num, pflags);
+}
+
+static int
+bmem_tx_add_v2(struct umem_instance *umm, umem_off_t umoff,
+	    uint64_t offset, size_t size)
+{
+	int	rc;
+
+	rc = dav_tx_add_range_v2(umem_off2offset(umoff), size);
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+static int
+bmem_tx_xadd_v2(struct umem_instance *umm, umem_off_t umoff, uint64_t offset,
+	     size_t size, uint64_t flags)
+{
+	int	rc;
+	uint64_t pflags = 0;
+
+	if (flags & UMEM_XADD_NO_SNAPSHOT)
+		pflags |= DAV_XADD_NO_SNAPSHOT;
+
+	rc = dav_tx_xadd_range_v2(umem_off2offset(umoff), size, pflags);
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+
+static int
+bmem_tx_add_ptr_v2(struct umem_instance *umm, void *ptr, size_t size)
+{
+	int	rc;
+
+	rc = dav_tx_add_range_direct_v2(ptr, size);
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+static int
+bmem_tx_abort_v2(struct umem_instance *umm, int err)
+{
+	/*
+	 * obj_tx_abort() may have already been called in the error
+	 * handling code of pmemobj APIs.
+	 */
+	if (dav_tx_stage_v2() != DAV_TX_STAGE_ONABORT)
+		dav_tx_abort_v2(err);
+
+	err = dav_tx_end_v2(NULL);
+	return err ? umem_tx_errno(err) : 0;
+}
+
+static int
+bmem_tx_begin_v2(struct umem_instance *umm, struct umem_tx_stage_data *txd)
+{
+	int rc;
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	if (txd != NULL) {
+		D_ASSERT(txd->txd_magic == UMEM_TX_DATA_MAGIC);
+		rc = dav_tx_begin_v2(pop, NULL, DAV_TX_PARAM_CB, pmem_stage_callback,
+				      txd, DAV_TX_PARAM_NONE);
+	} else {
+		rc = dav_tx_begin_v2(pop, NULL, DAV_TX_PARAM_NONE);
+	}
+
+	if (rc != 0) {
+		/*
+		 * dav_tx_end() needs be called to re-initialize the
+		 * tx state when dav_tx_begin() failed.
+		 */
+		rc = dav_tx_end_v2(NULL);
+		return rc ? umem_tx_errno(rc) : 0;
+	}
+	return 0;
+}
+
+static int
+bmem_tx_commit_v2(struct umem_instance *umm, void *data)
+{
+	int rc;
+
+	dav_tx_commit_v2();
+	rc = dav_tx_end_v2(data);
+
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+static int
+bmem_tx_stage_v2(void)
+{
+	return dav_tx_stage_v2();
+}
+
+static void
+bmem_defer_free_v2(struct umem_instance *umm, umem_off_t off, void *act)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	dav_defer_free_v2(pop, umem_off2offset(off),
+			(struct dav_action *)act);
+}
+
+static umem_off_t
+bmem_reserve_v2(struct umem_instance *umm, void *act, size_t size, unsigned int type_num,
+	     unsigned int mbkt_id)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+	uint64_t   flags = DAV_EZONE_ID(mbkt_id);
+
+	return dav_reserve_v2(pop, (struct dav_action *)act, size, type_num, flags);
+}
+
+static void
+bmem_cancel_v2(struct umem_instance *umm, void *actv, int actv_cnt)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	dav_cancel_v2(pop, (struct dav_action *)actv, actv_cnt);
+}
+
+static int
+bmem_tx_publish_v2(struct umem_instance *umm, void *actv, int actv_cnt)
+{
+	int	rc;
+
+	rc = dav_tx_publish_v2((struct dav_action *)actv, actv_cnt);
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+static void *
+bmem_atomic_copy_v2(struct umem_instance *umm, void *dest, const void *src,
+		 size_t len, enum acopy_hint hint)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	if (hint == UMEM_RESERVED_MEM) {
+		memcpy(dest, src, len);
+		return dest;
+	} else { /* UMEM_COMMIT_IMMEDIATE */
+		return dav_memcpy_persist_v2(pop, dest, src, len);
+	}
+}
+
+static umem_off_t
+bmem_atomic_alloc_v2(struct umem_instance *umm, size_t size, unsigned int type_num,
+		  unsigned int mbkt_id)
+{
+	uint64_t off;
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+	int rc;
+	uint64_t   flags = DAV_EZONE_ID(mbkt_id);
+
+	rc = dav_alloc_v2(pop, &off, size, type_num, flags, NULL, NULL);
+	if (rc)
+		return UMOFF_NULL;
+	return off;
+}
+
+static int
+bmem_atomic_free_v2(struct umem_instance *umm, umem_off_t umoff)
+{
+	if (!UMOFF_IS_NULL(umoff)) {
+		uint64_t off = umem_off2offset(umoff);
+
+		dav_free_v2((dav_obj_t *)umm->umm_pool->up_priv, off);
+	}
+	return 0;
+}
+
+static void
+bmem_atomic_flush_v2(struct umem_instance *umm, void *addr, size_t len)
+{
+	/* NOP */
+}
+
+static uint32_t
+bmem_allot_mb_evictable_v2(struct umem_instance *umm, int flags)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	return dav_allot_mb_evictable_v2(pop, flags);
+}
+
+static umem_ops_t bmem_v2_ops = {
+	.mo_tx_free            = bmem_tx_free_v2,
+	.mo_tx_alloc           = bmem_tx_alloc_v2,
+	.mo_tx_add             = bmem_tx_add_v2,
+	.mo_tx_xadd            = bmem_tx_xadd_v2,
+	.mo_tx_add_ptr         = bmem_tx_add_ptr_v2,
+	.mo_tx_abort           = bmem_tx_abort_v2,
+	.mo_tx_begin           = bmem_tx_begin_v2,
+	.mo_tx_commit          = bmem_tx_commit_v2,
+	.mo_tx_stage           = bmem_tx_stage_v2,
+	.mo_reserve            = bmem_reserve_v2,
+	.mo_defer_free         = bmem_defer_free_v2,
+	.mo_cancel             = bmem_cancel_v2,
+	.mo_tx_publish         = bmem_tx_publish_v2,
+	.mo_atomic_copy        = bmem_atomic_copy_v2,
+	.mo_atomic_alloc       = bmem_atomic_alloc_v2,
+	.mo_atomic_free        = bmem_atomic_free_v2,
+	.mo_atomic_flush       = bmem_atomic_flush_v2,
+	.mo_allot_evictable_mb = bmem_allot_mb_evictable_v2,
+	.mo_tx_add_callback    = umem_tx_add_cb,
+};
+
 int
 umem_tx_errno(int err)
 {
@@ -1283,7 +1654,8 @@ vmem_free(struct umem_instance *umm, umem_off_t umoff)
 }
 
 umem_off_t
-vmem_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num)
+vmem_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num,
+	   unsigned int unused)
 {
 	return (uint64_t)((flags & UMEM_FLAG_ZERO) ?
 			  calloc(1, size) : malloc(size));
@@ -1343,6 +1715,11 @@ static struct umem_class umem_class_defined[] = {
 		.umc_ops	= &bmem_ops,
 		.umc_name	= "bmem",
 	},
+	{
+		.umc_id		= UMEM_CLASS_BMEM_V2,
+		.umc_ops	= &bmem_v2_ops,
+		.umc_name	= "bmem_v2",
+	},
 	{
 		.umc_id		= UMEM_CLASS_ADMEM,
 		.umc_ops	= &ad_mem_ops,
@@ -1392,6 +1769,11 @@ set_offsets(struct umem_instance *umm)
 
 		umm->umm_base = (uint64_t)dav_get_base_ptr(dav_pop);
 		break;
+	case UMEM_CLASS_BMEM_V2:
+		dav_pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+		umm->umm_base = (uint64_t)dav_get_base_ptr_v2(dav_pop);
+		break;
 	case UMEM_CLASS_ADMEM:
 		bh.bh_blob = (struct ad_blob *)umm->umm_pool->up_priv;
 		umm->umm_base = (uint64_t)ad_base(bh);
@@ -1537,6 +1919,7 @@ umem_rsrvd_item_size(struct umem_instance *umm)
 	case UMEM_CLASS_ADMEM:
 		return sizeof(struct ad_reserv_act);
 	case UMEM_CLASS_BMEM:
+	case UMEM_CLASS_BMEM_V2:
 		return sizeof(struct dav_action);
 	default:
 		D_ERROR("bad umm_id %d\n", umm->umm_id);
@@ -1601,8 +1984,8 @@ umem_rsrvd_act_free(struct umem_rsrvd_act **rsrvd_act)
 }
 
 umem_off_t
-umem_reserve(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act,
-	     size_t size)
+umem_reserve_common(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act, size_t size,
+		    unsigned int mbkt_id)
 {
 	if (umm->umm_ops->mo_reserve) {
 		void			*act;
@@ -1613,8 +1996,7 @@ umem_reserve(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act,
 		D_ASSERT(rsrvd_act->rs_actv_cnt > rsrvd_act->rs_actv_at);
 
 		act = rsrvd_act->rs_actv + act_size * rsrvd_act->rs_actv_at;
-		off = umm->umm_ops->mo_reserve(umm, act, size,
-					       UMEM_TYPE_ANY);
+		off = umm->umm_ops->mo_reserve(umm, act, size, UMEM_TYPE_ANY, mbkt_id);
 		if (!UMOFF_IS_NULL(off))
 			rsrvd_act->rs_actv_at++;
 		D_ASSERTF(umem_off2flags(off) == 0,
@@ -1680,12 +2062,18 @@ umem_tx_publish(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act)
 	return rc;
 }
 
+/* Memory page */
 struct umem_page_info {
-	/** Back pointer to page */
-	struct umem_page *pi_page;
+	/** Mapped MD page ID */
+	uint32_t pi_pg_id;
+	/** Reference count */
+	uint32_t pi_ref;
 	/** Page flags */
-	uint64_t          pi_waiting : 1, /** Page is copied, but waiting for commit */
-	    pi_copying               : 1; /** Page is being copied. Blocks writes. */
+	uint64_t pi_io		: 1, /** Page is being flushed/loaded to/from MD-blob */
+		 pi_copying	: 1, /** Page is being copied. Blocks writes. */
+		 pi_mapped	: 1, /** Page is mapped to a MD page */
+		 pi_sys		: 1, /** Page is brought to cache by system internal access */
+		 pi_loaded	: 1; /** Page is loaded */
 	/** Highest transaction ID checkpointed.  This is set before the page is copied. The
 	 *  checkpoint will not be executed until the last committed ID is greater than or
 	 *  equal to this value.  If that's not the case immediately, the waiting flag is set
@@ -1694,243 +2082,593 @@ struct umem_page_info {
 	uint64_t pi_last_checkpoint;
 	/** Highest transaction ID of writes to the page */
 	uint64_t pi_last_inflight;
-	/** link chain on global dirty list, LRU list, or free info list */
-	d_list_t pi_link;
+	/** link to global LRU lists, or global free page list, or global pinned list */
+	d_list_t pi_lru_link;
+	/** link to global dirty page list, or wait commit list, or temporary list for flushing */
+	d_list_t pi_dirty_link;
+	/** link to global flushing page list */
+	d_list_t pi_flush_link;
+	/** Waitqueue for page loading/flushing */
+	void	*pi_io_wq;
+	/** Waitqueue for page committing */
+	void	*pi_commit_wq;
 	/** page memory address */
 	uint8_t *pi_addr;
 	/** Information about in-flight checkpoint */
 	void    *pi_chkpt_data;
-	/** bitmap for each dirty 16K unit */
-	uint64_t pi_bmap[UMEM_CACHE_BMAP_SZ];
+	/** bitmap for each dirty 4K unit */
+	uint64_t *pi_bmap;
 };
 
-int
-umem_cache_alloc(struct umem_store *store, uint64_t max_mapped)
+/* Convert page ID to MD-blob offset */
+static inline umem_off_t
+cache_id2off(struct umem_cache *cache, uint32_t pg_id)
 {
-	struct umem_cache *cache;
-	struct umem_page_info *pinfo;
-	uint64_t           num_pages;
-	int                rc = 0;
-	int                idx;
+	return ((umem_off_t)pg_id << cache->ca_page_shift) + cache->ca_base_off;
+}
 
-	D_ASSERT(store != NULL);
+/* Convert MD-blob offset to page ID */
+static inline uint32_t
+cache_off2id(struct umem_cache *cache, umem_off_t offset)
+{
+	D_ASSERT(offset >= cache->ca_base_off);
+	return (offset - cache->ca_base_off) >> cache->ca_page_shift;
+}
 
-	num_pages = (store->stor_size + UMEM_CACHE_PAGE_SZ - 1) >> UMEM_CACHE_PAGE_SZ_SHIFT;
+/* Convert MD-blob offset to MD page */
+static inline struct umem_page *
+cache_off2page(struct umem_cache *cache, umem_off_t offset)
+{
+	uint32_t idx = cache_off2id(cache, offset);
 
-	if (max_mapped != 0) {
-		D_ERROR("Setting max_mapped is unsupported at present\n");
-		return -DER_NOTSUPPORTED;
-	}
+	D_ASSERTF(idx < cache->ca_md_pages, "offset=" DF_U64 ", md_pages=%u, idx=%u\n",
+		  offset, cache->ca_md_pages, idx);
 
-	max_mapped = num_pages;
+	return &cache->ca_pages[idx];
+}
 
-	D_ALLOC(cache, sizeof(*cache) + sizeof(cache->ca_pages[0]) * num_pages +
-			   sizeof(cache->ca_pages[0].pg_info[0]) * max_mapped);
-	if (cache == NULL)
-		D_GOTO(error, rc = -DER_NOMEM);
+/* Convert memory pointer to memory page */
+static inline struct umem_page_info *
+cache_ptr2pinfo(struct umem_cache *cache, const void *ptr)
+{
+	struct umem_page_info	*pinfo;
+	uint32_t idx;
 
-	D_DEBUG(DB_IO,
-		"Allocated page cache for stor->stor_size=" DF_U64 ", " DF_U64 " pages at %p\n",
-		store->stor_size, num_pages, cache);
+	D_ASSERT(ptr >= cache->ca_base);
+	idx = (ptr - cache->ca_base) >> cache->ca_page_shift;
 
-	cache->ca_store      = store;
-	cache->ca_num_pages  = num_pages;
-	cache->ca_max_mapped = num_pages;
+	D_ASSERTF(idx < cache->ca_mem_pages, "ptr=%p, md_pages=%u, idx=%u\n",
+		  ptr, cache->ca_mem_pages, idx);
+	pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages];
 
-	D_INIT_LIST_HEAD(&cache->ca_pgs_dirty);
-	D_INIT_LIST_HEAD(&cache->ca_pgs_copying);
-	D_INIT_LIST_HEAD(&cache->ca_pgs_lru);
-	D_INIT_LIST_HEAD(&cache->ca_pi_free);
+	return &pinfo[idx];
+}
 
-	for (idx = 0; idx < num_pages; idx++)
-		cache->ca_pages[idx].pg_id = idx;
+/* Convert MD-blob offset to page offset */
+static inline uint32_t
+cache_off2pg_off(struct umem_cache *cache, umem_off_t offset)
+{
+	D_ASSERT(offset >= cache->ca_base_off);
+	return (offset - cache->ca_base_off) & cache->ca_page_mask;
+}
 
-	pinfo = (struct umem_page_info *)&cache->ca_pages[idx];
+bool
+umem_cache_offisloaded(struct umem_store *store, umem_off_t offset)
+{
+	struct umem_cache *cache = store->cache;
+	struct umem_page  *page  = cache_off2page(cache, offset);
 
-	for (idx = 0; idx < max_mapped; idx++) {
-		d_list_add_tail(&pinfo->pi_link, &cache->ca_pi_free);
-		pinfo++;
-	}
+	return ((page->pg_info != NULL) && page->pg_info->pi_loaded);
+}
 
-	store->cache = cache;
+/* Convert MD-blob offset to memory pointer */
+void *
+umem_cache_off2ptr(struct umem_store *store, umem_off_t offset)
+{
+	struct umem_cache	*cache = store->cache;
+	struct umem_page	*page = cache_off2page(cache, offset);
 
-	return 0;
+	/* The page must be mapped */
+	D_ASSERT(page->pg_info != NULL);
+	return (void *)(page->pg_info->pi_addr + cache_off2pg_off(cache, offset));
+}
 
-error:
-	D_FREE(cache);
-	return rc;
+/* Convert memory pointer to MD-blob offset */
+umem_off_t
+umem_cache_ptr2off(struct umem_store *store, const void *ptr)
+{
+	struct umem_cache	*cache = store->cache;
+	struct umem_page_info	*pinfo = cache_ptr2pinfo(cache, ptr);
+	umem_off_t		 offset;
+
+	/* The page must be mapped */
+	D_ASSERT(pinfo->pi_mapped);
+	offset = cache_id2off(cache, pinfo->pi_pg_id);
+	offset += (ptr - cache->ca_base) & cache->ca_page_mask;
+
+	return offset;
 }
 
-int
-umem_cache_free(struct umem_store *store)
+static int
+page_waitqueue_create(struct umem_cache *cache, struct umem_page_info *pinfo)
 {
-	/** XXX: check reference counts? */
-	D_FREE(store->cache);
+	struct umem_store	*store = cache->ca_store;
+	int			 rc;
+
+	D_ASSERT(store->stor_ops->so_waitqueue_create != NULL);
+	if (pinfo->pi_io_wq == NULL) {
+		rc = store->stor_ops->so_waitqueue_create(&pinfo->pi_io_wq);
+		if (rc)
+			return rc;
+	}
+	if (pinfo->pi_commit_wq == NULL) {
+		rc = store->stor_ops->so_waitqueue_create(&pinfo->pi_commit_wq);
+		if (rc)
+			return rc;
+	}
+
 	return 0;
 }
 
-int
-umem_cache_check(struct umem_store *store, uint64_t num_pages)
+static void
+page_waitqueue_destroy(struct umem_cache *cache, struct umem_page_info *pinfo)
 {
-	struct umem_cache *cache = store->cache;
+	struct umem_store	*store = cache->ca_store;
 
-	D_ASSERT(num_pages + cache->ca_mapped <= cache->ca_num_pages);
+	if (pinfo->pi_io_wq != NULL) {
+		store->stor_ops->so_waitqueue_destroy(pinfo->pi_io_wq);
+		pinfo->pi_io_wq = NULL;
+	}
+	if (pinfo->pi_commit_wq != NULL) {
+		store->stor_ops->so_waitqueue_destroy(pinfo->pi_commit_wq);
+		pinfo->pi_commit_wq = NULL;
+	}
+}
 
-	if (num_pages > cache->ca_max_mapped - cache->ca_mapped)
-		return num_pages - (cache->ca_max_mapped - cache->ca_mapped);
+static inline void
+verify_inactive_page(struct umem_page_info *pinfo)
+{
+	D_ASSERT(d_list_empty(&pinfo->pi_flush_link));
+	D_ASSERT(pinfo->pi_ref == 0);
+	D_ASSERT(pinfo->pi_io == 0);
+	D_ASSERT(pinfo->pi_copying == 0);
+}
 
-	return 0;
+static inline void
+verify_clean_page(struct umem_page_info *pinfo, int mapped)
+{
+	D_ASSERT(d_list_empty(&pinfo->pi_lru_link));
+	D_ASSERT(d_list_empty(&pinfo->pi_dirty_link));
+	D_ASSERT(pinfo->pi_mapped == mapped);
+	verify_inactive_page(pinfo);
 }
 
 int
-umem_cache_evict(struct umem_store *store, uint64_t num_pages)
+umem_cache_free(struct umem_store *store)
 {
-	/** XXX: Not yet implemented */
+	struct umem_cache	*cache = store->cache;
+	struct umem_page_info	*pinfo;
+	int			 i;
+
+	if (cache == NULL)
+		return 0;
+
+	D_ASSERT(d_list_empty(&cache->ca_pgs_flushing));
+	D_ASSERT(d_list_empty(&cache->ca_pgs_wait_commit));
+	D_ASSERT(d_list_empty(&cache->ca_pgs_pinned));
+	D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] == 0);
+	D_ASSERT(cache->ca_reserve_waiters == 0);
+
+	pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages];
+	for (i = 0; i < cache->ca_mem_pages; i++) {
+		verify_inactive_page(pinfo);
+
+		page_waitqueue_destroy(store->cache, pinfo);
+		pinfo++;
+	}
+
+	if (cache->ca_reserve_wq != NULL) {
+		store->stor_ops->so_waitqueue_destroy(cache->ca_reserve_wq);
+		cache->ca_reserve_wq = NULL;
+
+	}
+
+	D_FREE(store->cache);
 	return 0;
 }
 
-int
-umem_cache_map_range(struct umem_store *store, umem_off_t offset, void *start_addr,
-		     uint64_t num_pages)
+/* 1: phase I mode; 2: phase II mode; */
+static inline unsigned int
+cache_mode(struct umem_cache *cache)
 {
-	struct umem_cache *cache = store->cache;
-	struct umem_page *page;
-	struct umem_page_info *pinfo;
-	struct umem_page *end_page;
-	uint64_t          current_addr = (uint64_t)start_addr;
+	return cache->ca_mode;
+}
 
-	if (store->cache == NULL)
-		return 0; /* TODO: When SMD is supported outside VOS, this will be an error */
+static inline struct umem_page_info *
+cache_pop_free_page(struct umem_cache *cache)
+{
+	struct umem_page_info	*pinfo;
 
-	page     = umem_cache_off2page(cache, offset);
-	end_page = page + num_pages;
+	pinfo = d_list_pop_entry(&cache->ca_pgs_free, struct umem_page_info, pi_lru_link);
+	if (pinfo != NULL) {
+		D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_FREE] > 0);
+		cache->ca_pgs_stats[UMEM_PG_STATS_FREE] -= 1;
+	}
+	return pinfo;
+}
+
+#define UMEM_CHUNK_IDX_SHIFT 6
+#define UMEM_CHUNK_IDX_BITS  (1 << UMEM_CHUNK_IDX_SHIFT)
+#define UMEM_CHUNK_IDX_MASK  (UMEM_CHUNK_IDX_BITS - 1)
 
-	D_ASSERTF(page->pg_id + num_pages <= cache->ca_num_pages,
-		  "pg_id=%d, num_pages=" DF_U64 ", cache pages=" DF_U64 "\n", page->pg_id,
-		  num_pages, cache->ca_num_pages);
+#define UMEM_CACHE_PAGE_SHIFT_MAX 27	/* 128MB */
+#define UMEM_CACHE_BMAP_SZ_MAX    (1 << (UMEM_CACHE_PAGE_SHIFT_MAX - \
+					UMEM_CACHE_CHUNK_SZ_SHIFT - UMEM_CHUNK_IDX_SHIFT))
+#define UMEM_CACHE_RSRVD_PAGES	4
 
-	while (page != end_page) {
-		D_ASSERT(page->pg_info == NULL);
+int
+umem_cache_alloc(struct umem_store *store, uint32_t page_sz, uint32_t md_pgs, uint32_t mem_pgs,
+		 uint32_t max_ne_pgs, uint32_t base_off, void *base,
+		 bool (*is_evictable_fn)(void *arg, uint32_t pg_id),
+		 int (*evtcb_fn)(int evt_type, void *arg, uint32_t pg_id), void *fn_arg)
+{
+	struct umem_cache	*cache;
+	struct umem_page_info	*pinfo;
+	struct umem_page	*page;
+	unsigned int		 page_shift, bmap_sz;
+	uint64_t		*bmap;
+	void			*cur_addr = base;
+	int			 idx, cmode = 1, rc = 0;
+
+	D_ASSERT(store != NULL);
+	D_ASSERT(base != NULL);
+
+	page_shift = __builtin_ctz(page_sz);
+	if (page_sz != (1 << page_shift)) {
+		D_ERROR("Page size (%u) isn't aligned.\n", page_sz);
+		return -DER_INVAL;
+	} else if (page_shift > UMEM_CACHE_PAGE_SHIFT_MAX) {
+		D_ERROR("Page size (%u) > Max page size (%u).\n",
+			page_sz, 1 << UMEM_CACHE_PAGE_SHIFT_MAX);
+		return -DER_INVAL;
+	} else if (page_shift <= (UMEM_CACHE_CHUNK_SZ_SHIFT + UMEM_CHUNK_IDX_SHIFT)) {
+		D_ERROR("Page size (%u) <= Min page size (%u)\n",
+			page_sz, 1 << (UMEM_CACHE_CHUNK_SZ_SHIFT + UMEM_CHUNK_IDX_SHIFT));
+		return -DER_INVAL;
+	}
+
+	D_ASSERT(md_pgs > 0 && md_pgs >= mem_pgs);
+	if (mem_pgs == 0) {	/* Phase 1 mode */
+		mem_pgs = md_pgs;
+		max_ne_pgs = md_pgs;
+	} else
+		cmode = 2;
+
+	bmap_sz = (1 << (page_shift - UMEM_CACHE_CHUNK_SZ_SHIFT - UMEM_CHUNK_IDX_SHIFT));
+
+	D_ALLOC(cache, sizeof(*cache) + sizeof(cache->ca_pages[0]) * md_pgs +
+			   sizeof(cache->ca_pages[0].pg_info[0]) * mem_pgs +
+			   bmap_sz * sizeof(uint64_t) * mem_pgs);
+	if (cache == NULL)
+		return -DER_NOMEM;
+
+	D_DEBUG(DB_IO, "Allocated page cache, md-pages(%u), mem-pages(%u), max-ne-pages(%u) %p\n",
+		md_pgs, mem_pgs, max_ne_pgs, cache);
+
+	cache->ca_store		= store;
+	cache->ca_base		= base;
+	cache->ca_base_off	= base_off;
+	cache->ca_md_pages	= md_pgs;
+	cache->ca_mem_pages	= mem_pgs;
+	cache->ca_max_ne_pages	= max_ne_pgs;
+	cache->ca_page_sz	= page_sz;
+	cache->ca_page_shift	= page_shift;
+	cache->ca_page_mask	= page_sz - 1;
+	cache->ca_bmap_sz	= bmap_sz;
+	cache->ca_evictable_fn	= is_evictable_fn;
+	cache->ca_evtcb_fn      = evtcb_fn;
+	cache->ca_fn_arg        = fn_arg;
+	cache->ca_mode          = cmode;
+
+	D_INIT_LIST_HEAD(&cache->ca_pgs_free);
+	D_INIT_LIST_HEAD(&cache->ca_pgs_dirty);
+	D_INIT_LIST_HEAD(&cache->ca_pgs_lru[0]);
+	D_INIT_LIST_HEAD(&cache->ca_pgs_lru[1]);
+	D_INIT_LIST_HEAD(&cache->ca_pgs_flushing);
+	D_INIT_LIST_HEAD(&cache->ca_pgs_wait_commit);
+	D_INIT_LIST_HEAD(&cache->ca_pgs_pinned);
+
+	pinfo = (struct umem_page_info *)&cache->ca_pages[md_pgs];
+	bmap = (uint64_t *)&pinfo[mem_pgs];
+
+	/* Initialize memory page array */
+	for (idx = 0; idx < mem_pgs; idx++) {
+		pinfo->pi_bmap = bmap;
+		pinfo->pi_addr = (void *)cur_addr;
+		D_INIT_LIST_HEAD(&pinfo->pi_dirty_link);
+		D_INIT_LIST_HEAD(&pinfo->pi_flush_link);
+		d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_free);
+		cache->ca_pgs_stats[UMEM_PG_STATS_FREE] += 1;
+
+		pinfo++;
+		bmap += bmap_sz;
+		cur_addr += page_sz;
+	}
+	store->cache = cache;
+
+	/* Phase 2 mode */
+	if (cache_mode(cache) != 1) {
+		D_ASSERT(store->stor_ops->so_waitqueue_create != NULL);
+		rc = store->stor_ops->so_waitqueue_create(&cache->ca_reserve_wq);
+		if (rc)
+			goto error;
+
+		pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages];
+		for (idx = 0; idx < cache->ca_mem_pages; idx++) {
+			rc = page_waitqueue_create(cache, pinfo);
+			if (rc)
+				goto error;
+			pinfo++;
+		}
+		return 0;
+	}
 
-		pinfo = d_list_pop_entry(&cache->ca_pi_free, struct umem_page_info, pi_link);
+	/* Map all MD pages to memory pages for phase 1 mode */
+	for (idx = 0; idx < md_pgs; idx++) {
+		pinfo = cache_pop_free_page(cache);
 		D_ASSERT(pinfo != NULL);
+		D_ASSERT(pinfo->pi_addr == (base + (uint64_t)idx * page_sz));
+		pinfo->pi_pg_id = idx;
+		pinfo->pi_mapped = 1;
+		pinfo->pi_loaded = 1;
+
+		page = &cache->ca_pages[idx];
+		D_ASSERT(page->pg_info == NULL);
 		page->pg_info  = pinfo;
-		pinfo->pi_page = page;
-		pinfo->pi_addr = (void *)current_addr;
-		current_addr += UMEM_CACHE_PAGE_SZ;
 
-		d_list_add_tail(&pinfo->pi_link, &cache->ca_pgs_lru);
-		page++;
+		/* Add to non-evictable LRU */
+		cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] += 1;
+		d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_lru[0]);
 	}
 
-	cache->ca_mapped += num_pages;
-
 	return 0;
+error:
+	umem_cache_free(store);
+	return rc;
 }
 
-int
-umem_cache_pin(struct umem_store *store, umem_off_t addr, daos_size_t size)
+static inline bool
+is_id_evictable(struct umem_cache *cache, uint32_t pg_id)
+{
+	return cache->ca_evictable_fn && cache->ca_evictable_fn(cache->ca_fn_arg, pg_id);
+}
+
+static inline void
+cache_push_free_page(struct umem_cache *cache, struct umem_page_info *pinfo)
 {
-	struct umem_cache *cache     = store->cache;
-	struct umem_page *page      = umem_cache_off2page(cache, addr);
-	struct umem_page *end_page  = umem_cache_off2page(cache, addr + size - 1) + 1;
+	d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_free);
+	cache->ca_pgs_stats[UMEM_PG_STATS_FREE] += 1;
+}
 
-	while (page != end_page) {
-		page->pg_ref++;
-		page++;
+static inline void
+cache_unmap_page(struct umem_cache *cache, struct umem_page_info *pinfo)
+{
+	verify_clean_page(pinfo, 1);
+	D_ASSERT(pinfo->pi_pg_id < cache->ca_md_pages);
+	D_ASSERT(cache->ca_pages[pinfo->pi_pg_id].pg_info == pinfo);
+
+	pinfo->pi_mapped = 0;
+	pinfo->pi_loaded = 0;
+	pinfo->pi_last_inflight                  = 0;
+	pinfo->pi_last_checkpoint                = 0;
+	cache->ca_pages[pinfo->pi_pg_id].pg_info = NULL;
+
+	cache_push_free_page(cache, pinfo);
+
+	if (!is_id_evictable(cache, pinfo->pi_pg_id)) {
+		D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] > 0);
+		cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] -= 1;
 	}
+}
+
+static inline void
+cache_map_page(struct umem_cache *cache, struct umem_page_info *pinfo, unsigned int pg_id)
+{
+	verify_clean_page(pinfo, 0);
+	D_ASSERT(pinfo->pi_loaded == 0);
+
+	pinfo->pi_mapped = 1;
+	pinfo->pi_pg_id = pg_id;
+	cache->ca_pages[pg_id].pg_info = pinfo;
+	if (!is_id_evictable(cache, pg_id))
+		cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] += 1;
 
-	return 0;
 }
 
-int
-umem_cache_unpin(struct umem_store *store, umem_off_t addr, daos_size_t size)
+static inline void
+cache_add2lru(struct umem_cache *cache, struct umem_page_info *pinfo)
+{
+	D_ASSERT(d_list_empty(&pinfo->pi_lru_link));
+	D_ASSERT(pinfo->pi_ref == 0);
+
+	if (is_id_evictable(cache, pinfo->pi_pg_id))
+		d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_lru[1]);
+	else
+		d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_lru[0]);
+}
+
+static inline void
+cache_unpin_page(struct umem_cache *cache, struct umem_page_info *pinfo)
 {
-	struct umem_cache *cache    = store->cache;
-	struct umem_page *page     = umem_cache_off2page(cache, addr);
-	struct umem_page *end_page = umem_cache_off2page(cache, addr + size - 1) + 1;
+	D_ASSERT(pinfo->pi_ref > 0);
+	pinfo->pi_ref--;
 
-	while (page != end_page) {
-		D_ASSERT(page->pg_ref >= 1);
-		page->pg_ref--;
-		page++;
+	if (pinfo->pi_ref == 0) {
+		d_list_del_init(&pinfo->pi_lru_link);
+		cache_add2lru(cache, pinfo);
+		if (is_id_evictable(cache, pinfo->pi_pg_id)) {
+			D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] > 0);
+			cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] -= 1;
+		}
 	}
+}
 
-	return 0;
+static inline void
+cache_pin_page(struct umem_cache *cache, struct umem_page_info *pinfo)
+{
+	pinfo->pi_ref++;
+	if (pinfo->pi_ref == 1) {
+		d_list_del_init(&pinfo->pi_lru_link);
+		d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_pinned);
+		if (is_id_evictable(cache, pinfo->pi_pg_id))
+			cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] += 1;
+	}
 }
 
-#define UMEM_CHUNK_IDX_SHIFT 6
-#define UMEM_CHUNK_IDX_BITS  (1 << UMEM_CHUNK_IDX_SHIFT)
-#define UMEM_CHUNK_IDX_MASK  (UMEM_CHUNK_IDX_BITS - 1)
+static inline void
+page_wait_io(struct umem_cache *cache, struct umem_page_info *pinfo)
+{
+	struct umem_store	*store = cache->ca_store;
+
+	D_ASSERT(pinfo->pi_io == 1);
+	if (store->stor_ops->so_waitqueue_create == NULL)
+		return;
+
+	D_ASSERT(store->stor_ops->so_waitqueue_wait != NULL);
+	D_ASSERT(pinfo->pi_io_wq != NULL);
+	store->stor_ops->so_waitqueue_wait(pinfo->pi_io_wq, false);
+}
+
+static inline void
+page_wait_committed(struct umem_cache *cache, struct umem_page_info *pinfo, bool yield_only)
+{
+	struct umem_store	*store = cache->ca_store;
+
+	/* The page is must in flushing */
+	D_ASSERT(pinfo->pi_io == 1);
+	if (store->stor_ops->so_waitqueue_create == NULL)
+		return;
+
+	D_ASSERT(store->stor_ops->so_waitqueue_wait != NULL);
+	D_ASSERT(pinfo->pi_commit_wq != NULL);
+	store->stor_ops->so_waitqueue_wait(pinfo->pi_commit_wq, yield_only);
+}
+
+static inline void
+page_wakeup_io(struct umem_cache *cache, struct umem_page_info *pinfo)
+{
+	struct umem_store	*store = cache->ca_store;
+
+	D_ASSERT(pinfo->pi_io == 0);
+	if (store->stor_ops->so_waitqueue_create == NULL)
+		return;
+
+	if (cache_mode(cache) == 1)
+		return;
+
+	D_ASSERT(store->stor_ops->so_waitqueue_wakeup != NULL);
+	D_ASSERT(pinfo->pi_io_wq != NULL);
+	store->stor_ops->so_waitqueue_wakeup(pinfo->pi_io_wq, true);
+}
+
+static inline void
+page_wakeup_commit(struct umem_cache *cache, struct umem_page_info *pinfo)
+{
+	struct umem_store	*store = cache->ca_store;
+
+	/* The page is must in flushing */
+	D_ASSERT(pinfo->pi_io == 1);
+	if (store->stor_ops->so_waitqueue_create == NULL)
+		return;
+
+	D_ASSERT(store->stor_ops->so_waitqueue_wakeup != NULL);
+	D_ASSERT(pinfo->pi_commit_wq != NULL);
+	store->stor_ops->so_waitqueue_wakeup(pinfo->pi_commit_wq, true);
+}
+
+static inline bool
+is_page_dirty(struct umem_page_info *pinfo)
+{
+	return (pinfo->pi_last_inflight != pinfo->pi_last_checkpoint);
+}
 
 static inline void
 touch_page(struct umem_store *store, struct umem_page_info *pinfo, uint64_t wr_tx,
 	   umem_off_t first_byte, umem_off_t last_byte)
 {
 	struct umem_cache *cache = store->cache;
-	uint64_t start_bit = (first_byte & UMEM_CACHE_PAGE_SZ_MASK) >> UMEM_CACHE_CHUNK_SZ_SHIFT;
-	uint64_t end_bit   = (last_byte & UMEM_CACHE_PAGE_SZ_MASK) >> UMEM_CACHE_CHUNK_SZ_SHIFT;
+	uint64_t start_bit = (first_byte & cache->ca_page_mask) >> UMEM_CACHE_CHUNK_SZ_SHIFT;
+	uint64_t end_bit   = (last_byte & cache->ca_page_mask) >> UMEM_CACHE_CHUNK_SZ_SHIFT;
 	uint64_t bit_nr;
 	uint64_t bit;
 	uint64_t idx;
 
+	D_ASSERT(wr_tx != -1ULL);
+	D_ASSERTF(store->stor_ops->so_wal_id_cmp(store, wr_tx, pinfo->pi_last_inflight) >= 0,
+		  "cur_tx:"DF_U64" < last_inflight:"DF_U64"\n", wr_tx, pinfo->pi_last_inflight);
+	D_ASSERTF(pinfo->pi_last_checkpoint == 0 ||
+		  store->stor_ops->so_wal_id_cmp(store, wr_tx, pinfo->pi_last_checkpoint) > 0,
+		  "cur_tx:"DF_U64" <= last_checkpoint:"DF_U64"\n",
+		  wr_tx, pinfo->pi_last_checkpoint);
+
 	for (bit_nr = start_bit; bit_nr <= end_bit; bit_nr++) {
 		idx = bit_nr >> UMEM_CHUNK_IDX_SHIFT; /** uint64_t index */
 		bit = bit_nr & UMEM_CHUNK_IDX_MASK;
 		pinfo->pi_bmap[idx] |= 1ULL << bit;
 	}
 
-	if (!pinfo->pi_waiting && pinfo->pi_last_checkpoint == pinfo->pi_last_inflight) {
-		/** Keep the page in the waiting list if it's waiting for a transaction to
-		 *  be committed to the WAL before it can be flushed.
-		 */
-		d_list_del(&pinfo->pi_link);
-		d_list_add_tail(&pinfo->pi_link, &cache->ca_pgs_dirty);
-	}
+	D_ASSERT(pinfo->pi_loaded == 1);
+	pinfo->pi_last_inflight = wr_tx;
 
-	if (store->stor_ops->so_wal_id_cmp(store, wr_tx, pinfo->pi_last_inflight) <= 0 ||
-	    wr_tx == -1ULL)
+	/* Don't change the pi_dirty_link while the page is being flushed */
+	if (!d_list_empty(&pinfo->pi_flush_link))
 		return;
 
-	pinfo->pi_last_inflight = wr_tx;
+	D_ASSERT(pinfo->pi_io == 0);
+	if (d_list_empty(&pinfo->pi_dirty_link))
+		d_list_add_tail(&pinfo->pi_dirty_link, &cache->ca_pgs_dirty);
 }
 
+/* Convert MD-blob offset to memory page */
 static inline struct umem_page_info *
-off2pinfo(struct umem_cache *cache, umem_off_t addr)
+cache_off2pinfo(struct umem_cache *cache, umem_off_t addr)
 {
-	struct umem_page *page = umem_cache_off2page(cache, addr);
+	struct umem_page *page = cache_off2page(cache, addr);
 
+	D_ASSERT(page->pg_info != NULL);
 	return page->pg_info;
 }
 
 int
 umem_cache_touch(struct umem_store *store, uint64_t wr_tx, umem_off_t addr, daos_size_t size)
 {
-	struct umem_cache *cache     = store->cache;
-	struct umem_page_info *pinfo;
-	umem_off_t        end_addr  = addr + size - 1;
-	struct umem_page_info *end_pinfo;
-	umem_off_t        start_addr;
+	struct umem_cache	*cache = store->cache;
+	struct umem_page_info	*pinfo;
+	umem_off_t		 start_addr, end_addr = addr + size - 1;
+	struct umem_page_info	*end_pinfo;
 
 	if (cache == NULL)
 		return 0; /* TODO: When SMD is supported outside VOS, this will be an error */
 
-	D_ASSERTF(size <= UMEM_CACHE_PAGE_SZ, "size=" DF_U64 "\n", size);
-	pinfo     = off2pinfo(cache, addr);
-	end_pinfo = off2pinfo(cache, end_addr);
+	D_ASSERTF(size <= cache->ca_page_sz, "size=" DF_U64 "\n", size);
+	pinfo     = cache_off2pinfo(cache, addr);
+	end_pinfo = cache_off2pinfo(cache, end_addr);
 
 	if (pinfo->pi_copying)
 		return -DER_CHKPT_BUSY;
 
+	/* Convert the MD-blob offset to umem cache offset (exclude the allocator header) */
+	D_ASSERT(addr >= cache->ca_base_off);
+	addr -= cache->ca_base_off;
+	end_addr -= cache->ca_base_off;
+
 	if (pinfo != end_pinfo) {
-		/** Eventually, we can just assert equal here.  But until we have a guarantee that
-		 * no allocation will span a page boundary, we have to handle this case.  We should
-		 * never have to span multiple pages though.
-		 */
+		D_ASSERT(cache_mode(cache) == 1);
+
 		if (end_pinfo->pi_copying)
 			return -DER_CHKPT_BUSY;
-		start_addr = end_addr & ~UMEM_CACHE_PAGE_SZ_MASK;
 
+		start_addr = end_addr & ~cache->ca_page_mask;
 		touch_page(store, end_pinfo, wr_tx, start_addr, end_addr);
 		end_addr = start_addr - 1;
 	}
@@ -1947,7 +2685,7 @@ umem_cache_touch(struct umem_store *store, uint64_t wr_tx, umem_off_t addr, daos
 /** Maximum number of pages that can be in one set */
 #define MAX_PAGES_PER_SET 10
 /** Maximum number of ranges that can be in one page */
-#define MAX_IOD_PER_PAGE  ((UMEM_CACHE_BMAP_SZ << 6) / 2)
+#define MAX_IOD_PER_PAGE  ((UMEM_CACHE_BMAP_SZ_MAX << UMEM_CHUNK_IDX_SHIFT) / 2)
 /** Maximum number of IODs a set can handle */
 #define MAX_IOD_PER_SET   (2 * MAX_IOD_PER_PAGE)
 
@@ -1978,13 +2716,14 @@ static void
 page2chkpt(struct umem_store *store, struct umem_page_info *pinfo,
 	   struct umem_checkpoint_data *chkpt_data)
 {
-	uint64_t              *bits      = &pinfo->pi_bmap[0];
+	struct umem_cache     *cache     = store->cache;
+	uint64_t              *bits      = pinfo->pi_bmap;
 	struct umem_store_iod *store_iod = &chkpt_data->cd_store_iod;
 	d_sg_list_t           *sgl       = &chkpt_data->cd_sg_list;
 	uint64_t               bmap;
 	int       i;
 	uint64_t               first_bit_shift;
-	uint64_t               offset = (uint64_t)pinfo->pi_page->pg_id << UMEM_CACHE_PAGE_SZ_SHIFT;
+	uint64_t               offset = cache_id2off(cache, pinfo->pi_pg_id);
 	uint64_t               map_offset;
 	uint8_t               *page_addr = pinfo->pi_addr;
 	int                    nr        = sgl->sg_nr_out;
@@ -1998,7 +2737,7 @@ page2chkpt(struct umem_store *store, struct umem_page_info *pinfo,
 	    0)
 		chkpt_data->cd_max_tx = pinfo->pi_last_inflight;
 
-	for (i = 0; i < UMEM_CACHE_BMAP_SZ; i++) {
+	for (i = 0; i < cache->ca_bmap_sz; i++) {
 		if (bits[i] == 0)
 			goto next_bmap;
 
@@ -2062,66 +2801,73 @@ chkpt_insert_sorted(struct umem_store *store, struct umem_checkpoint_data *chkpt
 	d_list_add_tail(&chkpt_data->cd_link, list);
 }
 
-int
-umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, void *arg,
-		      uint64_t *out_id, struct umem_cache_chkpt_stats *stats)
+static void
+page_flush_completion(struct umem_cache *cache, struct umem_page_info *pinfo)
 {
-	struct umem_cache           *cache    = store->cache;
-	struct umem_page_info       *pinfo    = NULL;
-	struct umem_checkpoint_data *chkpt_data_all;
-	struct umem_checkpoint_data *chkpt_data;
-	uint64_t                     committed_tx = 0;
-	uint64_t                     chkpt_id     = *out_id;
-	d_list_t                     free_list;
-	d_list_t                     waiting_list;
-	int                          i;
-	int                          rc = 0;
-	int                          inflight = 0;
-	int                          pages_scanned = 0;
-	int                          dchunks_copied = 0;
-	int                          iovs_used = 0;
-	int			     nr_copying_pgs = 0;
+	D_ASSERT(d_list_empty(&pinfo->pi_dirty_link));
+	D_ASSERT(pinfo->pi_io == 1);
+	pinfo->pi_io = 0;
+	D_ASSERT(!d_list_empty(&pinfo->pi_flush_link));
+	d_list_del_init(&pinfo->pi_flush_link);
 
-	if (cache == NULL)
-		return 0; /* TODO: When SMD is supported outside VOS, this will be an error */
+	if (is_page_dirty(pinfo))
+		d_list_add_tail(&pinfo->pi_dirty_link, &cache->ca_pgs_dirty);
 
-	if (d_list_empty(&cache->ca_pgs_dirty))
-		return 0;
+	page_wakeup_io(cache, pinfo);
+}
+
+static int
+cache_flush_pages(struct umem_cache *cache, d_list_t *dirty_list,
+		  struct umem_checkpoint_data *chkpt_data_all, int chkpt_nr,
+		  umem_cache_wait_cb_t wait_commit_cb, void *arg, uint64_t *chkpt_id,
+		  struct umem_cache_chkpt_stats *stats)
+{
+	struct umem_store		*store = cache->ca_store;
+	struct umem_checkpoint_data	*chkpt_data;
+	struct umem_page_info		*pinfo;
+	d_list_t			 free_list;
+	d_list_t			 waiting_list;
+	uint64_t			 committed_tx = 0;
+	unsigned int			 max_iod_per_page;
+	unsigned int			 tot_pgs = 0, flushed_pgs = 0;
+	int				 inflight = 0;
+	int				 i, rc = 0;
 
 	D_ASSERT(store != NULL);
+	D_ASSERT(!d_list_empty(dirty_list));
+	max_iod_per_page = ((cache->ca_bmap_sz << UMEM_CHUNK_IDX_SHIFT) / 2);
 
 	D_INIT_LIST_HEAD(&free_list);
 	D_INIT_LIST_HEAD(&waiting_list);
-	D_ALLOC_ARRAY(chkpt_data_all, MAX_INFLIGHT_SETS);
-	if (chkpt_data_all == NULL)
-		return -DER_NOMEM;
 
 	/** Setup the in-flight IODs */
-	for (i = 0; i < MAX_INFLIGHT_SETS; i++) {
+	for (i = 0; i < chkpt_nr; i++) {
 		chkpt_data = &chkpt_data_all[i];
 		d_list_add_tail(&chkpt_data->cd_link, &free_list);
 		chkpt_data->cd_store_iod.io_regions = &chkpt_data->cd_regions[0];
 		chkpt_data->cd_sg_list.sg_iovs      = &chkpt_data->cd_iovs[0];
 	}
 
-	d_list_splice_init(&cache->ca_pgs_dirty, &cache->ca_pgs_copying);
-
 	/** First mark all pages in the new list so they won't be moved by an I/O thread.  This
 	 *  will enable us to continue the algorithm in relative isolation from I/O threads.
 	 */
-	d_list_for_each_entry(pinfo, &cache->ca_pgs_copying, pi_link) {
+	d_list_for_each_entry(pinfo, dirty_list, pi_dirty_link) {
 		/** Mark all pages in copying list first.  Marking them as waiting will prevent
 		 *  them from being moved to another list by an I/O operation.
 		 */
-		pinfo->pi_waiting = 1;
-		if (store->stor_ops->so_wal_id_cmp(store, pinfo->pi_last_inflight, chkpt_id) > 0)
-			chkpt_id = pinfo->pi_last_inflight;
-		nr_copying_pgs++;
+		D_ASSERT(pinfo->pi_io == 0);
+		pinfo->pi_io = 1;
+		D_ASSERT(d_list_empty(&pinfo->pi_flush_link));
+		d_list_add_tail(&pinfo->pi_flush_link, &cache->ca_pgs_flushing);
+		tot_pgs++;
+
+		if (store->stor_ops->so_wal_id_cmp(store, pinfo->pi_last_inflight, *chkpt_id) > 0)
+			*chkpt_id = pinfo->pi_last_inflight;
 	}
 
 	do {
 		/** first try to add up to MAX_INFLIGHT_SETS to the waiting queue */
-		while (inflight < MAX_INFLIGHT_SETS && !d_list_empty(&cache->ca_pgs_copying)) {
+		while (inflight < MAX_INFLIGHT_SETS && !d_list_empty(dirty_list)) {
 			chkpt_data =
 			    d_list_pop_entry(&free_list, struct umem_checkpoint_data, cd_link);
 
@@ -2134,9 +2880,9 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
 			chkpt_data->cd_nr_dchunks                                       = 0;
 
 			while (chkpt_data->cd_nr_pages < MAX_PAGES_PER_SET &&
-			       chkpt_data->cd_store_iod.io_nr <= MAX_IOD_PER_PAGE &&
-			       (pinfo = d_list_pop_entry(&cache->ca_pgs_copying,
-							 struct umem_page_info, pi_link)) != NULL) {
+			       chkpt_data->cd_store_iod.io_nr <= max_iod_per_page &&
+			       (pinfo = d_list_pop_entry(dirty_list, struct umem_page_info,
+							 pi_dirty_link)) != NULL) {
 				D_ASSERT(chkpt_data != NULL);
 				page2chkpt(store, pinfo, chkpt_data);
 			}
@@ -2148,7 +2894,7 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
 				for (i = 0; i < chkpt_data->cd_nr_pages; i++) {
 					pinfo             = chkpt_data->cd_pages[i];
 					pinfo->pi_copying = 0;
-					d_list_add(&pinfo->pi_link, &cache->ca_pgs_copying);
+					d_list_add(&pinfo->pi_dirty_link, dirty_list);
 				}
 				d_list_add(&chkpt_data->cd_link, &free_list);
 				rc = 0;
@@ -2189,7 +2935,7 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
 			for (i = 0; i < chkpt_data->cd_nr_pages; i++) {
 				pinfo             = chkpt_data->cd_pages[i];
 				pinfo->pi_copying = 0;
-				memset(&pinfo->pi_bmap[0], 0, sizeof(pinfo->pi_bmap));
+				memset(pinfo->pi_bmap, 0, sizeof(uint64_t) * cache->ca_bmap_sz);
 			}
 
 			chkpt_insert_sorted(store, chkpt_data, &waiting_list);
@@ -2200,7 +2946,7 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
 		chkpt_data = d_list_pop_entry(&waiting_list, struct umem_checkpoint_data, cd_link);
 
 		/* Wait for in-flight transactions committed, or yield to make progress */
-		wait_cb(arg, chkpt_data ? chkpt_data->cd_max_tx : 0, &committed_tx);
+		wait_commit_cb(arg, chkpt_data ? chkpt_data->cd_max_tx : 0, &committed_tx);
 
 		/* The so_flush_prep() could fail when the DMA buffer is under pressure */
 		if (chkpt_data == NULL)
@@ -2222,36 +2968,739 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
 		rc = store->stor_ops->so_flush_post(chkpt_data->cd_fh, rc);
 		for (i = 0; i < chkpt_data->cd_nr_pages; i++) {
 			pinfo = chkpt_data->cd_pages[i];
-			if (pinfo->pi_last_inflight != pinfo->pi_last_checkpoint)
-				d_list_add_tail(&pinfo->pi_link, &cache->ca_pgs_dirty);
-			else
-				d_list_add_tail(&pinfo->pi_link, &cache->ca_pgs_lru);
-			pinfo->pi_waiting = 0;
+			page_flush_completion(cache, pinfo);
 		}
 		inflight--;
-		pages_scanned  += chkpt_data->cd_nr_pages;
-		dchunks_copied += chkpt_data->cd_nr_dchunks;
-		iovs_used      += chkpt_data->cd_sg_list.sg_nr_out;
+
+		flushed_pgs += chkpt_data->cd_nr_pages;
+		if (stats) {
+			stats->uccs_nr_pages	+= chkpt_data->cd_nr_pages;
+			stats->uccs_nr_dchunks	+= chkpt_data->cd_nr_dchunks;
+			stats->uccs_nr_iovs	+= chkpt_data->cd_sg_list.sg_nr_out;
+		}
 		d_list_add(&chkpt_data->cd_link, &free_list);
 
 		if (rc != 0 || (DAOS_FAIL_CHECK(DAOS_MEM_FAIL_CHECKPOINT) &&
-		    pages_scanned >= nr_copying_pgs / 2)) {
-			d_list_move(&cache->ca_pgs_copying, &cache->ca_pgs_dirty);
+		    flushed_pgs >= tot_pgs / 2)) {
 			rc = -DER_AGAIN;
 			break;
 		}
 
-	} while (inflight != 0 || !d_list_empty(&cache->ca_pgs_copying));
+	} while (inflight != 0 || !d_list_empty(dirty_list));
+
+	return rc;
+}
+
+int
+umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, void *arg,
+		      uint64_t *out_id, struct umem_cache_chkpt_stats *stats)
+{
+	struct umem_cache		*cache;
+	struct umem_page_info		*pinfo;
+	struct umem_checkpoint_data	*chkpt_data_all;
+	d_list_t			 dirty_list;
+	uint64_t			 chkpt_id = *out_id;
+	int				 rc = 0;
+
+	D_ASSERT(store != NULL);
+	cache = store->cache;
+
+	if (cache == NULL)
+		return 0; /* TODO: When SMD is supported outside VOS, this will be an error */
+
+	if (d_list_empty(&cache->ca_pgs_dirty))
+		goto wait;
+
+	D_ALLOC_ARRAY(chkpt_data_all, MAX_INFLIGHT_SETS);
+	if (chkpt_data_all == NULL)
+		return -DER_NOMEM;
+
+	D_INIT_LIST_HEAD(&dirty_list);
+	d_list_splice_init(&cache->ca_pgs_dirty, &dirty_list);
+
+	rc = cache_flush_pages(cache, &dirty_list, chkpt_data_all, MAX_INFLIGHT_SETS, wait_cb, arg,
+			       &chkpt_id, stats);
 
 	D_FREE(chkpt_data_all);
+	if (!d_list_empty(&dirty_list)) {
+		D_ASSERT(rc != 0);
+		d_list_move(&dirty_list, &cache->ca_pgs_dirty);
+	}
+wait:
+	/* Wait for the evicting pages (if any) with lower checkpoint id */
+	d_list_for_each_entry(pinfo, &cache->ca_pgs_flushing, pi_flush_link) {
+		D_ASSERT(pinfo->pi_io == 1);
+		if (store->stor_ops->so_wal_id_cmp(store, chkpt_id, pinfo->pi_last_checkpoint) < 0)
+			continue;
+		page_wait_io(cache, pinfo);
+		goto wait;
+	}
 
 	*out_id = chkpt_id;
-	if (stats) {
-		stats->uccs_nr_pages   = pages_scanned;
-		stats->uccs_nr_dchunks = dchunks_copied;
-		stats->uccs_nr_iovs    = iovs_used;
+
+	return rc;
+}
+
+static inline void
+inc_cache_stats(struct umem_cache *cache, unsigned int op)
+{
+	cache->ca_cache_stats[op] += 1;
+}
+
+static int
+cache_load_page(struct umem_cache *cache, struct umem_page_info *pinfo)
+{
+	struct umem_store	*store = cache->ca_store;
+	uint64_t		 offset;
+	daos_size_t		 len;
+	int			 rc;
+
+	D_ASSERT(pinfo->pi_mapped == 1);
+
+	if (pinfo->pi_io == 1) {
+		page_wait_io(cache, pinfo);
+		return pinfo->pi_loaded ? 0 : -DER_IO;
+	}
+
+	offset = cache_id2off(cache, pinfo->pi_pg_id);
+	D_ASSERT(offset < store->stor_size);
+	len = min(cache->ca_page_sz, store->stor_size - offset);
+	pinfo->pi_io = 1;
+
+	if (DAOS_ON_VALGRIND)
+		VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE((char *)pinfo->pi_addr, len);
+	rc = store->stor_ops->so_load(store, (char *)pinfo->pi_addr, offset, len);
+	if (DAOS_ON_VALGRIND)
+		VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE((char *)pinfo->pi_addr, len);
+	pinfo->pi_io = 0;
+	if (rc) {
+		DL_ERROR(rc, "Read MD blob failed.");
+		page_wakeup_io(cache, pinfo);
+		return rc;
+	} else if (cache->ca_evtcb_fn) {
+		rc = cache->ca_evtcb_fn(UMEM_CACHE_EVENT_PGLOAD, cache->ca_fn_arg, pinfo->pi_pg_id);
+		if (rc) {
+			DL_ERROR(rc, "Pageload callback failed.");
+			page_wakeup_io(cache, pinfo);
+			return rc;
+		}
+	}
+
+	pinfo->pi_loaded = 1;
+	/* Add to LRU when it's unpinned */
+	if (pinfo->pi_ref == 0)
+		cache_add2lru(cache, pinfo);
+
+	page_wakeup_io(cache, pinfo);
+	inc_cache_stats(cache, UMEM_CACHE_STATS_LOAD);
+
+	return rc;
+}
+
+void
+umem_cache_commit(struct umem_store *store, uint64_t commit_id)
+{
+	struct umem_cache	*cache = store->cache;
+	struct umem_page_info	*pinfo, *tmp;
+
+	D_ASSERT(store->stor_ops->so_wal_id_cmp(store, cache->ca_commit_id, commit_id) <= 0);
+	cache->ca_commit_id = commit_id;
+
+	d_list_for_each_entry_safe(pinfo, tmp, &cache->ca_pgs_wait_commit, pi_dirty_link) {
+		if (store->stor_ops->so_wal_id_cmp(store, pinfo->pi_last_checkpoint,
+							commit_id) <= 0) {
+			d_list_del_init(&pinfo->pi_dirty_link);
+			page_wakeup_commit(cache, pinfo);
+		}
+	}
+}
+
+struct wait_page_commit_arg {
+	struct umem_cache	*wca_cache;
+	struct umem_page_info	*wca_pinfo;
+};
+
+static void
+wait_page_commit_cb(void *arg, uint64_t wait_tx, uint64_t *committed_tx)
+{
+	struct wait_page_commit_arg	*wca = arg;
+	struct umem_cache		*cache = wca->wca_cache;
+	struct umem_store		*store = cache->ca_store;
+	struct umem_page_info		*pinfo = wca->wca_pinfo;
+
+	/* Special case, needs to yield to allow progress */
+	if (wait_tx == 0) {
+		page_wait_committed(cache, pinfo, true);
+		*committed_tx = cache->ca_commit_id;
+		return;
+	}
+
+	D_ASSERT(wait_tx == pinfo->pi_last_checkpoint);
+	/* Page is committed */
+	if (store->stor_ops->so_wal_id_cmp(store, cache->ca_commit_id, wait_tx) >= 0) {
+		*committed_tx = cache->ca_commit_id;
+		return;
+	}
+
+	D_ASSERT(d_list_empty(&pinfo->pi_dirty_link));
+	d_list_add_tail(&pinfo->pi_dirty_link, &cache->ca_pgs_wait_commit);
+	page_wait_committed(cache, pinfo, false);
+	*committed_tx = cache->ca_commit_id;
+}
+
+static int
+cache_flush_page(struct umem_cache *cache, struct umem_page_info *pinfo)
+{
+	struct wait_page_commit_arg	 arg;
+	struct umem_checkpoint_data	*chkpt_data_all;
+	d_list_t			 dirty_list;
+	uint64_t			 chkpt_id = 0;
+	int				 rc;
+
+	D_ALLOC_ARRAY(chkpt_data_all, 1);
+	if (chkpt_data_all == NULL)
+		return -DER_NOMEM;
+
+	D_INIT_LIST_HEAD(&dirty_list);
+	d_list_del_init(&pinfo->pi_dirty_link);
+	d_list_add_tail(&pinfo->pi_dirty_link, &dirty_list);
+
+	/*
+	 * Bump the last checkpoint ID beforehand, since cache_flush_pages() could yield before
+	 * bumping the last checkpoint ID.
+	 */
+	D_ASSERT(is_page_dirty(pinfo));
+	pinfo->pi_last_checkpoint = pinfo->pi_last_inflight;
+
+	arg.wca_cache = cache;
+	arg.wca_pinfo = pinfo;
+
+	rc = cache_flush_pages(cache, &dirty_list, chkpt_data_all, 1, wait_page_commit_cb, &arg,
+			       &chkpt_id, NULL);
+	D_FREE(chkpt_data_all);
+	D_ASSERT(d_list_empty(&dirty_list));
+	inc_cache_stats(cache, UMEM_CACHE_STATS_FLUSH);
+
+	return rc;
+}
+
+static int
+cache_evict_page(struct umem_cache *cache, bool for_sys)
+{
+	struct umem_page_info	*pinfo;
+	d_list_t		*pg_list = &cache->ca_pgs_lru[1];
+	int			 rc;
+
+	if (cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] == cache->ca_mem_pages) {
+		D_ERROR("No evictable page.\n");
+		return -DER_INVAL;
+	} else if (d_list_empty(pg_list)) {
+		D_ERROR("All evictable pages are pinned.\n");
+		return -DER_BUSY;
+	}
+
+	/* Try the most recent used page if it was used for sys */
+	if (for_sys) {
+		pinfo = d_list_entry(pg_list->prev, struct umem_page_info, pi_lru_link);
+		if (pinfo->pi_sys == 1)
+			goto evict;
+	}
+
+	/* Try evictable pages in LRU order */
+	pinfo = d_list_entry(pg_list->next, struct umem_page_info, pi_lru_link);
+evict:
+	D_ASSERT(pinfo->pi_ref == 0);
+
+	/*
+	 * To minimize page eviction, let's evict page one by one for this moment, we
+	 * may consider to allow N concurrent pages eviction in the future.
+	 */
+	if (pinfo->pi_io == 1) {
+		D_ASSERT(!d_list_empty(&pinfo->pi_flush_link));
+		page_wait_io(cache, pinfo);
+		return -DER_AGAIN;
+	}
+
+	if (is_page_dirty(pinfo)) {
+		rc = cache_flush_page(cache, pinfo);
+		if (rc) {
+			DL_ERROR(rc, "Flush page failed.");
+			return rc;
+		}
+
+		/* The page is referenced by others while flushing */
+		if ((pinfo->pi_ref > 0) || is_page_dirty(pinfo) || pinfo->pi_io == 1)
+			return -DER_AGAIN;
+	}
+
+	if (cache->ca_evtcb_fn) {
+		rc = cache->ca_evtcb_fn(UMEM_CACHE_EVENT_PGEVICT, cache->ca_fn_arg,
+					pinfo->pi_pg_id);
+		if (rc)
+			DL_ERROR(rc, "Page evict callback failed.");
+	}
+	d_list_del_init(&pinfo->pi_lru_link);
+	cache_unmap_page(cache, pinfo);
+	inc_cache_stats(cache, UMEM_CACHE_STATS_EVICT);
+
+	return 0;
+}
+
+static inline bool
+need_reserve(struct umem_cache *cache, uint32_t extra_pgs)
+{
+	uint32_t	page_nr = 0;
+
+	if (cache->ca_replay_done) {
+		/* Few free pages are always reserved for potential non-evictable zone grow */
+		D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] <= cache->ca_max_ne_pages);
+		page_nr = cache->ca_max_ne_pages - cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE];
+		if (page_nr > UMEM_CACHE_RSRVD_PAGES)
+			page_nr = UMEM_CACHE_RSRVD_PAGES;
+	}
+	page_nr += extra_pgs;
+
+	if (page_nr == 0)
+		return false;
+
+	return cache->ca_pgs_stats[UMEM_PG_STATS_FREE] < page_nr ? true : false;
+}
+
+static inline bool
+need_evict(struct umem_cache *cache)
+{
+	if (d_list_empty(&cache->ca_pgs_free))
+		return true;
+
+	return need_reserve(cache, 1);
+}
+
+static int
+cache_get_free_page(struct umem_cache *cache, struct umem_page_info **ret_pinfo, int pinned_nr,
+		    bool for_sys)
+{
+	struct umem_page_info	*pinfo;
+	int			 rc, retry_cnt = 0;
+
+	while (need_evict(cache)) {
+		rc = cache_evict_page(cache, for_sys);
+		if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) {
+			DL_ERROR(rc, "Evict page failed.");
+			return rc;
+		}
+
+		/* All pinned pages are from current caller */
+		if (rc == -DER_BUSY && pinned_nr == cache->ca_pgs_stats[UMEM_PG_STATS_PINNED]) {
+			D_ERROR("Not enough evictable pages.\n");
+			return -DER_INVAL;
+		}
+
+		D_CDEBUG(retry_cnt == 10, DLOG_ERR, DB_TRACE,
+			 "Retry get free page, %d times\n", retry_cnt);
+		retry_cnt++;
+	}
+
+	pinfo = cache_pop_free_page(cache);
+	D_ASSERT(pinfo != NULL);
+	*ret_pinfo = pinfo;
+
+	return 0;
+}
+
+/*
+ * Only allow map empty pages. It could yield when mapping an evictable page,
+ * so when caller tries to map non-evictable page, the page_nr must be 1.
+ */
+static int
+cache_map_pages(struct umem_cache *cache, uint32_t *pages, int page_nr)
+{
+	struct umem_page_info	*pinfo, *free_pinfo = NULL;
+	uint32_t		 pg_id;
+	int			 i, rc = 0;
+
+	for (i = 0; i < page_nr; i++) {
+		pg_id = pages[i];
+
+		if (is_id_evictable(cache, pg_id) && page_nr != 1) {
+			D_ERROR("Can only map single evictable page.\n");
+			return -DER_INVAL;
+		}
+retry:
+		pinfo = cache->ca_pages[pg_id].pg_info;
+		/* The page is already mapped */
+		if (pinfo != NULL) {
+			D_ASSERT(pinfo->pi_pg_id == pg_id);
+			D_ASSERT(pinfo->pi_mapped == 1);
+			D_ASSERT(pinfo->pi_loaded == 1);
+			if (free_pinfo != NULL) {
+				cache_push_free_page(cache, free_pinfo);
+				free_pinfo = NULL;
+			}
+			continue;
+		}
+
+		if (is_id_evictable(cache, pg_id)) {
+			if (free_pinfo == NULL) {
+				rc = cache_get_free_page(cache, &free_pinfo, 0, false);
+				if (rc) {
+					DL_ERROR(rc, "Failed to get free page.");
+					break;
+				}
+				goto retry;
+			} else {
+				pinfo = free_pinfo;
+				free_pinfo = NULL;
+			}
+		} else {
+			pinfo = cache_pop_free_page(cache);
+			if (pinfo == NULL) {
+				D_ERROR("No free pages.\n");
+				rc = -DER_BUSY;
+				break;
+			}
+		}
+
+		cache_map_page(cache, pinfo, pg_id);
+		cache_add2lru(cache, pinfo);
+		/* Map an empty page, doesn't need to load page */
+		pinfo->pi_loaded = 1;
+	}
+
+	return rc;
+}
+
+static int
+cache_pin_pages(struct umem_cache *cache, uint32_t *pages, int page_nr, bool for_sys)
+{
+	struct umem_page_info	*pinfo, *free_pinfo = NULL;
+	uint32_t		 pg_id;
+	int			 i, processed = 0, pinned = 0, rc = 0;
+
+	for (i = 0; i < page_nr; i++) {
+		pg_id = pages[i];
+retry:
+		pinfo = cache->ca_pages[pg_id].pg_info;
+		/* The page is already mapped */
+		if (pinfo != NULL) {
+			D_ASSERT(pinfo->pi_pg_id == pg_id);
+			D_ASSERT(pinfo->pi_mapped == 1);
+			inc_cache_stats(cache, UMEM_CACHE_STATS_HIT);
+			if (free_pinfo != NULL) {
+				cache_push_free_page(cache, free_pinfo);
+				free_pinfo = NULL;
+			}
+			goto next;
+		}
+
+		if (free_pinfo == NULL) {
+			rc = cache_get_free_page(cache, &free_pinfo, pinned, for_sys);
+			if (rc)
+				goto error;
+			/* Above cache_get_free_page() could yield, need re-check mapped status */
+			goto retry;
+		} else {
+			pinfo = free_pinfo;
+			free_pinfo = NULL;
+		}
+
+		inc_cache_stats(cache, UMEM_CACHE_STATS_MISS);
+		cache_map_page(cache, pinfo, pg_id);
+next:
+		cache_pin_page(cache, pinfo);
+		processed++;
+		if (is_id_evictable(cache, pinfo->pi_pg_id))
+			pinned++;
+	}
+
+	for (i = 0; i < page_nr; i++) {
+		pg_id = pages[i];
+		pinfo = cache->ca_pages[pg_id].pg_info;
+
+		D_ASSERT(pinfo != NULL);
+		if (pinfo->pi_loaded == 0) {
+			rc = cache_load_page(cache, pinfo);
+			if (rc)
+				goto error;
+		}
+		pinfo->pi_sys = for_sys;
+	}
+
+	return 0;
+error:
+	for (i = 0; i < processed; i++) {
+		pg_id = pages[i];
+		pinfo = cache->ca_pages[pg_id].pg_info;
+
+		D_ASSERT(pinfo != NULL);
+		cache_unpin_page(cache, pinfo);
+
+	}
+	return rc;
+}
+
+#define DF_RANGE		\
+	DF_U64", "DF_U64
+#define DP_RANGE(range)		\
+	(range)->cr_off, (range)->cr_size
+
+static int
+cache_rgs2pgs(struct umem_cache *cache, struct umem_cache_range *ranges, int range_nr,
+	      uint32_t *in_pages, int *page_nr, uint32_t **out_pages)
+{
+	struct umem_cache_range	range;
+	uint32_t		page_id, *pages = in_pages, *old_pages = NULL, len = 0;
+	int			rc = 0, i, page_idx = 0, tot_pages = *page_nr;
+
+	for (i = 0; i < range_nr; i++) {
+		range = ranges[i];
+		/* Assume the ranges are sorted & no overlapping */
+		if (i > 0) {
+			if (range.cr_off < ranges[i - 1].cr_off + ranges[i - 1].cr_size) {
+				D_ERROR("Invalid ranges ["DF_RANGE"], ["DF_RANGE"]\n",
+					DP_RANGE(&ranges[i - 1]), DP_RANGE(&range));
+				rc = -DER_INVAL;
+				goto error;
+			}
+		}
+
+		D_ASSERT(range.cr_size > 0);
+		while (range.cr_size > 0) {
+			page_id = cache_off2id(cache, range.cr_off);
+
+			if (len != 0 && page_id != pages[page_idx]) {
+				page_idx++;
+				if (page_idx == tot_pages) {
+					D_REALLOC_ARRAY(pages, old_pages, tot_pages, tot_pages * 2);
+					if (pages == NULL) {
+						D_ERROR("Alloc array(%d) failed.\n", tot_pages * 2);
+						rc = -DER_NOMEM;
+						goto error;
+					}
+					old_pages = pages;
+					tot_pages = tot_pages * 2;
+				}
+			}
+
+			pages[page_idx] = page_id;
+			len = cache->ca_page_sz - cache_off2pg_off(cache, range.cr_off);
+			range.cr_off += len;
+			if (range.cr_size >= len)
+				range.cr_size -= len;
+			else
+				range.cr_size = 0;
+		}
+	}
+
+	D_ASSERT(page_idx < tot_pages);
+	*out_pages = pages;
+	*page_nr = page_idx + 1;
+
+	return 0;
+error:
+	if (old_pages)
+		D_FREE(old_pages);
+	return rc;
+}
+
+#define UMEM_PAGES_ON_STACK	16
+
+void
+umem_cache_post_replay(struct umem_store *store)
+{
+	struct umem_cache     *cache = store->cache;
+	int                    cnt   = 0;
+	int                    idx;
+	struct umem_page_info *pinfo;
+
+	pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages];
+	for (idx = 0; idx < cache->ca_mem_pages; idx++) {
+		if (pinfo[idx].pi_loaded == 0)
+			continue;
+
+		if (!is_id_evictable(cache, pinfo[idx].pi_pg_id)) {
+			d_list_del_init(&pinfo[idx].pi_lru_link);
+			d_list_add_tail(&pinfo[idx].pi_lru_link, &cache->ca_pgs_lru[0]);
+			cnt++;
+		}
+	}
+	cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] = cnt;
+	cache->ca_replay_done                           = 1;
+}
+
+int
+umem_cache_map(struct umem_store *store, struct umem_cache_range *ranges, int range_nr)
+{
+	struct umem_cache	*cache = store->cache;
+	uint32_t		 in_pages[UMEM_PAGES_ON_STACK], *out_pages;
+	int			 rc, page_nr = UMEM_PAGES_ON_STACK;
+
+	rc = cache_rgs2pgs(cache, ranges, range_nr, &in_pages[0], &page_nr, &out_pages);
+	if (rc)
+		return rc;
+
+	rc = cache_map_pages(cache, out_pages, page_nr);
+	if (rc)
+		DL_ERROR(rc, "Map page failed.");
+
+	if (out_pages != &in_pages[0])
+		D_FREE(out_pages);
+
+	return rc;
+}
+
+int
+umem_cache_load(struct umem_store *store, struct umem_cache_range *ranges, int range_nr,
+		bool for_sys)
+{
+	struct umem_cache	*cache = store->cache;
+	struct umem_page_info	*pinfo;
+	uint32_t		 in_pages[UMEM_PAGES_ON_STACK], *out_pages;
+	int			 i, rc, page_nr = UMEM_PAGES_ON_STACK;
+
+	rc = cache_rgs2pgs(cache, ranges, range_nr, &in_pages[0], &page_nr, &out_pages);
+	if (rc)
+		return rc;
+
+	rc = cache_pin_pages(cache, out_pages, page_nr, for_sys);
+	if (rc) {
+		DL_ERROR(rc, "Load page failed.");
+	} else {
+		for (i = 0; i < page_nr; i++) {
+			uint32_t	pg_id = out_pages[i];
+
+			pinfo = cache->ca_pages[pg_id].pg_info;
+			D_ASSERT(pinfo != NULL);
+			cache_unpin_page(cache, pinfo);
+		}
+	}
+
+	if (out_pages != &in_pages[0])
+		D_FREE(out_pages);
+
+	return rc;
+}
+
+struct umem_pin_handle {
+	uint32_t	ph_page_nr;
+	uint32_t	ph_pages[0];
+};
+
+int
+umem_cache_pin(struct umem_store *store, struct umem_cache_range *ranges, int range_nr,
+	       bool for_sys, struct umem_pin_handle **pin_handle)
+{
+	struct umem_cache	*cache = store->cache;
+	struct umem_pin_handle	*handle;
+	uint32_t		 in_pages[UMEM_PAGES_ON_STACK], *out_pages;
+	int			 rc, page_nr = UMEM_PAGES_ON_STACK;
+
+	rc = cache_rgs2pgs(cache, ranges, range_nr, &in_pages[0], &page_nr, &out_pages);
+	if (rc)
+		return rc;
+
+	rc = cache_pin_pages(cache, out_pages, page_nr, for_sys);
+	if (rc) {
+		DL_ERROR(rc, "Load page failed.");
+		goto out;
+	}
+
+	D_ALLOC(handle, sizeof(struct umem_pin_handle) + sizeof(uint32_t) * page_nr);
+	if (handle == NULL) {
+		rc = -DER_NOMEM;
+		goto out;
+	}
+	handle->ph_page_nr = page_nr;
+	memcpy(&handle->ph_pages[0], out_pages, sizeof(uint32_t) * page_nr);
+	*pin_handle = handle;
+out:
+	if (out_pages != &in_pages[0])
+		D_FREE(out_pages);
+
+	return rc;
+}
+
+void
+umem_cache_unpin(struct umem_store *store, struct umem_pin_handle *pin_handle)
+{
+	struct umem_cache	*cache = store->cache;
+	struct umem_page_info	*pinfo;
+	int			 i;
+
+	D_ASSERT(pin_handle != NULL);
+	D_ASSERT(pin_handle->ph_page_nr > 0);
+
+	for (i = 0; i < pin_handle->ph_page_nr; i++) {
+		uint32_t	pg_id = pin_handle->ph_pages[i];
+
+		pinfo = cache->ca_pages[pg_id].pg_info;
+		D_ASSERT(pinfo != NULL);
+		cache_unpin_page(cache, pinfo);
+	}
+
+	D_FREE(pin_handle);
+}
+
+int
+umem_cache_reserve(struct umem_store *store)
+{
+	struct umem_cache	*cache = store->cache;
+	int			 rc = 0, retry_cnt = 0;
+
+	if (cache_mode(cache) == 1)
+		return rc;
+
+	/* MUST ensure the FIFO order */
+	if (!need_reserve(cache, 0) && !cache->ca_reserve_waiters)
+		return rc;
+
+	D_ASSERT(cache->ca_reserve_wq != NULL);
+	cache->ca_reserve_waiters++;
+	if (cache->ca_reserve_waiters > 1) {
+		D_ASSERT(store->stor_ops->so_waitqueue_wait != NULL);
+		store->stor_ops->so_waitqueue_wait(cache->ca_reserve_wq, false);
+	}
+
+	while (need_reserve(cache, 0)) {
+		rc = cache_evict_page(cache, false);
+		if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) {
+			DL_ERROR(rc, "Evict page failed.");
+			break;
+		}
+		rc = 0;
+
+		D_CDEBUG(retry_cnt == 10, DLOG_ERR, DB_TRACE,
+			 "Retry reserve free page, %d times\n", retry_cnt);
+		retry_cnt++;
+	}
+
+	D_ASSERT(cache->ca_reserve_waiters > 0);
+	cache->ca_reserve_waiters--;
+	if (cache->ca_reserve_waiters > 0) {
+		D_ASSERT(store->stor_ops->so_waitqueue_wakeup != NULL);
+		store->stor_ops->so_waitqueue_wakeup(cache->ca_reserve_wq, false);
 	}
 
 	return rc;
 }
+
+uint32_t
+umem_get_mb_from_offset(struct umem_instance *umm, umem_off_t off)
+{
+	uint32_t           page_id;
+	struct umem_cache *cache = umm->umm_pool->up_store.cache;
+
+	page_id = cache_off2id(cache, off);
+	if (is_id_evictable(cache, page_id))
+		return page_id;
+	return 0;
+}
+
+umem_off_t
+umem_get_mb_base_offset(struct umem_instance *umm, uint32_t id)
+{
+	struct umem_cache *cache = umm->umm_pool->up_store.cache;
+
+	return cache_id2off(cache, id);
+}
+
 #endif
diff --git a/src/common/tests/umem_test.c b/src/common/tests/umem_test.c
index 8c192f7e892..6080843f51c 100644
--- a/src/common/tests/umem_test.c
+++ b/src/common/tests/umem_test.c
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -58,16 +58,17 @@ reset_arg(struct test_arg *arg)
 static void
 touch_mem(struct test_arg *arg, uint64_t tx_id, uint64_t offset, uint64_t size)
 {
+	struct umem_cache *cache = arg->ta_store.cache;
 	struct chunk *prep       = &arg->ta_chunks[arg->ta_chunk_nr++];
 	struct chunk *flush      = &arg->ta_chunks[arg->ta_chunk_nr++];
 	d_list_t     *prep_list  = &arg->ta_prep_list;
 	d_list_t     *flush_list = &arg->ta_flush_list;
 	int           rc;
 
-	rc = umem_cache_touch(&arg->ta_store, tx_id, offset, size);
+	rc = umem_cache_touch(&arg->ta_store, tx_id, offset + cache->ca_base_off, size);
 	assert_int_equal(rc, 0);
 
-	prep->ch_off  = offset;
+	prep->ch_off  = offset + cache->ca_base_off;
 	prep->ch_size = size;
 	d_list_add_tail(&prep->ch_link, prep_list);
 
@@ -140,7 +141,7 @@ check_io_region(struct test_arg *arg, struct umem_store_region *region)
 static void
 check_iov(struct test_arg *arg, d_iov_t *iov)
 {
-	find_expected(arg, "io_region", &arg->ta_flush_list, (uint64_t)iov->iov_buf,
+	find_expected(arg, "io_iov", &arg->ta_flush_list, (uint64_t)iov->iov_buf,
 		      (uint64_t)iov->iov_buf + iov->iov_len);
 }
 
@@ -239,10 +240,18 @@ static int
 global_setup(void **state)
 {
 	struct test_arg	*arg;
+	int rc;
+
+	rc = daos_debug_init(DAOS_LOG_DEFAULT);
+	if (rc) {
+		print_message("Failed to init debug\n");
+		return 1;
+	}
 
 	D_ALLOC_PTR(arg);
 	if (arg == NULL) {
 		print_message("Failed to allocate test struct\n");
+		daos_debug_fini();
 		return 1;
 	}
 
@@ -259,6 +268,7 @@ global_teardown(void **state)
 	umem_cache_free(&arg->ta_store);
 
 	D_FREE(arg);
+	daos_debug_fini();
 
 	return 0;
 }
@@ -419,16 +429,14 @@ test_page_cache(void **state)
 	arg->ta_store.stor_ops  = &stor_ops;
 	arg->ta_store.store_type = DAOS_MD_BMEM;
 
-	rc = umem_cache_alloc(&arg->ta_store, 0);
+	rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, 3, 0, 0, 0,
+			      (void *)(UMEM_CACHE_PAGE_SZ), NULL, NULL, NULL);
 	assert_rc_equal(rc, 0);
 
 	cache = arg->ta_store.cache;
 	assert_non_null(cache);
-	assert_int_equal(cache->ca_num_pages, 3);
-	assert_int_equal(cache->ca_max_mapped, 3);
-
-	rc = umem_cache_map_range(&arg->ta_store, 0, (void *)(UMEM_CACHE_PAGE_SZ), 3);
-	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_md_pages, 3);
+	assert_int_equal(cache->ca_mem_pages, 3);
 
 	reset_arg(arg);
 	/** touch multiple chunks */
@@ -486,16 +494,14 @@ test_many_pages(void **state)
 	/** In case prior test failed */
 	umem_cache_free(&arg->ta_store);
 
-	rc = umem_cache_alloc(&arg->ta_store, 0);
+	rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, LARGE_NUM_PAGES, 0, 0, 0,
+			      (void *)(UMEM_CACHE_PAGE_SZ), NULL, NULL, NULL);
 	assert_rc_equal(rc, 0);
 
 	cache = arg->ta_store.cache;
 	assert_non_null(cache);
-	assert_int_equal(cache->ca_num_pages, LARGE_NUM_PAGES);
-	assert_int_equal(cache->ca_max_mapped, LARGE_NUM_PAGES);
-
-	rc = umem_cache_map_range(&arg->ta_store, 0, (void *)(UMEM_CACHE_PAGE_SZ), LARGE_NUM_PAGES);
-	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_md_pages, LARGE_NUM_PAGES);
+	assert_int_equal(cache->ca_mem_pages, LARGE_NUM_PAGES);
 
 	/** Touch all pages, more than can fit in a single set */
 	reset_arg(arg);
@@ -532,16 +538,14 @@ test_many_writes(void **state)
 	/** In case prior test failed */
 	umem_cache_free(&arg->ta_store);
 
-	rc = umem_cache_alloc(&arg->ta_store, 0);
+	rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, LARGE_NUM_PAGES, 0, 0, 0,
+			      (void *)(UMEM_CACHE_PAGE_SZ), NULL, NULL, NULL);
 	assert_rc_equal(rc, 0);
 
 	cache = arg->ta_store.cache;
 	assert_non_null(cache);
-	assert_int_equal(cache->ca_num_pages, LARGE_NUM_PAGES);
-	assert_int_equal(cache->ca_max_mapped, LARGE_NUM_PAGES);
-
-	rc = umem_cache_map_range(&arg->ta_store, 0, (void *)(UMEM_CACHE_PAGE_SZ), LARGE_NUM_PAGES);
-	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_md_pages, LARGE_NUM_PAGES);
+	assert_int_equal(cache->ca_mem_pages, LARGE_NUM_PAGES);
 
 	/** Touch all pages, more than can fit in a single set */
 	reset_arg(arg);
@@ -559,6 +563,192 @@ test_many_writes(void **state)
 	umem_cache_free(&arg->ta_store);
 }
 
+static int
+waitqueue_create(void **wq)
+{
+	*wq = (void *)(UINT64_MAX);
+	return 0;
+}
+
+static void
+waitqueue_destroy(void *wq)
+{
+}
+
+static void
+waitqueue_wait(void *wq, bool yield_only)
+{
+}
+
+static void
+waitqueue_wakeup(void *wq, bool wakeup_all)
+{
+}
+
+static int
+store_load(struct umem_store *store, char *start_addr, daos_off_t offset, daos_size_t len)
+{
+	return 0;
+}
+
+static struct umem_store_ops p2_ops = {
+	.so_waitqueue_create	= waitqueue_create,
+	.so_waitqueue_destroy	= waitqueue_destroy,
+	.so_waitqueue_wait	= waitqueue_wait,
+	.so_waitqueue_wakeup	= waitqueue_wakeup,
+	.so_load		= store_load,
+	.so_flush_prep		= flush_prep,
+	.so_flush_copy		= flush_copy,
+	.so_flush_post		= flush_post,
+	.so_wal_id_cmp		= wal_id_cmp,
+};
+
+#define PAGE_NUM_MD	20
+#define PAGE_NUM_MEM	10
+#define PAGE_NUM_MAX_NE	5
+
+static bool
+is_evictable_fn(void *arg, uint32_t page_id)
+{
+	return page_id >= PAGE_NUM_MAX_NE;
+}
+
+static int
+pagevnt_fn(int event_type, void *arg, uint32_t page_id)
+{
+	return 0;
+}
+
+static void
+test_p2_basic(void **state)
+{
+	struct test_arg		*arg = *state;
+	struct umem_cache	*cache;
+	struct umem_cache_range	 rg = { 0 };
+	struct umem_pin_handle	*pin_hdl;
+	int			 rc;
+
+	arg->ta_store.stor_size = UMEM_CACHE_PAGE_SZ * PAGE_NUM_MD;
+	arg->ta_store.stor_ops  = &p2_ops;
+	arg->ta_store.store_type = DAOS_MD_BMEM;
+
+	rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, PAGE_NUM_MD, PAGE_NUM_MEM,
+			      PAGE_NUM_MAX_NE, 4096, (void *)(UMEM_CACHE_PAGE_SZ), is_evictable_fn,
+			      pagevnt_fn, NULL);
+	assert_rc_equal(rc, 0);
+
+	cache = arg->ta_store.cache;
+	assert_non_null(cache);
+
+	reset_arg(arg);
+
+	/* Load single page */
+	rg.cr_off	= cache->ca_base_off;
+	rg.cr_size	= UMEM_CACHE_PAGE_SZ;
+	rc = umem_cache_load(&arg->ta_store, &rg, 1, false);
+	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], 1);
+	assert_ptr_equal(umem_cache_off2ptr(&arg->ta_store, cache->ca_base_off), cache->ca_base);
+
+	/* Map single non-evictable page */
+	rg.cr_off	= cache->ca_base_off + 1 * UMEM_CACHE_PAGE_SZ;
+	rg.cr_size	= UMEM_CACHE_PAGE_SZ;
+	rc = umem_cache_map(&arg->ta_store, &rg, 1);
+	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], 2);
+	assert_ptr_equal(umem_cache_off2ptr(&arg->ta_store,
+				cache->ca_base_off + UMEM_CACHE_PAGE_SZ),
+			 cache->ca_base + UMEM_CACHE_PAGE_SZ);
+
+	/* Load multiple pages */
+	rg.cr_off	= cache->ca_base_off + (PAGE_NUM_MAX_NE - 1) * UMEM_CACHE_PAGE_SZ;
+	rg.cr_size	= 3 * UMEM_CACHE_PAGE_SZ;
+	rc = umem_cache_load(&arg->ta_store, &rg, 1, false);
+	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], 3);
+
+	/* Pin multiple pages */
+	rg.cr_off	= cache->ca_base_off + (PAGE_NUM_MAX_NE - 1) * UMEM_CACHE_PAGE_SZ;
+	rg.cr_size	= 2 * UMEM_CACHE_PAGE_SZ;
+	rc = umem_cache_pin(&arg->ta_store, &rg, 1, false, &pin_hdl);
+	assert_rc_equal(rc, 0);
+	assert_non_null(pin_hdl);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], 1);
+
+	/* Unpin the pinned pages */
+	umem_cache_unpin(&arg->ta_store, pin_hdl);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], 0);
+
+	/* Reserve free pages */
+	rc = umem_cache_reserve(&arg->ta_store);
+	assert_rc_equal(rc, 0);
+
+	umem_cache_free(&arg->ta_store);
+}
+
+static void
+test_p2_evict(void **state)
+{
+	struct test_arg		*arg = *state;
+	struct umem_cache	*cache;
+	struct umem_cache_range	 rg = { 0 };
+	struct umem_pin_handle	*pin_hdl;
+	uint64_t		 id;
+	int			 i, rc;
+
+	arg->ta_store.stor_size = UMEM_CACHE_PAGE_SZ * PAGE_NUM_MD;
+	arg->ta_store.stor_ops  = &p2_ops;
+	arg->ta_store.store_type = DAOS_MD_BMEM;
+
+	rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, PAGE_NUM_MD, PAGE_NUM_MEM,
+			      PAGE_NUM_MAX_NE, 4096, (void *)(UMEM_CACHE_PAGE_SZ), is_evictable_fn,
+			      pagevnt_fn, NULL);
+	assert_rc_equal(rc, 0);
+
+	cache = arg->ta_store.cache;
+	assert_non_null(cache);
+
+	reset_arg(arg);
+
+	/* Load all non-evictable pages */
+	rg.cr_off	= cache->ca_base_off;
+	rg.cr_size	= PAGE_NUM_MAX_NE * UMEM_CACHE_PAGE_SZ;
+	rc = umem_cache_load(&arg->ta_store, &rg, 1, false);
+	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], PAGE_NUM_MAX_NE);
+
+	/* Load more pages to fill the cache */
+	rg.cr_off	= cache->ca_base_off + PAGE_NUM_MAX_NE * UMEM_CACHE_PAGE_SZ;
+	rg.cr_size	= (PAGE_NUM_MEM - PAGE_NUM_MAX_NE) * UMEM_CACHE_PAGE_SZ;
+	rc = umem_cache_load(&arg->ta_store, &rg, 1, false);
+	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], PAGE_NUM_MAX_NE);
+
+	/* Dirty all pages */
+	for (i = 0; i < PAGE_NUM_MEM; i++) {
+		touch_mem(arg, i + 1, i * UMEM_CACHE_PAGE_SZ, UMEM_CACHE_CHUNK_SZ);
+		umem_cache_commit(&arg->ta_store, i + 1);
+	}
+	id = PAGE_NUM_MEM;
+
+	/* Pin an unmapped page to trigger eviction */
+	rg.cr_off	= cache->ca_base_off + PAGE_NUM_MEM * UMEM_CACHE_PAGE_SZ;
+	rg.cr_size	= 100;
+	rc = umem_cache_pin(&arg->ta_store, &rg, 1, false, &pin_hdl);
+	assert_rc_equal(rc, 0);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], 1);
+
+	umem_cache_unpin(&arg->ta_store, pin_hdl);
+	assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], 0);
+
+	rc = umem_cache_checkpoint(&arg->ta_store, wait_cb, NULL, &id, NULL);
+	assert_rc_equal(rc, 0);
+	assert_int_equal(id, PAGE_NUM_MEM);
+	check_lists_empty(arg);
+
+	umem_cache_free(&arg->ta_store);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -570,6 +760,8 @@ main(int argc, char **argv)
 	    {"UMEM005: Test page cache", test_page_cache, NULL, NULL},
 	    {"UMEM006: Test page cache many pages", test_many_pages, NULL, NULL},
 	    {"UMEM007: Test page cache many writes", test_many_writes, NULL, NULL},
+	    {"UMEM008: Test phase2 APIs", test_p2_basic, NULL, NULL},
+	    {"UMEM009: Test phase2 eviction", test_p2_evict, NULL, NULL},
 	    {NULL, NULL, NULL, NULL}};
 
 	d_register_alt_assert(mock_assert);
diff --git a/src/common/tests/umem_test_bmem.c b/src/common/tests/umem_test_bmem.c
index 07f4a112b36..cd745c48dc8 100644
--- a/src/common/tests/umem_test_bmem.c
+++ b/src/common/tests/umem_test_bmem.c
@@ -1,6 +1,6 @@
 /**
- * (C) Copyright 2019-2023 Intel Corporation.
- * (C) Copyright 2023 Hewlett Packard Enterprise Development LP.
+ * (C) Copyright 2019-2024 Intel Corporation.
+ * (C) Copyright 2023-2024 Hewlett Packard Enterprise Development LP.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -28,7 +28,9 @@
 #include <daos/tests_lib.h>
 #include "utest_common.h"
 
-#define POOL_SIZE ((1024 * 1024  * 1024ULL))
+#define POOL_SIZE  ((256 * 1024 * 1024ULL))
+#define NEMB_RATIO (0.8)
+#define MB_SIZE    (16 * 1024 * 1024)
 
 struct test_arg {
 	struct utest_context	*ta_utx;
@@ -58,7 +60,7 @@ validate_persist_activity(uint64_t persist_reserv_incr, uint64_t persist_submit_
 
 static int _persist_reserv(struct umem_store *store, uint64_t *id)
 {
-	persist_reserv_cnt++;
+	*id = persist_reserv_cnt++;
 	return 0;
 }
 
@@ -76,6 +78,116 @@ struct umem_store_ops _store_ops = {
 struct umem_store ustore = { .stor_size = POOL_SIZE, .stor_ops = &_store_ops,
 			     .store_type = DAOS_MD_BMEM };
 
+static int
+waitqueue_create(void **wq)
+{
+	*wq = (void *)(UINT64_MAX);
+	return 0;
+}
+
+static void
+waitqueue_destroy(void *wq)
+{
+}
+
+static void
+waitqueue_wait(void *wq, bool yield_only)
+{
+}
+
+static void
+waitqueue_wakeup(void *wq, bool wakeup_all)
+{
+}
+
+static int
+store_load(struct umem_store *store, char *start_addr, daos_off_t offset, daos_size_t len)
+{
+	memset(start_addr, 0, len);
+	D_ASSERTF(0, "Test is not suppose to do a store_load");
+}
+
+char store_buf[4096];
+
+static int
+store_read(struct umem_store *store, struct umem_store_iod *iod, d_sg_list_t *sgl)
+{
+	/* Fake Heap header read write */
+	D_ASSERT(sgl->sg_iovs->iov_len <= 4096);
+	memcpy(sgl->sg_iovs->iov_buf, store_buf, sgl->sg_iovs->iov_len);
+	return 0;
+}
+
+static int
+store_write(struct umem_store *store, struct umem_store_iod *iod, d_sg_list_t *sgl)
+{
+	/* Fake Heap header read write */
+	D_ASSERT(sgl->sg_iovs->iov_len <= 4096);
+	memcpy(store_buf, sgl->sg_iovs->iov_buf, sgl->sg_iovs->iov_len);
+	return 0;
+}
+
+static int
+store_flush_prep(struct umem_store *store, struct umem_store_iod *iod, daos_handle_t *fh)
+{
+	D_ASSERTF(0, "Test is not suppose to do a store_flush_prep");
+	return 0;
+}
+
+static int
+store_flush_copy(daos_handle_t fh, d_sg_list_t *sgl)
+{
+	D_ASSERTF(0, "Test is not suppose to do a store_flush_copy");
+	return 0;
+}
+
+static int
+store_flush_post(daos_handle_t fh, int err)
+{
+	D_ASSERTF(0, "Test is not suppose to do a store_flush_post");
+	return 0;
+}
+
+static int
+wal_id_cmp(struct umem_store *store, uint64_t id1, uint64_t id2)
+{
+	if (id1 > id2)
+		return 1;
+	if (id1 < id2)
+		return -1;
+	return 0;
+}
+
+static int
+wal_replay(struct umem_store *store,
+	   int (*replay_cb)(uint64_t tx_id, struct umem_action *act, void *arg), void *arg)
+{
+	D_ASSERTF(0, "Test is not suppose to do a store_flush_post");
+	return 0;
+}
+
+struct umem_store_ops _store_ops_v2 = {
+    .so_waitqueue_create  = waitqueue_create,
+    .so_waitqueue_destroy = waitqueue_destroy,
+    .so_waitqueue_wait    = waitqueue_wait,
+    .so_waitqueue_wakeup  = waitqueue_wakeup,
+    .so_load              = store_load,
+    .so_read              = store_read,
+    .so_write             = store_write,
+    .so_flush_prep        = store_flush_prep,
+    .so_flush_copy        = store_flush_copy,
+    .so_flush_post        = store_flush_post,
+    .so_wal_reserv        = _persist_reserv,
+    .so_wal_submit        = _persist_submit,
+    .so_wal_replay        = wal_replay,
+    .so_wal_id_cmp        = wal_id_cmp,
+};
+
+struct umem_store ustore_v2 = {.stor_size  = POOL_SIZE * 2,
+			       .stor_ops   = &_store_ops_v2,
+			       .store_type = DAOS_MD_BMEM_V2,
+			       .stor_priv  = (void *)(UINT64_MAX)};
+
 int
 teardown_pmem(void **state)
 {
@@ -94,8 +206,8 @@ teardown_pmem(void **state)
 	return rc;
 }
 
-int
-setup_pmem(void **state)
+static int
+setup_pmem_internal(void **state, struct umem_store *store)
 {
 	struct test_arg		*arg = *state;
 	static int		 tnum;
@@ -107,8 +219,8 @@ setup_pmem(void **state)
 		return 1;
 	}
 
-	rc = utest_pmem_create(arg->ta_pool_name, POOL_SIZE,
-			       sizeof(*arg->ta_root), &ustore, &arg->ta_utx);
+	rc = utest_pmem_create(arg->ta_pool_name, POOL_SIZE, sizeof(*arg->ta_root), store,
+			       &arg->ta_utx);
 	if (rc != 0) {
 		perror("Could not create pmem context");
 		rc = 1;
@@ -123,6 +235,18 @@ setup_pmem(void **state)
 	return rc;
 }
 
+static int
+setup_pmem(void **state)
+{
+	return setup_pmem_internal(state, &ustore);
+}
+
+static int
+setup_pmem_v2(void **state)
+{
+	return setup_pmem_internal(state, &ustore_v2);
+}
+
 static int
 global_setup(void **state)
 {
@@ -132,6 +256,7 @@ global_setup(void **state)
 		print_message("Failed to set the md_on_ssd tunable\n");
 		return 1;
 	}
+	ustore.store_type = umempobj_get_backend_type();
 
 	D_ALLOC_PTR(arg);
 	if (arg == NULL) {
@@ -204,6 +329,57 @@ test_atomic_alloc(void **state)
 	assert_true(cur_mem_used == initial_mem_used);
 }
 
+static void
+test_atomic_alloc_from_bucket(void **state)
+{
+	struct test_arg		*arg = *state;
+	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
+	uint64_t		 off, size, off_arr[16];
+	int			 i, rc;
+	uint64_t		 initial_mem_used, cur_mem_used;
+	uint64_t		 total_size = 0;
+
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	snap_persist_activity();
+	off = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(off));
+	validate_persist_activity(1, 1);
+
+	rc = umem_atomic_free(umm, off);
+	assert_int_equal(rc, 0);
+	validate_persist_activity(2, 2);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used == initial_mem_used);
+
+	/* Negative test: Incorrect size test */
+	snap_persist_activity();
+	off = umem_atomic_alloc_from_bucket(umm, 0, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_true(UMOFF_IS_NULL(off));
+	validate_persist_activity(0, 0);
+
+	/* Validate allocation of various sizes */
+	snap_persist_activity();
+	for (i = 1; i < 16; i++) {
+		size = (1ul<<i) - 1;
+		total_size += size;
+		off_arr[i] = umem_atomic_alloc_from_bucket(umm, size, UMEM_TYPE_ANY,
+							   UMEM_DEFAULT_MBKT_ID);
+		assert_false(UMOFF_IS_NULL(off_arr[i]));
+	}
+	validate_persist_activity(15, 15);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used >= initial_mem_used+total_size);
+
+	snap_persist_activity();
+	for (i = 15; i > 0; i--) {
+		rc = umem_atomic_free(umm, off_arr[i]);
+		assert_int_equal(rc, 0);
+	}
+	validate_persist_activity(15, 15);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used == initial_mem_used);
+}
+
 static void
 test_atomic_copy(void **state)
 {
@@ -769,29 +945,41 @@ test_alloc(void **state)
 	int			 rc;
 
 	rc = utest_tx_begin(arg->ta_utx);
-	if (rc != 0)
-		goto done;
+	assert_int_equal(rc, 0);
 
 	umoff = umem_zalloc(umm, 4);
-	if (UMOFF_IS_NULL(umoff)) {
-		print_message("umoff unexpectedly NULL\n");
-		rc = 1;
-		goto end;
-	}
+	assert_false(UMOFF_IS_NULL(umoff));
 
 	value1 = umem_off2ptr(umm, umoff);
+	assert_true(*value1 == 0);
 
-	if (*value1 != 0) {
-		print_message("Bad value for allocated umoff\n");
-		rc = 1;
-		goto end;
-	}
+	rc = umem_free(umm, umoff);
+	assert_int_equal(rc, 0);
+	utest_tx_end(arg->ta_utx, rc);
+}
+
+static void
+test_alloc_from_bucket(void **state)
+{
+	struct test_arg		*arg = *state;
+	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
+	int			*value1;
+	umem_off_t		 umoff = 0;
+	int			 rc;
+
+	rc = utest_tx_begin(arg->ta_utx);
+	assert_int_equal(rc, 0);
+
+	umoff = umem_zalloc_from_bucket(umm, 4, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff));
+
+	value1 = umem_off2ptr(umm, umoff);
+
+	assert_true(*value1 == 0);
 
 	rc = umem_free(umm, umoff);
-end:
-	rc = utest_tx_end(arg->ta_utx, rc);
-done:
 	assert_int_equal(rc, 0);
+	utest_tx_end(arg->ta_utx, rc);
 }
 
 static void
@@ -923,106 +1111,234 @@ test_tx_alloc(void **state)
 }
 
 static void
-test_tx_add(void **state)
+test_tx_alloc_from_bucket(void **state)
 {
 	struct test_arg		*arg = *state;
 	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
 	int			 rc;
-	umem_off_t		 umoff;
-	char			*start_ptr, *tmp_ptr;
-	char			 local_buf[2048];
-
-	/* Setup */
-	umoff = umem_atomic_alloc(umm, 2048, UMEM_TYPE_ANY);
-	assert_false(UMOFF_IS_NULL(umoff));
-	start_ptr = umem_off2ptr(umm, umoff);
-	memset(local_buf, 0, 2048);
-	tmp_ptr = umem_atomic_copy(umm, start_ptr, local_buf, 2048, UMEM_COMMIT_IMMEDIATE);
-	assert_true(tmp_ptr == start_ptr);
-
-	/* Negative tests */
-	expect_assert_failure(umem_tx_add(umm, umoff, 128));
+	daos_size_t		 allotted_size = 0;
+	uint64_t		 initial_mem_used, cur_mem_used;
+	int			*value1, *value2;
+	umem_off_t		 umoff1 = 0, umoff2 = 0;
 
-	/* Normal operation */
+	/* Test umem_zalloc */
 	snap_persist_activity();
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
 	rc = umem_tx_begin(umm, NULL);
 	assert_int_equal(rc, 0);
-	rc = umem_tx_add(umm, umoff, 128);
-	assert_int_equal(rc, 0);
-	start_ptr = umem_off2ptr(umm, umoff);
-	memset(start_ptr, 'a', 128);
-	memset(local_buf, 'a', 128);
-	rc = umem_tx_end(umm, 0);
+
+	umoff1 = umem_zalloc_from_bucket(umm, 4, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff1));
+	allotted_size += 4;
+
+	value1 = umem_off2ptr(umm, umoff1);
+
+	assert_true(*value1 == 0);
+
+	rc = umem_tx_commit(umm);
 	assert_int_equal(rc, 0);
 	validate_persist_activity(1, 1);
-	assert_false(strncmp(local_buf, start_ptr, 128));
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used >= (initial_mem_used + allotted_size));
 
-	/* Abort a transaction after tx add */
+	/* Test umem_alloc */
 	snap_persist_activity();
 	rc = umem_tx_begin(umm, NULL);
 	assert_int_equal(rc, 0);
-	rc = umem_tx_add(umm, umoff+128, 128);
-	assert_int_equal(rc, 0);
-	tmp_ptr = umem_off2ptr(umm, umoff+128);
-	memset(tmp_ptr, 'b', 128);
-	rc = umem_tx_abort(umm, 1);
-	assert_true(rc != 0);
-	validate_persist_activity(1, 0);
-	assert_false(strncmp(local_buf, start_ptr, 256));
 
-	/* Invalid offset */
-	snap_persist_activity();
-	rc = umem_tx_begin(umm, NULL);
-	assert_int_equal(rc, 0);
-	rc = umem_tx_add(umm, POOL_SIZE+4096, 128);
-	assert_true(rc != 0);
-	assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT);
-	rc = umem_tx_end(umm, rc);
-	assert_true(rc != 0);
-	validate_persist_activity(1, 0);
-}
+	umoff2 = umem_alloc_from_bucket(umm, 4, UMEM_DEFAULT_MBKT_ID);
+	allotted_size += 4;
+	assert_false(UMOFF_IS_NULL(umoff2));
 
-static void
-test_tx_add_ptr(void **state)
-{
-	struct test_arg		*arg = *state;
-	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
-	int			 rc;
-	umem_off_t		 umoff;
-	char			*start_ptr, *tmp_ptr;
-	char			 local_buf[2048];
+	value2 = umem_off2ptr(umm, umoff2);
+	*value2 = 100;
 
-	/* Setup */
-	umoff = umem_atomic_alloc(umm, 2048, UMEM_TYPE_ANY);
-	assert_false(UMOFF_IS_NULL(umoff));
-	start_ptr = umem_off2ptr(umm, umoff);
-	memset(local_buf, 0, 2048);
-	tmp_ptr = umem_atomic_copy(umm, start_ptr, local_buf, 2048, UMEM_COMMIT_IMMEDIATE);
-	assert_true(tmp_ptr == start_ptr);
+	rc = umem_tx_commit(umm);
+	assert_int_equal(rc, 0);
 
-	/* Negative tests */
-	expect_assert_failure(umem_tx_add_ptr(umm, start_ptr, 128));
+	validate_persist_activity(1, 1);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used >= (initial_mem_used + allotted_size));
 
-	/* Normal operation */
+	/* Test umem_free */
 	snap_persist_activity();
 	rc = umem_tx_begin(umm, NULL);
 	assert_int_equal(rc, 0);
-	start_ptr = umem_off2ptr(umm, umoff);
-	rc = umem_tx_add_ptr(umm, start_ptr, 128);
-	assert_int_equal(rc, 0);
-	memset(start_ptr, 'a', 128);
-	memset(local_buf, 'a', 128);
-	rc = umem_tx_end(umm, 0);
+
+	rc = umem_free(umm, umoff2);
 	assert_int_equal(rc, 0);
-	validate_persist_activity(1, 1);
-	assert_false(strncmp(local_buf, start_ptr, 128));
+	allotted_size -= 4;
 
-	/* Abort a transaction after tx add */
-	snap_persist_activity();
-	rc = umem_tx_begin(umm, NULL);
+	rc = umem_free(umm, umoff1);
 	assert_int_equal(rc, 0);
-	tmp_ptr = umem_off2ptr(umm, umoff+128);
-	rc = umem_tx_add_ptr(umm, tmp_ptr, 128);
+	allotted_size -= 4;
+
+	rc = umem_tx_commit(umm);
+	assert_int_equal(rc, 0);
+	validate_persist_activity(1, 1);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(allotted_size == 0);
+	assert_true(cur_mem_used == initial_mem_used);
+
+	/* Negative Tests */
+	/* Outside of TX */
+	expect_assert_failure(umem_alloc_from_bucket(umm, 100, UMEM_DEFAULT_MBKT_ID));
+	expect_assert_failure(umem_zalloc_from_bucket(umm, 100, UMEM_DEFAULT_MBKT_ID));
+
+	/* alloc of size zero */
+	snap_persist_activity();
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	umoff1 = umem_alloc_from_bucket(umm, 0, UMEM_DEFAULT_MBKT_ID);
+	assert_true(UMOFF_IS_NULL(umoff1));
+	assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT);
+	rc = umem_tx_end(umm, 1);
+	assert_false(rc == 0);
+	validate_persist_activity(1, 0);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(initial_mem_used == cur_mem_used);
+
+	snap_persist_activity();
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	umoff1 = umem_zalloc_from_bucket(umm, 0, UMEM_DEFAULT_MBKT_ID);
+	assert_true(UMOFF_IS_NULL(umoff1));
+	assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT);
+	rc = umem_tx_end(umm, 1);
+	assert_false(rc == 0);
+	validate_persist_activity(1, 0);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(initial_mem_used == cur_mem_used);
+
+	/* free outside of tx */
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	umoff1 = umem_zalloc_from_bucket(umm, 4, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff1));
+	rc = umem_tx_end(umm, 0);
+	assert_int_equal(rc, 0);
+	expect_assert_failure(umem_free(umm, umoff1));
+
+	/* abort after alloc and used memory should not increase */
+	snap_persist_activity();
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	umoff1 = umem_alloc_from_bucket(umm, 16, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff1));
+	umoff1 = umem_zalloc_from_bucket(umm, 32, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff2));
+	rc = umem_tx_abort(umm, 1);
+	assert_false(rc == 0);
+	validate_persist_activity(1, 0);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(initial_mem_used == cur_mem_used);
+
+}
+
+static void
+test_tx_add(void **state)
+{
+	struct test_arg		*arg = *state;
+	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
+	int			 rc;
+	umem_off_t		 umoff;
+	char			*start_ptr, *tmp_ptr;
+	char			 local_buf[2048];
+
+	/* Setup */
+	umoff = umem_atomic_alloc(umm, 2048, UMEM_TYPE_ANY);
+	assert_false(UMOFF_IS_NULL(umoff));
+	start_ptr = umem_off2ptr(umm, umoff);
+	memset(local_buf, 0, 2048);
+	tmp_ptr = umem_atomic_copy(umm, start_ptr, local_buf, 2048, UMEM_COMMIT_IMMEDIATE);
+	assert_true(tmp_ptr == start_ptr);
+
+	/* Negative tests */
+	expect_assert_failure(umem_tx_add(umm, umoff, 128));
+
+	/* Normal operation */
+	snap_persist_activity();
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_add(umm, umoff, 128);
+	assert_int_equal(rc, 0);
+	start_ptr = umem_off2ptr(umm, umoff);
+	memset(start_ptr, 'a', 128);
+	memset(local_buf, 'a', 128);
+	rc = umem_tx_end(umm, 0);
+	assert_int_equal(rc, 0);
+	validate_persist_activity(1, 1);
+	assert_false(strncmp(local_buf, start_ptr, 128));
+
+	/* Abort a transaction after tx add */
+	snap_persist_activity();
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_add(umm, umoff+128, 128);
+	assert_int_equal(rc, 0);
+	tmp_ptr = umem_off2ptr(umm, umoff+128);
+	memset(tmp_ptr, 'b', 128);
+	rc = umem_tx_abort(umm, 1);
+	assert_true(rc != 0);
+	validate_persist_activity(1, 0);
+	assert_false(strncmp(local_buf, start_ptr, 256));
+
+	/* Invalid offset */
+	snap_persist_activity();
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_add(umm, umm->umm_pool->up_store.stor_size + 4096, 128);
+	assert_true(rc != 0);
+	assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT);
+	rc = umem_tx_end(umm, rc);
+	assert_true(rc != 0);
+	validate_persist_activity(1, 0);
+}
+
+static void
+test_tx_add_ptr(void **state)
+{
+	struct test_arg		*arg = *state;
+	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
+	int			 rc;
+	umem_off_t		 umoff;
+	char			*start_ptr, *tmp_ptr;
+	char			 local_buf[2048];
+
+	/* Setup */
+	umoff = umem_atomic_alloc(umm, 2048, UMEM_TYPE_ANY);
+	assert_false(UMOFF_IS_NULL(umoff));
+	start_ptr = umem_off2ptr(umm, umoff);
+	memset(local_buf, 0, 2048);
+	tmp_ptr = umem_atomic_copy(umm, start_ptr, local_buf, 2048, UMEM_COMMIT_IMMEDIATE);
+	assert_true(tmp_ptr == start_ptr);
+
+	/* Negative tests */
+	expect_assert_failure(umem_tx_add_ptr(umm, start_ptr, 128));
+
+	/* Normal operation */
+	snap_persist_activity();
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	start_ptr = umem_off2ptr(umm, umoff);
+	rc = umem_tx_add_ptr(umm, start_ptr, 128);
+	assert_int_equal(rc, 0);
+	memset(start_ptr, 'a', 128);
+	memset(local_buf, 'a', 128);
+	rc = umem_tx_end(umm, 0);
+	assert_int_equal(rc, 0);
+	validate_persist_activity(1, 1);
+	assert_false(strncmp(local_buf, start_ptr, 128));
+
+	/* Abort a transaction after tx add */
+	snap_persist_activity();
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	tmp_ptr = umem_off2ptr(umm, umoff+128);
+	rc = umem_tx_add_ptr(umm, tmp_ptr, 128);
 	assert_int_equal(rc, 0);
 	memset(tmp_ptr, 'b', 128);
 	rc = umem_tx_abort(umm, 1);
@@ -1094,17 +1410,6 @@ test_tx_xadd_ptr(void **state)
 	assert_true(rc != 0);
 	validate_persist_activity(1, 0);
 	assert_false(strncmp(local_buf, start_ptr, 512));
-
-	/* Invalid pointer */
-	snap_persist_activity();
-	rc = umem_tx_begin(umm, NULL);
-	assert_int_equal(rc, 0);
-	rc = umem_tx_xadd_ptr(umm, local_buf, 128, UMEM_XADD_NO_SNAPSHOT);
-	assert_true(rc != 0);
-	assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT);
-	rc = umem_tx_end(umm, rc);
-	assert_true(rc != 0);
-	validate_persist_activity(1, 0);
 }
 
 static void
@@ -1238,6 +1543,137 @@ test_tx_reserve_publish_cancel(void **state)
 	umem_rsrvd_act_free(&rsrvd_act);
 }
 
+static void
+test_tx_bucket_reserve_publish_cancel(void **state)
+{
+	struct test_arg		*arg = *state;
+	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
+	int			 rc;
+	struct umem_rsrvd_act	*rsrvd_act;
+	umem_off_t		 umoff;
+	char			*rsrv_ptr1, *rsrv_ptr2, *rsrv_ptr3, *rsrv_ptr4;
+	char			*data = "Test Program test_tx_xadd_ptr";
+	char			 local_buf[980];
+	uint64_t		 initial_mem_used, cur_mem_used;
+	uint64_t		 allotted_mem = 0;
+	char			 addon_buf[128];
+
+	/* Reserve/Publish */
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2);
+	assert_int_equal(rc, 0);
+	umoff = umem_reserve_from_bucket(umm, rsrvd_act, 980, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff));
+	rsrv_ptr1 = umem_off2ptr(umm, umoff);
+	memset(rsrv_ptr1, 0, 980);
+	memset(local_buf, 0, 980);
+	memcpy(rsrv_ptr1+128, data, strlen(data));
+	memcpy(local_buf+128, data, strlen(data));
+
+	umoff = umem_reserve_from_bucket(umm, rsrvd_act, 128, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff));
+	rsrv_ptr2 = umem_off2ptr(umm, umoff);
+	memset(rsrv_ptr2, 0, 128);
+	memset(addon_buf, 0, 128);
+	memcpy(rsrv_ptr2, data, strlen(data));
+	memcpy(addon_buf, data, strlen(data));
+
+
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	snap_persist_activity();
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_add_ptr(umm, rsrv_ptr1, 128);
+	assert_int_equal(rc, 0);
+	strcpy(rsrv_ptr1, "header");
+	strcpy(local_buf, "header");
+	rc = umem_tx_publish(umm, rsrvd_act);
+	assert_int_equal(rc, 0);
+	allotted_mem = 980 + 128;
+	rc = umem_tx_commit(umm);
+	assert_int_equal(rc, 0);
+	validate_persist_activity(1, 1);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used >= initial_mem_used + allotted_mem);
+	assert_int_equal(memcmp(rsrv_ptr1, local_buf, 980), 0);
+	assert_int_equal(memcmp(rsrv_ptr2, addon_buf, 128), 0);
+	umem_rsrvd_act_free(&rsrvd_act);
+
+
+	/* Reserve/Cancel */
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2);
+	assert_int_equal(rc, 0);
+	umoff = umem_reserve_from_bucket(umm, rsrvd_act, 980, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff));
+	rsrv_ptr1 = umem_off2ptr(umm, umoff);
+	memset(rsrv_ptr1, 1, 980);
+	memset(local_buf, 1, 980);
+
+	umoff = umem_reserve_from_bucket(umm, rsrvd_act, 128, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff));
+	rsrv_ptr2 = umem_off2ptr(umm, umoff);
+	memset(rsrv_ptr2, 1, 128);
+
+
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	snap_persist_activity();
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_add_ptr(umm, rsrv_ptr1, 128);
+	assert_int_equal(rc, 0);
+	strcpy(rsrv_ptr1, "header");
+	rc = umem_tx_add_ptr(umm, rsrv_ptr2, 128);
+	assert_int_equal(rc, 0);
+	strcpy(rsrv_ptr2, "leader");
+	rc = umem_tx_abort(umm, 1);
+	assert_false(rc == 0);
+	assert_int_equal(memcmp(rsrv_ptr1, local_buf, 980), 0);
+	assert_int_equal(memcmp(rsrv_ptr2, local_buf, 128), 0);
+	umem_cancel(umm, rsrvd_act);
+	validate_persist_activity(1, 0);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used >= initial_mem_used);
+	umoff = umem_atomic_alloc_from_bucket(umm, 980, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff));
+	rsrv_ptr3 = umem_off2ptr(umm, umoff);
+	assert_ptr_equal(rsrv_ptr1, rsrv_ptr3);
+	umoff = umem_atomic_alloc_from_bucket(umm, 128, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff));
+	rsrv_ptr4 = umem_off2ptr(umm, umoff);
+	assert_ptr_equal(rsrv_ptr2, rsrv_ptr4);
+	umem_rsrvd_act_free(&rsrvd_act);
+
+	/* reserve - atomic_copy - cancel */
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2);
+	assert_int_equal(rc, 0);
+	umoff = umem_reserve_from_bucket(umm, rsrvd_act, 980, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff));
+	rsrv_ptr1 = umem_off2ptr(umm, umoff);
+	memset(local_buf, 1, 980);
+	memcpy(local_buf+128, data, strlen(data));
+	snap_persist_activity();
+	umem_atomic_copy(umm, rsrv_ptr1, local_buf, 980, UMEM_COMMIT_IMMEDIATE);
+	validate_persist_activity(1, 1);
+
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	snap_persist_activity();
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_add_ptr(umm, rsrv_ptr1, 128);
+	assert_int_equal(rc, 0);
+	strcpy(rsrv_ptr1, "header");
+	strcpy(local_buf, "header");
+	rc = umem_tx_publish(umm, rsrvd_act);
+	assert_int_equal(rc, 0);
+	allotted_mem = 980;
+	rc = umem_tx_commit(umm);
+	assert_int_equal(rc, 0);
+	validate_persist_activity(1, 1);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used >= initial_mem_used + allotted_mem);
+	assert_int_equal(memcmp(rsrv_ptr1, local_buf, 980), 0);
+	umem_rsrvd_act_free(&rsrvd_act);
+}
+
 static void
 test_tx_dfree_publish_cancel(void **state)
 {
@@ -1295,123 +1731,980 @@ test_tx_dfree_publish_cancel(void **state)
 	umem_rsrvd_act_free(&rsrvd_act);
 }
 
-#if 0
-/** This test is removed because the umempobj_set_slab_desc APIs are removed.  Testing the
- *  underlying dav or pmem APIs should probably be handled elsewhere.
- */
 static void
-test_tx_alloc_withslabs(void **state)
+test_tx_bucket_dfree_publish_cancel(void **state)
 {
 	struct test_arg		*arg = *state;
 	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
-	struct umem_slab_desc	 slab[5];
-	int			 rc, i;
-	umem_off_t		 ummoff_exact1[5], ummoff_less[5], ummoff_exact2[5], ummoff_greater;
-	size_t			 size_exact, size_less, size_greater;
-	size_t			 initial_mem_used, cur_mem_used, total_allotted;
-
-	/* Negative tests for allocation class */
-	slab[0].unit_size = ULONG_MAX;
-	slab[0].class_id = 0;
-	rc = umempobj_set_slab_desc(umm->umm_pool, &slab[0]);
-	assert_int_not_equal(rc, 0);
-	slab[0].unit_size = 344;
-	slab[0].class_id = UINT8_MAX;
-	rc = umempobj_set_slab_desc(umm->umm_pool, &slab[0]);
-	assert_int_not_equal(rc, 0);
-
-	/* Valid slab creation */
-	for (i = 0; i < 5; i++) {
-		slab[i].unit_size = (1<<(i*2)) + 200 + i*16;
-		slab[i].class_id = 0;
-		rc = umempobj_set_slab_desc(umm->umm_pool, &slab[i]);
-		assert_int_equal(rc, 0);
-		assert_int_not_equal(slab[i].class_id, 0);
+	int			 rc;
+	struct umem_rsrvd_act	*rsrvd_act;
+	umem_off_t		 umoff1, umoff2;
+	uint64_t		 freed_mem = 0;
+	uint64_t		 initial_mem_used, cur_mem_used;
 
-		umm->umm_slabs[i] = slab[i];
-	}
+	/* Defer Free/Publish */
+	umoff1 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff1));
+	umoff2 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff2));
+
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2);
+	assert_int_equal(rc, 0);
+
+	umem_defer_free(umm, umoff1, rsrvd_act);
+	umem_defer_free(umm, umoff2, rsrvd_act);
 
 	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	snap_persist_activity();
 	rc = umem_tx_begin(umm, NULL);
 	assert_int_equal(rc, 0);
-	total_allotted = 0;
-	for (i = 0; i < 5; i++) {
-		size_exact = (1<<(i*2)) + 200 + i*16;
-		ummoff_exact1[i] = umem_alloc_verb(umm, i, UMEM_FLAG_ZERO, size_exact);
-		assert_false(UMOFF_IS_NULL(ummoff_exact1[i]));
-		size_less = 200;
-		ummoff_less[i] = umem_alloc_verb(umm, i, UMEM_FLAG_ZERO, size_less);
-		assert_false(UMOFF_IS_NULL(ummoff_less[i]));
-		assert_true(ummoff_exact1[i] + size_exact == ummoff_less[i]);
-		ummoff_exact2[i] = umem_alloc_verb(umm, i, UMEM_FLAG_ZERO, size_exact);
-		assert_false(UMOFF_IS_NULL(ummoff_exact2[i]));
-		assert_true(ummoff_less[i] + size_exact == ummoff_exact2[i]);
-		total_allotted += size_exact*3;
-	}
+	rc = umem_tx_publish(umm, rsrvd_act);
+	assert_int_equal(rc, 0);
+	freed_mem = 2048 + 1024;
 	rc = umem_tx_commit(umm);
 	assert_int_equal(rc, 0);
+	validate_persist_activity(1, 1);
 	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
-	assert_true(initial_mem_used + total_allotted == cur_mem_used);
+	assert_true(initial_mem_used >= cur_mem_used + freed_mem);
+	umem_rsrvd_act_free(&rsrvd_act);
 
-	for (i = 0; i < 5; i++) {
-		size_greater = (1<<(i*2)) + 200 + i*16 + 100;
-		rc = umem_tx_begin(umm, NULL);
-		assert_int_equal(rc, 0);
-		ummoff_greater = umem_alloc_verb(umm, i, UMEM_FLAG_ZERO, size_greater);
-		assert_true(UMOFF_IS_NULL(ummoff_greater));
-		rc = umem_tx_end(umm, 1);
-		assert_int_equal(rc, umem_tx_errno(ENOMEM));
+
+	/* Defer Free/Cancel */
+	umoff1 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff1));
+	umoff2 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff2));
+
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2);
+	assert_int_equal(rc, 0);
+
+	umem_defer_free(umm, umoff1, rsrvd_act);
+	umem_defer_free(umm, umoff2, rsrvd_act);
+
+	utest_get_scm_used_space(arg->ta_utx, &initial_mem_used);
+	umem_cancel(umm, rsrvd_act);
+	utest_get_scm_used_space(arg->ta_utx, &cur_mem_used);
+	assert_true(cur_mem_used >= initial_mem_used);
+	umem_rsrvd_act_free(&rsrvd_act);
+}
+
+static void
+test_atomic_alloc_mb(void **state)
+{
+	struct test_arg		*arg = *state;
+	struct umem_instance	*umm = utest_utx2umm(arg->ta_utx);
+	umem_off_t               umoff, umoff1, umoff2, umoff3, umoff4;
+	uint32_t                 mb_id;
+	int                      found = 0, i;
+
+	mb_id = umem_allot_mb_evictable(umm, 0);
+	assert_int_not_equal(mb_id, 0); /* zero maps to non-evictable memory bucket */
+
+	/* Allocate objects from the memory bucket */
+	umoff1 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff1));
+	assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id);
+	umoff2 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff2));
+	assert_true(umem_get_mb_from_offset(umm, umoff2) == mb_id);
+
+	/* Allocate from non-evictable memory bucket */
+	umoff3 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff3));
+	assert_true(umem_get_mb_from_offset(umm, umoff3) == UMEM_DEFAULT_MBKT_ID);
+	umoff4 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+	assert_false(UMOFF_IS_NULL(umoff4));
+	assert_true(umem_get_mb_from_offset(umm, umoff4) == UMEM_DEFAULT_MBKT_ID);
+
+	/* Free allocated objects */
+	umem_atomic_free(umm, umoff1);
+	umem_atomic_free(umm, umoff2);
+	umem_atomic_free(umm, umoff3);
+	umem_atomic_free(umm, umoff4);
+
+	/*
+	 * Validate whether those freed objects are in the free list of respective
+	 * Memory buckets. We do many allocations and free to ensure that the objects
+	 * in recycler bin are moved back for reallocation.
+	 */
+
+	found = 0;
+	for (i = 0; i < 16 * 1024; i++) {
+		umoff = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff));
+		assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+		umem_atomic_free(umm, umoff);
+		if (umoff == umoff1) {
+			found = 1;
+			break;
+		}
+	}
+	assert_int_equal(found, 1);
+
+	found = 0;
+	for (i = 0; i < 16 * 1024; i++) {
+		umoff =
+		    umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+		assert_false(UMOFF_IS_NULL(umoff));
+		assert_true(umem_get_mb_from_offset(umm, umoff) == UMEM_DEFAULT_MBKT_ID);
+		umem_atomic_free(umm, umoff);
+		if (umoff == umoff3) {
+			found = 1;
+			break;
+		}
+	}
+	assert_int_equal(found, 1);
+
+	found = 0;
+	for (i = 0; i < 16 * 1024; i++) {
+		umoff = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff));
+		assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+		umem_atomic_free(umm, umoff);
+		if (umoff == umoff2) {
+			found = 1;
+			break;
+		}
+	}
+	assert_int_equal(found, 1);
+
+	found = 0;
+	for (i = 0; i < 16 * 1024; i++) {
+		umoff =
+		    umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID);
+		assert_false(UMOFF_IS_NULL(umoff));
+		assert_true(umem_get_mb_from_offset(umm, umoff) == UMEM_DEFAULT_MBKT_ID);
+		umem_atomic_free(umm, umoff);
+		if (umoff == umoff4) {
+			found = 1;
+			break;
+		}
 	}
+	assert_int_equal(found, 1);
+}
+
+static void
+test_atomic_alloc_overflow_mb(void **state)
+{
+	struct test_arg      *arg = *state;
+	struct umem_instance *umm = utest_utx2umm(arg->ta_utx);
+	umem_off_t            umoff, umoff_prev;
+	umem_off_t            umoff1 = UMOFF_NULL, umoff2 = UMOFF_NULL, umoff3 = UMOFF_NULL;
+	uint32_t              mb_id, ret_id;
+	int                   hit            = 0;
+	uint64_t              allocated_size = 0;
+
+	mb_id = umem_allot_mb_evictable(umm, 0);
+	assert_int_not_equal(mb_id, 0); /* zero maps to non-evictable memory bucket */
+
+	do {
+		hit = 0;
+		/* Allocate objects from the memory bucket */
+		umoff_prev = umoff1;
+		umoff1     = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff1));
+		ret_id = umem_get_mb_from_offset(umm, umoff1);
+		if (ret_id == mb_id)
+			allocated_size += 2048;
+		else if (ret_id == 0) {
+			umem_atomic_free(umm, umoff1);
+			umoff1 = umoff_prev;
+			hit++;
+		} else
+			assert_true(ret_id == mb_id);
+		umoff_prev = umoff2;
+		umoff2     = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff2));
+		ret_id = umem_get_mb_from_offset(umm, umoff2);
+		if (ret_id == mb_id)
+			allocated_size += 1024;
+		else if (ret_id == 0) {
+			umem_atomic_free(umm, umoff2);
+			umoff2 = umoff_prev;
+			hit++;
+		} else
+			assert_true(ret_id == mb_id);
+		umoff_prev = umoff3;
+		umoff3     = umem_atomic_alloc_from_bucket(umm, 128, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff3));
+		ret_id = umem_get_mb_from_offset(umm, umoff3);
+		if (ret_id == mb_id)
+			allocated_size += 128;
+		else if (ret_id == 0) {
+			umem_atomic_free(umm, umoff3);
+			umoff3 = umoff_prev;
+			hit++;
+		} else
+			assert_true(ret_id == mb_id);
+	} while (hit != 3);
+	print_message("Total allocated size from mb %lu\n", allocated_size);
+
+	umem_atomic_free(umm, umoff1);
+	umem_atomic_free(umm, umoff2);
+	umem_atomic_free(umm, umoff3);
+
+	/*
+	 * The only free memory in the MB is that of the offsets freed above.
+	 * Subsequent allocation from the same MB should return the same offsets.
+	 */
+	umoff = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+	assert_true(umoff == umoff1);
+	umoff = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+	assert_true(umoff == umoff2);
+	umoff = umem_atomic_alloc_from_bucket(umm, 128, UMEM_TYPE_ANY, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+	assert_true(umoff == umoff3);
+}
+
+static void
+test_reserve_from_mb(void **state)
+{
+	struct test_arg       *arg = *state;
+	struct umem_instance  *umm = utest_utx2umm(arg->ta_utx);
+	umem_off_t             umoff, umoff1;
+	uint32_t               mb_id;
+	struct umem_rsrvd_act *rsrvd_act;
+	size_t                 rsrv_size = 1032;
+	int                    found     = 0, i, rc;
+
+	mb_id = umem_allot_mb_evictable(umm, 0);
+	assert_int_not_equal(mb_id, 0); /* zero maps to non-evictable memory bucket */
+
+	/* Reserve an object and then cancel the allocation */
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1);
+	assert_int_equal(rc, 0);
+	umoff = umem_reserve_from_bucket(umm, rsrvd_act, rsrv_size, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	/* Validate that the object is from the memory bucket of interest. */
+	assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+	umem_cancel(umm, rsrvd_act);
+	umem_rsrvd_act_free(&rsrvd_act);
+	/* Validate that the object is really freed */
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1);
+	assert_int_equal(rc, 0);
+	umoff1 = umem_reserve_from_bucket(umm, rsrvd_act, rsrv_size, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	assert_true(umoff1 == umoff);
+	umem_cancel(umm, rsrvd_act);
+	umem_rsrvd_act_free(&rsrvd_act);
+
+	/* Reserve an object and publish it within a transaction. */
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1);
+	assert_int_equal(rc, 0);
+	umoff = umem_reserve_from_bucket(umm, rsrvd_act, rsrv_size, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	/* Validate that the object is from the memory bucket of interest. */
+	assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_publish(umm, rsrvd_act);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_commit(umm);
+	assert_int_equal(rc, 0);
+	umem_rsrvd_act_free(&rsrvd_act);
+	/*
+	 * Make sure that the above allocated object is never returned by
+	 * subsequent allocation.
+	 */
+	for (i = 0; i < 32 * 1024; i++) {
+		umoff1 = umem_atomic_alloc_from_bucket(umm, rsrv_size, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff1));
+		assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id);
+		umem_atomic_free(umm, umoff1);
+		assert_false(umoff == umoff1);
+	}
+
+	/* Defer free an object and cancel it subsequently */
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1);
+	assert_int_equal(rc, 0);
+	umem_defer_free(umm, umoff, rsrvd_act);
+	assert_int_equal(rc, 0);
+	umem_cancel(umm, rsrvd_act);
+	umem_rsrvd_act_free(&rsrvd_act);
+	/* Validate that the object is not really freed */
+	for (i = 0; i < 32 * 1024; i++) {
+		umoff1 = umem_atomic_alloc_from_bucket(umm, rsrv_size, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff1));
+		assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id);
+		umem_atomic_free(umm, umoff1);
+		assert_false(umoff == umoff1);
+	}
+
+	/* Defer free an object and publish it within a transaction. */
+	rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1);
+	assert_int_equal(rc, 0);
+	umem_defer_free(umm, umoff, rsrvd_act);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_publish(umm, rsrvd_act);
+	assert_int_equal(rc, 0);
+	rc = umem_tx_commit(umm);
+	assert_int_equal(rc, 0);
+	umem_rsrvd_act_free(&rsrvd_act);
+	/* Validate that the object is returned in subsequent allocation */
+	found = 0;
+	for (i = 0; i < 32 * 1024; i++) {
+		umoff1 = umem_atomic_alloc_from_bucket(umm, rsrv_size, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff1));
+		assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id);
+		umem_atomic_free(umm, umoff1);
+		if (umoff == umoff1) {
+			found = 1;
+			break;
+		}
+	}
+	assert_int_equal(found, 1);
+}
+
+static void
+test_tx_alloc_from_mb(void **state)
+{
+	struct test_arg      *arg   = *state;
+	struct umem_instance *umm   = utest_utx2umm(arg->ta_utx);
+	umem_off_t            umoff = UINT64_MAX, umoff1 = UINT64_MAX;
+	uint32_t              mb_id;
+	size_t                alloc_size = 1024;
+	int                   found      = 0, i, rc;
+
+	mb_id = umem_allot_mb_evictable(umm, 0);
+	assert_int_not_equal(mb_id, 0); /* zero maps to non-evictable memory bucket */
+
+	/* Do a tx alloc and fail the transaction. */
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	umoff = umem_alloc_from_bucket(umm, alloc_size, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+	rc = umem_tx_end(umm, 1);
+	assert_true(rc == umem_tx_errno(1));
+	found = 0;
+	for (i = 0; i < 32 * 1024; i++) {
+		umoff1 = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff1));
+		assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id);
+		umem_atomic_free(umm, umoff1);
+		if (umoff == umoff1) {
+			found = 1;
+			break;
+		}
+	}
+	assert_int_equal(found, 1);
+
+	/* Do a tx alloc and pass the transaction. */
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	umoff = umem_alloc_from_bucket(umm, alloc_size, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+	rc = umem_tx_end(umm, 0);
+	assert_int_equal(rc, 0);
+	for (i = 0; i < 32 * 1024; i++) {
+		umoff1 = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff1));
+		assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id);
+		umem_atomic_free(umm, umoff1);
+		assert_false(umoff == umoff1);
+	}
+
+	/* Do a tx free and fail the transaction. */
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	umem_free(umm, umoff);
+	rc = umem_tx_end(umm, 1);
+	assert_true(rc == umem_tx_errno(1));
+	for (i = 0; i < 32 * 1024; i++) {
+		umoff1 = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff1));
+		assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id);
+		umem_atomic_free(umm, umoff1);
+		assert_false(umoff == umoff1);
+	}
+
+	/* Do a tx free and pass the transaction. */
+	rc = umem_tx_begin(umm, NULL);
+	assert_int_equal(rc, 0);
+	umem_free(umm, umoff);
+	rc = umem_tx_end(umm, 0);
+	assert_int_equal(rc, 0);
+	found = 0;
+	for (i = 0; i < 32 * 1024; i++) {
+		umoff1 = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, mb_id);
+		assert_false(UMOFF_IS_NULL(umoff1));
+		assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id);
+		umem_atomic_free(umm, umoff1);
+		if (umoff == umoff1) {
+			found = 1;
+			break;
+		}
+	}
+	assert_int_equal(found, 1);
+}
+
+struct bucket_alloc_info {
+	umem_off_t start_umoff;
+	uint32_t   num_allocs;
+	uint32_t   mb_id;
+};
+
+void
+alloc_bucket_to_full(struct umem_instance *umm, struct bucket_alloc_info *ainfo)
+{
+	umem_off_t              umoff, prev_umoff;
+	size_t                  alloc_size = 128;
+	umem_off_t             *ptr;
+	struct umem_cache_range rg = {0};
+	struct umem_pin_handle *p_hdl;
+	uint32_t                id = ainfo->mb_id;
+
+	if (UMOFF_IS_NULL(ainfo->start_umoff)) {
+		ainfo->start_umoff =
+		    umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, id);
+		assert_false(UMOFF_IS_NULL(ainfo->start_umoff));
+		ainfo->num_allocs++;
+		assert_true(umem_get_mb_from_offset(umm, ainfo->start_umoff) == id);
+	}
+	prev_umoff = ainfo->start_umoff;
+	rg.cr_off  = umem_get_mb_base_offset(umm, id);
+	rg.cr_size = 1;
+	assert_true(umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl) == 0);
+
+	while (1) {
+		umoff = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, id);
+		assert_false(UMOFF_IS_NULL(umoff));
+		if (umem_get_mb_from_offset(umm, umoff) != id) {
+			umem_atomic_free(umm, umoff);
+			break;
+		}
+		ptr        = (umem_off_t *)umem_off2ptr(umm, prev_umoff);
+		*ptr       = umoff;
+		ptr        = (umem_off_t *)umem_off2ptr(umm, umoff);
+		*ptr       = UMOFF_NULL;
+		prev_umoff = umoff;
+		ainfo->num_allocs++;
+	}
+	umem_cache_unpin(&umm->umm_pool->up_store, p_hdl);
+	print_message("Bulk Alloc: Bucket %d, start off %lu num_allocation %d\n", ainfo->mb_id,
+		      ainfo->start_umoff, ainfo->num_allocs);
+}
+
+void
+free_bucket_by_pct(struct umem_instance *umm, struct bucket_alloc_info *ainfo, int pct)
+{
+	int                     num_free = (ainfo->num_allocs * pct) / 100;
+	umem_off_t              umoff, *ptr, next_umoff;
+	struct umem_pin_handle *p_hdl;
+	struct umem_cache_range rg = {0};
+	int                     i, rc;
+
+	assert_true((pct >= 0) && (pct <= 100));
+
+	if (UMOFF_IS_NULL(ainfo->start_umoff))
+		return;
+	print_message("Bulk Free BEFORE: Bucket %d, start off %lu num_allocation %d\n",
+		      ainfo->mb_id, ainfo->start_umoff, ainfo->num_allocs);
+
+	rg.cr_off  = umem_get_mb_base_offset(umm, ainfo->mb_id);
+	rg.cr_size = 1;
+	rc         = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl);
+	assert_true(rc == 0);
+
+	umoff = ainfo->start_umoff;
+	for (i = 0; i < num_free; i++) {
+		ptr        = (umem_off_t *)umem_off2ptr(umm, umoff);
+		next_umoff = *ptr;
+		umem_atomic_free(umm, umoff);
+		umoff = next_umoff;
+		ainfo->num_allocs--;
+		if (UMOFF_IS_NULL(umoff))
+			break;
+	}
+	ainfo->start_umoff = umoff;
+	umem_cache_unpin(&umm->umm_pool->up_store, p_hdl);
+	print_message("Bulk Free AFTER: Bucket %d, start off %lu num_allocation %d\n", ainfo->mb_id,
+		      ainfo->start_umoff, ainfo->num_allocs);
+}
+
+static void
+test_tx_alloc_from_multimb(void **state)
+{
+	struct test_arg         *arg = *state;
+	struct umem_instance    *umm = utest_utx2umm(arg->ta_utx);
+	struct bucket_alloc_info ainfo[10];
+	uint32_t                 id;
+	int                      i;
+
+	for (i = 0; i < 10; i++) {
+		/* Create an MB and fill it with allocs */
+		ainfo[i].mb_id       = umem_allot_mb_evictable(umm, 0);
+		ainfo[i].num_allocs  = 0;
+		ainfo[i].start_umoff = UMOFF_NULL;
+		assert_true(ainfo[i].mb_id != 0);
+		alloc_bucket_to_full(umm, &ainfo[i]);
+	}
+
+	/* Free 5% of space for MB 2 */
+	free_bucket_by_pct(umm, &ainfo[2], 5); /* 90+ */
+	/* Free 30% of space for MB 3 */
+	free_bucket_by_pct(umm, &ainfo[3], 30); /* 30-75 */
+	/* Free 80% of space for MB 4 */
+	free_bucket_by_pct(umm, &ainfo[4], 80); /* 0-30 */
+	/* Free 15% of space for MB 5 */
+	free_bucket_by_pct(umm, &ainfo[5], 20); /* 75-90 */
+	/* Free 10% of space for MB 6 */
+	free_bucket_by_pct(umm, &ainfo[6], 18); /* 75-90 */
+	/* Free 50% of space for MB 7 */
+	free_bucket_by_pct(umm, &ainfo[7], 50); /* 30-75 */
+	/* Free 90% of space for MB 8 */
+	free_bucket_by_pct(umm, &ainfo[8], 90); /* 0-30 */
+
+	/* Allocator should return mb with utilization 30%-75% */
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[3].mb_id);
+	assert_true(id == ainfo[3].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[3]);
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[7].mb_id);
+	assert_true(id == ainfo[7].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[7]);
+
+	/* Next preference should be 0%-30% */
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[4].mb_id);
+	assert_true(id == ainfo[4].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[4]);
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[8].mb_id);
+	assert_true(id == ainfo[8].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[8]);
+
+	/* Next is to create a new memory bucket. */
+	id = umem_allot_mb_evictable(umm, 0);
+	for (i = 0; i < 10; i++)
+		assert_true(id != ainfo[i].mb_id);
+	print_message("obtained id %d\n", id);
+
+	/* Without eviction support 75-90% and 90% and above cannot be tested.
+	 * TBD: as this requires supporting eviction within this test environment.
+	 */
+}
+
+static void
+test_umempobj_create_smallsize(void **state)
+{
+	int                  num = 0;
+	char                *name;
+	uint32_t             id;
+	struct umem_store    ustore_tmp = {.stor_size  = POOL_SIZE,
+					   .stor_ops   = &_store_ops_v2,
+					   .store_type = DAOS_MD_BMEM_V2,
+					   .stor_priv  = (void *)(UINT64_MAX)};
+	struct umem_attr     uma;
+	struct umem_instance umm;
+
+	uma.uma_id = umempobj_backend_type2class_id(ustore_tmp.store_type);
+
+	/* umempobj_create with zero scm size */
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++);
+	assert_true(name != NULL);
+	uma.uma_pool =
+	    umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, 0, 0666, &ustore_tmp);
+	assert_ptr_equal(uma.uma_pool, NULL);
+	unlink(name);
+	D_FREE(name);
+
+	/* umempobj_create with zero metablob size */
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++);
+	assert_true(name != NULL);
+	ustore_tmp.stor_size = 0;
+	uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, POOL_SIZE, 0666,
+				       &ustore_tmp);
+	assert_ptr_equal(uma.uma_pool, NULL);
+	ustore_tmp.stor_size = POOL_SIZE;
+	unlink(name);
+	D_FREE(name);
+
+	/* umempobj_create with scm size less than 32MB */
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++);
+	assert_true(name != NULL);
+	uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS,
+				       24 * 1024 * 1024, 0666, &ustore_tmp);
+	assert_ptr_equal(uma.uma_pool, NULL);
+	unlink(name);
+	D_FREE(name);
+
+	/* umempobj_create with scm size set to 112MB */
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++);
+	assert_true(name != NULL);
+	uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS,
+				       112 * 1024 * 1024, 0666, &ustore_tmp);
+	assert_ptr_not_equal(uma.uma_pool, NULL);
+	umempobj_close(uma.uma_pool);
+	unlink(name);
+	D_FREE(name);
+
+	/* umempobj_create with scm and metablob size set to 112MB */
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++);
+	assert_true(name != NULL);
+	ustore_tmp.stor_size = 112 * 1024 * 1024;
+	uma.uma_pool         = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS,
+					       112 * 1024 * 1024, 0666, &ustore_tmp);
+	umem_class_init(&uma, &umm);
+	id = umem_allot_mb_evictable(&umm, 0);
+	print_message("with scm == metablob, evictable id returned is %d\n", id);
+	assert_true(id == 0);
+	ustore_tmp.stor_size = POOL_SIZE;
+	assert_ptr_not_equal(uma.uma_pool, NULL);
+	umempobj_close(uma.uma_pool);
+	unlink(name);
+	D_FREE(name);
+
+	/* umempobj_create with scm size greater than metablob size*/
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++);
+	assert_true(name != NULL);
+	ustore_tmp.stor_size = 224 * 1024 * 1024;
+	uma.uma_pool         = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS,
+					       112 * 1024 * 1024, 0666, &ustore_tmp);
+	umem_class_init(&uma, &umm);
+	id = umem_allot_mb_evictable(&umm, 0);
+	print_message("with metablob > scm, evictable id returned is %d\n", id);
+	assert_true(id != 0);
+	ustore_tmp.stor_size = POOL_SIZE;
+	assert_ptr_not_equal(uma.uma_pool, NULL);
+	umempobj_close(uma.uma_pool);
+	unlink(name);
+	D_FREE(name);
+}
+
+static void
+test_umempobj_nemb_usage(void **state)
+{
+	int                  num = 0;
+	char                *name;
+	struct umem_store    ustore_tmp = {.stor_size  = 256 * 1024 * 1024,
+					   .stor_ops   = &_store_ops_v2,
+					   .store_type = DAOS_MD_BMEM_V2,
+					   .stor_priv  = (void *)(UINT64_MAX)};
+	struct umem_attr     uma;
+	struct umem_instance umm;
+	umem_off_t           umoff, *ptr = NULL, prev_umoff = UMOFF_NULL;
+	size_t               alloc_size = (10 * 1024 * 1024);
+
+	uma.uma_id = umempobj_backend_type2class_id(ustore_tmp.store_type);
+	/* Create a heap and cache of size 256MB and 249MB (16 & 15 zones) respectively */
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", 0);
+	assert_true(name != NULL);
+	uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS,
+				       240 * 1024 * 1024, 0666, &ustore_tmp);
+	assert_ptr_not_equal(uma.uma_pool, NULL);
+
+	umem_class_init(&uma, &umm);
+
+	/* Do allocation and verify that only 13 zones allotted to non evictable MBs */
+	for (num = 0;; num++) {
+		/* do an allocation that takes more than half the zone size */
+		umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		*ptr       = prev_umoff;
+		prev_umoff = umoff;
+	}
+	/* 80% nemb when heap size greater than cache size */
+	assert_int_equal(num, 13);
+	print_message("Number of allocations is %d\n", num);
+
+	for (--num;; num--) {
+		umoff = *ptr;
+		umem_atomic_free(&umm, prev_umoff);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		prev_umoff = umoff;
+	}
+	assert_int_equal(num, 0);
+	umempobj_close(uma.uma_pool);
+	unlink(name);
+	D_FREE(name);
+
+	prev_umoff = UMOFF_NULL;
+	/* Create a heap and cache of size 256MB (16 zones) each */
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", 1);
+	assert_true(name != NULL);
+	uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS,
+				       256 * 1024 * 1024, 0666, &ustore_tmp);
+	assert_ptr_not_equal(uma.uma_pool, NULL);
+
+	umem_class_init(&uma, &umm);
+
+	/* Do allocation and verify that all 16 zones are allotted to non evictable MBs */
+	for (num = 0;; num++) {
+		/* do an allocation that takes more than half the zone size */
+		umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		*ptr       = prev_umoff;
+		prev_umoff = umoff;
+	}
+	assert_int_equal(num, 16);
+	print_message("Number of allocations is %d\n", num);
+
+	for (--num;; num--) {
+		umoff = *ptr;
+		umem_atomic_free(&umm, prev_umoff);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		prev_umoff = umoff;
+	}
+	assert_int_equal(num, 0);
+	umempobj_close(uma.uma_pool);
+	unlink(name);
+	D_FREE(name);
+}
+
+static void
+test_umempobj_heap_mb_stats(void **state)
+{
+	int                  num = 0, count, rc;
+	char                *name;
+	uint64_t             scm_size   = 128 * 1024 * 1024;
+	uint64_t             meta_size  = 256 * 1024 * 1024;
+	struct umem_store    ustore_tmp = {.stor_size  = meta_size,
+					   .stor_ops   = &_store_ops_v2,
+					   .store_type = DAOS_MD_BMEM_V2,
+					   .stor_priv  = (void *)(UINT64_MAX)};
+	struct umem_attr     uma;
+	struct umem_instance umm;
+	umem_off_t           umoff, *ptr = NULL, prev_umoff = UMOFF_NULL;
+	size_t               alloc_size = 128;
+	uint64_t             allocated, allocated0, allocated1, maxsz, maxsz_exp;
+	uint32_t             mb_id;
+
+	uma.uma_id = umempobj_backend_type2class_id(ustore_tmp.store_type);
+	/* Create a heap and cache of size 256MB and 128MB (16 & 8 zones) respectively */
+	D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", 0);
+	assert_true(name != NULL);
+	uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, scm_size, 0666,
+				       &ustore_tmp);
+	assert_ptr_not_equal(uma.uma_pool, NULL);
+	maxsz_exp = (uint64_t)(scm_size / MB_SIZE * NEMB_RATIO) * MB_SIZE;
+
+	umem_class_init(&uma, &umm);
+
+	rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated0, &maxsz);
+	print_message("NE usage max_size = %lu exp_max_size = %lu allocated = %lu\n", maxsz,
+		      maxsz_exp, allocated0);
+	assert_int_equal(rc, 0);
+	assert_int_equal(maxsz, maxsz_exp);
+
+	/* allocate and consume all of the space */
+	for (num = 0;; num++) {
+		umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		*ptr       = prev_umoff;
+		prev_umoff = umoff;
+	}
+	rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated1, &maxsz);
+	print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated1);
+	assert_int_equal(rc, 0);
+	assert_true(allocated1 * 100 / maxsz >= 99);
+	assert_int_equal(maxsz, maxsz_exp);
+
+	for (count = num; count > num / 2; count--) {
+		umoff = *ptr;
+		umem_atomic_free(&umm, prev_umoff);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		prev_umoff = umoff;
+	}
+	rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated, &maxsz);
+	print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated);
+	assert_int_equal(rc, 0);
+	assert_true(allocated < allocated1 / 2);
+	assert_int_equal(maxsz, maxsz_exp);
+	for (;;) {
+		umoff = *ptr;
+		umem_atomic_free(&umm, prev_umoff);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		prev_umoff = umoff;
+	}
+	rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated, &maxsz);
+	print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated);
+	assert_int_equal(rc, 0);
+	assert_int_equal(allocated, allocated0);
+	assert_int_equal(maxsz, maxsz_exp);
+
+	/* Now Test an evictable MB */
+	mb_id = umem_allot_mb_evictable(&umm, 0);
+	assert_true(mb_id > 0);
+	maxsz_exp = MB_SIZE;
+
+	rc = umempobj_get_mbusage(umm.umm_pool, mb_id, &allocated0, &maxsz);
+	print_message("E usage max_size = %lu exp_max_size = %lu allocated = %lu\n", maxsz,
+		      maxsz_exp, allocated0);
+	assert_int_equal(rc, 0);
+	assert_int_equal(maxsz, maxsz_exp);
+
+	prev_umoff = UMOFF_NULL;
+	ptr        = NULL;
+	/* allocate and consume all of the space */
+	for (num = 0;; num++) {
+		umoff = umem_atomic_alloc_from_bucket(&umm, alloc_size, UMEM_TYPE_ANY, mb_id);
+		if (umem_get_mb_from_offset(&umm, umoff) != mb_id) {
+			umem_atomic_free(&umm, umoff);
+			break;
+		}
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		*ptr       = prev_umoff;
+		prev_umoff = umoff;
+	}
+	rc = umempobj_get_mbusage(umm.umm_pool, mb_id, &allocated1, &maxsz);
+	print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated1);
+	assert_int_equal(rc, 0);
+	assert_true(allocated1 * 100 / maxsz >= 99);
+	assert_int_equal(maxsz, maxsz_exp);
+
+	for (count = num; count > num / 2; count--) {
+		umoff = *ptr;
+		umem_atomic_free(&umm, prev_umoff);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		prev_umoff = umoff;
+	}
+	rc = umempobj_get_mbusage(umm.umm_pool, mb_id, &allocated, &maxsz);
+	print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated);
+	assert_int_equal(rc, 0);
+	assert_true(allocated < allocated1 / 2);
+	assert_int_equal(maxsz, maxsz_exp);
+	for (;;) {
+		umoff = *ptr;
+		umem_atomic_free(&umm, prev_umoff);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		ptr        = (umem_off_t *)umem_off2ptr(&umm, umoff);
+		prev_umoff = umoff;
+	}
+	rc = umempobj_get_mbusage(umm.umm_pool, mb_id, &allocated, &maxsz);
+	print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated);
+	assert_int_equal(rc, 0);
+	assert_int_equal(allocated, allocated0);
+	assert_int_equal(maxsz, maxsz_exp);
+
+	/* Testing invalid mb_ids */
+	rc = umempobj_get_mbusage(umm.umm_pool, mb_id - 1, &allocated, &maxsz);
+	assert_int_equal(rc, -DER_INVAL);
+	rc = umempobj_get_mbusage(umm.umm_pool, mb_id + 1, &allocated, &maxsz);
+	assert_int_equal(rc, -DER_INVAL);
+	rc = umempobj_get_mbusage(umm.umm_pool, 50, &allocated, &maxsz);
+	assert_int_equal(rc, -DER_INVAL);
+
+	umempobj_close(uma.uma_pool);
+	unlink(name);
+	D_FREE(name);
 }
-#endif
 
 int
 main(int argc, char **argv)
 {
-	int rc;
-
-	static const struct CMUnitTest umem_tests[] = {
-		{ "BMEM001: Test atomic alloc/free", test_atomic_alloc,
-			setup_pmem, teardown_pmem},
-		{ "BMEM002: Test null flags pmem", test_invalid_flags,
-			setup_pmem, teardown_pmem},
-		{ "BMEM003: Test alloc pmem", test_alloc,
-			setup_pmem, teardown_pmem},
-		{ "BMEM004: Test atomic copy", test_atomic_copy,
-			setup_pmem, teardown_pmem},
-		{ "BMEM005: Test simple commit tx", test_simple_commit_tx,
-			setup_pmem, teardown_pmem},
-		{ "BMEM006: Test simple abort tx", test_simple_abort_tx,
-			setup_pmem, teardown_pmem},
-		{ "BMEM007: Test nested commit tx", test_nested_commit_tx,
-			setup_pmem, teardown_pmem},
-		{ "BMEM008: Test nested outer abort tx", test_nested_outer_abort_tx,
-			setup_pmem, teardown_pmem},
-		{ "BMEM009: Test nested inner abort tx", test_nested_inner_abort_tx,
-			setup_pmem, teardown_pmem},
-		{ "BMEM010: Test tx alloc/free", test_tx_alloc,
-			setup_pmem, teardown_pmem},
-		{ "BMEM011: Test tx add range", test_tx_add,
-			setup_pmem, teardown_pmem},
-		{ "BMEM012: Test tx add ptr", test_tx_add_ptr,
-			setup_pmem, teardown_pmem},
-		{ "BMEM013: Test tx xadd ptr", test_tx_xadd_ptr,
-			setup_pmem, teardown_pmem},
-		{ "BMEM014: Test tx reserve publish/cancel", test_tx_reserve_publish_cancel,
-			setup_pmem, teardown_pmem},
-		{ "BMEM015: Test tx defer free publish/cancel", test_tx_dfree_publish_cancel,
-			setup_pmem, teardown_pmem},
-		{ NULL, NULL, NULL, NULL }
-	};
+	int                            rc = 0;
+
+	static const struct CMUnitTest v1_tests[] = {
+	    {"BMEM001: Test atomic alloc/free", test_atomic_alloc, setup_pmem, teardown_pmem},
+	    {"BMEM001a: Test atomic alloc/free", test_atomic_alloc_from_bucket, setup_pmem,
+	     teardown_pmem},
+	    {"BMEM002: Test null flags pmem", test_invalid_flags, setup_pmem, teardown_pmem},
+	    {"BMEM003: Test alloc pmem", test_alloc, setup_pmem, teardown_pmem},
+	    {"BMEM003a: Test alloc pmem", test_alloc_from_bucket, setup_pmem, teardown_pmem},
+	    {"BMEM004a: Test atomic copy", test_atomic_copy, setup_pmem, teardown_pmem},
+	    {"BMEM005: Test simple commit tx", test_simple_commit_tx, setup_pmem, teardown_pmem},
+	    {"BMEM006: Test simple abort tx", test_simple_abort_tx, setup_pmem, teardown_pmem},
+	    {"BMEM007: Test nested commit tx", test_nested_commit_tx, setup_pmem, teardown_pmem},
+	    {"BMEM008: Test nested outer abort tx", test_nested_outer_abort_tx, setup_pmem,
+	     teardown_pmem},
+	    {"BMEM009: Test nested inner abort tx", test_nested_inner_abort_tx, setup_pmem,
+	     teardown_pmem},
+	    {"BMEM010: Test tx alloc/free", test_tx_alloc, setup_pmem, teardown_pmem},
+	    {"BMEM010a: Test tx alloc/free", test_tx_alloc_from_bucket, setup_pmem, teardown_pmem},
+	    {"BMEM011: Test tx add range", test_tx_add, setup_pmem, teardown_pmem},
+	    {"BMEM012: Test tx add ptr", test_tx_add_ptr, setup_pmem, teardown_pmem},
+	    {"BMEM013: Test tx xadd ptr", test_tx_xadd_ptr, setup_pmem, teardown_pmem},
+	    {"BMEM014: Test tx reserve publish/cancel", test_tx_reserve_publish_cancel, setup_pmem,
+	     teardown_pmem},
+	    {"BMEM014a: Test tx reserve publish/cancel", test_tx_bucket_reserve_publish_cancel,
+	     setup_pmem, teardown_pmem},
+	    {"BMEM015: Test tx defer free publish/cancel", test_tx_dfree_publish_cancel, setup_pmem,
+	     teardown_pmem},
+	    {"BMEM015a: Test tx defer free publish/cancel", test_tx_bucket_dfree_publish_cancel,
+	     setup_pmem, teardown_pmem},
+	    {NULL, NULL, NULL, NULL}};
+
+	static const struct CMUnitTest v2_tests[] = {
+	    {"BMEM001: Test atomic alloc/free", test_atomic_alloc, setup_pmem_v2, teardown_pmem},
+	    {"BMEM001a: Test atomic alloc/free", test_atomic_alloc_from_bucket, setup_pmem_v2,
+	     teardown_pmem},
+	    {"BMEM002: Test null flags pmem", test_invalid_flags, setup_pmem_v2, teardown_pmem},
+	    {"BMEM003: Test alloc pmem", test_alloc, setup_pmem_v2, teardown_pmem},
+	    {"BMEM003a: Test alloc pmem", test_alloc_from_bucket, setup_pmem_v2, teardown_pmem},
+	    {"BMEM004a: Test atomic copy", test_atomic_copy, setup_pmem_v2, teardown_pmem},
+	    {"BMEM005: Test simple commit tx", test_simple_commit_tx, setup_pmem_v2, teardown_pmem},
+	    {"BMEM006: Test simple abort tx", test_simple_abort_tx, setup_pmem_v2, teardown_pmem},
+	    {"BMEM007: Test nested commit tx", test_nested_commit_tx, setup_pmem_v2, teardown_pmem},
+	    {"BMEM008: Test nested outer abort tx", test_nested_outer_abort_tx, setup_pmem_v2,
+	     teardown_pmem},
+	    {"BMEM009: Test nested inner abort tx", test_nested_inner_abort_tx, setup_pmem_v2,
+	     teardown_pmem},
+	    {"BMEM010: Test tx alloc/free", test_tx_alloc, setup_pmem_v2, teardown_pmem},
+	    {"BMEM010a: Test tx alloc/free", test_tx_alloc_from_bucket, setup_pmem_v2,
+	     teardown_pmem},
+	    {"BMEM011: Test tx add range", test_tx_add, setup_pmem_v2, teardown_pmem},
+	    {"BMEM012: Test tx add ptr", test_tx_add_ptr, setup_pmem_v2, teardown_pmem},
+	    {"BMEM013: Test tx xadd ptr", test_tx_xadd_ptr, setup_pmem_v2, teardown_pmem},
+	    {"BMEM014: Test tx reserve publish/cancel", test_tx_reserve_publish_cancel,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM014a: Test tx reserve publish/cancel", test_tx_bucket_reserve_publish_cancel,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM015: Test tx defer free publish/cancel", test_tx_dfree_publish_cancel,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM015a: Test tx defer free publish/cancel", test_tx_bucket_dfree_publish_cancel,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM016: Test atomic allocs within a memory bucket", test_atomic_alloc_mb,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM017: Test atomic allocs overflow a memory bucket", test_atomic_alloc_overflow_mb,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM018: Test reserve/defer_free from a memory bucket", test_reserve_from_mb,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM019: Test tx alloc/free from a memory bucket", test_tx_alloc_from_mb,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM020: Test tx alloc/free from multiple memory buckets", test_tx_alloc_from_multimb,
+	     setup_pmem_v2, teardown_pmem},
+	    {"BMEM021: Test umempobj create small size", test_umempobj_create_smallsize, NULL,
+	     NULL},
+	    {"BMEM022: Test umempobj non_evictable MB usage", test_umempobj_nemb_usage, NULL, NULL},
+	    {"BMEM023: Test umempobj get MB stats", test_umempobj_heap_mb_stats, NULL, NULL},
+	    {NULL, NULL, NULL, NULL}};
 
 	rc = daos_debug_init(DAOS_LOG_DEFAULT);
 	if (rc != 0)
 		return rc;
 
-
 	d_register_alt_assert(mock_assert);
 
-	rc = cmocka_run_group_tests_name("umem tests", umem_tests, global_setup, global_teardown);
+	rc = cmocka_run_group_tests_name("bmem v1 tests", v1_tests, global_setup, global_teardown);
+
+	rc += cmocka_run_group_tests_name("bmem v2 tests", v2_tests, global_setup, global_teardown);
 
 	daos_debug_fini();
 	return rc;
diff --git a/src/control/SConscript b/src/control/SConscript
index 06410fee53a..17b654f162d 100644
--- a/src/control/SConscript
+++ b/src/control/SConscript
@@ -150,6 +150,7 @@ def scons():
         dbenv = denv.Clone()
         dblibs = dbenv.subst("-L$BUILD_DIR/src/gurt "
                              "-L$BUILD_DIR/src/cart "
+                             "-L$BUILD_DIR/src/common/dav_v2 "
                              "-L$BUILD_DIR/src/common "
                              "-L$BUILD_DIR/src/client/dfs "
                              "-L$BUILD_DIR/src/utils "
@@ -184,6 +185,7 @@ def scons():
     cgolibdirs = aenv.subst("-L$BUILD_DIR/src/control/lib/spdk "
                             "-L$BUILD_DIR/src/gurt "
                             "-L$BUILD_DIR/src/cart "
+                            "-L$BUILD_DIR/src/common/dav_v2 "
                             "-L$BUILD_DIR/src/common "
                             "-L$BUILD_DIR/src/utils/ddb "
                             "-L$SPDK_PREFIX/lib "
@@ -210,7 +212,7 @@ def scons():
     ddb_env.d_add_rpaths(None, True, True)
 
     # Add vos and dependent libs for ddb
-    ddb_env.AppendENVPath("CGO_LDFLAGS", " -lvos -ldaos_common_pmem -lpmem "
+    ddb_env.AppendENVPath("CGO_LDFLAGS", " -lvos -ldav_v2 -ldaos_common_pmem -lpmem "
                                          "-labt -lgurt -luuid -lbio -lcart", sep=" ")
     install_go_bin(ddb_env, "ddb", ['ddb'])
 
diff --git a/src/control/cmd/daos/pretty/pool.go b/src/control/cmd/daos/pretty/pool.go
index a9f685b536f..f1a0b4525a7 100644
--- a/src/control/cmd/daos/pretty/pool.go
+++ b/src/control/cmd/daos/pretty/pool.go
@@ -9,6 +9,7 @@ package pretty
 import (
 	"fmt"
 	"io"
+	"strings"
 
 	"github.com/dustin/go-humanize"
 	"github.com/pkg/errors"
@@ -19,14 +20,36 @@ import (
 
 const msgNoPools = "No pools in system"
 
-func getTierNameText(tierIdx int) string {
-	switch tierIdx {
-	case int(daos.StorageMediaTypeScm):
-		return fmt.Sprintf("- Storage tier %d (SCM):", tierIdx)
-	case int(daos.StorageMediaTypeNvme):
-		return fmt.Sprintf("- Storage tier %d (NVMe):", tierIdx)
-	default:
-		return fmt.Sprintf("- Storage tier %d (unknown):", tierIdx)
+func printPoolTiers(memFileBytes uint64, suss []*daos.StorageUsageStats, w *txtfmt.ErrWriter, fullStats bool) {
+	mdOnSSD := memFileBytes != 0
+	for tierIdx, tierStats := range suss {
+		if mdOnSSD {
+			if tierIdx == 0 {
+				if fullStats {
+					fmt.Fprintf(w, "- Total memory-file size: %s\n",
+						humanize.Bytes(memFileBytes))
+				}
+				fmt.Fprintf(w, "- Metadata storage:\n")
+			} else {
+				fmt.Fprintf(w, "- Data storage:\n")
+			}
+		} else {
+			if tierIdx >= int(daos.StorageMediaTypeMax) {
+				// Print unknown type tiers.
+				tierStats.MediaType = daos.StorageMediaTypeMax
+			}
+			fmt.Fprintf(w, "- Storage tier %d (%s):\n", tierIdx,
+				strings.ToUpper(tierStats.MediaType.String()))
+		}
+
+		fmt.Fprintf(w, "  Total size: %s\n", humanize.Bytes(tierStats.Total))
+		if fullStats {
+			fmt.Fprintf(w, "  Free: %s, min:%s, max:%s, mean:%s\n",
+				humanize.Bytes(tierStats.Free), humanize.Bytes(tierStats.Min),
+				humanize.Bytes(tierStats.Max), humanize.Bytes(tierStats.Mean))
+		} else {
+			fmt.Fprintf(w, "  Free: %s\n", humanize.Bytes(tierStats.Free))
+		}
 	}
 }
 
@@ -66,14 +89,8 @@ func PrintPoolInfo(pi *daos.PoolInfo, out io.Writer) error {
 
 	if pi.QueryMask.HasOption(daos.PoolQueryOptionSpace) && pi.TierStats != nil {
 		fmt.Fprintln(w, "Pool space info:")
-		fmt.Fprintf(w, "- Target(VOS) count:%d\n", pi.ActiveTargets)
-		for tierIdx, tierStats := range pi.TierStats {
-			fmt.Fprintln(w, getTierNameText(tierIdx))
-			fmt.Fprintf(w, "  Total size: %s\n", humanize.Bytes(tierStats.Total))
-			fmt.Fprintf(w, "  Free: %s, min:%s, max:%s, mean:%s\n",
-				humanize.Bytes(tierStats.Free), humanize.Bytes(tierStats.Min),
-				humanize.Bytes(tierStats.Max), humanize.Bytes(tierStats.Mean))
-		}
+		fmt.Fprintf(w, "- Target count:%d\n", pi.ActiveTargets)
+		printPoolTiers(pi.MemFileBytes, pi.TierStats, w, true)
 	}
 	return w.Err
 }
@@ -89,11 +106,7 @@ func PrintPoolQueryTargetInfo(pqti *daos.PoolQueryTargetInfo, out io.Writer) err
 	// Maintain output compatibility with the `daos pool query-targets` output.
 	fmt.Fprintf(w, "Target: type %s, state %s\n", pqti.Type, pqti.State)
 	if pqti.Space != nil {
-		for tierIdx, tierUsage := range pqti.Space {
-			fmt.Fprintln(w, getTierNameText(tierIdx))
-			fmt.Fprintf(w, "  Total size: %s\n", humanize.Bytes(tierUsage.Total))
-			fmt.Fprintf(w, "  Free: %s\n", humanize.Bytes(tierUsage.Free))
-		}
+		printPoolTiers(pqti.MemFileBytes, pqti.Space, w, false)
 	}
 
 	return w.Err
diff --git a/src/control/cmd/daos/pretty/pool_test.go b/src/control/cmd/daos/pretty/pool_test.go
index 3a1724e1dda..938b73d0c86 100644
--- a/src/control/cmd/daos/pretty/pool_test.go
+++ b/src/control/cmd/daos/pretty/pool_test.go
@@ -55,12 +55,14 @@ Pool health info:
 				},
 				TierStats: []*daos.StorageUsageStats{
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -70,11 +72,11 @@ Pool layout out of date (1 < 2) -- see `+backtickStr+` for details.
 Pool health info:
 - Rebuild busy, 42 objs, 21 recs
 Pool space info:
-- Target(VOS) count:1
+- Target count:1
 - Storage tier 0 (SCM):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
 `, poolUUID.String()),
@@ -99,12 +101,14 @@ Pool space info:
 				},
 				TierStats: []*daos.StorageUsageStats{
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -115,11 +119,11 @@ Pool health info:
 - Enabled ranks: 0-2
 - Rebuild busy, 42 objs, 21 recs
 Pool space info:
-- Target(VOS) count:1
+- Target count:1
 - Storage tier 0 (SCM):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
 `, poolUUID.String()),
@@ -144,12 +148,14 @@ Pool space info:
 				},
 				TierStats: []*daos.StorageUsageStats{
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -160,11 +166,11 @@ Pool health info:
 - Disabled ranks: 0-1,3
 - Rebuild busy, 42 objs, 21 recs
 Pool space info:
-- Target(VOS) count:1
+- Target count:1
 - Storage tier 0 (SCM):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
 `, poolUUID.String()),
@@ -189,12 +195,14 @@ Pool space info:
 				},
 				TierStats: []*daos.StorageUsageStats{
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -205,11 +213,11 @@ Pool health info:
 - Disabled ranks: 0-1,3
 - Rebuild unknown, 42 objs, 21 recs
 Pool space info:
-- Target(VOS) count:1
+- Target count:1
 - Storage tier 0 (SCM):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
 `, poolUUID.String()),
@@ -234,12 +242,14 @@ Pool space info:
 				},
 				TierStats: []*daos.StorageUsageStats{
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 2,
-						Free:  1,
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -249,13 +259,60 @@ Pool layout out of date (1 < 2) -- see `+backtickStr+` for details.
 Pool health info:
 - Rebuild failed, status=2
 Pool space info:
-- Target(VOS) count:1
+- Target count:1
 - Storage tier 0 (SCM):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 2 B
   Free: 1 B, min:0 B, max:0 B, mean:0 B
+`, poolUUID.String()),
+		},
+		"normal response: MD-on-SSD": {
+			pi: &daos.PoolInfo{
+				QueryMask:        daos.DefaultPoolQueryMask,
+				State:            daos.PoolServiceStateDegraded,
+				UUID:             poolUUID,
+				TotalTargets:     2,
+				DisabledTargets:  1,
+				ActiveTargets:    1,
+				ServiceLeader:    42,
+				Version:          100,
+				PoolLayoutVer:    1,
+				UpgradeLayoutVer: 2,
+				Rebuild: &daos.PoolRebuildStatus{
+					State:   daos.PoolRebuildStateBusy,
+					Objects: 42,
+					Records: 21,
+				},
+				TierStats: []*daos.StorageUsageStats{
+					{
+						Total:     2,
+						Free:      1,
+						MediaType: daos.StorageMediaTypeScm,
+					},
+					{
+						Total:     4,
+						Free:      2,
+						MediaType: daos.StorageMediaTypeNvme,
+					},
+				},
+				MemFileBytes: 1,
+			},
+			expPrintStr: fmt.Sprintf(`
+Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=Degraded
+Pool layout out of date (1 < 2) -- see `+backtickStr+` for details.
+Pool health info:
+- Rebuild busy, 42 objs, 21 recs
+Pool space info:
+- Target count:1
+- Total memory-file size: 1 B
+- Metadata storage:
+  Total size: 2 B
+  Free: 1 B, min:0 B, max:0 B, mean:0 B
+- Data storage:
+  Total size: 4 B
+  Free: 2 B, min:0 B, max:0 B, mean:0 B
 `, poolUUID.String()),
 		},
 	} {
@@ -287,12 +344,14 @@ func TestPretty_PrintPoolQueryTarget(t *testing.T) {
 				State: daos.PoolTargetStateDownOut,
 				Space: []*daos.StorageUsageStats{
 					{
-						Total: 6000000000,
-						Free:  5000000000,
+						Total:     6000000000,
+						Free:      5000000000,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 100000000000,
-						Free:  90000000000,
+						Total:     100000000000,
+						Free:      90000000000,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -301,7 +360,7 @@ Target: type unknown, state down_out
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 `,
@@ -312,12 +371,14 @@ Target: type unknown, state down_out
 				State: daos.PoolTargetStateDown,
 				Space: []*daos.StorageUsageStats{
 					{
-						Total: 6000000000,
-						Free:  5000000000,
+						Total:     6000000000,
+						Free:      5000000000,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 100000000000,
-						Free:  90000000000,
+						Total:     100000000000,
+						Free:      90000000000,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -326,7 +387,7 @@ Target: type unknown, state down
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 `,
@@ -337,12 +398,14 @@ Target: type unknown, state down
 				State: daos.PoolTargetStateUp,
 				Space: []*daos.StorageUsageStats{
 					{
-						Total: 6000000000,
-						Free:  5000000000,
+						Total:     6000000000,
+						Free:      5000000000,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 100000000000,
-						Free:  90000000000,
+						Total:     100000000000,
+						Free:      90000000000,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -351,7 +414,7 @@ Target: type unknown, state up
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 `,
@@ -362,12 +425,14 @@ Target: type unknown, state up
 				State: daos.PoolTargetStateUpIn,
 				Space: []*daos.StorageUsageStats{
 					{
-						Total: 6000000000,
-						Free:  5000000000,
+						Total:     6000000000,
+						Free:      5000000000,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 100000000000,
-						Free:  90000000000,
+						Total:     100000000000,
+						Free:      90000000000,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -376,7 +441,7 @@ Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 `,
@@ -387,12 +452,14 @@ Target: type unknown, state up_in
 				State: daos.PoolTargetStateNew,
 				Space: []*daos.StorageUsageStats{
 					{
-						Total: 6000000000,
-						Free:  5000000000,
+						Total:     6000000000,
+						Free:      5000000000,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 100000000000,
-						Free:  90000000000,
+						Total:     100000000000,
+						Free:      90000000000,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -401,7 +468,7 @@ Target: type unknown, state new
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 `,
@@ -412,12 +479,14 @@ Target: type unknown, state new
 				State: daos.PoolTargetStateDrain,
 				Space: []*daos.StorageUsageStats{
 					{
-						Total: 6000000000,
-						Free:  5000000000,
+						Total:     6000000000,
+						Free:      5000000000,
+						MediaType: daos.StorageMediaTypeScm,
 					},
 					{
-						Total: 100000000000,
-						Free:  90000000000,
+						Total:     100000000000,
+						Free:      90000000000,
+						MediaType: daos.StorageMediaTypeNvme,
 					},
 				},
 			},
@@ -426,7 +495,35 @@ Target: type unknown, state drain
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
+  Total size: 100 GB
+  Free: 90 GB
+`,
+		},
+		"valid: single target (unknown, down_out): MD-on-SSD": {
+			pqti: &daos.PoolQueryTargetInfo{
+				Type:  0,
+				State: daos.PoolTargetStateDownOut,
+				Space: []*daos.StorageUsageStats{
+					{
+						Total:     6000000000,
+						Free:      5000000000,
+						MediaType: daos.StorageMediaTypeScm,
+					},
+					{
+						Total:     100000000000,
+						Free:      90000000000,
+						MediaType: daos.StorageMediaTypeNvme,
+					},
+				},
+				MemFileBytes: 3000000000,
+			},
+			expPrintStr: `
+Target: type unknown, state down_out
+- Metadata storage:
+  Total size: 6.0 GB
+  Free: 5.0 GB
+- Data storage:
   Total size: 100 GB
   Free: 90 GB
 `,
diff --git a/src/control/cmd/dmg/pool.go b/src/control/cmd/dmg/pool.go
index 1127ab2b6f6..dbb9f50e495 100644
--- a/src/control/cmd/dmg/pool.go
+++ b/src/control/cmd/dmg/pool.go
@@ -51,7 +51,9 @@ type PoolCmd struct {
 
 var (
 	// Default to 6% SCM:94% NVMe
-	defaultTierRatios = []float64{0.06, 0.94}
+	defaultTierRatios         = []float64{0.06, 0.94}
+	errPoolCreateIncompatOpts = errors.New("unsupported option combination, use (--scm-size and " +
+		"--nvme-size) or (--meta-size and --data-size) or (--size)")
 )
 
 type tierRatioFlag struct {
@@ -176,6 +178,9 @@ type PoolCreateCmd struct {
 	NumSvcReps uint32              `short:"v" long:"nsvc" description:"Number of pool service replicas"`
 	ScmSize    ui.ByteSizeFlag     `short:"s" long:"scm-size" description:"Per-engine SCM allocation for DAOS pool (manual)"`
 	NVMeSize   ui.ByteSizeFlag     `short:"n" long:"nvme-size" description:"Per-engine NVMe allocation for DAOS pool (manual)"`
+	MetaSize   ui.ByteSizeFlag     `long:"meta-size" description:"Per-engine Metadata-on-SSD allocation for DAOS pool (manual). Only valid in MD-on-SSD mode"`
+	DataSize   ui.ByteSizeFlag     `long:"data-size" description:"Per-engine Data-on-SSD allocation for DAOS pool (manual). Only valid in MD-on-SSD mode"`
+	MemRatio   tierRatioFlag       `long:"mem-ratio" description:"Percentage of the pool metadata storage size (on SSD) that should be used as the memory file size (on ram-disk). Default value is 100% and only valid in MD-on-SSD mode"`
 	RankList   ui.RankSetFlag      `short:"r" long:"ranks" description:"Storage engine unique identifiers (ranks) for DAOS pool"`
 
 	Args struct {
@@ -183,18 +188,6 @@ type PoolCreateCmd struct {
 	} `positional-args:"yes"`
 }
 
-func (cmd *PoolCreateCmd) checkSizeArgs() error {
-	if cmd.Size.IsSet() {
-		if cmd.ScmSize.IsSet() || cmd.NVMeSize.IsSet() {
-			return errIncompatFlags("size", "scm-size", "nvme-size")
-		}
-	} else if !cmd.ScmSize.IsSet() {
-		return errors.New("either --size or --scm-size must be set")
-	}
-
-	return nil
-}
-
 func ratio2Percentage(log logging.Logger, scm, nvme float64) (p float64) {
 	p = 100.00
 	min := storage.MinScmToNVMeRatio * p
@@ -212,6 +205,23 @@ func ratio2Percentage(log logging.Logger, scm, nvme float64) (p float64) {
 	return
 }
 
+// MemRatio can be supplied as two fractions that make up 1 or a single fraction less than 1.
+// Supply only the first fraction in request and if not set then use the default.
+func (cmd *PoolCreateCmd) setMemRatio(req *control.PoolCreateReq, defVal float32) error {
+	if cmd.MemRatio.IsSet() {
+		f, err := ratiosToSingleFraction(cmd.MemRatio.Ratios())
+		if err != nil {
+			return errors.Wrap(err, "md-on-ssd mode pool create unexpected mem-ratio")
+		}
+
+		req.MemRatio = f
+		return nil
+	}
+
+	req.MemRatio = defVal
+	return nil
+}
+
 func (cmd *PoolCreateCmd) storageAutoPercentage(ctx context.Context, req *control.PoolCreateReq) error {
 	if cmd.NumRanks > 0 {
 		return errIncompatFlags("size", "nranks")
@@ -224,6 +234,11 @@ func (cmd *PoolCreateCmd) storageAutoPercentage(ctx context.Context, req *contro
 	availFrac := float64(cmd.Size.availRatio) / 100.0
 	req.TierRatio = []float64{availFrac, availFrac}
 
+	// Pass --mem-ratio or zero if unset.
+	if err := cmd.setMemRatio(req, 0.0); err != nil {
+		return err
+	}
+
 	return nil
 }
 
@@ -236,6 +251,11 @@ func (cmd *PoolCreateCmd) storageAutoTotal(req *control.PoolCreateReq) error {
 	req.TierRatio = cmd.TierRatio.Ratios()
 	req.TotalBytes = cmd.Size.Bytes
 
+	// Pass --mem-ratio or zero if unset.
+	if err := cmd.setMemRatio(req, 0.0); err != nil {
+		return err
+	}
+
 	scmPercentage := ratio2Percentage(cmd.Logger, req.TierRatio[0], req.TierRatio[1])
 	msg := fmt.Sprintf("Creating DAOS pool with automatic storage allocation: "+
 		"%s total, %0.2f%% ratio", humanize.Bytes(req.TotalBytes), scmPercentage)
@@ -247,12 +267,40 @@ func (cmd *PoolCreateCmd) storageAutoTotal(req *control.PoolCreateReq) error {
 	return nil
 }
 
+func (cmd *PoolCreateCmd) storageManualMdOnSsd(req *control.PoolCreateReq) error {
+	metaBytes := cmd.MetaSize.Bytes
+	dataBytes := cmd.DataSize.Bytes
+	req.TierBytes = []uint64{metaBytes, dataBytes}
+
+	// Explicitly set mem-ratio non-zero, this will prevent MD-on-SSD syntax being used if the
+	// mode is not enabled by providing indication of which syntax type was used.
+	if err := cmd.setMemRatio(req, storage.DefaultMemoryFileRatio); err != nil {
+		return err
+	}
+
+	msg := fmt.Sprintf("Creating DAOS pool in MD-on-SSD mode with manual per-engine storage "+
+		"allocation: %s metadata, %s data (%0.2f%% storage ratio) and %0.2f%% "+
+		"memory-file:meta-blob size ratio", humanize.Bytes(metaBytes),
+		humanize.Bytes(dataBytes), 100.00*(float64(metaBytes)/float64(dataBytes)),
+		100.00*req.MemRatio)
+	cmd.Info(msg)
+
+	return nil
+}
+
 func (cmd *PoolCreateCmd) storageManual(req *control.PoolCreateReq) error {
-	if cmd.NumRanks > 0 {
+	switch {
+	case cmd.NumRanks > 0:
 		return errIncompatFlags("nranks", "scm-size")
-	}
-	if cmd.TierRatio.IsSet() {
+	case cmd.TierRatio.IsSet():
 		return errIncompatFlags("tier-ratio", "scm-size")
+	case cmd.MetaSize.IsSet() || cmd.DataSize.IsSet():
+		cmd.Tracef("md-on-ssd options detected for pool create: %+v", cmd)
+		return cmd.storageManualMdOnSsd(req)
+	case cmd.MemRatio.IsSet():
+		return errIncompatFlags("mem-ratio", "scm-size", "nvme-size")
+	case cmd.NVMeSize.IsSet() && !cmd.ScmSize.IsSet():
+		return errors.New("--nvme-size cannot be set without --scm-size")
 	}
 
 	scmBytes := cmd.ScmSize.Bytes
@@ -270,10 +318,6 @@ func (cmd *PoolCreateCmd) storageManual(req *control.PoolCreateReq) error {
 
 // Execute is run when PoolCreateCmd subcommand is activated
 func (cmd *PoolCreateCmd) Execute(args []string) error {
-	if err := cmd.checkSizeArgs(); err != nil {
-		return err
-	}
-
 	if cmd.Args.PoolLabel != "" {
 		for _, prop := range cmd.Properties.ToSet {
 			if prop.Name == "label" {
@@ -302,6 +346,20 @@ func (cmd *PoolCreateCmd) Execute(args []string) error {
 		}
 	}
 
+	// Refuse unsupported input value combinations.
+
+	pmemParams := cmd.ScmSize.IsSet() || cmd.NVMeSize.IsSet()
+	mdParams := cmd.MetaSize.IsSet() || cmd.DataSize.IsSet()
+
+	switch {
+	case (pmemParams || mdParams) && cmd.Size.IsSet():
+		return errPoolCreateIncompatOpts
+	case pmemParams && mdParams:
+		return errPoolCreateIncompatOpts
+	case !pmemParams && !mdParams && !cmd.Size.IsSet():
+		return errPoolCreateIncompatOpts
+	}
+
 	// Validate supported input values and set request fields.
 
 	switch {
diff --git a/src/control/cmd/dmg/pool_test.go b/src/control/cmd/dmg/pool_test.go
index b1270b0f19f..5d30ec2dfb1 100644
--- a/src/control/cmd/dmg/pool_test.go
+++ b/src/control/cmd/dmg/pool_test.go
@@ -15,6 +15,7 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/dustin/go-humanize"
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/pkg/errors"
@@ -232,7 +233,7 @@ func TestPoolCommands(t *testing.T) {
 			"Create pool with missing size",
 			"pool create label",
 			"",
-			errors.New("must be set"),
+			errPoolCreateIncompatOpts,
 		},
 		{
 			"Create pool with missing label",
@@ -244,13 +245,13 @@ func TestPoolCommands(t *testing.T) {
 			"Create pool with incompatible arguments (auto nvme-size)",
 			fmt.Sprintf("pool create label --size %s --nvme-size %s", testSizeStr, testSizeStr),
 			"",
-			errors.New("may not be mixed"),
+			errPoolCreateIncompatOpts,
 		},
 		{
 			"Create pool with incompatible arguments (auto scm-size)",
 			fmt.Sprintf("pool create label --size %s --scm-size %s", testSizeStr, testSizeStr),
 			"",
-			errors.New("may not be mixed"),
+			errPoolCreateIncompatOpts,
 		},
 		{
 			"Create pool with incompatible arguments (% size nranks)",
@@ -282,6 +283,24 @@ func TestPoolCommands(t *testing.T) {
 			"",
 			errors.New("may not be mixed"),
 		},
+		{
+			"Create pool with incompatible arguments (auto with meta-size)",
+			fmt.Sprintf("pool create label --size %s --meta-size 32G", testSizeStr),
+			"",
+			errPoolCreateIncompatOpts,
+		},
+		{
+			"Create pool with incompatible arguments (scm-size with meta-size)",
+			fmt.Sprintf("pool create label --scm-size %s --meta-size 32G", testSizeStr),
+			"",
+			errPoolCreateIncompatOpts,
+		},
+		{
+			"Create pool with incompatible arguments (scm-size with data-size)",
+			fmt.Sprintf("pool create label --scm-size %s --data-size 32G", testSizeStr),
+			"",
+			errPoolCreateIncompatOpts,
+		},
 		{
 			"Create pool with too-large tier-ratio (auto)",
 			fmt.Sprintf("pool create label --size %s --tier-ratio 200", testSizeStr),
@@ -361,7 +380,7 @@ func TestPoolCommands(t *testing.T) {
 			"Create pool with incompatible arguments (-n without -s)",
 			fmt.Sprintf("pool create label --nvme-size %s", testSizeStr),
 			"",
-			errors.New("must be set"),
+			errors.New("cannot be set without --scm-size"),
 		},
 		{
 			"Create pool with minimal arguments",
@@ -380,6 +399,104 @@ func TestPoolCommands(t *testing.T) {
 			}, " "),
 			nil,
 		},
+		{
+			"Create pool with manual memory file ratio; legacy syntax",
+			fmt.Sprintf("pool create label --scm-size %s --mem-ratio 0.25",
+				testSizeStr),
+			"",
+			errors.New("may not be mixed"),
+		},
+		{
+			"Create pool with default memory file ratio; MD-on-SSD syntax",
+			fmt.Sprintf("pool create label --meta-size %s --data-size 1024G",
+				testSizeStr),
+			strings.Join([]string{
+				printRequest(t, &control.PoolCreateReq{
+					User:      eUsr.Username + "@",
+					UserGroup: eGrp.Name + "@",
+					Ranks:     []ranklist.Rank{},
+					TierBytes: []uint64{
+						uint64(testSize),
+						1024 * humanize.GByte,
+					},
+					MemRatio: 1,
+					Properties: []*daos.PoolProperty{
+						propWithVal("label", "label"),
+					},
+				}),
+			}, " "),
+			nil,
+		},
+		{
+			"Create pool with manual memory file ratio; MD-on-SSD syntax; single value",
+			fmt.Sprintf("pool create label --meta-size %s --data-size 1024G --mem-ratio 25.5",
+				testSizeStr),
+			strings.Join([]string{
+				printRequest(t, &control.PoolCreateReq{
+					User:      eUsr.Username + "@",
+					UserGroup: eGrp.Name + "@",
+					Ranks:     []ranklist.Rank{},
+					TierBytes: []uint64{
+						uint64(testSize),
+						1024 * humanize.GByte,
+					},
+					MemRatio: 0.255,
+					Properties: []*daos.PoolProperty{
+						propWithVal("label", "label"),
+					},
+				}),
+			}, " "),
+			nil,
+		},
+		{
+			"Create pool with manual memory file ratio; MD-on-SSD syntax; both tiers",
+			fmt.Sprintf("pool create label --meta-size %s --data-size 1024G --mem-ratio 25.5,74.5",
+				testSizeStr),
+			strings.Join([]string{
+				printRequest(t, &control.PoolCreateReq{
+					User:      eUsr.Username + "@",
+					UserGroup: eGrp.Name + "@",
+					Ranks:     []ranklist.Rank{},
+					TierBytes: []uint64{
+						uint64(testSize),
+						1024 * humanize.GByte,
+					},
+					MemRatio: 0.255,
+					Properties: []*daos.PoolProperty{
+						propWithVal("label", "label"),
+					},
+				}),
+			}, " "),
+			nil,
+		},
+		{
+			"Create pool with manual memory file ratio; MD-on-SSD syntax; three tiers",
+			fmt.Sprintf("pool create label --meta-size %s --data-size 1024G --mem-ratio 25.5,25.5,49",
+				testSizeStr),
+			"",
+			errors.New("unexpected mem-ratio"),
+		},
+		{
+			"Create pool with manual memory file ratio; MD-on-SSD syntax; 100% tier",
+			fmt.Sprintf("pool create label --meta-size %s --data-size 1024G --mem-ratio 100",
+				testSizeStr),
+			strings.Join([]string{
+				printRequest(t, &control.PoolCreateReq{
+					User:      eUsr.Username + "@",
+					UserGroup: eGrp.Name + "@",
+					Ranks:     []ranklist.Rank{},
+					TierBytes: []uint64{
+						uint64(testSize),
+						1024 * humanize.GByte,
+					},
+					MemRatio: 1,
+					Properties: []*daos.PoolProperty{
+						propWithVal("label", "label"),
+					},
+				}),
+			}, " "),
+			nil,
+		},
 		{
 			"Create pool with manual ranks",
 			fmt.Sprintf("pool create label --size %s --ranks 1,2", testSizeStr),
diff --git a/src/control/cmd/dmg/pretty/pool.go b/src/control/cmd/dmg/pretty/pool.go
index 518502172cf..d28cc2f8061 100644
--- a/src/control/cmd/dmg/pretty/pool.go
+++ b/src/control/cmd/dmg/pretty/pool.go
@@ -21,17 +21,6 @@ import (
 
 const msgNoPools = "No pools in system"
 
-func getTierNameText(tierIdx int) string {
-	switch tierIdx {
-	case int(daos.StorageMediaTypeScm):
-		return fmt.Sprintf("- Storage tier %d (SCM):", tierIdx)
-	case int(daos.StorageMediaTypeNvme):
-		return fmt.Sprintf("- Storage tier %d (NVMe):", tierIdx)
-	default:
-		return fmt.Sprintf("- Storage tier %d (unknown):", tierIdx)
-	}
-}
-
 // PrintPoolQueryResponse generates a human-readable representation of the supplied
 // PoolQueryResp struct and writes it to the supplied io.Writer.
 func PrintPoolQueryResponse(pqr *control.PoolQueryResp, out io.Writer, opts ...PrintConfigOption) error {
@@ -60,6 +49,42 @@ func PrintTierRatio(ratio float64) string {
 	return fmt.Sprintf("%.2f%%", ratio*100)
 }
 
+func printTierBytesRow(fmtName string, tierBytes uint64, numRanks int) txtfmt.TableRow {
+	return txtfmt.TableRow{
+		fmtName: fmt.Sprintf("%s (%s / rank)",
+			humanize.Bytes(tierBytes*uint64(numRanks)),
+			humanize.Bytes(tierBytes)),
+	}
+}
+
+func getPoolCreateRespRows(mdOnSSD bool, tierBytes []uint64, tierRatios []float64, numRanks int) (title string, rows []txtfmt.TableRow) {
+	title = "Pool created with "
+	tierName := "SCM"
+	if mdOnSSD {
+		tierName = "Metadata"
+	}
+
+	for tierIdx, tierRatio := range tierRatios {
+		if tierIdx > 0 {
+			title += ","
+			tierName = "NVMe"
+			if mdOnSSD {
+				tierName = "Data"
+			}
+		}
+
+		title += PrintTierRatio(tierRatio)
+		fmtName := fmt.Sprintf("Storage tier %d (%s)", tierIdx, tierName)
+		if mdOnSSD {
+			fmtName = tierName + " Storage"
+		}
+		rows = append(rows, printTierBytesRow(fmtName, tierBytes[tierIdx], numRanks))
+	}
+	title += " storage tier ratio"
+
+	return title, rows
+}
+
 // PrintPoolCreateResponse generates a human-readable representation of the pool create
 // response and prints it to the supplied io.Writer.
 func PrintPoolCreateResponse(pcr *control.PoolCreateResp, out io.Writer, opts ...PrintConfigOption) error {
@@ -87,27 +112,28 @@ func PrintPoolCreateResponse(pcr *control.PoolCreateResp, out io.Writer, opts ..
 		return errors.New("create response had 0 target ranks")
 	}
 
-	numRanks := uint64(len(pcr.TgtRanks))
+	numRanks := len(pcr.TgtRanks)
 	fmtArgs := make([]txtfmt.TableRow, 0, 6)
 	fmtArgs = append(fmtArgs, txtfmt.TableRow{"UUID": pcr.UUID})
 	fmtArgs = append(fmtArgs, txtfmt.TableRow{"Service Leader": fmt.Sprintf("%d", pcr.Leader)})
 	fmtArgs = append(fmtArgs, txtfmt.TableRow{"Service Ranks": pretty.PrintRanks(pcr.SvcReps)})
 	fmtArgs = append(fmtArgs, txtfmt.TableRow{"Storage Ranks": pretty.PrintRanks(pcr.TgtRanks)})
-	fmtArgs = append(fmtArgs, txtfmt.TableRow{"Total Size": humanize.Bytes(totalSize * numRanks)})
+	fmtArgs = append(fmtArgs, txtfmt.TableRow{
+		"Total Size": humanize.Bytes(totalSize * uint64(numRanks)),
+	})
 
-	title := "Pool created with "
-	tierName := "SCM"
-	for tierIdx, tierRatio := range tierRatios {
-		if tierIdx > 0 {
-			title += ","
-			tierName = "NVMe"
-		}
+	mdOnSsdEnabled := pcr.MemFileBytes > 0
 
-		title += PrintTierRatio(tierRatio)
-		fmtName := fmt.Sprintf("Storage tier %d (%s)", tierIdx, tierName)
-		fmtArgs = append(fmtArgs, txtfmt.TableRow{fmtName: fmt.Sprintf("%s (%s / rank)", humanize.Bytes(pcr.TierBytes[tierIdx]*numRanks), humanize.Bytes(pcr.TierBytes[tierIdx]))})
+	title, tierRows := getPoolCreateRespRows(mdOnSsdEnabled, pcr.TierBytes, tierRatios,
+		numRanks)
+
+	// Print memory-file to meta-blob ratio for MD-on-SSD.
+	if mdOnSsdEnabled {
+		tierRows = append(tierRows, printTierBytesRow("Memory File Size",
+			pcr.MemFileBytes, numRanks))
 	}
-	title += " storage tier ratio"
+
+	fmtArgs = append(fmtArgs, tierRows...)
 
 	_, err := fmt.Fprintln(out, txtfmt.FormatEntity(title, fmtArgs))
 	return err
diff --git a/src/control/cmd/dmg/pretty/pool_test.go b/src/control/cmd/dmg/pretty/pool_test.go
index 720d0bf7e41..bbc880f5b82 100644
--- a/src/control/cmd/dmg/pretty/pool_test.go
+++ b/src/control/cmd/dmg/pretty/pool_test.go
@@ -22,7 +22,18 @@ import (
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
 )
 
-func TestPretty_PrintPoolQueryTargetResp(t *testing.T) {
+func TestPretty_PrintPoolQueryTargetResponse(t *testing.T) {
+	tier0 := &daos.StorageUsageStats{
+		Total:     6000000000,
+		Free:      5000000000,
+		MediaType: daos.StorageMediaTypeScm,
+	}
+	tier1 := &daos.StorageUsageStats{
+		Total:     100000000000,
+		Free:      90000000000,
+		MediaType: daos.StorageMediaTypeNvme,
+	}
+
 	for name, tc := range map[string]struct {
 		pqtr        *control.PoolQueryTargetResp
 		expPrintStr string
@@ -44,58 +55,22 @@ func TestPretty_PrintPoolQueryTargetResp(t *testing.T) {
 					{
 						Type:  0,
 						State: daos.PoolTargetStateDown,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateUpIn,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateDownOut,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateUpIn,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 				},
 			},
@@ -104,28 +79,28 @@ Target: type unknown, state down
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state down_out
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 `,
@@ -137,58 +112,22 @@ Target: type unknown, state up_in
 					{
 						Type:  0,
 						State: 42,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateUpIn,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateDownOut,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateUpIn,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 				},
 			},
@@ -197,28 +136,28 @@ Target: type unknown, state invalid
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state down_out
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 `,
@@ -230,58 +169,22 @@ Target: type unknown, state up_in
 					{
 						Type:  42,
 						State: daos.PoolTargetStateDown,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateUpIn,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateDownOut,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 					{
 						Type:  0,
 						State: daos.PoolTargetStateUpIn,
-						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-						},
+						Space: []*daos.StorageUsageStats{tier0, tier1},
 					},
 				},
 			},
@@ -290,28 +193,28 @@ Target: type invalid, state down
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state down_out
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
 `,
@@ -324,14 +227,7 @@ Target: type unknown, state up_in
 						Type:  0,
 						State: daos.PoolTargetStateDown,
 						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
+							tier0, tier1,
 							{
 								Total: 800000000000,
 								Free:  200000000000,
@@ -342,14 +238,7 @@ Target: type unknown, state up_in
 						Type:  0,
 						State: daos.PoolTargetStateUpIn,
 						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
+							tier0, tier1,
 							{
 								Total: 800000000000,
 								Free:  200000000000,
@@ -360,17 +249,11 @@ Target: type unknown, state up_in
 						Type:  0,
 						State: daos.PoolTargetStateDownOut,
 						Space: []*daos.StorageUsageStats{
+							tier0, tier1,
 							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
-							{
-								Total: 800000000000,
-								Free:  200000000000,
+								Total:     800000000000,
+								Free:      200000000000,
+								MediaType: daos.StorageMediaType(3),
 							},
 						},
 					},
@@ -378,14 +261,7 @@ Target: type unknown, state up_in
 						Type:  0,
 						State: daos.PoolTargetStateUpIn,
 						Space: []*daos.StorageUsageStats{
-							{
-								Total: 6000000000,
-								Free:  5000000000,
-							},
-							{
-								Total: 100000000000,
-								Free:  90000000000,
-							},
+							tier0, tier1,
 							{
 								Total: 800000000000,
 								Free:  200000000000,
@@ -399,40 +275,40 @@ Target: type unknown, state down
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
-- Storage tier 2 (unknown):
+- Storage tier 2 (UNKNOWN):
   Total size: 800 GB
   Free: 200 GB
 Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
-- Storage tier 2 (unknown):
+- Storage tier 2 (UNKNOWN):
   Total size: 800 GB
   Free: 200 GB
 Target: type unknown, state down_out
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
-- Storage tier 2 (unknown):
+- Storage tier 2 (UNKNOWN):
   Total size: 800 GB
   Free: 200 GB
 Target: type unknown, state up_in
 - Storage tier 0 (SCM):
   Total size: 6.0 GB
   Free: 5.0 GB
-- Storage tier 1 (NVMe):
+- Storage tier 1 (NVME):
   Total size: 100 GB
   Free: 90 GB
-- Storage tier 2 (unknown):
+- Storage tier 2 (UNKNOWN):
   Total size: 800 GB
   Free: 200 GB
 `,
@@ -489,6 +365,31 @@ Pool created with 5.66%%,94.34%% storage tier ratio
   Storage tier 0 (SCM) : 2.4 GB (600 MB / rank)              
   Storage tier 1 (NVMe): 40 GB (10 GB / rank)                
 
+`, test.MockPoolUUID()),
+		},
+		"basic; md-on-ssd": {
+			pcr: &control.PoolCreateResp{
+				UUID:     test.MockUUID(),
+				SvcReps:  mockRanks(0, 1, 2),
+				TgtRanks: mockRanks(0, 1, 2, 3),
+				TierBytes: []uint64{
+					600 * humanize.MByte,
+					10 * humanize.GByte,
+				},
+				MemFileBytes: 300 * humanize.MByte, // Non-zero indicates MD-on-SSD.
+			},
+			expPrintStr: fmt.Sprintf(`
+Pool created with 5.66%%,94.34%% storage tier ratio
+-------------------------------------------------
+  UUID             : %s
+  Service Leader   : 0                                   
+  Service Ranks    : [0-2]                               
+  Storage Ranks    : [0-3]                               
+  Total Size       : 42 GB                               
+  Metadata Storage : 2.4 GB (600 MB / rank)              
+  Data Storage     : 40 GB (10 GB / rank)                
+  Memory File Size : 1.2 GB (300 MB / rank)              
+
 `, test.MockPoolUUID()),
 		},
 		"no nvme": {
@@ -681,6 +582,33 @@ one  6.0 TB Ready 83%%  16%%       0/16
 			verbose:     true,
 			expPrintStr: msgNoPools + "\n",
 		},
+		"verbose, two pools": {
+			resp: &control.ListPoolsResp{
+				Pools: []*daos.PoolInfo{
+					{
+						UUID:             test.MockPoolUUID(1),
+						TierStats:        exampleTierStats,
+						TotalTargets:     16,
+						ActiveTargets:    16,
+						DisabledTargets:  0,
+						State:            daos.PoolServiceStateReady,
+						PoolLayoutVer:    1,
+						UpgradeLayoutVer: 2,
+						Rebuild: &daos.PoolRebuildStatus{
+							State: daos.PoolRebuildStateIdle,
+						},
+						QueryMask: daos.DefaultPoolQueryMask,
+					},
+				},
+			},
+			verbose: true,
+			expPrintStr: `
+Label UUID                                 State SvcReps SCM Size SCM Used SCM Imbalance NVME Size NVME Used NVME Imbalance Disabled UpgradeNeeded? Rebuild State 
+----- ----                                 ----- ------- -------- -------- ------------- --------- --------- -------------- -------- -------------- ------------- 
+-     00000001-0001-0001-0001-000000000001 Ready N/A     100 GB   80 GB    16%           6.0 TB    5.0 TB    8%             0/16     1->2           idle          
+
+`,
+		},
 	} {
 		t.Run(name, func(t *testing.T) {
 			var bld strings.Builder
diff --git a/src/control/cmd/dmg/pretty/storage_nvme.go b/src/control/cmd/dmg/pretty/storage_nvme.go
index a65c3c050b9..8920094f9d1 100644
--- a/src/control/cmd/dmg/pretty/storage_nvme.go
+++ b/src/control/cmd/dmg/pretty/storage_nvme.go
@@ -213,6 +213,22 @@ func printNvmeFormatResults(inCtrlrs storage.NvmeControllers, out io.Writer, opt
 	return nil
 }
 
+func rolesRankFromSmd(ctrlr *storage.NvmeController) (string, string) {
+	rolesStr := "NA"
+	roles := ctrlr.Roles()
+	if !roles.IsEmpty() {
+		rolesStr = roles.String()
+	}
+
+	rankStr := "None"
+	rank := ctrlr.Rank()
+	if rank != ranklist.NilRank {
+		rankStr = rank.String()
+	}
+
+	return rolesStr, rankStr
+}
+
 // PrintNvmeControllers displays controller details in a verbose table.
 func PrintNvmeControllers(controllers storage.NvmeControllers, out io.Writer, opts ...PrintConfigOption) error {
 	w := txtfmt.NewErrWriter(out)
@@ -245,18 +261,7 @@ func PrintNvmeControllers(controllers storage.NvmeControllers, out io.Writer, op
 		row[fwTitle] = ctrlr.FwRev
 		row[socketTitle] = fmt.Sprint(ctrlr.SocketID)
 		row[capacityTitle] = humanize.Bytes(ctrlr.Capacity())
-		roles := "NA"
-		rank := "None"
-		// Assumes that all SMD devices on a controller have the same roles and rank.
-		if len(ctrlr.SmdDevices) > 0 {
-			sd := ctrlr.SmdDevices[0]
-			roles = sd.Roles.String()
-			if sd.Rank != ranklist.NilRank {
-				rank = sd.Rank.String()
-			}
-		}
-		row[rolesTitle] = roles
-		row[rankTitle] = rank
+		row[rolesTitle], row[rankTitle] = rolesRankFromSmd(ctrlr)
 
 		table = append(table, row)
 	}
@@ -276,7 +281,7 @@ func PrintNvmeHealthMap(hsm control.HostStorageMap, out io.Writer, opts ...Print
 		lineBreak := strings.Repeat("-", len(hosts))
 		fmt.Fprintf(out, "%s\n%s\n%s\n", lineBreak, hosts, lineBreak)
 
-		if len(hss.HostStorage.NvmeDevices) == 0 {
+		if hss.HostStorage.NvmeDevices.Len() == 0 {
 			fmt.Fprintln(out, "  No NVMe devices detected")
 			continue
 		}
diff --git a/src/control/cmd/dmg/utils.go b/src/control/cmd/dmg/utils.go
index b8b97e43ff8..c29c74a628f 100644
--- a/src/control/cmd/dmg/utils.go
+++ b/src/control/cmd/dmg/utils.go
@@ -55,3 +55,17 @@ func errIncompatFlags(key string, incompat ...string) error {
 
 	return errors.Errorf("%s with --%s", base, strings.Join(incompat, " or --"))
 }
+
+// Convert pair of ratios to a single fraction.
+func ratiosToSingleFraction(ratios []float64) (float32, error) {
+	nrRatios := len(ratios)
+
+	// Most validation already performed by tierRatioFlag type, this just prevents
+	// incomplete or overvalue tier combinations and restricts to 1 or 2 tiers.
+	if nrRatios != 2 && ratios[0] < 1 {
+		return 0, errors.Errorf("want 2 ratio values got %d", nrRatios)
+	}
+
+	// Precision loss deemed acceptable with conversion from float64 to float32.
+	return float32(ratios[0]), nil
+}
diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go
index cb2dc5099d4..ee0f6a92717 100644
--- a/src/control/common/proto/ctl/storage_nvme.pb.go
+++ b/src/control/common/proto/ctl/storage_nvme.pb.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2019-2023 Intel Corporation.
+// (C) Copyright 2019-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -95,12 +95,13 @@ type ScanNvmeReq struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields
 
-	Health    bool   `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"`       // Retrieve NVMe device health statistics
-	Meta      bool   `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"`           // Retrieve metadata relating to NVMe device
-	Basic     bool   `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"`         // Strip NVMe device details to only basic
-	MetaSize  uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"`   // Size of the metadata blob
-	RdbSize   uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"`     // Size of the RDB blob
-	LinkStats bool   `protobuf:"varint,6,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics
+	Health    bool    `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"`       // Retrieve NVMe device health statistics
+	Meta      bool    `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"`           // Retrieve metadata relating to NVMe device
+	Basic     bool    `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"`         // Strip NVMe device details to only basic
+	MetaSize  uint64  `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"`   // Size of the metadata blob
+	RdbSize   uint64  `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"`     // Size of the RDB blob
+	MemRatio  float32 `protobuf:"fixed32,6,opt,name=MemRatio,proto3" json:"MemRatio,omitempty"`  // Ratio of VOS-file:meta-blob sizes
+	LinkStats bool    `protobuf:"varint,7,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics
 }
 
 func (x *ScanNvmeReq) Reset() {
@@ -170,6 +171,13 @@ func (x *ScanNvmeReq) GetRdbSize() uint64 {
 	return 0
 }
 
+func (x *ScanNvmeReq) GetMemRatio() float32 {
+	if x != nil {
+		return x.MemRatio
+	}
+	return 0
+}
+
 func (x *ScanNvmeReq) GetLinkStats() bool {
 	if x != nil {
 		return x.LinkStats
@@ -284,7 +292,7 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{
 	0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53,
 	0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72,
 	0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08,
-	0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61,
+	0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xbf, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61,
 	0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c,
 	0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68,
 	0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04,
@@ -293,20 +301,22 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{
 	0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65,
 	0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a,
 	0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65,
-	0x12, 0x1c, 0x0a, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x06, 0x20,
-	0x01, 0x28, 0x08, 0x52, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65,
-	0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b,
-	0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13,
-	0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
-	0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73,
-	0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c,
-	0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05,
-	0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e,
-	0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
-	0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f,
-	0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
-	0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74,
-	0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x12, 0x1a, 0x0a, 0x08, 0x4d, 0x65, 0x6d, 0x52, 0x61, 0x74, 0x69, 0x6f, 0x18, 0x06, 0x20, 0x01,
+	0x28, 0x02, 0x52, 0x08, 0x4d, 0x65, 0x6d, 0x52, 0x61, 0x74, 0x69, 0x6f, 0x12, 0x1c, 0x0a, 0x09,
+	0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x08, 0x52,
+	0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63,
+	0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74,
+	0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c,
+	0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52,
+	0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65,
+	0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73,
+	0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74,
+	0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52,
+	0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d,
+	0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73,
+	0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d,
+	0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70,
+	0x72, 0x6f, 0x74, 0x6f, 0x33,
 }
 
 var (
diff --git a/src/control/common/proto/logging.go b/src/control/common/proto/logging.go
index 033b235669d..5de759ce865 100644
--- a/src/control/common/proto/logging.go
+++ b/src/control/common/proto/logging.go
@@ -10,6 +10,7 @@ import (
 	"fmt"
 	"strings"
 
+	"github.com/dustin/go-humanize"
 	"google.golang.org/protobuf/proto"
 
 	grpcpb "github.com/Jille/raft-grpc-transport/proto"
@@ -96,6 +97,7 @@ func Debug(msg proto.Message) string {
 				fmt.Fprintf(&bld, "(%.02f%%) ", m.TierRatio[i])
 			}
 		}
+		fmt.Fprintf(&bld, "mem-ratio: %.02f ", m.MemRatio)
 	case *mgmtpb.PoolCreateResp:
 		fmt.Fprintf(&bld, "%T svc_ldr:%d ", m, m.SvcLdr)
 		ranks := &ranklist.RankSet{}
@@ -112,6 +114,7 @@ func Debug(msg proto.Message) string {
 		for i, b := range m.TierBytes {
 			fmt.Fprintf(&bld, "%d:%d ", i, b)
 		}
+		fmt.Fprintf(&bld, "meta-file-size:%s", humanize.Bytes(m.MemFileBytes))
 	case *mgmtpb.PoolEvictReq:
 		fmt.Fprintf(&bld, "%T pool:%s", m, m.Id)
 		if len(m.Handles) > 0 {
diff --git a/src/control/common/proto/mgmt/pool.pb.go b/src/control/common/proto/mgmt/pool.pb.go
index d514bfa6c43..4c1103520d1 100644
--- a/src/control/common/proto/mgmt/pool.pb.go
+++ b/src/control/common/proto/mgmt/pool.pb.go
@@ -315,6 +315,7 @@ type PoolCreateReq struct {
 	NumRanks     uint32    `protobuf:"varint,11,opt,name=num_ranks,json=numRanks,proto3" json:"num_ranks,omitempty"`                   // Number of target ranks to use
 	Ranks        []uint32  `protobuf:"varint,12,rep,packed,name=ranks,proto3" json:"ranks,omitempty"`                                  // target ranks
 	TierBytes    []uint64  `protobuf:"varint,13,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"`         // Size in bytes of storage tier
+	MemRatio     float32   `protobuf:"fixed32,14,opt,name=mem_ratio,json=memRatio,proto3" json:"mem_ratio,omitempty"`                  // Fraction of meta-blob-sz to use as mem-file-sz
 }
 
 func (x *PoolCreateReq) Reset() {
@@ -440,17 +441,25 @@ func (x *PoolCreateReq) GetTierBytes() []uint64 {
 	return nil
 }
 
+func (x *PoolCreateReq) GetMemRatio() float32 {
+	if x != nil {
+		return x.MemRatio
+	}
+	return 0
+}
+
 // PoolCreateResp returns created pool uuid and ranks.
 type PoolCreateResp struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields
 
-	Status    int32    `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"`                               // DAOS error code
-	SvcLdr    uint32   `protobuf:"varint,2,opt,name=svc_ldr,json=svcLdr,proto3" json:"svc_ldr,omitempty"`                 // Current service leader rank
-	SvcReps   []uint32 `protobuf:"varint,3,rep,packed,name=svc_reps,json=svcReps,proto3" json:"svc_reps,omitempty"`       // pool service replica ranks
-	TgtRanks  []uint32 `protobuf:"varint,4,rep,packed,name=tgt_ranks,json=tgtRanks,proto3" json:"tgt_ranks,omitempty"`    // pool target ranks
-	TierBytes []uint64 `protobuf:"varint,5,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"` // storage tiers allocated to pool
+	Status       int32    `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"`                                   // DAOS error code
+	SvcLdr       uint32   `protobuf:"varint,2,opt,name=svc_ldr,json=svcLdr,proto3" json:"svc_ldr,omitempty"`                     // Current service leader rank
+	SvcReps      []uint32 `protobuf:"varint,3,rep,packed,name=svc_reps,json=svcReps,proto3" json:"svc_reps,omitempty"`           // pool service replica ranks
+	TgtRanks     []uint32 `protobuf:"varint,4,rep,packed,name=tgt_ranks,json=tgtRanks,proto3" json:"tgt_ranks,omitempty"`        // pool target ranks
+	TierBytes    []uint64 `protobuf:"varint,5,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"`     // per-rank storage tier sizes allocated in pool
+	MemFileBytes uint64   `protobuf:"varint,6,opt,name=mem_file_bytes,json=memFileBytes,proto3" json:"mem_file_bytes,omitempty"` // per-rank accumulated value of memory file sizes
 }
 
 func (x *PoolCreateResp) Reset() {
@@ -520,6 +529,13 @@ func (x *PoolCreateResp) GetTierBytes() []uint64 {
 	return nil
 }
 
+func (x *PoolCreateResp) GetMemFileBytes() uint64 {
+	if x != nil {
+		return x.MemFileBytes
+	}
+	return 0
+}
+
 // PoolDestroyReq supplies pool identifier and force flag.
 type PoolDestroyReq struct {
 	state         protoimpl.MessageState
@@ -1150,8 +1166,9 @@ type PoolExtendResp struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields
 
-	Status    int32    `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"`                               // DAOS error code
-	TierBytes []uint64 `protobuf:"varint,2,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"` // storage tiers allocated to pool
+	Status        int32    `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"`                                      // DAOS error code
+	TierBytes     []uint64 `protobuf:"varint,2,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"`        // storage tiers allocated to pool
+	MetaBlobBytes uint32   `protobuf:"varint,3,opt,name=meta_blob_bytes,json=metaBlobBytes,proto3" json:"meta_blob_bytes,omitempty"` // Size in bytes of metadata blob on SSD
 }
 
 func (x *PoolExtendResp) Reset() {
@@ -1200,6 +1217,13 @@ func (x *PoolExtendResp) GetTierBytes() []uint64 {
 	return nil
 }
 
+func (x *PoolExtendResp) GetMetaBlobBytes() uint32 {
+	if x != nil {
+		return x.MetaBlobBytes
+	}
+	return 0
+}
+
 // PoolReintegrateReq supplies pool identifier, rank, and target_idxs.
 type PoolReintegrateReq struct {
 	state         protoimpl.MessageState
@@ -1826,6 +1850,7 @@ type PoolQueryResp struct {
 	SvcLdr           uint32               `protobuf:"varint,18,opt,name=svc_ldr,json=svcLdr,proto3" json:"svc_ldr,omitempty"`                                 // current raft leader (2.6+)
 	SvcReps          []uint32             `protobuf:"varint,19,rep,packed,name=svc_reps,json=svcReps,proto3" json:"svc_reps,omitempty"`                       // service replica ranks
 	QueryMask        uint64               `protobuf:"varint,20,opt,name=query_mask,json=queryMask,proto3" json:"query_mask,omitempty"`                        // Bitmask of pool query options used
+	MemFileBytes     uint64               `protobuf:"varint,21,opt,name=mem_file_bytes,json=memFileBytes,proto3" json:"mem_file_bytes,omitempty"`             // per-pool accumulated value of memory file sizes
 }
 
 func (x *PoolQueryResp) Reset() {
@@ -1993,6 +2018,13 @@ func (x *PoolQueryResp) GetQueryMask() uint64 {
 	return 0
 }
 
+func (x *PoolQueryResp) GetMemFileBytes() uint64 {
+	if x != nil {
+		return x.MemFileBytes
+	}
+	return 0
+}
+
 type PoolProperty struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -2597,7 +2629,8 @@ type PoolQueryTargetInfo struct {
 	Type  PoolQueryTargetInfo_TargetType  `protobuf:"varint,1,opt,name=type,proto3,enum=mgmt.PoolQueryTargetInfo_TargetType" json:"type,omitempty"`    // Target type jsee enum daos_target_type_t
 	State PoolQueryTargetInfo_TargetState `protobuf:"varint,2,opt,name=state,proto3,enum=mgmt.PoolQueryTargetInfo_TargetState" json:"state,omitempty"` // target state see enum daos_target_state_t
 	// TODO: target performance data
-	Space []*StorageTargetUsage `protobuf:"bytes,3,rep,name=space,proto3" json:"space,omitempty"` // this target's usage per storage tier
+	Space        []*StorageTargetUsage `protobuf:"bytes,3,rep,name=space,proto3" json:"space,omitempty"`                                      // this target's usage per storage tier
+	MemFileBytes uint64                `protobuf:"varint,4,opt,name=mem_file_bytes,json=memFileBytes,proto3" json:"mem_file_bytes,omitempty"` // per-target value of memory file size
 }
 
 func (x *PoolQueryTargetInfo) Reset() {
@@ -2653,6 +2686,13 @@ func (x *PoolQueryTargetInfo) GetSpace() []*StorageTargetUsage {
 	return nil
 }
 
+func (x *PoolQueryTargetInfo) GetMemFileBytes() uint64 {
+	if x != nil {
+		return x.MemFileBytes
+	}
+	return 0
+}
+
 // PoolQueryTargetResp represents a pool target query response
 type PoolQueryTargetResp struct {
 	state         protoimpl.MessageState
@@ -2839,7 +2879,7 @@ var File_mgmt_pool_proto protoreflect.FileDescriptor
 
 var file_mgmt_pool_proto_rawDesc = []byte{
 	0x0a, 0x0f, 0x6d, 0x67, 0x6d, 0x74, 0x2f, 0x70, 0x6f, 0x6f, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74,
-	0x6f, 0x12, 0x04, 0x6d, 0x67, 0x6d, 0x74, 0x22, 0x87, 0x03, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c,
+	0x6f, 0x12, 0x04, 0x6d, 0x67, 0x6d, 0x74, 0x22, 0xa4, 0x03, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c,
 	0x43, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52, 0x65, 0x71, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69,
 	0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x12, 0x10, 0x0a,
 	0x03, 0x73, 0x79, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12,
@@ -2864,217 +2904,215 @@ var file_mgmt_pool_proto_rawDesc = []byte{
 	0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0c, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x05, 0x72, 0x61, 0x6e,
 	0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73,
 	0x18, 0x0d, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65,
-	0x73, 0x22, 0x98, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x43, 0x72, 0x65, 0x61, 0x74, 0x65,
-	0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01,
-	0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x17, 0x0a, 0x07,
-	0x73, 0x76, 0x63, 0x5f, 0x6c, 0x64, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x73,
-	0x76, 0x63, 0x4c, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70,
-	0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73,
-	0x12, 0x1b, 0x0a, 0x09, 0x74, 0x67, 0x74, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20,
-	0x03, 0x28, 0x0d, 0x52, 0x08, 0x74, 0x67, 0x74, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a,
-	0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28,
-	0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, 0x83, 0x01, 0x0a,
-	0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x52, 0x65, 0x71, 0x12,
-	0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79,
-	0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69,
-	0x64, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08,
-	0x52, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72,
-	0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52,
-	0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1c, 0x0a, 0x09, 0x72, 0x65, 0x63, 0x75, 0x72, 0x73, 0x69, 0x76,
-	0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x72, 0x65, 0x63, 0x75, 0x72, 0x73, 0x69,
-	0x76, 0x65, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f,
-	0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18,
-	0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0xc0, 0x01,
-	0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x76, 0x69, 0x63, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10,
-	0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73,
-	0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64,
-	0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20,
-	0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x18, 0x0a,
-	0x07, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x09, 0x52, 0x07,
-	0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x64, 0x65, 0x73, 0x74, 0x72,
-	0x6f, 0x79, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x64, 0x65, 0x73, 0x74, 0x72, 0x6f,
-	0x79, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x5f, 0x64, 0x65, 0x73, 0x74, 0x72,
-	0x6f, 0x79, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0c, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x44,
-	0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x61, 0x63, 0x68, 0x69, 0x6e,
-	0x65, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x61, 0x63, 0x68, 0x69, 0x6e, 0x65,
-	0x22, 0x3d, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x76, 0x69, 0x63, 0x74, 0x52, 0x65, 0x73,
+	0x73, 0x12, 0x1b, 0x0a, 0x09, 0x6d, 0x65, 0x6d, 0x5f, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x18, 0x0e,
+	0x20, 0x01, 0x28, 0x02, 0x52, 0x08, 0x6d, 0x65, 0x6d, 0x52, 0x61, 0x74, 0x69, 0x6f, 0x22, 0xbe,
+	0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x43, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73,
 	0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28,
-	0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x63, 0x6f, 0x75,
-	0x6e, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x22,
-	0x82, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x52,
+	0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x17, 0x0a, 0x07, 0x73, 0x76, 0x63,
+	0x5f, 0x6c, 0x64, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x73, 0x76, 0x63, 0x4c,
+	0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70, 0x73, 0x18, 0x03,
+	0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73, 0x12, 0x1b, 0x0a,
+	0x09, 0x74, 0x67, 0x74, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d,
+	0x52, 0x08, 0x74, 0x67, 0x74, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69,
+	0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09,
+	0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d,
+	0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28,
+	0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22,
+	0x83, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x52,
 	0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52,
 	0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09,
-	0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, 0x01,
-	0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x61, 0x72, 0x67,
-	0x65, 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x09, 0x74, 0x61,
-	0x72, 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72,
-	0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52,
-	0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x63, 0x6c,
-	0x75, 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75,
-	0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22,
-	0x80, 0x01, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x72, 0x61, 0x69, 0x6e, 0x52, 0x65, 0x71,
-	0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73,
-	0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02,
-	0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d,
-	0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
-	0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x09, 0x74, 0x61, 0x72, 0x67,
-	0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e,
-	0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e,
-	0x6b, 0x73, 0x22, 0x27, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x72, 0x61, 0x69, 0x6e, 0x52,
-	0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20,
-	0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0xa8, 0x01, 0x0a, 0x0d,
-	0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a,
-	0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12,
-	0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12,
-	0x14, 0x0a, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x05,
-	0x72, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e,
-	0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e,
-	0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73,
-	0x18, 0x05, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65,
-	0x73, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x64, 0x6f, 0x6d, 0x61, 0x69,
-	0x6e, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x0c, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x44,
-	0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x73, 0x22, 0x47, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78,
-	0x74, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74,
+	0x52, 0x02, 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x18, 0x03, 0x20,
+	0x01, 0x28, 0x08, 0x52, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76,
+	0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73,
+	0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1c, 0x0a, 0x09, 0x72, 0x65, 0x63, 0x75, 0x72,
+	0x73, 0x69, 0x76, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x72, 0x65, 0x63, 0x75,
+	0x72, 0x73, 0x69, 0x76, 0x65, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x65, 0x73,
+	0x74, 0x72, 0x6f, 0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74,
 	0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73,
-	0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x02,
-	0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22,
-	0xa5, 0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72,
-	0x61, 0x74, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20,
-	0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02,
-	0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b,
-	0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a,
-	0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d,
-	0x52, 0x09, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73,
-	0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08,
-	0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72,
-	0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69,
-	0x65, 0x72, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, 0x2d, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x52,
-	0x65, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16,
-	0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06,
-	0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x20, 0x0a, 0x0c, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f,
-	0x6f, 0x6c, 0x73, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20,
-	0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x22, 0x83, 0x02, 0x0a, 0x0d, 0x4c, 0x69, 0x73,
-	0x74, 0x50, 0x6f, 0x6f, 0x6c, 0x73, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74,
+	0x22, 0xc0, 0x01, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x76, 0x69, 0x63, 0x74, 0x52, 0x65,
+	0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03,
+	0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52,
+	0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73,
+	0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73,
+	0x12, 0x18, 0x0a, 0x07, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28,
+	0x09, 0x52, 0x07, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x64, 0x65,
+	0x73, 0x74, 0x72, 0x6f, 0x79, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x64, 0x65, 0x73,
+	0x74, 0x72, 0x6f, 0x79, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x5f, 0x64, 0x65,
+	0x73, 0x74, 0x72, 0x6f, 0x79, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0c, 0x66, 0x6f, 0x72,
+	0x63, 0x65, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x61, 0x63,
+	0x68, 0x69, 0x6e, 0x65, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x61, 0x63, 0x68,
+	0x69, 0x6e, 0x65, 0x22, 0x3d, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x76, 0x69, 0x63, 0x74,
+	0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01,
+	0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x14, 0x0a, 0x05,
+	0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x63, 0x6f, 0x75,
+	0x6e, 0x74, 0x22, 0x82, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x63, 0x6c, 0x75,
+	0x64, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01,
+	0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20,
+	0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18,
+	0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74,
+	0x61, 0x72, 0x67, 0x65, 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52,
+	0x09, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76,
+	0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73,
+	0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x45,
+	0x78, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74,
 	0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74,
-	0x75, 0x73, 0x12, 0x2e, 0x0a, 0x05, 0x70, 0x6f, 0x6f, 0x6c, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28,
-	0x0b, 0x32, 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, 0x6f,
-	0x6c, 0x73, 0x52, 0x65, 0x73, 0x70, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x05, 0x70, 0x6f, 0x6f,
-	0x6c, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x64, 0x61, 0x74, 0x61, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69,
-	0x6f, 0x6e, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0b, 0x64, 0x61, 0x74, 0x61, 0x56, 0x65,
-	0x72, 0x73, 0x69, 0x6f, 0x6e, 0x1a, 0x86, 0x01, 0x0a, 0x04, 0x50, 0x6f, 0x6f, 0x6c, 0x12, 0x12,
-	0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75,
-	0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28,
-	0x09, 0x52, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f,
-	0x72, 0x65, 0x70, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52,
-	0x65, 0x70, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x04, 0x20, 0x01,
-	0x28, 0x09, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x23, 0x0a, 0x0d, 0x72, 0x65, 0x62,
-	0x75, 0x69, 0x6c, 0x64, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09,
-	0x52, 0x0c, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x22, 0x4c,
-	0x0a, 0x0b, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a,
-	0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12,
-	0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12,
-	0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03,
-	0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x7b, 0x0a, 0x0c,
-	0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06,
-	0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74,
-	0x61, 0x74, 0x75, 0x73, 0x12, 0x37, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65,
-	0x72, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e,
-	0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x2e, 0x43, 0x6f, 0x6e,
-	0x74, 0x52, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x1a, 0x1a, 0x0a,
-	0x04, 0x43, 0x6f, 0x6e, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20,
-	0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x22, 0x6c, 0x0a, 0x0c, 0x50, 0x6f, 0x6f,
-	0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73,
-	0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69,
-	0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73,
-	0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08,
-	0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72,
-	0x79, 0x5f, 0x6d, 0x61, 0x73, 0x6b, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75,
-	0x65, 0x72, 0x79, 0x4d, 0x61, 0x73, 0x6b, 0x22, 0xac, 0x01, 0x0a, 0x11, 0x53, 0x74, 0x6f, 0x72,
-	0x61, 0x67, 0x65, 0x55, 0x73, 0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x14, 0x0a,
-	0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f,
-	0x74, 0x61, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28,
-	0x04, 0x52, 0x04, 0x66, 0x72, 0x65, 0x65, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x69, 0x6e, 0x18, 0x03,
-	0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x6d, 0x69, 0x6e, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x61, 0x78,
-	0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x6d, 0x61, 0x78, 0x12, 0x12, 0x0a, 0x04, 0x6d,
-	0x65, 0x61, 0x6e, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x6d, 0x65, 0x61, 0x6e, 0x12,
-	0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x06, 0x20,
-	0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61,
-	0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09, 0x6d, 0x65, 0x64,
-	0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0xbb, 0x01, 0x0a, 0x11, 0x50, 0x6f, 0x6f, 0x6c, 0x52,
-	0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x16, 0x0a, 0x06,
-	0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74,
-	0x61, 0x74, 0x75, 0x73, 0x12, 0x33, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20,
-	0x01, 0x28, 0x0e, 0x32, 0x1d, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52,
-	0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x2e, 0x53, 0x74, 0x61,
-	0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x6f, 0x62, 0x6a,
-	0x65, 0x63, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x6f, 0x62, 0x6a, 0x65,
-	0x63, 0x74, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x18, 0x04,
-	0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x22, 0x25, 0x0a,
-	0x05, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x08, 0x0a, 0x04, 0x49, 0x44, 0x4c, 0x45, 0x10, 0x00,
-	0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x55,
-	0x53, 0x59, 0x10, 0x02, 0x22, 0xc0, 0x05, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65,
-	0x72, 0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73,
-	0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x12,
-	0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75,
-	0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28,
-	0x09, 0x52, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61,
-	0x6c, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52,
-	0x0c, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x25, 0x0a,
-	0x0e, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18,
-	0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x54, 0x61, 0x72,
-	0x67, 0x65, 0x74, 0x73, 0x12, 0x29, 0x0a, 0x10, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64,
-	0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0f,
-	0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12,
-	0x31, 0x0a, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b,
-	0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75,
-	0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69,
-	0x6c, 0x64, 0x12, 0x36, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x73,
-	0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74,
-	0x6f, 0x72, 0x61, 0x67, 0x65, 0x55, 0x73, 0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x52,
-	0x09, 0x74, 0x69, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65,
-	0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72,
-	0x73, 0x69, 0x6f, 0x6e, 0x12, 0x16, 0x0a, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, 0x0b,
-	0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x12, 0x23, 0x0a, 0x0d,
-	0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0c, 0x20,
-	0x01, 0x28, 0x09, 0x52, 0x0c, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b,
-	0x73, 0x12, 0x25, 0x0a, 0x0e, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61,
-	0x6e, 0x6b, 0x73, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x64, 0x69, 0x73, 0x61, 0x62,
-	0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61,
-	0x6c, 0x5f, 0x65, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x0d, 0x52,
-	0x0c, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x45, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x12, 0x26, 0x0a,
-	0x0f, 0x70, 0x6f, 0x6f, 0x6c, 0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72,
-	0x18, 0x0f, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x70, 0x6f, 0x6f, 0x6c, 0x4c, 0x61, 0x79, 0x6f,
-	0x75, 0x74, 0x56, 0x65, 0x72, 0x12, 0x2c, 0x0a, 0x12, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65,
-	0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, 0x18, 0x10, 0x20, 0x01, 0x28,
-	0x0d, 0x52, 0x10, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74,
-	0x56, 0x65, 0x72, 0x12, 0x2c, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x11, 0x20, 0x01,
-	0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65,
-	0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74,
-	0x65, 0x12, 0x17, 0x0a, 0x07, 0x73, 0x76, 0x63, 0x5f, 0x6c, 0x64, 0x72, 0x18, 0x12, 0x20, 0x01,
-	0x28, 0x0d, 0x52, 0x06, 0x73, 0x76, 0x63, 0x4c, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76,
-	0x63, 0x5f, 0x72, 0x65, 0x70, 0x73, 0x18, 0x13, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76,
-	0x63, 0x52, 0x65, 0x70, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72, 0x79, 0x5f, 0x6d,
-	0x61, 0x73, 0x6b, 0x18, 0x14, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, 0x65, 0x72, 0x79,
-	0x4d, 0x61, 0x73, 0x6b, 0x4a, 0x04, 0x08, 0x09, 0x10, 0x0a, 0x52, 0x0b, 0x74, 0x6f, 0x74, 0x61,
-	0x6c, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x73, 0x22, 0x63, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x50,
-	0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65,
-	0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12,
-	0x18, 0x0a, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x48,
-	0x00, 0x52, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x12, 0x18, 0x0a, 0x06, 0x6e, 0x75, 0x6d,
-	0x76, 0x61, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x48, 0x00, 0x52, 0x06, 0x6e, 0x75, 0x6d,
-	0x76, 0x61, 0x6c, 0x42, 0x07, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x83, 0x01, 0x0a,
-	0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12,
-	0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79,
-	0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69,
-	0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18,
-	0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f,
-	0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65,
-	0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e,
-	0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e,
-	0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f,
-	0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18,
-	0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x83, 0x01,
-	0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71,
+	0x75, 0x73, 0x22, 0x80, 0x01, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x72, 0x61, 0x69, 0x6e,
+	0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09,
+	0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28,
+	0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20,
+	0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x61, 0x72,
+	0x67, 0x65, 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x09, 0x74,
+	0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f,
+	0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63,
+	0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x27, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x72, 0x61,
+	0x69, 0x6e, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73,
+	0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0xa8,
+	0x01, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x71,
+	0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73,
+	0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02,
+	0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28,
+	0x0d, 0x52, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f,
+	0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63,
+	0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79,
+	0x74, 0x65, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42,
+	0x79, 0x74, 0x65, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x64, 0x6f,
+	0x6d, 0x61, 0x69, 0x6e, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x0c, 0x66, 0x61, 0x75,
+	0x6c, 0x74, 0x44, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x73, 0x22, 0x6f, 0x0a, 0x0e, 0x50, 0x6f, 0x6f,
+	0x6c, 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73,
+	0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61,
+	0x74, 0x75, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65,
+	0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74,
+	0x65, 0x73, 0x12, 0x26, 0x0a, 0x0f, 0x6d, 0x65, 0x74, 0x61, 0x5f, 0x62, 0x6c, 0x6f, 0x62, 0x5f,
+	0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x6d, 0x65, 0x74,
+	0x61, 0x42, 0x6c, 0x6f, 0x62, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, 0xa5, 0x01, 0x0a, 0x12, 0x50,
+	0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x65, 0x52, 0x65,
+	0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03,
+	0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52,
+	0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, 0x01, 0x28,
+	0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x61, 0x72, 0x67, 0x65,
+	0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x09, 0x74, 0x61, 0x72,
+	0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61,
+	0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61,
+	0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65,
+	0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74,
+	0x65, 0x73, 0x22, 0x2d, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x69, 0x6e, 0x74, 0x65,
+	0x67, 0x72, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61,
+	0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75,
+	0x73, 0x22, 0x20, 0x0a, 0x0c, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, 0x6f, 0x6c, 0x73, 0x52, 0x65,
+	0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03,
+	0x73, 0x79, 0x73, 0x22, 0x83, 0x02, 0x0a, 0x0d, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, 0x6f, 0x6c,
+	0x73, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18,
+	0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2e, 0x0a,
+	0x05, 0x70, 0x6f, 0x6f, 0x6c, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x6d,
+	0x67, 0x6d, 0x74, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, 0x6f, 0x6c, 0x73, 0x52, 0x65, 0x73,
+	0x70, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x05, 0x70, 0x6f, 0x6f, 0x6c, 0x73, 0x12, 0x21, 0x0a,
+	0x0c, 0x64, 0x61, 0x74, 0x61, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x03, 0x20,
+	0x01, 0x28, 0x04, 0x52, 0x0b, 0x64, 0x61, 0x74, 0x61, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e,
+	0x1a, 0x86, 0x01, 0x0a, 0x04, 0x50, 0x6f, 0x6f, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69,
+	0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x12, 0x14, 0x0a,
+	0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6c, 0x61,
+	0x62, 0x65, 0x6c, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70, 0x73, 0x18,
+	0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73, 0x12, 0x14,
+	0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x73,
+	0x74, 0x61, 0x74, 0x65, 0x12, 0x23, 0x0a, 0x0d, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x5f,
+	0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x72, 0x65, 0x62,
+	0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x22, 0x4c, 0x0a, 0x0b, 0x4c, 0x69, 0x73,
+	0x74, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18,
+	0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64,
+	0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76,
+	0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73,
+	0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x7b, 0x0a, 0x0c, 0x4c, 0x69, 0x73, 0x74, 0x43,
+	0x6f, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75,
+	0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12,
+	0x37, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x18, 0x02, 0x20,
+	0x03, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x43,
+	0x6f, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x0a, 0x63, 0x6f,
+	0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x1a, 0x1a, 0x0a, 0x04, 0x43, 0x6f, 0x6e, 0x74,
+	0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04,
+	0x75, 0x75, 0x69, 0x64, 0x22, 0x6c, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72,
+	0x79, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28,
+	0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01,
+	0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61,
+	0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61,
+	0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72, 0x79, 0x5f, 0x6d, 0x61, 0x73,
+	0x6b, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, 0x65, 0x72, 0x79, 0x4d, 0x61,
+	0x73, 0x6b, 0x22, 0xac, 0x01, 0x0a, 0x11, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x55, 0x73,
+	0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61,
+	0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x12,
+	0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66, 0x72,
+	0x65, 0x65, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x69, 0x6e, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52,
+	0x03, 0x6d, 0x69, 0x6e, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x61, 0x78, 0x18, 0x04, 0x20, 0x01, 0x28,
+	0x04, 0x52, 0x03, 0x6d, 0x61, 0x78, 0x12, 0x12, 0x0a, 0x04, 0x6d, 0x65, 0x61, 0x6e, 0x18, 0x05,
+	0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x6d, 0x65, 0x61, 0x6e, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65,
+	0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16,
+	0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64,
+	0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70,
+	0x65, 0x22, 0xbb, 0x01, 0x0a, 0x11, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c,
+	0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75,
+	0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12,
+	0x33, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1d,
+	0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c,
+	0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x2e, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73,
+	0x74, 0x61, 0x74, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x18,
+	0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x12, 0x18,
+	0x0a, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52,
+	0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x22, 0x25, 0x0a, 0x05, 0x53, 0x74, 0x61, 0x74,
+	0x65, 0x12, 0x08, 0x0a, 0x04, 0x49, 0x44, 0x4c, 0x45, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x44,
+	0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x55, 0x53, 0x59, 0x10, 0x02, 0x22,
+	0xe6, 0x05, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x73,
+	0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28,
+	0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69,
+	0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x12, 0x14, 0x0a,
+	0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6c, 0x61,
+	0x62, 0x65, 0x6c, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x74, 0x61, 0x72,
+	0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0c, 0x74, 0x6f, 0x74, 0x61,
+	0x6c, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x61, 0x63, 0x74, 0x69,
+	0x76, 0x65, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d,
+	0x52, 0x0d, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12,
+	0x29, 0x0a, 0x10, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x74, 0x61, 0x72, 0x67,
+	0x65, 0x74, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0f, 0x64, 0x69, 0x73, 0x61, 0x62,
+	0x6c, 0x65, 0x64, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x31, 0x0a, 0x07, 0x72, 0x65,
+	0x62, 0x75, 0x69, 0x6c, 0x64, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67,
+	0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74,
+	0x61, 0x74, 0x75, 0x73, 0x52, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x12, 0x36, 0x0a,
+	0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28,
+	0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65,
+	0x55, 0x73, 0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72,
+	0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e,
+	0x18, 0x0a, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12,
+	0x16, 0x0a, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x0d, 0x52,
+	0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x12, 0x23, 0x0a, 0x0d, 0x65, 0x6e, 0x61, 0x62, 0x6c,
+	0x65, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c,
+	0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x25, 0x0a, 0x0e,
+	0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0d,
+	0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x52, 0x61,
+	0x6e, 0x6b, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x65, 0x6e, 0x67,
+	0x69, 0x6e, 0x65, 0x73, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0c, 0x74, 0x6f, 0x74, 0x61,
+	0x6c, 0x45, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x12, 0x26, 0x0a, 0x0f, 0x70, 0x6f, 0x6f, 0x6c,
+	0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, 0x18, 0x0f, 0x20, 0x01, 0x28,
+	0x0d, 0x52, 0x0d, 0x70, 0x6f, 0x6f, 0x6c, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x56, 0x65, 0x72,
+	0x12, 0x2c, 0x0a, 0x12, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x5f, 0x6c, 0x61, 0x79, 0x6f,
+	0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, 0x18, 0x10, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10, 0x75, 0x70,
+	0x67, 0x72, 0x61, 0x64, 0x65, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x56, 0x65, 0x72, 0x12, 0x2c,
+	0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x11, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e,
+	0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65,
+	0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x17, 0x0a, 0x07,
+	0x73, 0x76, 0x63, 0x5f, 0x6c, 0x64, 0x72, 0x18, 0x12, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x73,
+	0x76, 0x63, 0x4c, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70,
+	0x73, 0x18, 0x13, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73,
+	0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72, 0x79, 0x5f, 0x6d, 0x61, 0x73, 0x6b, 0x18, 0x14,
+	0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, 0x65, 0x72, 0x79, 0x4d, 0x61, 0x73, 0x6b, 0x12,
+	0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65,
+	0x73, 0x18, 0x15, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65,
+	0x42, 0x79, 0x74, 0x65, 0x73, 0x4a, 0x04, 0x08, 0x09, 0x10, 0x0a, 0x52, 0x0b, 0x74, 0x6f, 0x74,
+	0x61, 0x6c, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x73, 0x22, 0x63, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c,
+	0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x62,
+	0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72,
+	0x12, 0x18, 0x0a, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09,
+	0x48, 0x00, 0x52, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x12, 0x18, 0x0a, 0x06, 0x6e, 0x75,
+	0x6d, 0x76, 0x61, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x48, 0x00, 0x52, 0x06, 0x6e, 0x75,
+	0x6d, 0x76, 0x61, 0x6c, 0x42, 0x07, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x83, 0x01,
+	0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71,
 	0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73,
 	0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02,
 	0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73,
@@ -3082,76 +3120,89 @@ var file_mgmt_pool_proto_rawDesc = []byte{
 	0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70,
 	0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61,
 	0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61,
-	0x6e, 0x6b, 0x73, 0x22, 0x5d, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72,
+	0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72,
 	0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73,
-	0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x32,
-	0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x02, 0x20, 0x03,
-	0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72,
-	0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69,
-	0x65, 0x73, 0x22, 0x4f, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61, 0x64,
-	0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28,
-	0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01,
-	0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61,
-	0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61,
-	0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61,
-	0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73,
-	0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x81,
-	0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67,
-	0x65, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01,
+	0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x83,
+	0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65,
+	0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03,
+	0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52,
+	0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65,
+	0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50,
+	0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f,
+	0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72,
+	0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52,
+	0x61, 0x6e, 0x6b, 0x73, 0x22, 0x5d, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50,
+	0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75,
+	0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12,
+	0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x02, 0x20,
+	0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50,
+	0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74,
+	0x69, 0x65, 0x73, 0x22, 0x4f, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61,
+	0x64, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01,
 	0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20,
-	0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18,
-	0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x18, 0x0a, 0x07, 0x74,
-	0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x74, 0x61,
-	0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e,
-	0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e,
-	0x6b, 0x73, 0x22, 0x75, 0x0a, 0x12, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72,
-	0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61,
-	0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x12,
-	0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66, 0x72,
-	0x65, 0x65, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65,
-	0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74,
-	0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09,
-	0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0xda, 0x02, 0x0a, 0x13, 0x50, 0x6f,
-	0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66,
-	0x6f, 0x12, 0x38, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32,
-	0x24, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79,
-	0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65,
-	0x74, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x3b, 0x0a, 0x05, 0x73,
-	0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x6d, 0x67, 0x6d,
-	0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65,
-	0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74,
-	0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x2e, 0x0a, 0x05, 0x73, 0x70, 0x61, 0x63,
-	0x65, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53,
-	0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67,
-	0x65, 0x52, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x22, 0x3b, 0x0a, 0x0a, 0x54, 0x61, 0x72, 0x67,
-	0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57,
-	0x4e, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x48, 0x44, 0x44, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03,
-	0x53, 0x53, 0x44, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, 0x50, 0x4d, 0x10, 0x03, 0x12, 0x06, 0x0a,
-	0x02, 0x56, 0x4d, 0x10, 0x04, 0x22, 0x5f, 0x0a, 0x0b, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53,
-	0x74, 0x61, 0x74, 0x65, 0x12, 0x11, 0x0a, 0x0d, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e,
-	0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x4f, 0x57, 0x4e, 0x5f,
-	0x4f, 0x55, 0x54, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12,
-	0x06, 0x0a, 0x02, 0x55, 0x50, 0x10, 0x03, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x50, 0x5f, 0x49, 0x4e,
-	0x10, 0x04, 0x12, 0x07, 0x0a, 0x03, 0x4e, 0x45, 0x57, 0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x44,
-	0x52, 0x41, 0x49, 0x4e, 0x10, 0x06, 0x22, 0x5e, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75,
-	0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a,
-	0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73,
-	0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2f, 0x0a, 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x18, 0x02,
-	0x20, 0x03, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c,
-	0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x52,
-	0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x2a, 0x25, 0x0a, 0x10, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67,
-	0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x43,
-	0x4d, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4e, 0x56, 0x4d, 0x45, 0x10, 0x01, 0x2a, 0x56, 0x0a,
-	0x10, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74,
-	0x65, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x72, 0x65, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x10, 0x00, 0x12,
-	0x09, 0x0a, 0x05, 0x52, 0x65, 0x61, 0x64, 0x79, 0x10, 0x01, 0x12, 0x0e, 0x0a, 0x0a, 0x44, 0x65,
-	0x73, 0x74, 0x72, 0x6f, 0x79, 0x69, 0x6e, 0x67, 0x10, 0x02, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x65,
-	0x67, 0x72, 0x61, 0x64, 0x65, 0x64, 0x10, 0x03, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e,
-	0x6f, 0x77, 0x6e, 0x10, 0x04, 0x42, 0x3a, 0x5a, 0x38, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e,
-	0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64,
-	0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f,
-	0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d,
-	0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72,
+	0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52,
+	0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72,
+	0x61, 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75,
+	0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22,
+	0x81, 0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72,
+	0x67, 0x65, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20,
+	0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02,
+	0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b,
+	0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x18, 0x0a, 0x07,
+	0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x74,
+	0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61,
+	0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61,
+	0x6e, 0x6b, 0x73, 0x22, 0x75, 0x0a, 0x12, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61,
+	0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74,
+	0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12,
+	0x12, 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66,
+	0x72, 0x65, 0x65, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70,
+	0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53,
+	0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52,
+	0x09, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0x80, 0x03, 0x0a, 0x13, 0x50,
+	0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e,
+	0x66, 0x6f, 0x12, 0x38, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e,
+	0x32, 0x24, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72,
+	0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67,
+	0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x3b, 0x0a, 0x05,
+	0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x6d, 0x67,
+	0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67,
+	0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61,
+	0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x2e, 0x0a, 0x05, 0x73, 0x70, 0x61,
+	0x63, 0x65, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e,
+	0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61,
+	0x67, 0x65, 0x52, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d,
+	0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28,
+	0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22,
+	0x3b, 0x0a, 0x0a, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a,
+	0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x48, 0x44,
+	0x44, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x53, 0x44, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02,
+	0x50, 0x4d, 0x10, 0x03, 0x12, 0x06, 0x0a, 0x02, 0x56, 0x4d, 0x10, 0x04, 0x22, 0x5f, 0x0a, 0x0b,
+	0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x11, 0x0a, 0x0d, 0x53,
+	0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0c,
+	0x0a, 0x08, 0x44, 0x4f, 0x57, 0x4e, 0x5f, 0x4f, 0x55, 0x54, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04,
+	0x44, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, 0x55, 0x50, 0x10, 0x03, 0x12, 0x09,
+	0x0a, 0x05, 0x55, 0x50, 0x5f, 0x49, 0x4e, 0x10, 0x04, 0x12, 0x07, 0x0a, 0x03, 0x4e, 0x45, 0x57,
+	0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x44, 0x52, 0x41, 0x49, 0x4e, 0x10, 0x06, 0x22, 0x5e, 0x0a,
+	0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+	0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01,
+	0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2f, 0x0a, 0x05,
+	0x69, 0x6e, 0x66, 0x6f, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x6d, 0x67,
+	0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67,
+	0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x52, 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x2a, 0x25, 0x0a,
+	0x10, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70,
+	0x65, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x43, 0x4d, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4e, 0x56,
+	0x4d, 0x45, 0x10, 0x01, 0x2a, 0x56, 0x0a, 0x10, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76,
+	0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x72, 0x65, 0x61,
+	0x74, 0x69, 0x6e, 0x67, 0x10, 0x00, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x65, 0x61, 0x64, 0x79, 0x10,
+	0x01, 0x12, 0x0e, 0x0a, 0x0a, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x69, 0x6e, 0x67, 0x10,
+	0x02, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x65, 0x67, 0x72, 0x61, 0x64, 0x65, 0x64, 0x10, 0x03, 0x12,
+	0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x10, 0x04, 0x42, 0x3a, 0x5a, 0x38,
+	0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d,
+	0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63,
+	0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72,
+	0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
 }
 
 var (
diff --git a/src/control/fault/code/codes.go b/src/control/fault/code/codes.go
index e9d78fd6588..4d045f7cfce 100644
--- a/src/control/fault/code/codes.go
+++ b/src/control/fault/code/codes.go
@@ -154,6 +154,7 @@ const (
 	ServerNoCompatibilityInsecure
 	ServerPoolHasContainers
 	ServerHugepagesDisabled
+	ServerPoolMemRatioNoRoles
 	ServerBadFaultDomainLabels
 )
 
diff --git a/src/control/lib/control/pool.go b/src/control/lib/control/pool.go
index 4f29cb696e9..65b042ad406 100644
--- a/src/control/lib/control/pool.go
+++ b/src/control/lib/control/pool.go
@@ -41,6 +41,12 @@ const (
 	DefaultPoolTimeout = 5 * time.Minute
 )
 
+// Pool create error conditions.
+var (
+	errPoolCreateFirstTierZeroBytes = errors.New("can't create pool with 0 byte first tier")
+	errPoolCreateFirstTierRatioZero = errors.New("can't create pool with 0.0 first tier ratio")
+)
+
 // checkUUID is a helper function for validating that the supplied
 // UUID string parses as a valid UUID.
 func checkUUID(uuidStr string) error {
@@ -217,19 +223,21 @@ type (
 		NumRanks   uint32               `json:"num_ranks"`   // Auto-sizing param
 		Ranks      []ranklist.Rank      `json:"ranks"`       // Manual-sizing param
 		TierBytes  []uint64             `json:"tier_bytes"`  // Per-rank values
+		MemRatio   float32              `json:"mem_ratio"`   // mem_file_size:meta_blob_size
 	}
 
 	// PoolCreateResp contains the response from a pool create request.
 	PoolCreateResp struct {
-		UUID      string   `json:"uuid"`
-		Leader    uint32   `json:"svc_ldr"`
-		SvcReps   []uint32 `json:"svc_reps"`
-		TgtRanks  []uint32 `json:"tgt_ranks"`
-		TierBytes []uint64 `json:"tier_bytes"` // Per-rank storage tier sizes
+		UUID         string   `json:"uuid"`
+		Leader       uint32   `json:"svc_ldr"`
+		SvcReps      []uint32 `json:"svc_reps"`
+		TgtRanks     []uint32 `json:"tgt_ranks"`
+		TierBytes    []uint64 `json:"tier_bytes"`     // Per-rank storage tier sizes.
+		MemFileBytes uint64   `json:"mem_file_bytes"` // Per-rank. MD-on-SSD mode only.
 	}
 )
 
-type maxPoolSizeGetter func() (uint64, uint64, error)
+type maxPoolSizeGetter func(*PoolCreateReq) (uint64, uint64, error)
 
 func poolCreateReqChkSizes(log debugLogger, getMaxPoolSz maxPoolSizeGetter, req *PoolCreateReq) error {
 	hasTotBytes := req.TotalBytes > 0
@@ -241,14 +249,14 @@ func poolCreateReqChkSizes(log debugLogger, getMaxPoolSz maxPoolSizeGetter, req
 	switch {
 	case hasTierBytes && hasNoTierRatio && !hasTotBytes:
 		if req.TierBytes[0] == 0 {
-			return errors.New("can't create pool with 0 SCM")
+			return errPoolCreateFirstTierZeroBytes
 		}
 		// Storage sizes have been written to TierBytes in request (manual-size).
 		log.Debugf("manual-size pool create mode: %+v", req)
 
 	case hasNoTierBytes && hasTierRatio && hasTotBytes:
 		if req.TierRatio[0] == 0 {
-			return errors.New("can't create pool with 0.0 SCM ratio")
+			return errPoolCreateFirstTierRatioZero
 		}
 		// Storage tier ratios and total pool size given, distribution of space across
 		// ranks to be calculated on the server side (auto-total-size).
@@ -256,7 +264,7 @@ func poolCreateReqChkSizes(log debugLogger, getMaxPoolSz maxPoolSizeGetter, req
 
 	case hasNoTierBytes && hasTierRatio && !hasTotBytes:
 		if req.TierRatio[0] == 0 {
-			return errors.New("can't create pool with 0.0 SCM ratio")
+			return errPoolCreateFirstTierRatioZero
 		}
 		availRatio := req.TierRatio[0]
 		if req.TierRatio[1] != availRatio {
@@ -265,7 +273,7 @@ func poolCreateReqChkSizes(log debugLogger, getMaxPoolSz maxPoolSizeGetter, req
 		req.TierRatio = nil
 		// Storage tier ratios specified without a total size, use specified fraction of
 		// available space (auto-percentage-size).
-		scmBytes, nvmeBytes, err := getMaxPoolSz()
+		scmBytes, nvmeBytes, err := getMaxPoolSz(req)
 		if err != nil {
 			return err
 		}
@@ -294,8 +302,8 @@ func poolCreateGenPBReq(ctx context.Context, rpcClient UnaryInvoker, in *PoolCre
 		return
 	}
 
-	getMaxPoolSz := func() (uint64, uint64, error) {
-		return getMaxPoolSize(ctx, rpcClient, ranklist.RankList(in.Ranks))
+	getMaxPoolSz := func(createReq *PoolCreateReq) (uint64, uint64, error) {
+		return getMaxPoolSize(ctx, rpcClient, createReq)
 	}
 
 	if err = poolCreateReqChkSizes(rpcClient, getMaxPoolSz, in); err != nil {
@@ -594,14 +602,14 @@ func convertPoolTargetInfo(pbInfo *mgmtpb.PoolQueryTargetInfo) (*daos.PoolQueryT
 	pqti.State = daos.PoolQueryTargetState(pbInfo.State)
 	pqti.Space = []*daos.StorageUsageStats{
 		{
-			Total:     uint64(pbInfo.Space[daos.StorageMediaTypeScm].Total),
-			Free:      uint64(pbInfo.Space[daos.StorageMediaTypeScm].Free),
-			MediaType: daos.StorageMediaTypeScm,
+			Total:     uint64(pbInfo.Space[0].Total),
+			Free:      uint64(pbInfo.Space[0].Free),
+			MediaType: daos.StorageMediaType(pbInfo.Space[0].MediaType),
 		},
 		{
-			Total:     uint64(pbInfo.Space[daos.StorageMediaTypeNvme].Total),
-			Free:      uint64(pbInfo.Space[daos.StorageMediaTypeNvme].Free),
-			MediaType: daos.StorageMediaTypeNvme,
+			Total:     uint64(pbInfo.Space[1].Total),
+			Free:      uint64(pbInfo.Space[1].Free),
+			MediaType: daos.StorageMediaType(pbInfo.Space[1].MediaType),
 		},
 	}
 
@@ -1040,6 +1048,7 @@ func newFilterRankFunc(ranks ranklist.RankList) filterRankFn {
 func processSCMSpaceStats(log debugLogger, filterRank filterRankFn, scmNamespaces storage.ScmNamespaces, rankNVMeFreeSpace rankFreeSpaceMap) (uint64, error) {
 	scmBytes := uint64(math.MaxUint64)
 
+	// Realistically there should only be one-per-rank but handle the case for multiple anyway.
 	for _, scmNamespace := range scmNamespaces {
 		if scmNamespace.Mount == nil {
 			return 0, errors.Errorf("SCM device %s (bdev %s, name %s) is not mounted",
@@ -1075,12 +1084,17 @@ func processSCMSpaceStats(log debugLogger, filterRank filterRankFn, scmNamespace
 func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControllers storage.NvmeControllers, rankNVMeFreeSpace rankFreeSpaceMap) error {
 	for _, controller := range nvmeControllers {
 		for _, smdDevice := range controller.SmdDevices {
-			msgDev := fmt.Sprintf("SMD device %s (rank %d, ctrlr %s)", smdDevice.UUID,
+			msgDev := fmt.Sprintf("SMD device %s (rank %d, ctrlr %s", smdDevice.UUID,
 				smdDevice.Rank, controller.PciAddr)
 
-			if !smdDevice.Roles.IsEmpty() && (smdDevice.Roles.OptionBits&storage.BdevRoleData) == 0 {
-				log.Debugf("Skipping %s, not used for storing data", msgDev)
-				continue
+			if smdDevice.Roles.IsEmpty() {
+				msgDev += ")"
+			} else {
+				msgDev += fmt.Sprintf(", roles %q)", smdDevice.Roles.String())
+				if !smdDevice.Roles.HasData() {
+					log.Debugf("skipping %s, not used for storing data", msgDev)
+					continue
+				}
 			}
 
 			if controller.NvmeState == storage.NvmeStateNew {
@@ -1114,31 +1128,59 @@ func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControl
 }
 
 // Return the maximal SCM and NVMe size of a pool which could be created with all the storage nodes.
-func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, ranks ranklist.RankList) (uint64, uint64, error) {
+func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, createReq *PoolCreateReq) (uint64, uint64, error) {
+	isMdOnSsdEnabled := func(log debugLogger, hsm HostStorageMap) bool {
+		for _, hss := range hsm {
+			hs := hss.HostStorage
+			if hs == nil {
+				continue
+			}
+			nvme := hs.NvmeDevices
+			if nvme.Len() > 0 && !nvme[0].Roles().IsEmpty() {
+				log.Debugf("fetch max pool size in md-on-size mode")
+				return true
+			}
+		}
+
+		return false
+	}
+
+	if createReq.MemRatio < 0 {
+		return 0, 0, errors.New("invalid mem-ratio, should be greater than zero")
+	}
+	if createReq.MemRatio > 1 {
+		return 0, 0, errors.New("invalid mem-ratio, should not be greater than one")
+	}
+
 	// Verify that the DAOS system is ready before attempting to query storage.
 	if _, err := SystemQuery(ctx, rpcClient, &SystemQueryReq{}); err != nil {
 		return 0, 0, err
 	}
 
-	resp, err := StorageScan(ctx, rpcClient, &StorageScanReq{Usage: true})
+	scanReq := &StorageScanReq{
+		Usage:    true,
+		MemRatio: createReq.MemRatio,
+	}
+
+	scanResp, err := StorageScan(ctx, rpcClient, scanReq)
 	if err != nil {
 		return 0, 0, err
 	}
 
-	if len(resp.HostStorage) == 0 {
+	if len(scanResp.HostStorage) == 0 {
 		return 0, 0, errors.New("Empty host storage response from StorageScan")
 	}
 
 	// Generate function to verify a rank is in the provided rank slice.
-	filterRank := newFilterRankFunc(ranks)
+	filterRank := newFilterRankFunc(ranklist.RankList(createReq.Ranks))
 	rankNVMeFreeSpace := make(rankFreeSpaceMap)
 	scmBytes := uint64(math.MaxUint64)
-	for _, key := range resp.HostStorage.Keys() {
-		hostStorage := resp.HostStorage[key].HostStorage
+	for _, key := range scanResp.HostStorage.Keys() {
+		hostStorage := scanResp.HostStorage[key].HostStorage
 
 		if hostStorage.ScmNamespaces.Usable() == 0 {
 			return 0, 0, errors.Errorf("Host without SCM storage: hostname=%s",
-				resp.HostStorage[key].HostSet.String())
+				scanResp.HostStorage[key].HostSet.String())
 		}
 
 		sb, err := processSCMSpaceStats(rpcClient, filterRank, hostStorage.ScmNamespaces, rankNVMeFreeSpace)
@@ -1156,7 +1198,8 @@ func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, ranks ranklist.
 	}
 
 	if scmBytes == math.MaxUint64 {
-		return 0, 0, errors.Errorf("No SCM storage space available with rank list %s", ranks)
+		return 0, 0, errors.Errorf("No SCM storage space available with rank list %q",
+			createReq.Ranks)
 	}
 
 	nvmeBytes := uint64(math.MaxUint64)
@@ -1166,8 +1209,27 @@ func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, ranks ranklist.
 		}
 	}
 
-	rpcClient.Debugf("Maximal size of a pool: scmBytes=%s (%d B) nvmeBytes=%s (%d B)",
-		humanize.Bytes(scmBytes), scmBytes, humanize.Bytes(nvmeBytes), nvmeBytes)
+	if !isMdOnSsdEnabled(rpcClient, scanResp.HostStorage) {
+		rpcClient.Debugf("Maximal size of a pool: scmBytes=%s (%d B) nvmeBytes=%s (%d B)",
+			humanize.Bytes(scmBytes), scmBytes, humanize.Bytes(nvmeBytes), nvmeBytes)
+
+		return scmBytes, nvmeBytes, nil
+	}
+
+	// In MD-on-SSD mode calculate metaBytes based on the minimum ramdisk (called scm here)
+	// availability across ranks. NVMe sizes returned in StorageScan response at the beginning
+	// of this function have been adjusted based on SSD bdev roles and MemRatio passed in the
+	// scan request. The rationale behind deriving pool sizes from ramdisk availability is that
+	// this is more likely to be the limiting factor than SSD usage.
+	if createReq.MemRatio == 0 {
+		createReq.MemRatio = 1
+	}
+	metaBytes := uint64(float64(scmBytes) / float64(createReq.MemRatio))
+
+	rpcClient.Debugf("With minimum available ramdisk capacity of %s and mem-ratio %.2f,"+
+		" the maximum per-rank sizes for a pool are META=%s (%d B) and DATA=%s (%d B)",
+		humanize.Bytes(scmBytes), createReq.MemRatio, humanize.Bytes(metaBytes),
+		metaBytes, humanize.Bytes(nvmeBytes), nvmeBytes)
 
-	return scmBytes, nvmeBytes, nil
+	return metaBytes, nvmeBytes, nil
 }
diff --git a/src/control/lib/control/pool_test.go b/src/control/lib/control/pool_test.go
index 7e342d95be8..9e0d557c490 100644
--- a/src/control/lib/control/pool_test.go
+++ b/src/control/lib/control/pool_test.go
@@ -368,6 +368,7 @@ func TestControl_PoolCreateReq_Convert(t *testing.T) {
 		NumRanks:   3,
 		Ranks:      []ranklist.Rank{1, 2, 3},
 		TierBytes:  []uint64{humanize.GiByte, 10 * humanize.GiByte},
+		MemRatio:   0.55,
 		Properties: []*daos.PoolProperty{
 			{
 				Name:   "label",
@@ -389,6 +390,7 @@ func TestControl_PoolCreateReq_Convert(t *testing.T) {
 		NumRanks:   3,
 		Ranks:      []uint32{1, 2, 3},
 		TierBytes:  []uint64{humanize.GiByte, 10 * humanize.GiByte},
+		MemRatio:   0.55,
 		Properties: []*mgmtpb.PoolProperty{
 			{Number: 1, Value: &mgmtpb.PoolProperty_Strval{"foo"}},
 		},
@@ -481,7 +483,7 @@ func TestControl_poolCreateReqChkSizes(t *testing.T) {
 			defer test.ShowBufferOnFailure(t, buf)
 
 			nrGetMaxCalls := 0
-			getMaxPoolSz := func() (uint64, uint64, error) {
+			getMaxPoolSz := func(createReq *PoolCreateReq) (uint64, uint64, error) {
 				nrGetMaxCalls++
 				return tc.getMaxScm, tc.getMaxNvme, tc.getMaxErr
 			}
@@ -840,7 +842,7 @@ func TestControl_PoolQueryResp_MarshalJSON(t *testing.T) {
 					UpgradeLayoutVer: 8,
 				},
 			},
-			exp: `{"query_mask":"disabled_engines,rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"status":42}`,
+			exp: `{"query_mask":"disabled_engines,rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":0,"status":42}`,
 		},
 		"valid rankset": {
 			pqr: &PoolQueryResp{
@@ -860,9 +862,10 @@ func TestControl_PoolQueryResp_MarshalJSON(t *testing.T) {
 					DisabledRanks:    &ranklist.RankSet{},
 					PoolLayoutVer:    7,
 					UpgradeLayoutVer: 8,
+					MemFileBytes:     1000,
 				},
 			},
-			exp: `{"query_mask":"disabled_engines,rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"enabled_ranks":[0,1,2,3,5],"disabled_ranks":[],"pool_layout_ver":7,"upgrade_layout_ver":8,"status":42}`,
+			exp: `{"query_mask":"disabled_engines,rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"enabled_ranks":[0,1,2,3,5],"disabled_ranks":[],"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":1000,"status":42}`,
 		},
 	} {
 		t.Run(name, func(t *testing.T) {
@@ -904,7 +907,7 @@ func TestControl_PoolQueryResp_UnmarshalJSON(t *testing.T) {
 			},
 		},
 		"valid rankset": {
-			data: `{"enabled_ranks":"[0,1-3,5]","disabled_ranks":"[]","status":0,"uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":null,"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8}`,
+			data: `{"enabled_ranks":"[0,1-3,5]","disabled_ranks":"[]","status":0,"uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":null,"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":1000}`,
 			expResp: PoolQueryResp{
 				Status: 0,
 				PoolInfo: daos.PoolInfo{
@@ -919,6 +922,7 @@ func TestControl_PoolQueryResp_UnmarshalJSON(t *testing.T) {
 					DisabledRanks:    &ranklist.RankSet{},
 					PoolLayoutVer:    7,
 					UpgradeLayoutVer: 8,
+					MemFileBytes:     1000,
 				},
 			},
 		},
@@ -2064,205 +2068,196 @@ func TestControl_ListPools(t *testing.T) {
 	}
 }
 
+// Helper to generate typical SCM configs with rank and optional size params.
+func newScmCfg(rank int, size ...uint64) MockScmConfig {
+	sz := uint64(100) * humanize.GByte
+	if len(size) > 0 {
+		sz = size[0]
+	}
+	return MockScmConfig{
+		MockStorageConfig: MockStorageConfig{
+			TotalBytes:  sz,
+			AvailBytes:  sz,
+			UsableBytes: sz,
+		},
+		Rank: ranklist.Rank(rank),
+	}
+}
+
+// Helper to generate typical NVMe configs with rank, roles and optional size params.
+func newNvmeCfg(rank int, roles storage.OptionBits, size ...uint64) MockNvmeConfig {
+	sz := uint64(humanize.TByte)
+	if len(size) > 0 {
+		sz = size[0]
+	}
+	return MockNvmeConfig{
+		MockStorageConfig: MockStorageConfig{
+			TotalBytes:  sz,
+			AvailBytes:  sz,
+			UsableBytes: sz,
+			NvmeRole:    &storage.BdevRoles{OptionBits: roles},
+		},
+		Rank: ranklist.Rank(rank),
+	}
+}
+
 func TestControl_getMaxPoolSize(t *testing.T) {
 	devStateFaulty := storage.NvmeStateFaulty
 	devStateNew := storage.NvmeStateNew
-	type ExpectedOutput struct {
-		ScmBytes   uint64
-		NvmeBytes  uint64
-		Error      error
-		QueryError error
-		Debug      string
-	}
 
 	for name, tc := range map[string]struct {
-		HostsConfigArray []MockHostStorageConfig
-		TgtRanks         []ranklist.Rank
-		ExpectedOutput   ExpectedOutput
+		hostsConfigArray []MockHostStorageConfig
+		tgtRanks         []ranklist.Rank
+		memRatio         float32
+		queryError       error
+		expScmBytes      uint64
+		expNvmeBytes     uint64
+		expError         error
+		expDebug         string
 	}{
 		"single server": {
-			HostsConfigArray: []MockHostStorageConfig{
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
+				},
+			},
+			expScmBytes:  100 * humanize.GByte,
+			expNvmeBytes: humanize.TByte,
+		},
+		"single MD-on-SSD server; no mem-ratio specified; defaults to 1.0": {
+			hostsConfigArray: []MockHostStorageConfig{
+				{
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
 					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
+						newNvmeCfg(0, storage.BdevRoleData),
+						newNvmeCfg(0,
+							storage.BdevRoleWAL|storage.BdevRoleMeta,
+							2*humanize.TByte),
 					},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  100 * humanize.GByte,
-				NvmeBytes: 1 * humanize.TByte,
-			},
+			expScmBytes:  100 * humanize.GByte,
+			expNvmeBytes: humanize.TByte,
 		},
-		"single MD-on-SSD server": {
-			HostsConfigArray: []MockHostStorageConfig{
+		"single MD-on-SSD server; invalid mem-ratio; high": {
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
 					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-								NvmeRole: &storage.BdevRoles{
-									storage.OptionBits(storage.BdevRoleData),
-								},
-							},
-							Rank: 0,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  2 * humanize.TByte,
-								AvailBytes:  2 * humanize.TByte,
-								UsableBytes: 2 * humanize.TByte,
-								NvmeRole: &storage.BdevRoles{
-									storage.OptionBits(storage.BdevRoleWAL | storage.BdevRoleMeta),
-								},
-							},
-							Rank: 0,
-						},
+						newNvmeCfg(0, storage.BdevRoleData),
+						newNvmeCfg(0,
+							storage.BdevRoleWAL|storage.BdevRoleMeta,
+							2*humanize.TByte),
 					},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  100 * humanize.GByte,
-				NvmeBytes: 1 * humanize.TByte,
+			memRatio: 1.1,
+			expError: errors.New("invalid mem-ratio"),
+		},
+		"single MD-on-SSD server; invalid mem-ratio; low": {
+			hostsConfigArray: []MockHostStorageConfig{
+				{
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{
+						newNvmeCfg(0, storage.BdevRoleData),
+						newNvmeCfg(0,
+							storage.BdevRoleWAL|storage.BdevRoleMeta,
+							2*humanize.TByte),
+					},
+				},
 			},
+			memRatio: -1.1,
+			expError: errors.New("invalid mem-ratio"),
 		},
-		"single Ephemeral server": {
-			HostsConfigArray: []MockHostStorageConfig{
+		"single MD-on-SSD server; phase-1 mode (mem-file-sz == meta-blob-sz)": {
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{
+						newNvmeCfg(0, storage.BdevRoleData),
+						newNvmeCfg(0,
+							storage.BdevRoleWAL|storage.BdevRoleMeta,
+							2*humanize.TByte),
 					},
+				},
+			},
+			memRatio:     1,
+			expScmBytes:  100 * humanize.GByte,
+			expNvmeBytes: humanize.TByte,
+		},
+		"single MD-on-SSD server; phase-2 mode (mem-file-sz < meta-blob-sz)": {
+			hostsConfigArray: []MockHostStorageConfig{
+				{
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
 					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-								NvmeRole:    &storage.BdevRoles{storage.OptionBits(0)},
-							},
-							Rank: 0,
-						},
+						newNvmeCfg(0, storage.BdevRoleData),
+						newNvmeCfg(0,
+							storage.BdevRoleWAL|storage.BdevRoleMeta,
+							2*humanize.TByte),
 					},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  100 * humanize.GByte,
-				NvmeBytes: 1 * humanize.TByte,
+			memRatio:     0.5,
+			expScmBytes:  200 * humanize.GByte, // Double meta-blob-sz due to mem-ratio.
+			expNvmeBytes: humanize.TByte,
+		},
+		"single ephemeral server": {
+			hostsConfigArray: []MockHostStorageConfig{
+				{
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
+				},
 			},
+			expScmBytes:  100 * humanize.GByte,
+			expNvmeBytes: humanize.TByte,
 		},
 		"double server": {
-			HostsConfigArray: []MockHostStorageConfig{
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 				{
 					HostName: "bar[1,3]",
 					ScmConfig: []MockScmConfig{
+						newScmCfg(1, humanize.TByte),
+						newScmCfg(2),
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 2,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  50 * humanize.GByte,
 								UsableBytes: 50 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 3,
 						},
 					},
 					NvmeConfig: []MockNvmeConfig{
+						newNvmeCfg(1, 0),
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  400 * humanize.GByte,
 								UsableBytes: 400 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  300 * humanize.GByte,
 								UsableBytes: 300 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
@@ -2271,124 +2266,67 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 								TotalBytes:  3 * humanize.TByte,
 								AvailBytes:  2 * humanize.TByte,
 								UsableBytes: 2 * humanize.TByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 3,
 						},
 					},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  50 * humanize.GByte,
-				NvmeBytes: 700 * humanize.GByte,
-			},
+			expScmBytes:  50 * humanize.GByte,
+			expNvmeBytes: 700 * humanize.GByte,
 		},
 		"double server; rank filter": {
-			HostsConfigArray: []MockHostStorageConfig{
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 				{
 					HostName: "bar[1,3]",
 					ScmConfig: []MockScmConfig{
+						newScmCfg(1, humanize.TByte),
+						newScmCfg(2, humanize.TByte),
+						newScmCfg(3, humanize.GByte),
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 2,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.GByte,
-								AvailBytes:  1 * humanize.GByte,
-								UsableBytes: 1 * humanize.GByte,
-							},
-							Rank: 3,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  50 * humanize.GByte,
 								UsableBytes: 50 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 4,
 						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.GByte,
-								AvailBytes:  1 * humanize.GByte,
-								UsableBytes: 1 * humanize.GByte,
-							},
-							Rank: 5,
-						},
+						newScmCfg(5, humanize.GByte),
 					},
 					NvmeConfig: []MockNvmeConfig{
+						newNvmeCfg(1, 0),
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  400 * humanize.GByte,
 								UsableBytes: 400 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  300 * humanize.GByte,
 								UsableBytes: 300 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.GByte,
-								AvailBytes:  1 * humanize.GByte,
-								UsableBytes: 1 * humanize.GByte,
-							},
-							Rank: 3,
-						},
+						newNvmeCfg(3, 0, humanize.GByte),
 						{
 							MockStorageConfig: MockStorageConfig{
 								TotalBytes:  3 * humanize.TByte,
 								AvailBytes:  2 * humanize.TByte,
 								UsableBytes: 2 * humanize.TByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 4,
 						},
@@ -2397,139 +2335,78 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 								TotalBytes:  3 * humanize.TByte,
 								AvailBytes:  2 * humanize.TByte,
 								UsableBytes: 2 * humanize.TByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 5,
 						},
 					},
 				},
 			},
-			TgtRanks: []ranklist.Rank{0, 1, 2, 4},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  50 * humanize.GByte,
-				NvmeBytes: 700 * humanize.GByte,
-			},
+			tgtRanks:     []ranklist.Rank{0, 1, 2, 4},
+			expScmBytes:  50 * humanize.GByte,
+			expNvmeBytes: 700 * humanize.GByte,
 		},
 		"No NVMe; single server": {
-			HostsConfigArray: []MockHostStorageConfig{
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
 					NvmeConfig: []MockNvmeConfig{},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  100 * humanize.GByte,
-				NvmeBytes: uint64(0),
-			},
+			expScmBytes:  100 * humanize.GByte,
+			expNvmeBytes: uint64(0),
 		},
 		"No NVMe; double server": {
-			HostsConfigArray: []MockHostStorageConfig{
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 				{
 					HostName: "bar",
 					ScmConfig: []MockScmConfig{
+						newScmCfg(1, humanize.TByte),
+						newScmCfg(2, humanize.TByte),
+						newScmCfg(3, humanize.GByte),
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 2,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.GByte,
-								AvailBytes:  1 * humanize.GByte,
-								UsableBytes: 1 * humanize.GByte,
-							},
-							Rank: 3,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  50 * humanize.GByte,
 								UsableBytes: 50 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 4,
 						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.GByte,
-								AvailBytes:  1 * humanize.GByte,
-								UsableBytes: 1 * humanize.GByte,
-							},
-							Rank: 5,
-						},
+						newScmCfg(5, humanize.GByte),
 					},
 					NvmeConfig: []MockNvmeConfig{
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  400 * humanize.GByte,
 								UsableBytes: 400 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  300 * humanize.GByte,
 								UsableBytes: 300 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.GByte,
-								AvailBytes:  1 * humanize.GByte,
-								UsableBytes: 1 * humanize.GByte,
-							},
-							Rank: 3,
-						},
+						newNvmeCfg(3, 0, humanize.GByte),
 						{
 							MockStorageConfig: MockStorageConfig{
 								TotalBytes:  3 * humanize.TByte,
 								AvailBytes:  2 * humanize.TByte,
 								UsableBytes: 2 * humanize.TByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 4,
 						},
@@ -2538,226 +2415,132 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 								TotalBytes:  3 * humanize.TByte,
 								AvailBytes:  2 * humanize.TByte,
 								UsableBytes: 2 * humanize.TByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 5,
 						},
 					},
 				},
 			},
-			TgtRanks: []ranklist.Rank{0, 1, 2, 4},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  50 * humanize.GByte,
-				NvmeBytes: uint64(0),
-			},
+			tgtRanks:     []ranklist.Rank{0, 1, 2, 4},
+			expScmBytes:  50 * humanize.GByte,
+			expNvmeBytes: uint64(0),
 		},
 		"SCM:NVMe ratio": {
-			HostsConfigArray: []MockHostStorageConfig{
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
 					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.TByte,
-								AvailBytes:  100 * humanize.TByte,
-								UsableBytes: 100 * humanize.TByte,
-							},
-							Rank: 0,
-						},
+						newNvmeCfg(0, 0, 100*humanize.TByte),
 					},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  100 * humanize.GByte,
-				NvmeBytes: 100 * humanize.TByte,
-			},
+			expScmBytes:  100 * humanize.GByte,
+			expNvmeBytes: 100 * humanize.TByte,
 		},
-		"Invalid response message": {
-			HostsConfigArray: []MockHostStorageConfig{{}},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("unable to unpack message"),
-			},
+		"invalid response message": {
+			hostsConfigArray: []MockHostStorageConfig{{}},
+			expError:         errors.New("unable to unpack message"),
 		},
 		"empty response": {
-			HostsConfigArray: []MockHostStorageConfig{},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("host storage response"),
-			},
-		},
-		"query fails": {
-			HostsConfigArray: []MockHostStorageConfig{},
-			ExpectedOutput: ExpectedOutput{
-				QueryError: errors.New("query whoops"),
-				Error:      errors.New("query whoops"),
-			},
+			hostsConfigArray: []MockHostStorageConfig{},
+			expError:         errors.New("host storage response"),
 		},
-		"No SCM storage": {
-			HostsConfigArray: []MockHostStorageConfig{
-				{
-					HostName:   "foo",
-					ScmConfig:  []MockScmConfig{},
-					NvmeConfig: []MockNvmeConfig{},
-				},
-			},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("Host without SCM storage"),
-			},
-		},
-		"Engine with two SCM storage": {
-			HostsConfigArray: []MockHostStorageConfig{
-				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{},
-				},
-			},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("Multiple SCM devices found for rank"),
-			},
-		},
-		"Unusable NVMe device": {
-			HostsConfigArray: []MockHostStorageConfig{
-				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-								NvmeState:   &devStateFaulty,
-							},
-							Rank: 0,
-						},
-					},
-				},
-			},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("not usable"),
-			},
-		},
-		"New NVMe device": {
-			HostsConfigArray: []MockHostStorageConfig{
-				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-								NvmeState:   &devStateNew,
-							},
-							Rank: 0,
-						},
-					},
+		"query fails": {
+			hostsConfigArray: []MockHostStorageConfig{},
+			queryError:       errors.New("query whoops"),
+			expError:         errors.New("query whoops"),
+		},
+		"no SCM storage": {
+			hostsConfigArray: []MockHostStorageConfig{
+				{
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{},
+					NvmeConfig: []MockNvmeConfig{},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				ScmBytes:  100 * humanize.GByte,
-				NvmeBytes: uint64(0),
-			},
+			expError: errors.New("Host without SCM storage"),
 		},
-		"Unmounted SCM device": {
-			HostsConfigArray: []MockHostStorageConfig{
+		"engine with two SCM storage": {
+			hostsConfigArray: []MockHostStorageConfig{
 				{
 					HostName: "foo",
 					ScmConfig: []MockScmConfig{
+						newScmCfg(0, humanize.TByte),
+						newScmCfg(0, humanize.TByte),
+					},
+					NvmeConfig: []MockNvmeConfig{},
+				},
+			},
+			expError: errors.New("Multiple SCM devices found for rank"),
+		},
+		"unusable NVMe device": {
+			hostsConfigArray: []MockHostStorageConfig{
+				{
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
+								TotalBytes:  humanize.TByte,
+								AvailBytes:  humanize.TByte,
+								UsableBytes: humanize.TByte,
+								NvmeState:   &devStateFaulty,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 0,
 						},
 					},
+				},
+			},
+			expError: errors.New("not usable"),
+		},
+		"new NVMe device": {
+			hostsConfigArray: []MockHostStorageConfig{
+				{
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
 					NvmeConfig: []MockNvmeConfig{
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
+								AvailBytes:  humanize.TByte,
+								UsableBytes: humanize.TByte,
+								NvmeState:   &devStateNew,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 0,
 						},
 					},
 				},
+			},
+			expScmBytes:  100 * humanize.GByte,
+			expNvmeBytes: uint64(0),
+		},
+		"unmounted SCM device": {
+			hostsConfigArray: []MockHostStorageConfig{
+				{
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
+				},
 				{
 					HostName: "bar[1,3]",
 					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
+						newScmCfg(1, humanize.TByte),
 						{
 							MockStorageConfig: MockStorageConfig{
 								TotalBytes:  uint64(0),
 								AvailBytes:  uint64(0),
 								UsableBytes: uint64(0),
+								NvmeRole:    &storage.BdevRoles{},
 							},
 						},
+						newScmCfg(2),
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 2,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  50 * humanize.GByte,
 								UsableBytes: 50 * humanize.GByte,
 							},
@@ -2765,17 +2548,10 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 						},
 					},
 					NvmeConfig: []MockNvmeConfig{
+						newNvmeCfg(1, 0),
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  400 * humanize.GByte,
 								UsableBytes: 400 * humanize.GByte,
 							},
@@ -2783,7 +2559,7 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 						},
 						{
 							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
+								TotalBytes:  humanize.TByte,
 								AvailBytes:  300 * humanize.GByte,
 								UsableBytes: 300 * humanize.GByte,
 							},
@@ -2800,70 +2576,28 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 					},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("is not mounted"),
-			},
+			expError: errors.New("is not mounted"),
 		},
 		"SMD without SCM": {
-			HostsConfigArray: []MockHostStorageConfig{
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(1, 0)},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("without SCM device and at least one SMD device"),
-			},
+			expError: errors.New("without SCM device and at least one SMD device"),
 		},
 		"no SCM": {
-			HostsConfigArray: []MockHostStorageConfig{
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 			},
-			TgtRanks: []ranklist.Rank{1},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("No SCM storage space available"),
-			},
+			tgtRanks: []ranklist.Rank{1},
+			expError: errors.New("No SCM storage space available"),
 		},
 	} {
 		t.Run(name, func(t *testing.T) {
@@ -2877,7 +2611,7 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 							{
 								Addr:    "foo",
 								Message: &mgmtpb.SystemQueryResp{},
-								Error:   tc.ExpectedOutput.QueryError,
+								Error:   tc.queryError,
 							},
 						},
 					},
@@ -2886,7 +2620,8 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 					},
 				},
 			}
-			for _, hostStorageConfig := range tc.HostsConfigArray {
+
+			for _, hostStorageConfig := range tc.hostsConfigArray {
 				var hostResponse *HostResponse
 				if hostStorageConfig.HostName == "" {
 					hostResponse = new(HostResponse)
@@ -2904,32 +2639,26 @@ func TestControl_getMaxPoolSize(t *testing.T) {
 			}
 			mockInvoker := NewMockInvoker(log, mockInvokerConfig)
 
-			scmBytes, nvmeBytes, err := getMaxPoolSize(test.Context(t), mockInvoker, tc.TgtRanks)
+			createReq := &PoolCreateReq{Ranks: tc.tgtRanks, MemRatio: tc.memRatio}
+			scmBytes, nvmeBytes, gotErr := getMaxPoolSize(test.Context(t), mockInvoker,
+				createReq)
 
-			if tc.ExpectedOutput.Error != nil {
-				test.AssertTrue(t, err != nil, "Expected error")
-				test.CmpErr(t, tc.ExpectedOutput.Error, err)
+			test.CmpErr(t, tc.expError, gotErr)
+			if gotErr != nil {
 				return
 			}
 
-			test.AssertTrue(t, err == nil,
-				fmt.Sprintf("Expected no error: err=%q", err))
-			test.AssertEqual(t,
-				tc.ExpectedOutput.ScmBytes,
-				scmBytes,
-				fmt.Sprintf("Invalid SCM pool size: expected=%d got=%d",
-					tc.ExpectedOutput.ScmBytes,
-					scmBytes))
+			test.AssertEqual(t, tc.expScmBytes, scmBytes,
+				fmt.Sprintf("Invalid SCM pool size, want %s got %s",
+					humanize.Bytes(tc.expScmBytes), humanize.Bytes(scmBytes)))
 
-			test.AssertEqual(t,
-				tc.ExpectedOutput.NvmeBytes,
-				nvmeBytes,
-				fmt.Sprintf("Invalid NVMe pool size: expected=%d got=%d",
-					tc.ExpectedOutput.NvmeBytes,
-					nvmeBytes))
-			if tc.ExpectedOutput.Debug != "" {
-				test.AssertTrue(t, strings.Contains(buf.String(), tc.ExpectedOutput.Debug),
-					"Missing log message: "+tc.ExpectedOutput.Debug)
+			test.AssertEqual(t, tc.expNvmeBytes, nvmeBytes,
+				fmt.Sprintf("Invalid NVMe pool size, want %s got %s",
+					humanize.Bytes(tc.expNvmeBytes), humanize.Bytes(nvmeBytes)))
+
+			if tc.expDebug != "" {
+				test.AssertTrue(t, strings.Contains(buf.String(), tc.expDebug),
+					"Missing log message: "+tc.expDebug)
 			}
 		})
 	}
@@ -2946,135 +2675,59 @@ func (invoker *MockRequestsRecorderInvoker) InvokeUnaryRPC(context context.Conte
 }
 
 func TestControl_PoolCreateAllCmd(t *testing.T) {
-	type ExpectedOutput struct {
-		PoolConfig MockPoolRespConfig
-		WarningMsg string
-		Error      error
-	}
-
 	for name, tc := range map[string]struct {
-		StorageRatio     float64
-		HostsConfigArray []MockHostStorageConfig
-		TgtRanks         string
-		ExpectedOutput   ExpectedOutput
+		hostsConfigArray []MockHostStorageConfig
+		storageRatio     float64
+		tgtRanks         string
+		expPoolConfig    MockPoolRespConfig
+		expError         error
+		expWarning       string
 	}{
 		"single server": {
-			StorageRatio: 1,
-			HostsConfigArray: []MockHostStorageConfig{
+			storageRatio: 1,
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				PoolConfig: MockPoolRespConfig{
-					HostName:  "foo",
-					Ranks:     "0",
-					ScmBytes:  100 * humanize.GByte,
-					NvmeBytes: 1 * humanize.TByte,
-				},
+			expPoolConfig: MockPoolRespConfig{
+				HostName:  "foo",
+				Ranks:     "0",
+				ScmBytes:  100 * humanize.GByte,
+				NvmeBytes: 1 * humanize.TByte,
 			},
 		},
 		"single server 30%": {
-			StorageRatio: 0.3,
-			HostsConfigArray: []MockHostStorageConfig{
+			storageRatio: 0.3,
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				PoolConfig: MockPoolRespConfig{
-					HostName:  "foo",
-					Ranks:     "0",
-					ScmBytes:  30 * humanize.GByte,
-					NvmeBytes: 300 * humanize.GByte,
-				},
+			expPoolConfig: MockPoolRespConfig{
+				HostName:  "foo",
+				Ranks:     "0",
+				ScmBytes:  30 * humanize.GByte,
+				NvmeBytes: 300 * humanize.GByte,
 			},
 		},
 		"double server": {
-			StorageRatio: 1,
-			HostsConfigArray: []MockHostStorageConfig{
+			storageRatio: 1,
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 				{
 					HostName: "bar",
 					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 2,
-						},
+						newScmCfg(1, humanize.TByte),
+						newScmCfg(2),
 						{
 							MockStorageConfig: MockStorageConfig{
 								TotalBytes:  1 * humanize.TByte,
@@ -3085,19 +2738,13 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 						},
 					},
 					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
+						newNvmeCfg(1, 0),
 						{
 							MockStorageConfig: MockStorageConfig{
 								TotalBytes:  1 * humanize.TByte,
 								AvailBytes:  400 * humanize.GByte,
 								UsableBytes: 400 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
@@ -3106,6 +2753,7 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 								TotalBytes:  1 * humanize.TByte,
 								AvailBytes:  300 * humanize.GByte,
 								UsableBytes: 300 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
@@ -3114,66 +2762,33 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 								TotalBytes:  3 * humanize.TByte,
 								AvailBytes:  2 * humanize.TByte,
 								UsableBytes: 2 * humanize.TByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 3,
 						},
 					},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				PoolConfig: MockPoolRespConfig{
-					HostName:  "foo",
-					Ranks:     "0,1,2,3",
-					ScmBytes:  50 * humanize.GByte,
-					NvmeBytes: 700 * humanize.GByte,
-				},
+			expPoolConfig: MockPoolRespConfig{
+				HostName:  "foo",
+				Ranks:     "0,1,2,3",
+				ScmBytes:  50 * humanize.GByte,
+				NvmeBytes: 700 * humanize.GByte,
 			},
 		},
-		"double server;rank filter": {
-			StorageRatio: 1,
-			HostsConfigArray: []MockHostStorageConfig{
+		"double server; rank filter": {
+			storageRatio: 1,
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 				{
 					HostName: "bar",
 					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 2,
-						},
+						newScmCfg(1, humanize.TByte),
+						newScmCfg(2),
 						{
 							MockStorageConfig: MockStorageConfig{
 								TotalBytes:  1 * humanize.TByte,
@@ -3192,19 +2807,13 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 						},
 					},
 					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 1,
-						},
+						newNvmeCfg(1, 0),
 						{
 							MockStorageConfig: MockStorageConfig{
 								TotalBytes:  1 * humanize.TByte,
 								AvailBytes:  400 * humanize.GByte,
 								UsableBytes: 400 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
@@ -3213,6 +2822,7 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 								TotalBytes:  1 * humanize.TByte,
 								AvailBytes:  300 * humanize.GByte,
 								UsableBytes: 300 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 2,
 						},
@@ -3221,6 +2831,7 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 								TotalBytes:  3 * humanize.TByte,
 								AvailBytes:  2 * humanize.TByte,
 								UsableBytes: 2 * humanize.TByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 3,
 						},
@@ -3229,90 +2840,60 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 								TotalBytes:  3 * humanize.TByte,
 								AvailBytes:  1 * humanize.GByte,
 								UsableBytes: 1 * humanize.GByte,
+								NvmeRole:    &storage.BdevRoles{},
 							},
 							Rank: 4,
 						},
 					},
 				},
 			},
-			TgtRanks: "0,1,2,3",
-			ExpectedOutput: ExpectedOutput{
-				PoolConfig: MockPoolRespConfig{
-					HostName:  "foo",
-					Ranks:     "0,1,2,3",
-					ScmBytes:  50 * humanize.GByte,
-					NvmeBytes: 700 * humanize.GByte,
-				},
+			tgtRanks: "0,1,2,3",
+			expPoolConfig: MockPoolRespConfig{
+				HostName:  "foo",
+				Ranks:     "0,1,2,3",
+				ScmBytes:  50 * humanize.GByte,
+				NvmeBytes: 700 * humanize.GByte,
 			},
 		},
 		"No NVME": {
-			StorageRatio: 1,
-			HostsConfigArray: []MockHostStorageConfig{
+			storageRatio: 1,
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:   "foo",
+					ScmConfig:  []MockScmConfig{newScmCfg(0)},
 					NvmeConfig: []MockNvmeConfig{},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				PoolConfig: MockPoolRespConfig{
-					HostName:  "foo",
-					Ranks:     "0",
-					ScmBytes:  100 * humanize.GByte,
-					NvmeBytes: uint64(0),
-				},
-				WarningMsg: "Creating DAOS pool without NVME storage",
+			expPoolConfig: MockPoolRespConfig{
+				HostName:  "foo",
+				Ranks:     "0",
+				ScmBytes:  100 * humanize.GByte,
+				NvmeBytes: uint64(0),
 			},
+			expWarning: "Creating DAOS pool without NVME storage",
 		},
 		"SCM:NVME ratio": {
-			StorageRatio: 1,
-			HostsConfigArray: []MockHostStorageConfig{
+			storageRatio: 1,
+			hostsConfigArray: []MockHostStorageConfig{
 				{
-					HostName: "foo",
-					ScmConfig: []MockScmConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.GByte,
-								AvailBytes:  100 * humanize.GByte,
-								UsableBytes: 100 * humanize.GByte,
-							},
-							Rank: 0,
-						},
-					},
+					HostName:  "foo",
+					ScmConfig: []MockScmConfig{newScmCfg(0)},
 					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  100 * humanize.TByte,
-								AvailBytes:  100 * humanize.TByte,
-								UsableBytes: 100 * humanize.TByte,
-							},
-							Rank: 0,
-						},
+						newNvmeCfg(0, 0, 100*humanize.TByte),
 					},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				PoolConfig: MockPoolRespConfig{
-					HostName:  "foo",
-					Ranks:     "0",
-					ScmBytes:  100 * humanize.GByte,
-					NvmeBytes: 100 * humanize.TByte,
-				},
-				WarningMsg: "SCM:NVMe ratio is less than",
+			expPoolConfig: MockPoolRespConfig{
+				HostName:  "foo",
+				Ranks:     "0",
+				ScmBytes:  100 * humanize.GByte,
+				NvmeBytes: 100 * humanize.TByte,
 			},
+			expWarning: "SCM:NVMe ratio is less than",
 		},
 		"single server error 1%": {
-			StorageRatio: 0.01,
-			HostsConfigArray: []MockHostStorageConfig{
+			storageRatio: 0.01,
+			hostsConfigArray: []MockHostStorageConfig{
 				{
 					HostName: "foo",
 					ScmConfig: []MockScmConfig{
@@ -3325,21 +2906,10 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 							Rank: 0,
 						},
 					},
-					NvmeConfig: []MockNvmeConfig{
-						{
-							MockStorageConfig: MockStorageConfig{
-								TotalBytes:  1 * humanize.TByte,
-								AvailBytes:  1 * humanize.TByte,
-								UsableBytes: 1 * humanize.TByte,
-							},
-							Rank: 0,
-						},
-					},
+					NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)},
 				},
 			},
-			ExpectedOutput: ExpectedOutput{
-				Error: errors.New("Not enough SCM storage available with ratio 1%"),
-			},
+			expError: errors.New("Not enough SCM storage available with ratio 1%"),
 		},
 	} {
 		t.Run(name, func(t *testing.T) {
@@ -3360,7 +2930,7 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 			}
 
 			unaryResponse := new(UnaryResponse)
-			for _, hostStorageConfig := range tc.HostsConfigArray {
+			for _, hostStorageConfig := range tc.hostsConfigArray {
 				storageScanResp := MockStorageScanResp(t,
 					hostStorageConfig.ScmConfig,
 					hostStorageConfig.NvmeConfig)
@@ -3372,10 +2942,10 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 			}
 			mockInvokerConfig.UnaryResponseSet = append(mockInvokerConfig.UnaryResponseSet, unaryResponse)
 
-			if tc.ExpectedOutput.PoolConfig.Ranks != "" {
-				poolCreateResp := MockPoolCreateResp(t, &tc.ExpectedOutput.PoolConfig)
+			if tc.expPoolConfig.Ranks != "" {
+				poolCreateResp := MockPoolCreateResp(t, &tc.expPoolConfig)
 				hostResponse := &HostResponse{
-					Addr:    tc.ExpectedOutput.PoolConfig.HostName,
+					Addr:    tc.expPoolConfig.HostName,
 					Message: poolCreateResp,
 				}
 				unaryResponse = new(UnaryResponse)
@@ -3390,15 +2960,15 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 
 			req := &PoolCreateReq{}
 
-			if tc.StorageRatio != 0 {
-				req.TierRatio = []float64{tc.StorageRatio, tc.StorageRatio}
+			if tc.storageRatio != 0 {
+				req.TierRatio = []float64{tc.storageRatio, tc.storageRatio}
 			}
-			if tc.TgtRanks != "" {
-				req.Ranks = ranklist.RanksFromUint32(mockRanks(tc.TgtRanks))
+			if tc.tgtRanks != "" {
+				req.Ranks = ranklist.RanksFromUint32(mockRanks(tc.tgtRanks))
 			}
 
 			_, gotErr := PoolCreate(context.Background(), mockInvoker, req)
-			test.CmpErr(t, tc.ExpectedOutput.Error, gotErr)
+			test.CmpErr(t, tc.expError, gotErr)
 			if gotErr != nil {
 				return
 			}
@@ -3419,20 +2989,20 @@ func TestControl_PoolCreateAllCmd(t *testing.T) {
 			poolCreateRequest := mockInvoker.Requests[2].(*PoolCreateReq)
 			test.AssertEqual(t,
 				poolCreateRequest.TierBytes[0],
-				tc.ExpectedOutput.PoolConfig.ScmBytes,
+				tc.expPoolConfig.ScmBytes,
 				"Invalid size of allocated SCM")
 			test.AssertEqual(t,
 				poolCreateRequest.TierBytes[1],
-				tc.ExpectedOutput.PoolConfig.NvmeBytes,
+				tc.expPoolConfig.NvmeBytes,
 				"Invalid size of allocated NVME")
 			test.AssertEqual(t,
 				poolCreateRequest.TotalBytes,
 				uint64(0),
 				"Invalid size of TotalBytes attribute: disabled with manual allocation")
-			if tc.TgtRanks != "" {
+			if tc.tgtRanks != "" {
 				test.AssertEqual(t,
 					ranklist.RankList(poolCreateRequest.Ranks).String(),
-					tc.ExpectedOutput.PoolConfig.Ranks,
+					tc.expPoolConfig.Ranks,
 					"Invalid list of Ranks")
 			} else {
 				test.AssertEqual(t,
diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go
index 9d5fe470de6..fb649a06ce1 100644
--- a/src/control/lib/control/storage.go
+++ b/src/control/lib/control/storage.go
@@ -160,6 +160,7 @@ type (
 		Usage      bool
 		NvmeHealth bool
 		NvmeBasic  bool
+		MemRatio   float32
 	}
 
 	// StorageScanResp contains the response from a storage scan request.
@@ -256,8 +257,9 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe
 			Nvme: &ctlpb.ScanNvmeReq{
 				Basic: req.NvmeBasic,
 				// Health and meta details required to populate usage statistics.
-				Health: req.NvmeHealth || req.Usage,
-				Meta:   req.Usage,
+				Health:   req.NvmeHealth || req.Usage,
+				Meta:     req.Usage,
+				MemRatio: req.MemRatio,
 				// Only request link stats if health explicitly requested.
 				LinkStats: req.NvmeHealth,
 			},
diff --git a/src/control/lib/daos/pool.go b/src/control/lib/daos/pool.go
index e47e6e2b23d..0792c46c4ea 100644
--- a/src/control/lib/daos/pool.go
+++ b/src/control/lib/daos/pool.go
@@ -79,6 +79,7 @@ type (
 		DisabledRanks    *ranklist.RankSet    `json:"disabled_ranks,omitempty"`
 		PoolLayoutVer    uint32               `json:"pool_layout_ver"`
 		UpgradeLayoutVer uint32               `json:"upgrade_layout_ver"`
+		MemFileBytes     uint64               `json:"mem_file_bytes"`
 	}
 
 	PoolQueryTargetType  int32
@@ -86,9 +87,10 @@ type (
 
 	// PoolQueryTargetInfo contains information about a single target
 	PoolQueryTargetInfo struct {
-		Type  PoolQueryTargetType  `json:"target_type"`
-		State PoolQueryTargetState `json:"target_state"`
-		Space []*StorageUsageStats `json:"space"`
+		Type         PoolQueryTargetType  `json:"target_type"`
+		State        PoolQueryTargetState `json:"target_state"`
+		Space        []*StorageUsageStats `json:"space"`
+		MemFileBytes uint64               `json:"mem_file_bytes"`
 	}
 
 	// StorageTargetUsage represents DAOS target storage usage
@@ -351,6 +353,8 @@ const (
 	StorageMediaTypeScm = StorageMediaType(mgmtpb.StorageMediaType_SCM)
 	// StorageMediaTypeNvme indicates that the media is NVMe SSD
 	StorageMediaTypeNvme = StorageMediaType(mgmtpb.StorageMediaType_NVME)
+	// StorageMediaTypeMax indicates the end of the StorageMediaType array
+	StorageMediaTypeMax = StorageMediaType(StorageMediaTypeNvme + 1)
 )
 
 func (smt StorageMediaType) String() string {
diff --git a/src/control/server/ctl_storage_rpc.go b/src/control/server/ctl_storage_rpc.go
index 71339918876..90a46495ae0 100644
--- a/src/control/server/ctl_storage_rpc.go
+++ b/src/control/server/ctl_storage_rpc.go
@@ -70,9 +70,8 @@ func newResponseState(inErr error, badStatus ctlpb.ResponseStatus, infoMsg strin
 
 // Package-local function variables for mocking in unit tests.
 var (
-	scanBdevs        = bdevScan         // StorageScan() unit tests
-	scanEngineBdevs  = bdevScanEngine   // bdevScan() unit tests
-	computeMetaRdbSz = metaRdbComputeSz // TODO unit tests
+	scanBdevs       = bdevScan       // StorageScan() unit tests
+	scanEngineBdevs = bdevScanEngine // bdevScan() unit tests
 )
 
 type scanBdevsFn func(storage.BdevScanRequest) (*storage.BdevScanResponse, error)
@@ -161,7 +160,7 @@ func bdevScanEngines(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvm
 		eReq := new(ctlpb.ScanNvmeReq)
 		*eReq = *req
 		if req.Meta {
-			ms, rs, err := computeMetaRdbSz(cs, engine, nsps)
+			ms, rs, err := metaRdbComputeSz(cs, engine, nsps, req.MemRatio)
 			if err != nil {
 				return nil, errors.Wrap(err, "computing meta and rdb size")
 			}
@@ -169,7 +168,7 @@ func bdevScanEngines(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvm
 		}
 
 		// If partial number of engines return results, indicate errors for non-ready
-		// engines whilst returning successful scanmresults.
+		// engines whilst returning successful scan results.
 		respEng, err := scanEngineBdevs(ctx, engine, eReq)
 		if err != nil {
 			err = errors.Wrapf(err, "instance %d", engine.Index())
@@ -287,7 +286,7 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n
 	// Retry once if harness scan returns unexpected number of controllers in case engines
 	// claimed devices between when started state was checked and scan was executed.
 	if !hasStarted {
-		cs.log.Debugf("retrying harness bdev scan as unexpected nr returned, want %d got %d",
+		cs.log.Debugf("retrying harness bdev scan as unexpected nr ctrlrs returned, want %d got %d",
 			nrCfgBdevs, nrScannedBdevs)
 
 		resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, bdevCfgs)
@@ -304,7 +303,7 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n
 		}
 	}
 
-	cs.log.Noticef("harness bdev scan returned unexpected nr, want %d got %d", nrCfgBdevs,
+	cs.log.Noticef("harness bdev scan returned unexpected nr ctrlrs, want %d got %d", nrCfgBdevs,
 		nrScannedBdevs)
 
 	return bdevScanTrimResults(req, resp), nil
@@ -418,39 +417,65 @@ func (cs *ControlService) getRdbSize(engineCfg *engine.Config) (uint64, error) {
 
 // Compute the maximal size of the metadata to allow the engine to fill the WallMeta field
 // response.  The maximal metadata (i.e. VOS index file) size should be equal to the SCM available
-// size divided by the number of targets of the engine.
-func metaRdbComputeSz(cs *ControlService, ei Engine, nsps []*ctlpb.ScmNamespace) (md_size, rdb_size uint64, errOut error) {
+// size divided by the number of targets of the engine. Sizes returned are per-target values.
+func metaRdbComputeSz(cs *ControlService, ei Engine, nsps []*ctlpb.ScmNamespace, memRatio float32) (uint64, uint64, error) {
+	msg := fmt.Sprintf("computing meta/rdb sizes with %d scm namespaces", len(nsps))
+
+	var metaBytes, rdbBytes uint64
 	for _, nsp := range nsps {
+		msg += fmt.Sprintf(", scm-ns: %+v", nsp)
+
 		mp := nsp.GetMount()
 		if mp == nil {
+			cs.log.Tracef("%s: skip (no mount)", msg)
+			continue
+		}
+		msg += fmt.Sprintf(", mount: %+v", mp)
+
+		r, err := ei.GetRank()
+		if err != nil {
+			cs.log.Tracef("%s: skip (get rank err: %s)", msg, err.Error())
 			continue
 		}
-		if r, err := ei.GetRank(); err != nil || uint32(r) != mp.GetRank() {
+		if uint32(r) != mp.Rank {
+			cs.log.Tracef("%s: skip (wrong rank, want %d got %d)", msg, r, mp.Rank)
 			continue
 		}
+		msg += fmt.Sprintf(", rank %d", r)
 
-		// NOTE DAOS-14223: This metadata size calculation won't necessarily match
-		//                  the meta blob size on SSD if --meta-size is specified in
-		//                  pool create command.
-		md_size = mp.GetUsableBytes() / uint64(ei.GetTargetCount())
+		if ei.GetTargetCount() == 0 {
+			return 0, 0, errors.Errorf("%s: engine with zero tgts is invalid", msg)
+		}
+		metaBytes = mp.GetUsableBytes() / uint64(ei.GetTargetCount())
+
+		// Divide VOS index file size by memRatio fraction, if nonzero, to project the
+		// effective meta-blob size. In MD-on-SSD phase-2, meta-blob > VOS-file size.
+		if memRatio > 0 {
+			msg += fmt.Sprintf(", using %.2f mem-ratio", memRatio)
+			metaBytes = uint64(float64(metaBytes) / float64(memRatio))
+		}
 
 		engineCfg, err := cs.getEngineCfgFromScmNsp(nsp)
 		if err != nil {
-			errOut = errors.Wrap(err, "Engine with invalid configuration")
-			return
+			return 0, 0, errors.Wrapf(err, "%s: engine with invalid configuration", msg)
 		}
-		rdb_size, errOut = cs.getRdbSize(engineCfg)
-		if errOut != nil {
-			return
+		rdbBytes, err = cs.getRdbSize(engineCfg)
+		if err != nil {
+			return 0, 0, errors.Wrapf(err, "%s: get rdb size with engine cfg %+v", msg,
+				engineCfg)
 		}
-		break
+
+		break // Just use first namespace.
 	}
 
-	if md_size == 0 {
+	if metaBytes == 0 {
 		cs.log.Noticef("instance %d: no SCM space available for metadata", ei.Index)
+		rdbBytes = 0
 	}
+	cs.log.Tracef("%s: computed meta sz %s and rdb sz %s", msg, humanize.IBytes(metaBytes),
+		humanize.IBytes(rdbBytes))
 
-	return
+	return metaBytes, rdbBytes, nil
 }
 
 type deviceToAdjust struct {
@@ -464,8 +489,20 @@ type deviceSizeStat struct {
 	devs             []*deviceToAdjust
 }
 
+// Dedupe and remove sysXS target ID from slice before counting IDs. See
+// storage.SmdDevice.UnmarshalJSON() for tgtID sanitization.
+func getSmdTgtCount(log logging.Logger, sd *ctlpb.SmdDevice) int {
+	var sdOut storage.SmdDevice
+	if err := convert.Types(sd, &sdOut); err != nil {
+		log.Errorf("could not retrieve target count for smd %s", sd.GetUuid())
+		return 0
+	}
+
+	return len(sdOut.TargetIDs)
+}
+
 // Add a device to the input map of device to which the usable size have to be adjusted
-func (cs *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat, devToAdjust *deviceToAdjust, dataClusterCount uint64) {
+func (cs *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat, devToAdjust *deviceToAdjust, dataClusterCount uint64, devTgtCount int) {
 	dev := devToAdjust.ctlr.GetSmdDevices()[devToAdjust.idx]
 	if devsStat[devToAdjust.rank] == nil {
 		devsStat[devToAdjust.rank] = &deviceSizeStat{
@@ -473,44 +510,45 @@ func (cs *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat,
 		}
 	}
 	devsStat[devToAdjust.rank].devs = append(devsStat[devToAdjust.rank].devs, devToAdjust)
-	targetCount := uint64(len(dev.GetTgtIds()))
-	clusterPerTarget := dataClusterCount / targetCount
+	clusterPerTarget := dataClusterCount / uint64(devTgtCount)
 	cs.log.Tracef("SMD device %s (rank %d, ctlr %s) added to the list of device to adjust",
 		dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
 	if clusterPerTarget < devsStat[devToAdjust.rank].clusterPerTarget {
-		cs.log.Tracef("Updating number of clusters per target of rank %d: old=%d new=%d",
-			devToAdjust.rank, devsStat[devToAdjust.rank].clusterPerTarget, clusterPerTarget)
+		cs.log.Tracef("Updating number of clusters per target (%d/%d) of rank %d: old=%d "+
+			"new=%d", dataClusterCount, devTgtCount, devToAdjust.rank,
+			devsStat[devToAdjust.rank].clusterPerTarget, clusterPerTarget)
 		devsStat[devToAdjust.rank].clusterPerTarget = clusterPerTarget
 	}
 }
 
 // For a given size in bytes, returns the total number of SPDK clusters needed for a given number of targets
-func getClusterCount(sizeBytes uint64, targetNb uint64, clusterSize uint64) uint64 {
+func getClusterCount(sizeBytes uint64, tgtCount int, clusterSize uint64) uint64 {
 	clusterCount := sizeBytes / clusterSize
 	if sizeBytes%clusterSize != 0 {
 		clusterCount += 1
 	}
-	return clusterCount * targetNb
+
+	return clusterCount * uint64(tgtCount)
 }
 
 func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdjust deviceToAdjust) (subtrClusterCount uint64) {
 	dev := devToAdjust.ctlr.GetSmdDevices()[devToAdjust.idx]
 	clusterSize := uint64(dev.GetClusterSize())
-	engineTargetNb := uint64(engineCfg.TargetCount)
+	// Calculate MD cluster overhead based on the number of targets allocated to the device
+	// as per-target blobs will be striped across all of a given role's SSDs.
+	devTgtCount := getSmdTgtCount(cs.log, dev)
 
 	if dev.GetRoleBits()&storage.BdevRoleMeta != 0 {
-		// TODO DAOS-14223: GetMetaSize() should reflect custom values set through pool
-		//                  create --meta-size option.
-		clusterCount := getClusterCount(dev.GetMetaSize(), engineTargetNb, clusterSize)
-		cs.log.Tracef("Removing %d Metadata clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ",
-			clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
+		clusterCount := getClusterCount(dev.GetMetaSize(), devTgtCount, clusterSize)
+		cs.log.Tracef("Removing %d Metadata clusters (cluster size: %d, dev tgts: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ",
+			clusterCount, clusterSize, devTgtCount, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
 		subtrClusterCount += clusterCount
 	}
 
 	if dev.GetRoleBits()&storage.BdevRoleWAL != 0 {
-		clusterCount := getClusterCount(dev.GetMetaWalSize(), engineTargetNb, clusterSize)
-		cs.log.Tracef("Removing %d Metadata WAL clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ",
-			clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
+		clusterCount := getClusterCount(dev.GetMetaWalSize(), devTgtCount, clusterSize)
+		cs.log.Tracef("Removing %d Metadata WAL clusters (cluster size: %d, dev tgts: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ",
+			clusterCount, clusterSize, devTgtCount, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
 		subtrClusterCount += clusterCount
 	}
 
@@ -520,7 +558,7 @@ func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdj
 
 	if dev.GetRoleBits()&storage.BdevRoleMeta != 0 {
 		clusterCount := getClusterCount(dev.GetRdbSize(), 1, clusterSize)
-		cs.log.Tracef("Removing %d RDB clusters (cluster size: %d) the usable size of the SMD device %s (rank %d, ctlr %s)",
+		cs.log.Tracef("Removing %d RDB clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s)",
 			clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
 		subtrClusterCount += clusterCount
 	}
@@ -535,7 +573,8 @@ func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdj
 	return
 }
 
-// Adjust the NVME available size to its real usable size.
+// Estimate the NVME size available to store pool data after metadata overheads have been
+// accounted for.
 func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
 	devsStat := make(map[uint32]*deviceSizeStat, 0)
 	for _, ctlr := range resp.GetCtrlrs() {
@@ -547,6 +586,7 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
 
 		for idx, dev := range ctlr.GetSmdDevices() {
 			rank := dev.GetRank()
+			devTgtCount := getSmdTgtCount(cs.log, dev)
 
 			if dev.GetRoleBits() != 0 && (dev.GetRoleBits()&storage.BdevRoleData) == 0 {
 				cs.log.Debugf("SMD device %s (rank %d, ctlr %s) not used to store data (Role bits 0x%X)",
@@ -565,7 +605,7 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
 				continue
 			}
 
-			if dev.GetClusterSize() == 0 || len(dev.GetTgtIds()) == 0 {
+			if dev.GetClusterSize() == 0 || devTgtCount == 0 {
 				cs.log.Noticef("SMD device %s (rank %d,  ctlr %s) not usable: missing storage info",
 					dev.GetUuid(), rank, ctlr.GetPciAddr())
 				dev.AvailBytes = 0
@@ -579,7 +619,7 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
 			clusterSize := uint64(dev.GetClusterSize())
 			availBytes := (dev.GetAvailBytes() / clusterSize) * clusterSize
 			if dev.GetAvailBytes() != availBytes {
-				cs.log.Tracef("Adjusting available size of SMD device %s (rank %d, ctlr %s): from %s (%d Bytes) to %s (%d bytes)",
+				cs.log.Tracef("Rounding available size of SMD device %s based on cluster size (rank %d, ctlr %s): from %s (%d Bytes) to %s (%d bytes)",
 					dev.GetUuid(), rank, ctlr.GetPciAddr(),
 					humanize.Bytes(dev.GetAvailBytes()), dev.GetAvailBytes(),
 					humanize.Bytes(availBytes), availBytes)
@@ -595,7 +635,8 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
 			if dev.GetRoleBits() == 0 {
 				cs.log.Tracef("No meta-data stored on SMD device %s (rank %d, ctlr %s)",
 					dev.GetUuid(), rank, ctlr.GetPciAddr())
-				cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount)
+				cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount,
+					devTgtCount)
 				continue
 			}
 
@@ -606,17 +647,19 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
 				dev.UsableBytes = 0
 				continue
 			}
+			cs.log.Tracef("Removing %d metadata clusters from %d total",
+				subtrClusterCount, dataClusterCount)
 			dataClusterCount -= subtrClusterCount
-			cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount)
+			cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount, devTgtCount)
 		}
 	}
 
 	for rank, item := range devsStat {
 		for _, dev := range item.devs {
 			smdDev := dev.ctlr.GetSmdDevices()[dev.idx]
-			targetCount := uint64(len(smdDev.GetTgtIds()))
-			smdDev.UsableBytes = targetCount * item.clusterPerTarget * smdDev.GetClusterSize()
-			cs.log.Debugf("Defining usable size of the SMD device %s (rank %d, ctlr %s) to %s (%d bytes)",
+			clusters := uint64(getSmdTgtCount(cs.log, smdDev)) * item.clusterPerTarget
+			smdDev.UsableBytes = clusters * smdDev.GetClusterSize()
+			cs.log.Debugf("Defining usable size of the SMD device %s (rank %d, ctlr %s) as %s (%d bytes)",
 				smdDev.GetUuid(), rank, dev.ctlr.GetPciAddr(),
 				humanize.Bytes(smdDev.GetUsableBytes()), smdDev.GetUsableBytes())
 		}
@@ -680,13 +723,10 @@ func (cs *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) {
 			}
 
 			cmdPath := engineCfg.Storage.ControlMetadata.Path
-			if hasPrefix, err := common.HasPrefixPath(mountPath, cmdPath); hasPrefix || err != nil {
-				if err != nil {
-					cs.log.Noticef("Invalid SCM mount path or Control Metadata path: %q", err.Error())
-				}
-				if hasPrefix {
-					removeControlPlaneMetadata(mnt)
-				}
+			if hasPrefix, err := common.HasPrefixPath(mountPath, cmdPath); err != nil {
+				cs.log.Noticef("Invalid SCM mount path or Control Metadata path: %q", err.Error())
+			} else if hasPrefix {
+				removeControlPlaneMetadata(mnt)
 			}
 		}
 
diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go
index bf2d7ee43b5..c1b795b8551 100644
--- a/src/control/server/ctl_storage_rpc_test.go
+++ b/src/control/server/ctl_storage_rpc_test.go
@@ -58,18 +58,44 @@ var (
 )
 
 func TestServer_bdevScan(t *testing.T) {
+	defTgtCount := 16
+	defScmMountPt := "/mnt/daos0"
+	defScmDev := "/dev/pmem0"
+	defMountAvail := uint64(12) * humanize.GiByte
+	defMountUsable := uint64(10) * humanize.GiByte
+	defMetaSize := defMountUsable / uint64(defTgtCount)
+	defRdbSize := uint64(humanize.GiByte)
+
+	mockSmd := func(roles uint32) *ctlpb.SmdDevice {
+		return &ctlpb.SmdDevice{
+			Rank:   uint32(0),
+			TgtIds: []int32{1, 2, 3, 4},
+			// Avoid rounding
+			AvailBytes:  32 * humanize.GiByte,
+			ClusterSize: 32 * humanize.MiByte,
+			RoleBits:    roles,
+			MetaSize:    defMetaSize,
+			MetaWalSize: humanize.GiByte,
+			RdbSize:     uint64(defRdbSize),
+			RdbWalSize:  humanize.GiByte,
+		}
+	}
+
 	for name, tc := range map[string]struct {
 		req                 *ctlpb.ScanNvmeReq
 		disableHPs          bool
 		provRes             *storage.BdevScanResponse
 		provErr             error
+		engTgtCount         int
 		engTierCfgs         []storage.TierConfigs // one per-engine
 		engStopped          []bool                // one per-engine (all false if unset)
+		scmNamespaces       []*ctlpb.ScmNamespace // one per-engine
 		engRes              []ctlpb.ScanNvmeResp  // one per-engine
 		engErr              []error               // one per-engine
 		expResp             *ctlpb.ScanNvmeResp
 		expErr              error
 		expBackendScanCalls []storage.BdevScanRequest
+		expRemoteScanCalls  []*ctlpb.ScanNvmeReq
 	}{
 		"nil request": {
 			expErr: errNilReq,
@@ -257,10 +283,77 @@ func TestServer_bdevScan(t *testing.T) {
 				},
 			},
 		},
+		"scan remote; bdevs in config; missing mount in config": {
+			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1),
+							test.MockPCIAddr(2)),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			expErr:     errors.New("unknown SCM mount point"),
+		},
+		"scan remote; bdevs in config; adjustment skipped as no meta flag in req": {
+			req: &ctlpb.ScanNvmeReq{Health: true},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1),
+							test.MockPCIAddr(2)),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					proto.MockNvmeController(2),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true},
+			},
+		},
+		"scan remote; bdevs in config; zero namespaces": {
+			req:           &ctlpb.ScanNvmeReq{Health: true, Meta: true},
+			scmNamespaces: []*ctlpb.ScmNamespace{},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList(defScmDev).
+						WithScmMountPoint(defScmMountPt),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1),
+							test.MockPCIAddr(2)),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					proto.MockNvmeController(2),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true, Meta: true},
+			},
+		},
 		"scan remote; bdevs in config": {
 			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
 			engTierCfgs: []storage.TierConfigs{
 				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList(defScmDev).
+						WithScmMountPoint(defScmMountPt),
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
 						WithBdevDeviceList(test.MockPCIAddr(1),
@@ -275,23 +368,454 @@ func TestServer_bdevScan(t *testing.T) {
 				},
 				State: new(ctlpb.ResponseState),
 			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
+		},
+		"scan remote; bdev with md-on-ssd roles in config; no request flags; adjustments skipped": {
+			req: &ctlpb.ScanNvmeReq{},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassRam.String()).
+						WithScmMountPoint(defScmMountPt),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1)).
+						WithBdevDeviceRoles(storage.BdevRoleAll),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			engRes: []ctlpb.ScanNvmeResp{
+				ctlpb.ScanNvmeResp{
+					Ctrlrs: proto.NvmeControllers{
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(1)
+							nc.SmdDevices = []*ctlpb.SmdDevice{
+								mockSmd(storage.BdevRoleAll),
+							}
+							return nc
+						}(),
+					},
+				},
+			},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(1)
+						nc.HealthStats = nil
+						nc.SmdDevices = []*ctlpb.SmdDevice{
+							mockSmd(storage.BdevRoleAll),
+						}
+						return nc
+					}(),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{{}},
+		},
+		"scan remote; bdev with md-on-ssd roles in config; no meta flag": {
+			req: &ctlpb.ScanNvmeReq{Health: true},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassRam.String()).
+						WithScmMountPoint(defScmMountPt),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1)).
+						WithBdevDeviceRoles(storage.BdevRoleAll),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			engRes: []ctlpb.ScanNvmeResp{
+				ctlpb.ScanNvmeResp{
+					Ctrlrs: proto.NvmeControllers{
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(1)
+							nc.SmdDevices = []*ctlpb.SmdDevice{
+								mockSmd(storage.BdevRoleAll),
+							}
+							return nc
+						}(),
+					},
+				},
+			},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(1)
+						nc.SmdDevices = []*ctlpb.SmdDevice{
+							mockSmd(storage.BdevRoleAll),
+						}
+						return nc
+					}(),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true},
+			},
+		},
+		"scan remote; bdev with md-on-ssd roles in config; no md info in smd devs": {
+			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassRam.String()).
+						WithScmMountPoint(defScmMountPt),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1)).
+						WithBdevDeviceRoles(storage.BdevRoleAll),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			engRes: []ctlpb.ScanNvmeResp{
+				ctlpb.ScanNvmeResp{
+					Ctrlrs: proto.NvmeControllers{
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(1)
+							sd := &ctlpb.SmdDevice{
+								Rank:   uint32(0),
+								TgtIds: []int32{1, 2, 3, 4},
+								// Avoid rounding
+								AvailBytes:  32 * humanize.GiByte,
+								ClusterSize: 32 * humanize.MiByte,
+								RoleBits:    storage.BdevRoleAll,
+							}
+							nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+							return nc
+						}(),
+					},
+				},
+			},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(1)
+						sd := &ctlpb.SmdDevice{
+							Rank:        uint32(0),
+							TgtIds:      []int32{1, 2, 3, 4},
+							AvailBytes:  32 * humanize.GiByte,
+							ClusterSize: 32 * humanize.MiByte,
+							RoleBits:    storage.BdevRoleAll,
+							UsableBytes: 32 * humanize.GiByte,
+						}
+						nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+						return nc
+					}(),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
+		},
+		"scan remote; bdev with md-on-ssd roles in config; nvme capacity adjusted": {
+			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassRam.String()).
+						WithScmMountPoint(defScmMountPt),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1)).
+						WithBdevDeviceRoles(storage.BdevRoleAll),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			engRes: []ctlpb.ScanNvmeResp{
+				ctlpb.ScanNvmeResp{
+					Ctrlrs: proto.NvmeControllers{
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(1)
+							nc.SmdDevices = []*ctlpb.SmdDevice{
+								mockSmd(storage.BdevRoleAll),
+							}
+							return nc
+						}(),
+					},
+				},
+			},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(1)
+						sd := mockSmd(storage.BdevRoleAll)
+						// See TestServer_CtlSvc_adjustNvmeSize
+						// 80 metadata, 128 wal, 64 rdb = 272 clusters
+						sd.UsableBytes = (1024 - 272) * (32 * humanize.MiByte)
+						nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+						return nc
+					}(),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
+		},
+		"scan remote; bdev with md-on-ssd roles in config; no health flag": {
+			req: &ctlpb.ScanNvmeReq{Meta: true},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassRam.String()).
+						WithScmMountPoint(defScmMountPt),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1)).
+						WithBdevDeviceRoles(storage.BdevRoleAll),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			engRes: []ctlpb.ScanNvmeResp{
+				ctlpb.ScanNvmeResp{
+					Ctrlrs: proto.NvmeControllers{
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(1)
+							nc.SmdDevices = []*ctlpb.SmdDevice{
+								mockSmd(storage.BdevRoleAll),
+							}
+							return nc
+						}(),
+					},
+				},
+			},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(1)
+						nc.HealthStats = nil
+						sd := mockSmd(storage.BdevRoleAll)
+						sd.UsableBytes = (1024 - 272) * (32 * humanize.MiByte)
+						nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+						return nc
+					}(),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
+		},
+		"scan remote; bdev with md-on-ssd roles in config; separate data role": {
+			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassRam.String()).
+						WithScmMountPoint(defScmMountPt),
+					// Roles are read from scan resp, adding here for posterity.
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1)).
+						WithBdevDeviceRoles(
+							storage.BdevRoleWAL | storage.BdevRoleMeta),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(2)).
+						WithBdevDeviceRoles(storage.BdevRoleData),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			engRes: []ctlpb.ScanNvmeResp{
+				ctlpb.ScanNvmeResp{
+					Ctrlrs: proto.NvmeControllers{
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(1)
+							nc.SmdDevices = []*ctlpb.SmdDevice{
+								mockSmd(storage.BdevRoleWAL | storage.BdevRoleMeta),
+							}
+							return nc
+						}(),
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(2)
+							nc.SmdDevices = []*ctlpb.SmdDevice{
+								mockSmd(storage.BdevRoleData),
+							}
+							return nc
+						}(),
+					},
+				},
+			},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(1)
+						sd := mockSmd(storage.BdevRoleWAL | storage.BdevRoleMeta)
+						sd.AvailBytes = 0
+						nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+						return nc
+					}(),
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(2)
+						sd := mockSmd(storage.BdevRoleData)
+						sd.UsableBytes = 32 * humanize.GiByte
+						nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+						return nc
+					}(),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
+		},
+		"scan remote; bdev with md-on-ssd roles in config; phase-2 scan (mem-ratio in req)": {
+			req: &ctlpb.ScanNvmeReq{Meta: true, MemRatio: 0.5},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassRam.String()).
+						WithScmMountPoint(defScmMountPt),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1)).
+						WithBdevDeviceRoles(storage.BdevRoleAll),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			engRes: []ctlpb.ScanNvmeResp{
+				ctlpb.ScanNvmeResp{
+					Ctrlrs: proto.NvmeControllers{
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(1)
+							sd := mockSmd(storage.BdevRoleAll)
+							// Populated from scan request based on 0.5 MemRatio.
+							sd.MetaSize = defMetaSize * 2
+							nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+							return nc
+						}(),
+					},
+				},
+			},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(1)
+						nc.HealthStats = nil
+						sd := mockSmd(storage.BdevRoleAll)
+						sd.MetaSize = defMetaSize * 2
+						// Before doubling meta-size, 272 clusters removed from 1024
+						// 128 wal, 64 rdb, 80 meta. Add 80 meta gives 352 to remove.
+						sd.UsableBytes = (1024 - 352) * (32 * humanize.MiByte)
+						nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+						return nc
+					}(),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				// Double MetaSize passed in request because of 0.5 MemRatio.
+				{Meta: true, MemRatio: 0.5, MetaSize: defMetaSize * 2, RdbSize: defRdbSize},
+			},
+		},
+		"scan remote; bdev with md-on-ssd roles in config; duplicate and sysXS tgt ids": {
+			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
+			engTierCfgs: []storage.TierConfigs{
+				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassRam.String()).
+						WithScmMountPoint(defScmMountPt),
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassNvme.String()).
+						WithBdevDeviceList(test.MockPCIAddr(1)).
+						WithBdevDeviceRoles(storage.BdevRoleAll),
+				},
+			},
+			engStopped: []bool{false},
+			engErr:     []error{nil},
+			engRes: []ctlpb.ScanNvmeResp{
+				ctlpb.ScanNvmeResp{
+					Ctrlrs: proto.NvmeControllers{
+						func() *ctlpb.NvmeController {
+							nc := proto.MockNvmeController(1)
+							sd := mockSmd(storage.BdevRoleAll)
+							sd.TgtIds = []int32{
+								1024, 1024, 1, 1, 2, 2, 3, 3, 4, 4,
+							}
+							nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+							return nc
+						}(),
+					},
+				},
+			},
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{
+					func() *ctlpb.NvmeController {
+						nc := proto.MockNvmeController(1)
+						sd := mockSmd(storage.BdevRoleAll)
+						sd.TgtIds = []int32{
+							// See storage.SmdDevice.UnmarshalJSON()
+							// for tgtID sanitization.
+							1024, 1024, 1, 1, 2, 2, 3, 3, 4, 4,
+						}
+						// See TestServer_CtlSvc_adjustNvmeSize
+						// 80 metadata, 128 wal, 64 rdb = 272 clusters
+						sd.UsableBytes = (1024 - 272) * (32 * humanize.MiByte)
+						nc.SmdDevices = []*ctlpb.SmdDevice{sd}
+						return nc
+					}(),
+				},
+				State: new(ctlpb.ResponseState),
+			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
 		},
 		"scan remote; collate results from multiple engines": {
 			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
 			engTierCfgs: []storage.TierConfigs{
 				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList(defScmDev).
+						WithScmMountPoint(defScmMountPt),
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
 						WithBdevDeviceList(test.MockPCIAddr(1),
 							test.MockPCIAddr(2)),
 				},
 				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList("/dev/pmem1").
+						WithScmMountPoint("/mnt/daos1"),
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
 						WithBdevDeviceList(test.MockPCIAddr(3),
 							test.MockPCIAddr(4)),
 				},
 			},
+			scmNamespaces: []*ctlpb.ScmNamespace{
+				{
+					Mount: &ctlpb.ScmNamespace_Mount{
+						Path:        defScmMountPt,
+						AvailBytes:  defMountAvail,
+						UsableBytes: defMountUsable,
+						Class:       storage.ClassDcpm.String(),
+					},
+				},
+				{
+					Mount: &ctlpb.ScmNamespace_Mount{
+						Rank:        1,
+						Path:        "/mnt/daos1",
+						AvailBytes:  defMountAvail,
+						UsableBytes: defMountUsable,
+						Class:       storage.ClassDcpm.String(),
+					},
+				},
+			},
 			engRes: []ctlpb.ScanNvmeResp{
 				{
 					Ctrlrs: proto.NvmeControllers{
@@ -319,17 +843,29 @@ func TestServer_bdevScan(t *testing.T) {
 				},
 				State: new(ctlpb.ResponseState),
 			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
 		},
 		"scan remote; both engine scans fail": {
 			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
 			engTierCfgs: []storage.TierConfigs{
 				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList(defScmDev).
+						WithScmMountPoint(defScmMountPt),
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
 						WithBdevDeviceList(test.MockPCIAddr(1),
 							test.MockPCIAddr(2)),
 				},
 				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList("/dev/pmem1").
+						WithScmMountPoint("/mnt/daos1"),
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
 						WithBdevDeviceList(test.MockPCIAddr(3),
@@ -345,18 +881,45 @@ func TestServer_bdevScan(t *testing.T) {
 			req: &ctlpb.ScanNvmeReq{Health: true, Meta: true},
 			engTierCfgs: []storage.TierConfigs{
 				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList(defScmDev).
+						WithScmMountPoint(defScmMountPt),
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
 						WithBdevDeviceList(test.MockPCIAddr(1),
 							test.MockPCIAddr(2)),
 				},
 				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList("/dev/pmem1").
+						WithScmMountPoint("/mnt/daos1"),
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
 						WithBdevDeviceList(test.MockPCIAddr(3),
 							test.MockPCIAddr(4)),
 				},
 			},
+			scmNamespaces: []*ctlpb.ScmNamespace{
+				{
+					Mount: &ctlpb.ScmNamespace_Mount{
+						Path:        defScmMountPt,
+						AvailBytes:  defMountAvail,
+						UsableBytes: defMountUsable,
+						Class:       storage.ClassDcpm.String(),
+					},
+				},
+				{
+					Mount: &ctlpb.ScmNamespace_Mount{
+						Rank:        1,
+						Path:        "/mnt/daos1",
+						AvailBytes:  defMountAvail,
+						UsableBytes: defMountUsable,
+						Class:       storage.ClassDcpm.String(),
+					},
+				},
+			},
 			engRes: []ctlpb.ScanNvmeResp{
 				{},
 				{
@@ -379,6 +942,10 @@ func TestServer_bdevScan(t *testing.T) {
 					Status: ctlpb.ResponseStatus_CTL_ERR_NVME,
 				},
 			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+				{Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
 		},
 		"scan remote; filter results based on request basic flag": {
 			req: &ctlpb.ScanNvmeReq{Basic: true},
@@ -422,6 +989,9 @@ func TestServer_bdevScan(t *testing.T) {
 				},
 				State: new(ctlpb.ResponseState),
 			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Basic: true},
+			},
 		},
 		"scan local; filter results based on request basic flag": {
 			req: &ctlpb.ScanNvmeReq{Basic: true},
@@ -517,6 +1087,10 @@ func TestServer_bdevScan(t *testing.T) {
 			req: &ctlpb.ScanNvmeReq{Meta: true},
 			engTierCfgs: []storage.TierConfigs{
 				{
+					storage.NewTierConfig().
+						WithStorageClass(storage.ClassDcpm.String()).
+						WithScmDeviceList(defScmDev).
+						WithScmMountPoint(defScmMountPt),
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
 						WithBdevDeviceList("0000:05:05.5"),
@@ -538,6 +1112,9 @@ func TestServer_bdevScan(t *testing.T) {
 				},
 				State: new(ctlpb.ResponseState),
 			},
+			expRemoteScanCalls: []*ctlpb.ScanNvmeReq{
+				{Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize},
+			},
 		},
 	} {
 		t.Run(name, func(t *testing.T) {
@@ -555,9 +1132,27 @@ func TestServer_bdevScan(t *testing.T) {
 				t.Fatal("len tc.engStopped != len tc.tierCfgs")
 			}
 
+			if tc.scmNamespaces == nil {
+				tc.scmNamespaces = []*ctlpb.ScmNamespace{
+					{
+						Mount: &ctlpb.ScmNamespace_Mount{
+							Path:        defScmMountPt,
+							AvailBytes:  defMountAvail,
+							UsableBytes: defMountUsable,
+							Class:       storage.ClassRam.String(),
+						},
+					},
+				}
+			}
+
+			var remoteScanCalls []*ctlpb.ScanNvmeReq
 			idx := 0
 			// Mock per-engine-scan function to focus on unit testing bdevScan().
-			scanEngineBdevs = func(_ context.Context, _ Engine, _ *ctlpb.ScanNvmeReq) (*ctlpb.ScanNvmeResp, error) {
+			scanEngineBdevs = func(_ context.Context, _ Engine, eReq *ctlpb.ScanNvmeReq) (*ctlpb.ScanNvmeResp, error) {
+				// Store request call.
+				remoteScanCalls = append(remoteScanCalls, eReq)
+
+				// Generate response.
 				if len(tc.engRes) <= idx {
 					t.Fatal("engine scan called but response not specified")
 				}
@@ -567,15 +1162,20 @@ func TestServer_bdevScan(t *testing.T) {
 				engRes := tc.engRes[idx]
 				engErr := tc.engErr[idx]
 				idx++
+
 				return &engRes, engErr
 			}
 			defer func() {
 				scanEngineBdevs = bdevScanEngine
 			}()
 
+			if tc.engTgtCount == 0 {
+				tc.engTgtCount = defTgtCount
+			}
 			engCfgs := []*engine.Config{}
 			for _, tcs := range tc.engTierCfgs {
-				engCfg := engine.MockConfig().WithStorage(tcs...)
+				engCfg := engine.MockConfig().WithStorage(tcs...).
+					WithTargetCount(tc.engTgtCount)
 				engCfgs = append(engCfgs, engCfg)
 			}
 			sCfg := config.DefaultServer().WithEngines(engCfgs...).
@@ -591,7 +1191,7 @@ func TestServer_bdevScan(t *testing.T) {
 			cs := newMockControlServiceFromBackends(t, log, sCfg, bmb, smb, nil,
 				tc.engStopped...)
 
-			resp, err := bdevScan(test.Context(t), cs, tc.req, nil)
+			resp, err := bdevScan(test.Context(t), cs, tc.req, tc.scmNamespaces)
 			test.CmpErr(t, tc.expErr, err)
 			if err != nil {
 				return
@@ -609,19 +1209,27 @@ func TestServer_bdevScan(t *testing.T) {
 				return x.Equals(y)
 			})
 
+			// Verify expected provider backend scan requests have been made.
 			bmb.RLock()
 			if len(tc.expBackendScanCalls) != len(bmb.ScanCalls) {
 				t.Fatalf("unexpected number of backend scan calls, want %d got %d",
 					len(tc.expBackendScanCalls), len(bmb.ScanCalls))
 			}
-			if len(tc.expBackendScanCalls) == 0 {
-				return
-			}
 			if diff := cmp.Diff(tc.expBackendScanCalls, bmb.ScanCalls,
 				append(defStorageScanCmpOpts, cmpopt)...); diff != "" {
 				t.Fatalf("unexpected backend scan calls (-want, +got):\n%s\n", diff)
 			}
 			bmb.RUnlock()
+
+			// Verify expected remote drpc scan requests have been made.
+			if len(tc.expRemoteScanCalls) != len(remoteScanCalls) {
+				t.Fatalf("unexpected number of remote scan calls, want %d got %d",
+					len(tc.expRemoteScanCalls), len(remoteScanCalls))
+			}
+			if diff := cmp.Diff(tc.expRemoteScanCalls, remoteScanCalls,
+				append(defStorageScanCmpOpts, cmpopt)...); diff != "" {
+				t.Fatalf("unexpected remote scan calls (-want, +got):\n%s\n", diff)
+			}
 		})
 	}
 }
@@ -2310,15 +2918,15 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 		rdbWalSize      uint64 = 512 * humanize.MiByte
 	)
 
-	type StorageCfg struct {
+	type storageCfg struct {
 		targetCount int
 		tierCfgs    storage.TierConfigs
 	}
-	type DataInput struct {
-		storageCfgs  []*StorageCfg
+	type dataInput struct {
+		storageCfgs  []*storageCfg
 		scanNvmeResp *ctlpb.ScanNvmeResp
 	}
-	type ExpectedOutput struct {
+	type expectedOutput struct {
 		totalBytes     []uint64
 		availableBytes []uint64
 		usableBytes    []uint64
@@ -2344,12 +2952,12 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 	}
 
 	for name, tc := range map[string]struct {
-		input  DataInput
-		output ExpectedOutput
+		input  dataInput
+		output expectedOutput
 	}{
 		"homogeneous": {
-			input: DataInput{
-				storageCfgs: []*StorageCfg{
+			input: dataInput{
+				storageCfgs: []*storageCfg{
 					{
 						targetCount: 12,
 						tierCfgs: storage.TierConfigs{
@@ -2444,7 +3052,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				totalBytes: []uint64{
 					10 * hugeClusterSize,
 					10 * hugeClusterSize,
@@ -2469,8 +3077,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 			},
 		},
 		"heterogeneous": {
-			input: DataInput{
-				storageCfgs: []*StorageCfg{
+			input: dataInput{
+				storageCfgs: []*storageCfg{
 					{
 						targetCount: 11,
 						tierCfgs: storage.TierConfigs{
@@ -2567,7 +3175,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				totalBytes: []uint64{
 					10 * hugeClusterSize,
 					10 * hugeClusterSize,
@@ -2592,8 +3200,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 			},
 		},
 		"new": {
-			input: DataInput{
-				storageCfgs: []*StorageCfg{
+			input: dataInput{
+				storageCfgs: []*storageCfg{
 					{
 						targetCount: 7,
 						tierCfgs: storage.TierConfigs{
@@ -2637,7 +3245,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				totalBytes: []uint64{
 					10 * hugeClusterSize,
 					10 * hugeClusterSize,
@@ -2654,8 +3262,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 			},
 		},
 		"evicted": {
-			input: DataInput{
-				storageCfgs: []*StorageCfg{
+			input: dataInput{
+				storageCfgs: []*storageCfg{
 					{
 						targetCount: 7,
 						tierCfgs: storage.TierConfigs{
@@ -2699,7 +3307,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				totalBytes: []uint64{
 					10 * hugeClusterSize,
 					10 * hugeClusterSize,
@@ -2716,8 +3324,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 			},
 		},
 		"missing targets": {
-			input: DataInput{
-				storageCfgs: []*StorageCfg{
+			input: dataInput{
+				storageCfgs: []*storageCfg{
 					{
 						targetCount: 4,
 						tierCfgs: storage.TierConfigs{
@@ -2761,7 +3369,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				totalBytes: []uint64{
 					10 * hugeClusterSize,
 					10 * hugeClusterSize,
@@ -2778,8 +3386,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 			},
 		},
 		"missing cluster size": {
-			input: DataInput{
-				storageCfgs: []*StorageCfg{
+			input: dataInput{
+				storageCfgs: []*storageCfg{
 					{
 						targetCount: 7,
 						tierCfgs: storage.TierConfigs{
@@ -2822,7 +3430,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				totalBytes: []uint64{
 					10 * hugeClusterSize,
 					10 * hugeClusterSize,
@@ -2839,8 +3447,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 			},
 		},
 		"multi bdev tier": {
-			input: DataInput{
-				storageCfgs: []*StorageCfg{
+			input: dataInput{
+				storageCfgs: []*storageCfg{
 					{
 						targetCount: 5,
 						tierCfgs:    storage.TierConfigs{newTierCfg(1)},
@@ -2873,7 +3481,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 							SmdDevices: []*ctlpb.SmdDevice{
 								{
 									Uuid:        "nvme0",
-									TgtIds:      []int32{0, 1, 2, 3},
+									TgtIds:      []int32{0, 1, 2, 3, 4},
 									TotalBytes:  10 * humanize.GiByte,
 									AvailBytes:  10 * humanize.GiByte,
 									ClusterSize: clusterSize,
@@ -2903,7 +3511,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 							SmdDevices: []*ctlpb.SmdDevice{
 								{
 									Uuid:        "nvme2",
-									TgtIds:      []int32{0, 1, 2, 3},
+									TgtIds:      []int32{0, 1, 2, 3, 4, 5},
 									TotalBytes:  10 * humanize.GiByte,
 									AvailBytes:  10 * humanize.GiByte,
 									ClusterSize: clusterSize,
@@ -2933,7 +3541,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 							SmdDevices: []*ctlpb.SmdDevice{
 								{
 									Uuid:        "nvme4",
-									TgtIds:      []int32{0, 1, 2, 3},
+									TgtIds:      []int32{0, 1, 2, 3, 4},
 									TotalBytes:  10 * humanize.GiByte,
 									AvailBytes:  10 * humanize.GiByte,
 									ClusterSize: clusterSize,
@@ -2948,12 +3556,12 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 							SmdDevices: []*ctlpb.SmdDevice{
 								{
 									Uuid:        "nvme5",
-									TgtIds:      []int32{0, 1, 2, 3},
+									TgtIds:      []int32{0, 1, 2, 3, 4, 5},
 									TotalBytes:  10 * humanize.GiByte,
 									AvailBytes:  10 * humanize.GiByte,
 									ClusterSize: clusterSize,
 									Rank:        5,
-									RoleBits:    storage.BdevRoleMeta | storage.BdevRoleMeta,
+									RoleBits:    storage.BdevRoleMeta | storage.BdevRoleWAL,
 								},
 							},
 							DevState: devStateNormal,
@@ -2961,7 +3569,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				totalBytes: []uint64{
 					320 * clusterSize,
 					320 * clusterSize,
@@ -2979,9 +3587,22 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 					0 * humanize.GiByte,
 				},
 				usableBytes: []uint64{
+					// 5tgts * 64mib = 320mib of meta on SSD (10 clusters)
+					// 256mib RDB = 8 clusters
+					// 320-18 = 302 remaining clusters
+					// 302 / 5 = 60 clusters-per-target (rounding diff)
 					300 * clusterSize,
+					// 4tgts * 128mib = 512mib of wal on SSD (16 clusters)
+					// 512mib WAL RDB = 16 clusters
+					// 320-32 = 288 remaining clusters
 					288 * clusterSize,
-					260 * clusterSize,
+					// 6tgts * 64mib = 384mib of meta on SSD (12 clusters)
+					// 6tgts * 128mib = 768mib of wal on SSD (24 clusters)
+					// 256mib RDB = 8 clusters
+					// 512mib WAL RDB = 16 clusters
+					// 320-60 = 260 remaining clusters
+					// 260 / 6 = 43 clusters-per-target (rounding diff)
+					258 * clusterSize,
 					0 * humanize.GiByte,
 					0 * humanize.GiByte,
 					0 * humanize.GiByte,
@@ -3008,14 +3629,14 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 			for idx, ctlr := range tc.input.scanNvmeResp.GetCtrlrs() {
 				dev := ctlr.GetSmdDevices()[0]
 				test.AssertEqual(t, tc.output.totalBytes[idx], dev.GetTotalBytes(),
-					fmt.Sprintf("Invalid total bytes with ctlr %s (index=%d): wait=%d, got=%d",
+					fmt.Sprintf("Invalid total bytes with ctlr %s (index=%d): want=%d, got=%d",
 						ctlr.GetPciAddr(), idx, tc.output.totalBytes[idx], dev.GetTotalBytes()))
 				test.AssertEqual(t, tc.output.availableBytes[idx], dev.GetAvailBytes(),
-					fmt.Sprintf("Invalid available bytes with ctlr %s (index=%d): wait=%d, got=%d",
+					fmt.Sprintf("Invalid available bytes with ctlr %s (index=%d): want=%d, got=%d",
 						ctlr.GetPciAddr(), idx, tc.output.availableBytes[idx], dev.GetAvailBytes()))
 				test.AssertEqual(t, tc.output.usableBytes[idx], dev.GetUsableBytes(),
 					fmt.Sprintf("Invalid usable bytes with ctlr %s (index=%d), "+
-						"wait=%d (%d clusters) got=%d (%d clusters)",
+						"want=%d (%d clusters) got=%d (%d clusters)",
 						ctlr.GetPciAddr(), idx,
 						tc.output.usableBytes[idx], tc.output.usableBytes[idx]/clusterSize,
 						dev.GetUsableBytes(), dev.GetUsableBytes()/clusterSize))
@@ -3030,7 +3651,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) {
 }
 
 func TestServer_getRdbSize(t *testing.T) {
-	type ExpectedOutput struct {
+	type expectedOutput struct {
 		size    uint64
 		message string
 		err     error
@@ -3038,23 +3659,23 @@ func TestServer_getRdbSize(t *testing.T) {
 
 	for name, tc := range map[string]struct {
 		rdbSize string
-		output  ExpectedOutput
+		output  expectedOutput
 	}{
 		"simple env var": {
 			rdbSize: "DAOS_MD_CAP=1024",
-			output: ExpectedOutput{
+			output: expectedOutput{
 				size: 1024 * humanize.MiByte,
 			},
 		},
 		"simple default": {
-			output: ExpectedOutput{
+			output: expectedOutput{
 				size:    defaultRdbSize,
 				message: "using default RDB file size",
 			},
 		},
 		"invalid mdcap": {
 			rdbSize: "DAOS_MD_CAP=foo",
-			output: ExpectedOutput{
+			output: expectedOutput{
 				err: errors.New("invalid RDB file size"),
 			},
 		},
@@ -3100,23 +3721,23 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 		mountPoints []string
 	}
 
-	type DataInput struct {
+	type dataInput struct {
 		configs  []*EngineConfig
 		response *ctlpb.ScanScmResp
 	}
 
-	type ExpectedOutput struct {
+	type expectedOutput struct {
 		availableBytes []uint64
 		usableBytes    []uint64
 		message        string
 	}
 
 	for name, tc := range map[string]struct {
-		input  DataInput
-		output ExpectedOutput
+		input  dataInput
+		output expectedOutput
 	}{
 		"single mountPoint": {
-			input: DataInput{
+			input: dataInput{
 				configs: []*EngineConfig{
 					{
 						mountPoints: []string{"/mnt/daos0"},
@@ -3128,19 +3749,19 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt/daos0",
 								AvailBytes: uint64(64) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				availableBytes: []uint64{uint64(64) * humanize.GiByte},
 				usableBytes:    []uint64{uint64(64)*humanize.GiByte - defaultRdbSize - mdDaosScmBytes - mdFsScmBytes},
 			},
 		},
 		"three mountPoints": {
-			input: DataInput{
+			input: dataInput{
 				configs: []*EngineConfig{
 					{
 						mdCap:       "DAOS_MD_CAP=1024",
@@ -3153,27 +3774,27 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt/daos0",
 								AvailBytes: uint64(64) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 						{
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt/daos1",
 								AvailBytes: uint64(32) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 						{
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt/daos2",
 								AvailBytes: uint64(128) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				availableBytes: []uint64{
 					uint64(64) * humanize.GiByte,
 					uint64(32) * humanize.GiByte,
@@ -3187,7 +3808,7 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 			},
 		},
 		"Missing SCM": {
-			input: DataInput{
+			input: dataInput{
 				configs: []*EngineConfig{
 					{
 						mdCap:       "DAOS_MD_CAP=1024",
@@ -3200,27 +3821,27 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt/daos0",
 								AvailBytes: uint64(64) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 						{
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt/daos1",
 								AvailBytes: uint64(32) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 						{
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt/daos2",
 								AvailBytes: uint64(128) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				availableBytes: []uint64{
 					uint64(64) * humanize.GiByte,
 					uint64(32) * humanize.GiByte,
@@ -3235,7 +3856,7 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 			},
 		},
 		"No more space": {
-			input: DataInput{
+			input: dataInput{
 				configs: []*EngineConfig{
 					{
 						mountPoints: []string{"/mnt/daos0"},
@@ -3247,20 +3868,20 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt/daos0",
 								AvailBytes: uint64(64) * humanize.KiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				availableBytes: []uint64{uint64(64) * humanize.KiByte},
 				usableBytes:    []uint64{0},
 				message:        "No more usable space in SCM device",
 			},
 		},
 		"Multi bdev Tiers": {
-			input: DataInput{
+			input: dataInput{
 				configs: []*EngineConfig{
 					{
 						mdCap:       "DAOS_MD_CAP=1024",
@@ -3274,20 +3895,20 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/mnt",
 								AvailBytes: uint64(64) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 						{
 							Mount: &ctlpb.ScmNamespace_Mount{
 								Path:       "/opt",
 								AvailBytes: uint64(32) * humanize.GiByte,
-								Class:      storage.ClassFile.String(),
+								Class:      storage.ClassRam.String(),
 							},
 						},
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				availableBytes: []uint64{
 					uint64(64) * humanize.GiByte,
 					uint64(32) * humanize.GiByte,
@@ -3335,12 +3956,12 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 
 			for index, namespace := range tc.input.response.Namespaces {
 				test.AssertEqual(t, tc.output.availableBytes[index], namespace.GetMount().GetAvailBytes(),
-					fmt.Sprintf("Invalid SCM available bytes: nsp=%s, wait=%s (%d bytes), got=%s (%d bytes)",
+					fmt.Sprintf("Invalid SCM available bytes: nsp=%s, want=%s (%d bytes), got=%s (%d bytes)",
 						namespace.GetMount().GetPath(),
 						humanize.Bytes(tc.output.availableBytes[index]), tc.output.availableBytes[index],
 						humanize.Bytes(namespace.GetMount().GetAvailBytes()), namespace.GetMount().GetAvailBytes()))
 				test.AssertEqual(t, tc.output.usableBytes[index], namespace.GetMount().GetUsableBytes(),
-					fmt.Sprintf("Invalid SCM usable bytes: nsp=%s, wait=%s (%d bytes), got=%s (%d bytes)",
+					fmt.Sprintf("Invalid SCM usable bytes: nsp=%s, want=%s (%d bytes), got=%s (%d bytes)",
 						namespace.GetMount().GetPath(),
 						humanize.Bytes(tc.output.usableBytes[index]), tc.output.usableBytes[index],
 						humanize.Bytes(namespace.GetMount().GetUsableBytes()), namespace.GetMount().GetUsableBytes()))
@@ -3355,11 +3976,11 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
 }
 
 func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) {
-	type DataInput struct {
+	type dataInput struct {
 		tierCfgs storage.TierConfigs
 		nvmeCtlr *ctl.NvmeController
 	}
-	type ExpectedOutput struct {
+	type expectedOutput struct {
 		res bool
 		msg string
 	}
@@ -3376,32 +3997,32 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) {
 	}
 
 	for name, tc := range map[string]struct {
-		input  DataInput
-		output ExpectedOutput
+		input  dataInput
+		output expectedOutput
 	}{
 		"find NVME Ctlr": {
-			input: DataInput{
+			input: dataInput{
 				tierCfgs: newTierCfgs(5),
 				nvmeCtlr: &ctl.NvmeController{
 					PciAddr: test.MockPCIAddr(3),
 				},
 			},
-			output: ExpectedOutput{res: true},
+			output: expectedOutput{res: true},
 		},
 		"not find NVME Ctlr": {
-			input: DataInput{
+			input: dataInput{
 				tierCfgs: newTierCfgs(5),
 				nvmeCtlr: &ctl.NvmeController{
 					PciAddr: test.MockPCIAddr(13),
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				res: false,
 				msg: "unknown PCI device",
 			},
 		},
 		"find VMD device": {
-			input: DataInput{
+			input: dataInput{
 				tierCfgs: storage.TierConfigs{
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
@@ -3411,10 +4032,10 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) {
 					PciAddr: "040603:02:00.0",
 				},
 			},
-			output: ExpectedOutput{res: true},
+			output: expectedOutput{res: true},
 		},
 		"Invalid address": {
-			input: DataInput{
+			input: dataInput{
 				tierCfgs: storage.TierConfigs{
 					storage.NewTierConfig().
 						WithStorageClass(storage.ClassNvme.String()).
@@ -3424,7 +4045,7 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) {
 					PciAddr: "666",
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				res: false,
 				msg: "Invalid PCI address",
 			},
@@ -3442,7 +4063,7 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) {
 
 			if tc.output.res {
 				test.AssertEqual(t, engineCfg, ec,
-					fmt.Sprintf("Invalid engine config: wait=%v got=%v", engineCfg, ec))
+					fmt.Sprintf("Invalid engine config: want=%v got=%v", engineCfg, ec))
 				return
 			}
 
@@ -3456,11 +4077,11 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) {
 }
 
 func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) {
-	type DataInput struct {
+	type dataInput struct {
 		tierCfgs storage.TierConfigs
 		scmNsp   *ctl.ScmNamespace
 	}
-	type ExpectedOutput struct {
+	type expectedOutput struct {
 		res bool
 		msg string
 	}
@@ -3477,11 +4098,11 @@ func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) {
 	}
 
 	for name, tc := range map[string]struct {
-		input  DataInput
-		output ExpectedOutput
+		input  dataInput
+		output expectedOutput
 	}{
 		"find SCM Nsp": {
-			input: DataInput{
+			input: dataInput{
 				tierCfgs: newTierCfgs(5),
 				scmNsp: &ctl.ScmNamespace{
 					Mount: &ctl.ScmNamespace_Mount{
@@ -3489,10 +4110,10 @@ func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{res: true},
+			output: expectedOutput{res: true},
 		},
 		"not find SCM Nsp": {
-			input: DataInput{
+			input: dataInput{
 				tierCfgs: newTierCfgs(5),
 				scmNsp: &ctl.ScmNamespace{
 					Mount: &ctl.ScmNamespace_Mount{
@@ -3500,7 +4121,7 @@ func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) {
 					},
 				},
 			},
-			output: ExpectedOutput{
+			output: expectedOutput{
 				res: false,
 				msg: "unknown SCM mount point"},
 		},
@@ -3517,7 +4138,7 @@ func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) {
 
 			if tc.output.res {
 				test.AssertEqual(t, engineCfg, ec,
-					fmt.Sprintf("Invalid engine config: wait=%v got=%v", engineCfg, ec))
+					fmt.Sprintf("Invalid engine config: want=%v got=%v", engineCfg, ec))
 				return
 			}
 
diff --git a/src/control/server/faults.go b/src/control/server/faults.go
index 5a5526e36fa..ad70f202c1c 100644
--- a/src/control/server/faults.go
+++ b/src/control/server/faults.go
@@ -171,6 +171,14 @@ func FaultNoCompatibilityInsecure(self, other build.Version) *fault.Fault {
 	)
 }
 
+// FaultPoolMemRatioNoRoles indicates a fault when pool create request contains MD-on-SSD
+// parameters but MD-on-SSD has not been enabled on the server.
+var FaultPoolMemRatioNoRoles = serverFault(
+	code.ServerPoolMemRatioNoRoles,
+	"pool create request contains MD-on-SSD parameters but MD-on-SSD has not been enabled",
+	"either remove MD-on-SSD-specific options from the command request or set bdev_roles in "+
+		"server config file to enable MD-on-SSD")
+
 func FaultBadFaultDomainLabels(faultPath, addr string, reqLabels, systemLabels []string) *fault.Fault {
 	return serverFault(
 		code.ServerBadFaultDomainLabels,
diff --git a/src/control/server/mgmt_pool.go b/src/control/server/mgmt_pool.go
index bd9a0064787..4ad98ccd1bf 100644
--- a/src/control/server/mgmt_pool.go
+++ b/src/control/server/mgmt_pool.go
@@ -24,6 +24,7 @@ import (
 	"github.com/daos-stack/daos/src/control/lib/daos"
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
 	"github.com/daos-stack/daos/src/control/server/engine"
+	"github.com/daos-stack/daos/src/control/server/storage"
 	"github.com/daos-stack/daos/src/control/system"
 )
 
@@ -160,9 +161,8 @@ func minPoolNvme(tgtCount, rankCount uint64) uint64 {
 	return minRankNvme(tgtCount) * rankCount
 }
 
-// calculateCreateStorage determines the amount of SCM/NVMe storage to
-// allocate per engine in order to fulfill the create request, if those
-// values are not already supplied as part of the request.
+// calculateCreateStorage determines the amount of SCM/NVMe storage to allocate per engine in order
+// to fulfill the create request, if those values are not already supplied as part of the request.
 func (svc *mgmtSvc) calculateCreateStorage(req *mgmtpb.PoolCreateReq) error {
 	instances := svc.harness.Instances()
 	if len(instances) < 1 {
@@ -172,11 +172,21 @@ func (svc *mgmtSvc) calculateCreateStorage(req *mgmtpb.PoolCreateReq) error {
 		return errors.New("zero ranks in calculateCreateStorage()")
 	}
 
-	// NB: The following logic is based on the assumption that
-	// a request will always include SCM as tier 0. Currently,
-	// we only support one additional tier, NVMe, which is
-	// optional. As we add support for other tiers, this logic
-	// will need to be updated.
+	mdOnSSD := instances[0].GetStorage().BdevRoleMetaConfigured()
+	switch {
+	case !mdOnSSD && req.MemRatio > 0:
+		// Prevent MD-on-SSD parameters being used in incompatible mode.
+		return FaultPoolMemRatioNoRoles
+	case mdOnSSD && req.MemRatio == 0:
+		// Set reasonable default if not set in MD-on-SSD mode.
+		req.MemRatio = storage.DefaultMemoryFileRatio
+		svc.log.Infof("Default memory-file:md-on-ssd ratio of %d%% applied",
+			int(storage.DefaultMemoryFileRatio)*100)
+	}
+
+	// NB: The following logic is based on the assumption that a request will always include SCM
+	// as tier 0. Currently, we only support one additional tier, NVMe, which is optional. As we
+	// add support for other tiers, this logic will need to be updated.
 
 	nvmeMissing := !instances[0].GetStorage().HasBlockDevices()
 
@@ -251,6 +261,7 @@ func (svc *mgmtSvc) PoolCreate(ctx context.Context, req *mgmtpb.PoolCreateReq) (
 	if err != nil {
 		return nil, err
 	}
+
 	return msg.(*mgmtpb.PoolCreateResp), nil
 }
 
@@ -300,7 +311,6 @@ func (svc *mgmtSvc) poolCreate(parent context.Context, req *mgmtpb.PoolCreateReq
 		resp.SvcReps = ranklist.RanksToUint32(ps.Replicas)
 		resp.TgtRanks = ranklist.RanksToUint32(ps.Storage.CreationRanks())
 		resp.TierBytes = ps.Storage.PerRankTierStorage
-		// TODO DAOS-14223: Store Meta-Blob-Size in sysdb.
 
 		return resp, nil
 	}
@@ -947,6 +957,13 @@ func (svc *mgmtSvc) PoolQuery(ctx context.Context, req *mgmtpb.PoolQueryReq) (*m
 	// Preserve compatibility with pre-2.6 callers.
 	resp.Leader = resp.SvcLdr
 
+	// TODO DAOS-16209: After VOS query API is updated, zero-value mem_file_bytes will be
+	//                  returned in non-MD-on-SSD mode and this hack can be removed.
+	storage := svc.harness.Instances()[0].GetStorage()
+	if !storage.ControlMetadataPathConfigured() {
+		resp.MemFileBytes = 0
+	}
+
 	return resp, nil
 }
 
@@ -966,6 +983,15 @@ func (svc *mgmtSvc) PoolQueryTarget(ctx context.Context, req *mgmtpb.PoolQueryTa
 		return nil, errors.Wrap(err, "unmarshal PoolQueryTarget response")
 	}
 
+	// TODO DAOS-16209: After VOS query API is updated, zero-value mem_file_bytes will be
+	//                  returned in non-MD-on-SSD mode and this hack can be removed.
+	storage := svc.harness.Instances()[0].GetStorage()
+	if !storage.ControlMetadataPathConfigured() {
+		for _, tgtInfo := range resp.Infos {
+			tgtInfo.MemFileBytes = 0
+		}
+	}
+
 	return resp, nil
 }
 
diff --git a/src/control/server/mgmt_pool_test.go b/src/control/server/mgmt_pool_test.go
index 31684752af2..24f109cf196 100644
--- a/src/control/server/mgmt_pool_test.go
+++ b/src/control/server/mgmt_pool_test.go
@@ -200,10 +200,11 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) {
 	nvmeTooSmallReq := nvmeTooSmallTotal
 
 	for name, tc := range map[string]struct {
-		disableNVMe bool
-		in          *mgmtpb.PoolCreateReq
-		expOut      *mgmtpb.PoolCreateReq
-		expErr      error
+		disableNVMe   bool
+		enableMdOnSsd bool
+		in            *mgmtpb.PoolCreateReq
+		expOut        *mgmtpb.PoolCreateReq
+		expErr        error
 	}{
 		"auto sizing": {
 			in: &mgmtpb.PoolCreateReq{
@@ -245,6 +246,15 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) {
 				Ranks:     []uint32{0},
 			},
 		},
+		"auto sizing (mem-ratio but not MD-on-SSD)": {
+			in: &mgmtpb.PoolCreateReq{
+				TotalBytes: defaultTotal,
+				TierRatio:  defaultRatios,
+				Ranks:      []uint32{0, 1},
+				MemRatio:   0.2,
+			},
+			expErr: FaultPoolMemRatioNoRoles,
+		},
 		"tier bytes set for both (no NVMe in config)": {
 			disableNVMe: true,
 			in: &mgmtpb.PoolCreateReq{
@@ -264,6 +274,39 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) {
 				Ranks:     []uint32{0},
 			},
 		},
+		"mem-ratio is set (mdonssd not configured)": {
+			in: &mgmtpb.PoolCreateReq{
+				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
+				Ranks:     []uint32{0},
+				MemRatio:  storage.DefaultMemoryFileRatio,
+			},
+			expErr: FaultPoolMemRatioNoRoles,
+		},
+		"mem-ratio is unset (mdonssd configured)": {
+			enableMdOnSsd: true,
+			in: &mgmtpb.PoolCreateReq{
+				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
+				Ranks:     []uint32{0},
+			},
+			expOut: &mgmtpb.PoolCreateReq{
+				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
+				Ranks:     []uint32{0},
+				MemRatio:  storage.DefaultMemoryFileRatio,
+			},
+		},
+		"mem-ratio is set (mdonssd configured)": {
+			enableMdOnSsd: true,
+			in: &mgmtpb.PoolCreateReq{
+				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
+				Ranks:     []uint32{0},
+				MemRatio:  0.25,
+			},
+			expOut: &mgmtpb.PoolCreateReq{
+				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
+				Ranks:     []uint32{0},
+				MemRatio:  0.25,
+			},
+		},
 		"manual sizing": {
 			in: &mgmtpb.PoolCreateReq{
 				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
@@ -288,6 +331,27 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) {
 			},
 			expErr: FaultPoolNvmeTooSmall(nvmeTooSmallReq, minPoolNvme),
 		},
+		"manual sizing (MD-on-SSD syntax used)": {
+			enableMdOnSsd: true,
+			in: &mgmtpb.PoolCreateReq{
+				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
+				Ranks:     []uint32{0, 1},
+				MemRatio:  1,
+			},
+			expOut: &mgmtpb.PoolCreateReq{
+				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
+				Ranks:     []uint32{0, 1},
+				MemRatio:  1,
+			},
+		},
+		"manual sizing (MD-on-SSD syntax used but not MD-on-SSD)": {
+			in: &mgmtpb.PoolCreateReq{
+				TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1},
+				Ranks:     []uint32{0, 1},
+				MemRatio:  1,
+			},
+			expErr: FaultPoolMemRatioNoRoles,
+		},
 	} {
 		t.Run(name, func(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
@@ -301,7 +365,12 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) {
 							WithStorageClass("nvme").
 							WithBdevDeviceList("foo", "bar"),
 					)
+				if tc.enableMdOnSsd {
+					engineCfg.Storage.Tiers[0].
+						WithBdevDeviceRoles(storage.BdevRoleAll)
+				}
 			}
+
 			svc := newTestMgmtSvc(t, log)
 			sp := storage.MockProvider(log, 0, &engineCfg.Storage, nil, nil, nil, nil)
 			svc.harness.instances[0] = newTestEngine(log, false, sp, engineCfg)
@@ -417,6 +486,26 @@ func TestServer_MgmtSvc_PoolCreate(t *testing.T) {
 				TgtRanks:  []uint32{0, 1},
 			},
 		},
+		"successful creation with memory file ratio": {
+			targetCount:    8,
+			mdonssdEnabled: true,
+			req: &mgmtpb.PoolCreateReq{
+				Uuid:       test.MockUUID(1),
+				TierBytes:  []uint64{100 * humanize.GiByte, 10 * humanize.TByte},
+				MemRatio:   storage.DefaultMemoryFileRatio,
+				Properties: testPoolLabelProp(),
+			},
+			drpcRet: &mgmtpb.PoolCreateResp{
+				TierBytes:    []uint64{100 * humanize.GiByte, 10 * humanize.TByte},
+				MemFileBytes: 50 * humanize.GiByte,
+				TgtRanks:     []uint32{0, 1},
+			},
+			expResp: &mgmtpb.PoolCreateResp{
+				TierBytes:    []uint64{100 * humanize.GiByte, 10 * humanize.TByte},
+				MemFileBytes: 50 * humanize.GiByte,
+				TgtRanks:     []uint32{0, 1},
+			},
+		},
 		"successful creation minimum size": {
 			targetCount: 8,
 			req: &mgmtpb.PoolCreateReq{
diff --git a/src/control/server/storage/bdev.go b/src/control/server/storage/bdev.go
index 981a64e936a..96286052fcf 100644
--- a/src/control/server/storage/bdev.go
+++ b/src/control/server/storage/bdev.go
@@ -39,6 +39,12 @@ const (
 	sysXSTgtID     = 1024
 	// Minimum amount of hugepage memory (in bytes) needed for each target.
 	memHugepageMinPerTarget = 1 << 30 // 1GiB
+
+	// DefaultMemoryFileRatio (mem_size:meta_size) describes the behavior of MD-on-SSD in
+	// phase-1 mode where the per-target-meta-blob size is equal to the per-target-VOS-file
+	// size. In phase-2 mode where the per-target-meta-blob size is greater than
+	// per-target-VOS-file size, the memory file ratio will be less than one.
+	DefaultMemoryFileRatio = 1.0
 )
 
 // JSON config file constants.
@@ -312,6 +318,7 @@ func (sd *SmdDevice) UnmarshalJSON(data []byte) error {
 		sd.Roles.OptionBits = OptionBits(from.RoleBits)
 	}
 
+	// Handle any duplicate target IDs and set flag instead of sysXS target ID.
 	seen := make(map[int32]bool)
 	newTgts := make([]int32, 0, len(sd.TargetIDs))
 	for _, i := range sd.TargetIDs {
@@ -389,6 +396,26 @@ func (nc NvmeController) Free() (tb uint64) {
 	return
 }
 
+// Roles returns bdev_roles for NVMe controller being used in MD-on-SSD mode. Assume that all SMD
+// devices on a controller have the same roles.
+func (nc *NvmeController) Roles() *BdevRoles {
+	if len(nc.SmdDevices) > 0 {
+		return &nc.SmdDevices[0].Roles
+	}
+
+	return &BdevRoles{}
+}
+
+// Rank returns rank on which this NVMe controller is being used. Assume that all SMD devices on a
+// controller have the same rank.
+func (nc *NvmeController) Rank() ranklist.Rank {
+	if len(nc.SmdDevices) > 0 {
+		return nc.SmdDevices[0].Rank
+	}
+
+	return ranklist.NilRank
+}
+
 // NvmeControllers is a type alias for []*NvmeController.
 type NvmeControllers []*NvmeController
 
@@ -404,6 +431,11 @@ func (ncs NvmeControllers) String() string {
 	return strings.Join(ss, ", ")
 }
 
+// Len returns the length of the NvmeController reference slice.
+func (ncs NvmeControllers) Len() int {
+	return len(ncs)
+}
+
 // Capacity returns the cumulative total bytes of all controller capacities.
 func (ncs NvmeControllers) Capacity() (tb uint64) {
 	for _, c := range ncs {
diff --git a/src/control/server/storage/config.go b/src/control/server/storage/config.go
index 180b9663b31..402bee1c400 100644
--- a/src/control/server/storage/config.go
+++ b/src/control/server/storage/config.go
@@ -319,8 +319,7 @@ func (tcs TierConfigs) HasBdevRoleMeta() bool {
 	}
 
 	for _, bc := range tcs.BdevConfigs() {
-		bits := bc.Bdev.DeviceRoles.OptionBits
-		if (bits & BdevRoleMeta) != 0 {
+		if bc.Bdev.DeviceRoles.HasMeta() {
 			return true
 		}
 	}
@@ -403,10 +402,9 @@ func (tcs TierConfigs) validateBdevRoles() error {
 			return FaultBdevConfigRolesMissing
 		}
 
-		bits := roles.OptionBits
-		hasWAL := (bits & BdevRoleWAL) != 0
-		hasMeta := (bits & BdevRoleMeta) != 0
-		hasData := (bits & BdevRoleData) != 0
+		hasWAL := roles.HasWAL()
+		hasMeta := roles.HasMeta()
+		hasData := roles.HasData()
 
 		// Disallow having both wal and data only on a tier.
 		if hasWAL && hasData && !hasMeta {
@@ -942,6 +940,33 @@ func (bdr *BdevRoles) String() string {
 	return bdr.toString(roleOptFlags)
 }
 
+// HasDataRole returns true if BdevRoles has DATA role set.
+func (bdr *BdevRoles) HasData() bool {
+	if bdr == nil {
+		return false
+	}
+
+	return bdr.OptionBits&BdevRoleData != 0
+}
+
+// HasMetaRole returns true if BdevRoles has META role set.
+func (bdr *BdevRoles) HasMeta() bool {
+	if bdr == nil {
+		return false
+	}
+
+	return bdr.OptionBits&BdevRoleMeta != 0
+}
+
+// HasWALRole returns true if BdevRoles has WAL role set.
+func (bdr *BdevRoles) HasWAL() bool {
+	if bdr == nil {
+		return false
+	}
+
+	return bdr.OptionBits&BdevRoleWAL != 0
+}
+
 // BdevConfig represents a Block Device (NVMe, etc.) configuration entry.
 type BdevConfig struct {
 	DeviceList    *BdevDeviceList `yaml:"bdev_list,omitempty"`
@@ -1105,6 +1130,7 @@ type BdevAutoFaulty struct {
 	MaxCsumErrs uint32 `yaml:"max_csum_errs,omitempty" json:"max_csum_errs"`
 }
 
+// Config defines engine storage.
 type Config struct {
 	ControlMetadata  ControlMetadata `yaml:"-"` // inherited from server
 	EngineIdx        uint            `yaml:"-"`
@@ -1118,6 +1144,7 @@ type Config struct {
 	AutoFaultyProps  BdevAutoFaulty  `yaml:"bdev_auto_faulty,omitempty"`
 }
 
+// SetNUMAAffinity enables the assignment of NUMA affinity to tier configs.
 func (c *Config) SetNUMAAffinity(node uint) {
 	c.NumaNodeIndex = node
 	for _, tier := range c.Tiers {
@@ -1125,14 +1152,12 @@ func (c *Config) SetNUMAAffinity(node uint) {
 	}
 }
 
+// GetBdevs retrieves bdev device list of storage tiers.
 func (c *Config) GetBdevs() *BdevDeviceList {
 	return c.Tiers.Bdevs()
 }
 
-func (c *Config) GetNVMeBdevs() *BdevDeviceList {
-	return c.Tiers.NVMeBdevs()
-}
-
+// Validate checks the validity of the storage config.
 func (c *Config) Validate() error {
 	if err := c.Tiers.Validate(); err != nil {
 		return err
diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c
index 7e7c8199155..ba45aaa2616 100644
--- a/src/dtx/dtx_coll.c
+++ b/src/dtx/dtx_coll.c
@@ -81,9 +81,11 @@ dtx_coll_prep_ult(void *arg)
 	}
 
 	if (dcpa->dcpa_result != 0) {
-		if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST)
-			D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n",
-				DP_DTI(&dci->dci_xid), opc, DP_RC(rc));
+		if (dcpa->dcpa_result < 0 &&
+		    dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST)
+			D_ERROR("Failed to load mbs for "DF_DTI" in "DF_UUID"/"DF_UUID", opc %u: "
+				DF_RC"\n", DP_DTI(&dci->dci_xid), DP_UUID(dci->dci_po_uuid),
+				DP_UUID(dci->dci_co_uuid), opc, DP_RC(dcpa->dcpa_result));
 		goto out;
 	}
 
diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c
index ecb156729ed..1ee74ae11a4 100644
--- a/src/dtx/dtx_common.c
+++ b/src/dtx/dtx_common.c
@@ -1271,7 +1271,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	int				 status = -1;
 	int				 rc = 0;
 	bool				 aborted = false;
-	bool				 unpin = false;
 
 	D_ASSERT(cont != NULL);
 
@@ -1339,7 +1338,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	 * it persistently. Otherwise, the subsequent DTX resync may not find it as
 	 * to regard it as failed transaction and abort it.
 	 */
-	if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo &&
+	if (!dth->dth_active && !dth->dth_prepared &&
 	    (dth->dth_dist || dth->dth_modification_cnt > 0)) {
 		result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false);
 		if (unlikely(result < 0)) {
@@ -1349,7 +1348,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		}
 	}
 
-	if (dth->dth_prepared || dtx_batched_ult_max == 0) {
+	if ((dth->dth_prepared && !dlh->dlh_coll) || dtx_batched_ult_max == 0) {
 		dth->dth_sync = 1;
 		goto sync;
 	}
@@ -1363,14 +1362,12 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	if (DAOS_FAIL_CHECK(DAOS_DTX_MISS_COMMIT))
 		dth->dth_sync = 1;
 
-	/* For synchronous DTX, do not add it into CoS cache, otherwise,
-	 * we may have no way to remove it from the cache.
-	 */
 	if (dth->dth_sync)
 		goto sync;
 
 	D_ASSERT(dth->dth_mbs != NULL);
 
+cache:
 	if (dlh->dlh_coll) {
 		rc = dtx_cos_add(cont, dlh->dlh_coll_entry, &dth->dth_leader_oid,
 				 dth->dth_dkey_hash, dth->dth_epoch, DCF_EXP_CMT | DCF_COLL);
@@ -1378,38 +1375,47 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size;
 		D_ALLOC(dte, size);
 		if (dte == NULL) {
-			dth->dth_sync = 1;
-			goto sync;
-		}
-
-		mbs = (struct dtx_memberships *)(dte + 1);
-		memcpy(mbs, dth->dth_mbs, size - sizeof(*dte));
-
-		dte->dte_xid = dth->dth_xid;
-		dte->dte_ver = dth->dth_ver;
-		dte->dte_refs = 1;
-		dte->dte_mbs = mbs;
+			rc = -DER_NOMEM;
+		} else {
+			mbs = (struct dtx_memberships *)(dte + 1);
+			memcpy(mbs, dth->dth_mbs, size - sizeof(*dte));
+
+			dte->dte_xid = dth->dth_xid;
+			dte->dte_ver = dth->dth_ver;
+			dte->dte_refs = 1;
+			dte->dte_mbs = mbs;
+
+			if (!(mbs->dm_flags & DMF_SRDG_REP))
+				flags = DCF_EXP_CMT;
+			else if (dth->dth_modify_shared)
+				flags = DCF_SHARED;
+			else
+				flags = 0;
 
-		if (!(mbs->dm_flags & DMF_SRDG_REP))
-			flags = DCF_EXP_CMT;
-		else if (dth->dth_modify_shared)
-			flags = DCF_SHARED;
-		else
-			flags = 0;
+			rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash,
+					 dth->dth_epoch, flags);
+			dtx_entry_put(dte);
+		}
+	}
 
-		rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash,
-				 dth->dth_epoch, flags);
-		dtx_entry_put(dte);
+	/*
+	 * NOTE: If we failed to add the committable DTX into CoS cache, then we also have no way
+	 *	 to commit (or abort) the DTX because of out of memory. Such DTX will be finally
+	 *	 committed via next DTX resync (after recovered from OOM).
+	 *
+	 *	 Here, we only warning to notify the trouble, but not failed the transaction.
+	 */
+	if (rc != 0) {
+		D_WARN(DF_UUID": Fail to cache %s DTX "DF_DTI": "DF_RC"\n",
+		       DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular",
+		       DP_DTI(&dth->dth_xid), DP_RC(rc));
+		D_GOTO(out, result = 0);
 	}
 
-	if (rc == 0) {
-		if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) {
-			vos_dtx_mark_committable(dth);
-			if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll)
-				sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req);
-		}
-	} else {
-		dth->dth_sync = 1;
+	if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) {
+		vos_dtx_mark_committable(dth);
+		if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll)
+			sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req);
 	}
 
 sync:
@@ -1428,10 +1434,15 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 			rc = dtx_commit(cont, &dte, NULL, 1, false);
 		}
 
-		if (rc != 0)
+		if (rc != 0) {
 			D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n",
 			       DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular",
 			       DP_DTI(&dth->dth_xid), DP_RC(rc));
+			if (likely(dtx_batched_ult_max != 0)) {
+				dth->dth_sync = 0;
+				goto cache;
+			}
+		}
 
 		/*
 		 * NOTE: The semantics of 'sync' commit does not guarantee that all
@@ -1451,7 +1462,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	 * to locally retry for avoiding related forwarded RPC timeout, instead,
 	 * The leader will trigger retry globally without abort 'prepared' ones.
 	 */
-	if (unpin || (result < 0 && result != -DER_AGAIN && !dth->dth_solo)) {
+	if (result < 0 && result != -DER_AGAIN && !dth->dth_solo) {
 		/* 1. Drop partial modification for distributed transaction.
 		 * 2. Remove the pinned DTX entry.
 		 */
diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c
index 4c165f94d0c..0f6dd1c5913 100644
--- a/src/dtx/dtx_cos.c
+++ b/src/dtx/dtx_cos.c
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c
index 2ccbfec2734..6d34e871269 100644
--- a/src/dtx/dtx_rpc.c
+++ b/src/dtx/dtx_rpc.c
@@ -1657,8 +1657,9 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
 	}
 
 	D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE,
-		 "Collectively commit DTX "DF_DTI": %d/%d/%d\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1, rc2);
+		 "Collectively commit DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/%d\n",
+		 DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid),
+		 rc, rc1, rc2);
 
 	return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2;
 }
@@ -1717,8 +1718,9 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 		rc2 = 0;
 
 	D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE,
-		 "Collectively abort DTX "DF_DTI": %d/%d/%d\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1, rc2);
+		 "Collectively abort DTX "DF_DTI" with epoch "DF_X64" in "
+		 DF_UUID"/"DF_UUID": %d/%d/%d\n", DP_DTI(&dce->dce_xid), epoch,
+		 DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), rc, rc1, rc2);
 
 	return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2;
 }
@@ -1766,8 +1768,9 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 	}
 
 	D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR,
-		 DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1);
+		 DB_TRACE, "Collectively check DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/\n",
+		 DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid),
+		 rc, rc1);
 
 	return dce->dce_ranks != NULL  ? rc : rc1;
 }
diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c
index 41480b6e8b0..0885b7f6cdc 100644
--- a/src/dtx/dtx_srv.c
+++ b/src/dtx/dtx_srv.c
@@ -474,8 +474,9 @@ dtx_coll_handler(crt_rpc_t *rpc)
 
 out:
 	D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE,
-		 "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n",
-		 opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc));
+		 "Handled collective DTX PRC %u on rank %u for "DF_DTI" in "
+		 DF_UUID"/"DF_UUID": "DF_RC"\n", opc, myrank, DP_DTI(&dci->dci_xid),
+		 DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), DP_RC(rc));
 
 	dco->dco_status = rc;
 	rc = crt_reply_send(rpc);
diff --git a/src/engine/sched.c b/src/engine/sched.c
index 1fe400204be..49a46ca3618 100644
--- a/src/engine/sched.c
+++ b/src/engine/sched.c
@@ -781,7 +781,7 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
 {
 	struct sched_info	*info = &dx->dx_sched_info;
 	struct vos_pool_space	 vps = { 0 };
-	uint64_t		 scm_left, nvme_left;
+	uint64_t		 scm_left, nvme_left, ne_left, ne_sys;
 	struct pressure_ratio	*pr;
 	int			 orig_pressure, rc;
 
@@ -814,6 +814,17 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
 	else
 		scm_left = 0;
 
+	if (vps.vps_ne_total == 0) {
+		ne_left = UINT64_MAX;
+	} else {
+		D_ASSERT(vps.vps_ne_total < SCM_TOTAL(&vps));
+		ne_sys = SCM_SYS(&vps) * vps.vps_ne_total / SCM_TOTAL(&vps);
+		if (vps.vps_ne_free > ne_sys)
+			ne_left = vps.vps_ne_free - ne_sys;
+		else
+			ne_left = 0;
+	}
+
 	if (NVME_TOTAL(&vps) == 0)      /* NVMe not enabled */
 		nvme_left = UINT64_MAX;
 	else if (NVME_FREE(&vps) > NVME_SYS(&vps))
@@ -824,7 +835,8 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
 	orig_pressure = spi->spi_space_pressure;
 	for (pr = &pressure_gauge[0]; pr->pr_free != 0; pr++) {
 		if (scm_left > (SCM_TOTAL(&vps) * pr->pr_free / 100) &&
-		    nvme_left > (NVME_TOTAL(&vps) * pr->pr_free / 100))
+		    nvme_left > (NVME_TOTAL(&vps) * pr->pr_free / 100) &&
+		    ne_left > (vps.vps_ne_total * pr->pr_free / 100))
 			break;
 	}
 	spi->spi_space_pressure = pr->pr_pressure;
@@ -832,10 +844,11 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
 	if (spi->spi_space_pressure != SCHED_SPACE_PRESS_NONE &&
 	    spi->spi_space_pressure != orig_pressure) {
 		D_INFO("Pool:"DF_UUID" is under %d pressure, "
-		       "SCM: tot["DF_U64"], sys["DF_U64"], free["DF_U64"] "
+		       "SCM: tot["DF_U64"], sys["DF_U64"], free["DF_U64"], ne["DF_U64"/"DF_U64"] "
 		       "NVMe: tot["DF_U64"], sys["DF_U64"], free["DF_U64"]\n",
 		       DP_UUID(spi->spi_pool_id), spi->spi_space_pressure,
 		       SCM_TOTAL(&vps), SCM_SYS(&vps), SCM_FREE(&vps),
+		       vps.vps_ne_free, vps.vps_ne_total,
 		       NVME_TOTAL(&vps), NVME_SYS(&vps), NVME_FREE(&vps));
 
 		spi->spi_pressure_ts = info->si_cur_ts;
diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h
index ffd3296b4eb..6b016f8fcf9 100644
--- a/src/include/daos/mem.h
+++ b/src/include/daos/mem.h
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -30,19 +30,30 @@ int umempobj_settings_init(bool md_on_ssd);
 /* convert backend type to umem class id */
 int umempobj_backend_type2class_id(int backend);
 
+/* get page size for the backend */
+size_t
+umempobj_pgsz(int backend);
+
 /* umem persistent object property flags */
 #define	UMEMPOBJ_ENABLE_STATS	0x1
 
 #ifdef DAOS_PMEM_BUILD
+
+/* The backend type is stored in meta blob header, don't change the value */
 enum {
 	DAOS_MD_PMEM	= 0,
 	DAOS_MD_BMEM	= 1,
 	DAOS_MD_ADMEM	= 2,
+	DAOS_MD_BMEM_V2	= 3,
 };
 
 /* return umem backend type */
 int umempobj_get_backend_type(void);
 
+/* returns whether bmem_v2 pools are allowed */
+bool
+umempobj_allow_md_bmem_v2();
+
 #endif
 
 struct umem_wal_tx;
@@ -108,7 +119,12 @@ struct umem_store_iod {
 struct umem_store;
 
 struct umem_store_ops {
-	int	(*so_load)(struct umem_store *store, char *start);
+	int	(*so_waitqueue_create)(void **wq);
+	void	(*so_waitqueue_destroy)(void *wq);
+	void	(*so_waitqueue_wait)(void *wq, bool yield_only);
+	void	(*so_waitqueue_wakeup)(void *wq, bool wakeup_all);
+	int	(*so_load)(struct umem_store *store, char *start_addr, daos_off_t offset,
+			   daos_size_t len);
 	int	(*so_read)(struct umem_store *store, struct umem_store_iod *iod,
 			   d_sg_list_t *sgl);
 	int	(*so_write)(struct umem_store *store, struct umem_store_iod *iod,
@@ -151,6 +167,8 @@ struct umem_store {
 	struct umem_store_ops	*stor_ops;
 	/* backend type */
 	int			 store_type;
+	/* whether the store has evictable zones */
+	bool			 store_evictable;
 	/* standalone store */
 	bool			 store_standalone;
 	/* backend SSD is in faulty state */
@@ -169,6 +187,312 @@ struct umem_pool {
 	struct umem_slab_desc	 up_slabs[0];
 };
 
+#ifdef DAOS_PMEM_BUILD
+#define UMEM_CACHE_PAGE_SZ_SHIFT  24 /* 16MB */
+#define UMEM_CACHE_PAGE_SZ        (1 << UMEM_CACHE_PAGE_SZ_SHIFT)
+
+#define UMEM_CACHE_CHUNK_SZ_SHIFT 12 /* 4KB */
+#define UMEM_CACHE_CHUNK_SZ       (1 << UMEM_CACHE_CHUNK_SZ_SHIFT)
+#define UMEM_CACHE_CHUNK_SZ_MASK  (UMEM_CACHE_CHUNK_SZ - 1)
+
+#define UMEM_CACHE_MIN_EVICTABLE_PAGES	2
+
+enum umem_page_event_types {
+	UMEM_CACHE_EVENT_PGLOAD = 0,
+	UMEM_CACHE_EVENT_PGEVICT
+};
+
+struct umem_page_info;
+/* MD page */
+struct umem_page {
+	/** Pointing to memory page when it's mapped */
+	struct umem_page_info *pg_info;
+};
+
+enum umem_page_stats {
+	UMEM_PG_STATS_NONEVICTABLE = 0,
+	UMEM_PG_STATS_PINNED,
+	UMEM_PG_STATS_FREE,
+	UMEM_PG_STATS_MAX,
+};
+
+enum umem_cache_stats {
+	/* How many page cache hit */
+	UMEM_CACHE_STATS_HIT	= 0,
+	/* How many page cache miss */
+	UMEM_CACHE_STATS_MISS,
+	/* How many pages are evicted */
+	UMEM_CACHE_STATS_EVICT,
+	/* How many dirty pages are flushed on evicting */
+	UMEM_CACHE_STATS_FLUSH,
+	/* How many pages are loaded on cache miss */
+	UMEM_CACHE_STATS_LOAD,
+	UMEM_CACHE_STATS_MAX,
+};
+
+/** Global cache status for each umem_store */
+struct umem_cache {
+	struct umem_store *ca_store;
+	/** Base address of the page cache */
+	void            *ca_base;
+	/** Offset of first page */
+	uint32_t         ca_base_off;
+	/** Cache Mode */
+	uint32_t         ca_mode;
+	/** WAL replay status */
+	uint32_t         ca_replay_done;
+	/** Total MD pages */
+	uint32_t         ca_md_pages;
+	/** Total memory pages in cache */
+	uint32_t         ca_mem_pages;
+	/** Maximum non-evictable memory pages */
+	uint32_t         ca_max_ne_pages;
+	/** Page size */
+	uint32_t         ca_page_sz;
+	/** Page size shift */
+	uint32_t         ca_page_shift;
+	/** Page size mask */
+	uint32_t         ca_page_mask;
+	/** Per-page Bitmap size (in uint64_t) */
+	uint32_t         ca_bmap_sz;
+	/** Free list for unmapped page info */
+	d_list_t         ca_pgs_free;
+	/** Non-evictable & evictable dirty pages */
+	d_list_t         ca_pgs_dirty;
+	/** All Non-evictable[0] & evictable[1] pages */
+	d_list_t         ca_pgs_lru[2];
+	/** all the pages in the progress of flushing */
+	d_list_t         ca_pgs_flushing;
+	/** all the pages waiting for commit */
+	d_list_t         ca_pgs_wait_commit;
+	/** all the pages being pinned */
+	d_list_t         ca_pgs_pinned;
+	/** Highest committed transaction ID */
+	uint64_t         ca_commit_id;
+	/** Callback to tell if a page is evictable */
+	bool		 (*ca_evictable_fn)(void *arg, uint32_t pg_id);
+	/** Callback being called on page loaded/evicted */
+	int		 (*ca_evtcb_fn)(int event_type, void *arg, uint32_t pg_id);
+	/** Argument to the callback function */
+	void            *ca_fn_arg;
+	/** Page stats */
+	uint32_t         ca_pgs_stats[UMEM_PG_STATS_MAX];
+	/** Cache stats */
+	uint64_t	 ca_cache_stats[UMEM_CACHE_STATS_MAX];
+	/** How many waiters waiting on free page reserve */
+	uint32_t         ca_reserve_waiters;
+	/** Waitqueue for free page reserve: umem_cache_reserve() */
+	void            *ca_reserve_wq;
+	/** TODO: some other global status */
+	/** MD page array, array index is page ID */
+	struct umem_page ca_pages[0];
+};
+
+struct umem_cache_chkpt_stats {
+	/** Last committed checkpoint id */
+	uint64_t *uccs_chkpt_id;
+	/** Number of pages processed */
+	int       uccs_nr_pages;
+	/** Number of dirty chunks copied */
+	int       uccs_nr_dchunks;
+	/** Number of sgl iovs used to copy dirty chunks */
+	int       uccs_nr_iovs;
+};
+
+/** Allocate global cache for umem store.
+ *
+ * \param[in]	store		The umem store
+ * \param[in]	page_sz		Page size
+ * \param[in]	md_pgs		Total MD pages
+ * \param[in]	mem_pgs		Total memory pages
+ * \param[in]	max_ne_pgs	Maximum Non-evictable pages
+ * \param[in]	base_off	Offset of the umem cache base
+ * \param[in]	base		Start address of the page cache
+ * \param[in]	is_evictable_fn	Callback function to check if page is evictable
+ * \param[in]	pageload_fn	Callback called on page being loaded
+ * \param[in]	arg		Argument to callback functions.
+ *
+ * \return 0 on success
+ */
+int
+umem_cache_alloc(struct umem_store *store, uint32_t page_sz, uint32_t md_pgs, uint32_t mem_pgs,
+		 uint32_t max_ne_pgs, uint32_t base_off, void *base,
+		 bool (*is_evictable_fn)(void *arg, uint32_t pg_id),
+		 int (*evtcb_fn)(int evt_flag, void *arg, uint32_t pg_id), void *arg);
+
+/** Free global cache for umem store.
+ *
+ * \param[in]	store	Store for which to free cache
+ *
+ * \return 0 on success
+ */
+int
+umem_cache_free(struct umem_store *store);
+
+/** Check MD-blob offset is already loaded onto umem cache.
+ *
+ * \param[in]	store	The umem store
+ * \param[in]	offset	MD-blob offset to be converted
+ *
+ * \return	true or false
+ */
+bool
+umem_cache_offisloaded(struct umem_store *store, umem_off_t offset);
+
+/** Convert MD-blob offset to memory pointer, the corresponding page must be mapped already.
+ *
+ * \param[in]	store	The umem store
+ * \param[in]	offset	MD-blob offset to be converted
+ *
+ * \return	Memory pointer
+ */
+void *
+umem_cache_off2ptr(struct umem_store *store, umem_off_t offset);
+
+/** Convert memory pointer to MD-blob offset, the corresponding page must be mapped already.
+ *
+ * \param[in]	store	The umem store
+ * \param[in]	ptr	Memory pointer to be converted
+ *
+ * \return	MD-blob offset
+ */
+umem_off_t
+umem_cache_ptr2off(struct umem_store *store, const void *ptr);
+
+/** Update umem_cache post WAL replay. This routine is called after
+ *  WAL replay and the evictability of all pages are determined.
+ *
+ * \param[in]	store	The umem store
+ *
+ * \return      None
+ */
+void
+umem_cache_post_replay(struct umem_store *store);
+
+struct umem_cache_range {
+	umem_off_t  cr_off;
+	daos_size_t cr_size;
+};
+
+/** Map MD pages in specified range to memory pages. The range to be mapped should be empty
+ *  (no page loading required). If caller tries to map non-evictable pages, page eviction
+ *  won't be triggered when there are not enough free pages; If caller tries to map evictable
+ *  page, page eviction could be triggered, but it can only map single evictable page at a time.
+ *
+ * \param[in]	store		The umem store
+ * \param[in]	ranges		Ranges to be mapped
+ * \param[in]	range_nr	Number of ranges
+ *
+ * \return	0		: On success
+ *		-DER_BUSY	: Not enough free pages
+ *		-ve		: Errors
+ */
+int
+umem_cache_map(struct umem_store *store, struct umem_cache_range *ranges, int range_nr);
+
+/** Load & map MD pages in specified range to memory pages.
+ *
+ * \param[in]	store		The umem store
+ * \param[in]	ranges		Ranges to be mapped
+ * \param[in]	range_nr	Number of ranges
+ * \param[in]	for_sys		Internal access from system ULTs (aggregation etc.)
+ *
+ * \return	0 on success, negative value on error.
+ */
+int
+umem_cache_load(struct umem_store *store, struct umem_cache_range *ranges, int range_nr,
+		bool for_sys);
+
+struct umem_pin_handle;
+
+/** Load & map MD pages in specified range to memory pages, then take a reference on the mapped
+ *  memory pages, so that the pages won't be evicted until unpin is called. It's usually for the
+ *  cases where we need the pages stay loaded across a yield.
+ *
+ *  \param[in]	store		The umem store
+ *  \param[in]	ranges		Ranges to be pinned
+ *  \param[in]	range_nr	Number of ranges
+ *  \param[in]	for_sys		Internal access from system ULTs (aggregation etc.)
+ *  \param[out] pin_handle	Returned pin handle
+ *
+ *  \return 0 on success
+ */
+int
+umem_cache_pin(struct umem_store *store, struct umem_cache_range *rangs, int range_nr, bool for_sys,
+	       struct umem_pin_handle **pin_handle);
+
+/** Unpin the pages pinned by prior umem_cache_pin().
+ *
+ *  \param[in]	store		The umem store
+ *  \param[in]	pin_handle	Pin handle got from umem_cache_pin()
+ *  \param[in]	range_nr	Number of ranges
+ */
+void
+umem_cache_unpin(struct umem_store *store, struct umem_pin_handle *pin_handle);
+
+/** Reserve few free pages for potential non-evictable zone grow within a transaction.
+ *  Caller needs to ensure there is no CPU yielding after this call till transaction
+ *  start.
+ *
+ *  \param[in]	store		The umem store
+ *
+ *  \return 0 on success
+ */
+int
+umem_cache_reserve(struct umem_store *store);
+
+/** Inform umem cache the last committed ID.
+ *
+ * \param[in]	store		The umem store
+ * \param[in]	commit_id	The last committed ID
+ */
+void
+umem_cache_commit(struct umem_store *store, uint64_t commit_id);
+
+/**
+ * Touched the region identified by @addr and @size, it will mark pages in this region as
+ * dirty (also set bitmap within each page), and put it on dirty list
+ *
+ * This function is called by allocator(probably VOS as well) each time it creates memory
+ * snapshot (calls tx_snap) or just to mark a region to be flushed.
+ *
+ * \param[in]	store	The umem store
+ * \param[in]	wr_tx	The writing transaction
+ * \param[in]	addr	The start address
+ * \param[in]	size	size of dirty region
+ *
+ * \return 0 on success, -DER_CHKPT_BUSY if a checkpoint is in progress on the page. The calling
+ *         transaction must either abort or find another location to modify.
+ */
+int
+umem_cache_touch(struct umem_store *store, uint64_t wr_tx, umem_off_t addr, daos_size_t size);
+
+/** Callback for checkpoint to wait for the commit of chkpt_tx.
+ *
+ * \param[in]	arg		Argument passed to umem_cache_checkpoint
+ * \param[in]	chkpt_tx	The WAL transaction ID we are waiting to commit to WAL
+ * \param[out]	committed_tx	The WAL tx ID of the last transaction committed to WAL
+ */
+typedef void
+umem_cache_wait_cb_t(void *arg, uint64_t chkpt_tx, uint64_t *committed_tx);
+
+/**
+ * This function can yield internally, it is called by checkpoint service of upper level stack.
+ *
+ * \param[in]		store		The umem store
+ * \param[in]		wait_cb		Callback for to wait for wal commit completion
+ * \param[in]		arg		argument for wait_cb
+ * \param[in,out]	chkpt_id	Input is last committed id, output is checkpointed id
+ * \param[out]		chkpt_stats	check point stats
+ *
+ * \return 0 on success
+ */
+int
+umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, void *arg,
+		      uint64_t *chkpt_id, struct umem_cache_chkpt_stats *chkpt_stats);
+
+#endif /*DAOS_PMEM_BUILD*/
+
 /* umem persistent object functions */
 struct umem_pool *umempobj_create(const char *path, const char *layout_name,
 				  int prop_flags, size_t poolsize,
@@ -179,6 +503,9 @@ void  umempobj_close(struct umem_pool *pool);
 void *umempobj_get_rootptr(struct umem_pool *pool, size_t size);
 int   umempobj_get_heapusage(struct umem_pool *pool,
 			     daos_size_t *cur_allocated);
+int
+      umempobj_get_mbusage(struct umem_pool *pool, uint32_t mb_id, daos_size_t *cur_allocated,
+			   daos_size_t *maxsz);
 void  umempobj_log_fraginfo(struct umem_pool *pool);
 
 /** Number of flag bits to reserve for encoding extra information in
@@ -273,6 +600,8 @@ typedef enum {
 	UMEM_CLASS_BMEM,
 	/** ad-hoc memory */
 	UMEM_CLASS_ADMEM,
+	/** blob backed memory v2 */
+	UMEM_CLASS_BMEM_V2,
 	/** unknown */
 	UMEM_CLASS_UNKNOWN,
 } umem_class_id_t;
@@ -314,6 +643,9 @@ struct umem_instance;
 #define UMEM_FLAG_NO_FLUSH	(((uint64_t)1) << 1)
 #define UMEM_XADD_NO_SNAPSHOT	(((uint64_t)1) << 2)
 
+/* Macros associated with Memory buckets */
+#define	UMEM_DEFAULT_MBKT_ID	0
+
 /* type num used by umem ops */
 enum {
 	UMEM_TYPE_ANY,
@@ -334,11 +666,12 @@ typedef struct {
 	 *
 	 * \param umm	   [IN]	umem class instance.
 	 * \param size	   [IN]	size to allocate.
-	 * \param flags	   [IN]	flags like zeroing, noflush (for PMDK)
-	 * \param type_num [IN]	struct type (for PMDK)
+	 * \param flags	   [IN]	flags like zeroing, noflush (for PMDK and BMEM)
+	 * \param type_num [IN]	struct type (for PMDK and BMEM)
+	 * \param mbkt_id  [IN]	memory bucket id (for BMEM)
 	 */
-	umem_off_t	 (*mo_tx_alloc)(struct umem_instance *umm, size_t size,
-					uint64_t flags, unsigned int type_num);
+	umem_off_t (*mo_tx_alloc)(struct umem_instance *umm, size_t size, uint64_t flags,
+				  unsigned int type_num, unsigned int mbkt_id);
 	/**
 	 * Add the specified range of umoff to current memory transaction.
 	 *
@@ -361,7 +694,7 @@ typedef struct {
 	 * \param offset [IN]	start offset of \a umoff tracked by the
 	 *			transaction.
 	 * \param size	[IN]	size of \a umoff tracked by the transaction.
-	 * \param flags [IN]	PMDK flags
+	 * \param flags [IN]	PMDK and BMEM flags
 	 */
 	int		 (*mo_tx_xadd)(struct umem_instance *umm,
 				       umem_off_t umoff, uint64_t offset,
@@ -394,9 +727,10 @@ typedef struct {
 	 * \param act	[IN|OUT]	action used for later cancel/publish.
 	 * \param size	[IN]		size to be reserved.
 	 * \param type_num [IN]		struct type (for PMDK)
+	 * \param mbkt_id  [IN]		memory bucket id (for BMEM)
 	 */
-	umem_off_t	 (*mo_reserve)(struct umem_instance *umm, void *act, size_t size,
-				       unsigned int type_num);
+	umem_off_t (*mo_reserve)(struct umem_instance *umm, void *act, size_t size,
+				 unsigned int type_num, unsigned int mbkt_id);
 
 	/**
 	 * Defer free til commit.  For use with reserved extents that are not
@@ -446,13 +780,14 @@ typedef struct {
 	/**
 	 * allocate umoff with the specified size & flags atomically
 	 *
-	 * \param umm	[IN]	umem class instance.
-	 * \param size	[IN]	size to allocate.
-	 * \param flags	[IN]	flags like zeroing, noflush (for PMDK)
-	 * \param type_num [IN]	struct type (for PMDK)
+	 * \param umm	   [IN]	 umem class instance.
+	 * \param size	   [IN]	 size to allocate.
+	 * \param flags	   [IN]	 flags like zeroing, noflush (for PMDK)
+	 * \param type_num [IN]	 struct type (for PMDK)
+	 * \param mbkt_id  [IN]	 memory bucket id (for BMEM)
 	 */
-	umem_off_t	 (*mo_atomic_alloc)(struct umem_instance *umm, size_t size,
-					    unsigned int type_num);
+	umem_off_t (*mo_atomic_alloc)(struct umem_instance *umm, size_t size, unsigned int type_num,
+				      unsigned int mbkt_id);
 
 	/**
 	 * flush data at specific offset to persistent store.
@@ -464,6 +799,14 @@ typedef struct {
 	void		(*mo_atomic_flush)(struct umem_instance *umm, void *addr,
 					   size_t size);
 
+	/**
+	 * returns an evictable memory bucket for tasks like new object creation etc.
+	 *
+	 * \param umm	   [IN]	 umem class instance.
+	 * \param flags	   [IN]	 flags for MB selection criteria. Currently unused.
+	 */
+	uint32_t (*mo_allot_evictable_mb)(struct umem_instance *umm, int flags);
+
 #endif
 	/**
 	 * Add one commit or abort callback to current transaction.
@@ -522,6 +865,10 @@ umem_off2ptr(const struct umem_instance *umm, umem_off_t umoff)
 	if (UMOFF_IS_NULL(umoff))
 		return NULL;
 
+#ifdef DAOS_PMEM_BUILD
+	if (umm->umm_pool && (umm->umm_pool->up_store.store_type == DAOS_MD_BMEM_V2))
+		return umem_cache_off2ptr(&umm->umm_pool->up_store, umem_off2offset(umoff));
+#endif
 	return (void *)(umm->umm_base + umem_off2offset(umoff));
 }
 
@@ -538,7 +885,12 @@ umem_ptr2off(const struct umem_instance *umm, void *ptr)
 	if (ptr == NULL)
 		return UMOFF_NULL;
 
-	return (umem_off_t)ptr - umm->umm_base;
+#ifdef DAOS_PMEM_BUILD
+	if (umm->umm_pool && (umm->umm_pool->up_store.store_type == DAOS_MD_BMEM_V2)) {
+			return umem_cache_ptr2off(&umm->umm_pool->up_store, ptr);
+	} else
+#endif
+		return (umem_off_t)ptr - umm->umm_base;
 }
 
 /**
@@ -558,11 +910,11 @@ umem_has_tx(struct umem_instance *umm)
 	return umm->umm_ops->mo_tx_add != NULL;
 }
 
-#define umem_alloc_verb(umm, flags, size)			                                   \
+#define umem_alloc_verb(umm, flags, size, mbkt_id)                                                 \
 	({                                                                                         \
 		umem_off_t __umoff;                                                                \
                                                                                                    \
-		__umoff = (umm)->umm_ops->mo_tx_alloc(umm, size, flags, UMEM_TYPE_ANY);   \
+		__umoff = (umm)->umm_ops->mo_tx_alloc(umm, size, flags, UMEM_TYPE_ANY, mbkt_id);   \
 		D_ASSERTF(umem_off2flags(__umoff) == 0,                                            \
 			  "Invalid assumption about allocnot using flag bits");                    \
 		D_DEBUG(DB_MEM,                                                                    \
@@ -573,14 +925,17 @@ umem_has_tx(struct umem_instance *umm)
 		__umoff;                                                                           \
 	})
 
-#define umem_alloc(umm, size)						\
-	umem_alloc_verb(umm, 0, size)
+#define umem_alloc(umm, size)  umem_alloc_verb(umm, 0, size, UMEM_DEFAULT_MBKT_ID)
+
+#define umem_alloc_from_bucket(umm, size, mbkt_id)  umem_alloc_verb(umm, 0, size, mbkt_id)
 
-#define umem_zalloc(umm, size)						\
-	umem_alloc_verb(umm, UMEM_FLAG_ZERO, size)
+#define umem_zalloc(umm, size) umem_alloc_verb(umm, UMEM_FLAG_ZERO, size, UMEM_DEFAULT_MBKT_ID)
 
-#define umem_alloc_noflush(umm, size)					\
-	umem_alloc_verb(umm, UMEM_FLAG_NO_FLUSH, size)
+#define umem_zalloc_from_bucket(umm, size, mbkt_id)						   \
+	umem_alloc_verb(umm, UMEM_FLAG_ZERO, size, mbkt_id)
+
+#define umem_alloc_noflush(umm, size)								   \
+	umem_alloc_verb(umm, UMEM_FLAG_NO_FLUSH, size, UMEM_DEFAULT_MBKT_ID)
 
 #define umem_free(umm, umoff)                                                                      \
 	({                                                                                         \
@@ -736,13 +1091,20 @@ int umem_rsrvd_act_realloc(struct umem_instance *umm, struct umem_rsrvd_act **ac
 /* Free up the array of reserved actions */
 int umem_rsrvd_act_free(struct umem_rsrvd_act **act);
 
-umem_off_t umem_reserve(struct umem_instance *umm,
-			struct umem_rsrvd_act *rsrvd_act, size_t size);
-void umem_defer_free(struct umem_instance *umm, umem_off_t off,
-		     struct umem_rsrvd_act *rsrvd_act);
-void umem_cancel(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act);
-int umem_tx_publish(struct umem_instance *umm,
-		    struct umem_rsrvd_act *rsrvd_act);
+umem_off_t
+umem_reserve_common(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act, size_t size,
+	     unsigned int mbkt_id);
+#define umem_reserve(umm, rsrvd_act, size)							\
+	umem_reserve_common(umm, rsrvd_act, size, UMEM_DEFAULT_MBKT_ID)
+#define umem_reserve_from_bucket(umm, rsrvd_act, size, mbkt_id)					\
+	umem_reserve_common(umm, rsrvd_act, size, mbkt_id)
+
+void
+umem_defer_free(struct umem_instance *umm, umem_off_t off, struct umem_rsrvd_act *rsrvd_act);
+void
+umem_cancel(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act);
+int
+umem_tx_publish(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act);
 
 static inline void *
 umem_atomic_copy(struct umem_instance *umm, void *dest, void *src, size_t len,
@@ -756,7 +1118,15 @@ static inline umem_off_t
 umem_atomic_alloc(struct umem_instance *umm, size_t len, unsigned int type_num)
 {
 	D_ASSERT(umm->umm_ops->mo_atomic_alloc != NULL);
-	return umm->umm_ops->mo_atomic_alloc(umm, len, type_num);
+	return umm->umm_ops->mo_atomic_alloc(umm, len, type_num, UMEM_DEFAULT_MBKT_ID);
+}
+
+static inline umem_off_t
+umem_atomic_alloc_from_bucket(struct umem_instance *umm, size_t len, unsigned int type_num,
+		  unsigned int mbkt_id)
+{
+	D_ASSERT(umm->umm_ops->mo_atomic_alloc != NULL);
+	return umm->umm_ops->mo_atomic_alloc(umm, len, type_num, mbkt_id);
 }
 
 static inline int
@@ -786,6 +1156,48 @@ umem_tx_add_callback(struct umem_instance *umm, struct umem_tx_stage_data *txd,
 	return umm->umm_ops->mo_tx_add_callback(umm, txd, stage, cb, data);
 }
 
+/**
+ * Allot an evictable memory bucket for tasks like new object creation etc.
+ *
+ * \param[in]		umm		umem instance pointer.
+ * \param[in]		flags		MB selection criteria.
+ *
+ * \return id > 0, memory bucket id.
+ *	   id = 0, no evictable memory was be chosen.
+ */
+static inline uint32_t
+umem_allot_mb_evictable(struct umem_instance *umm, int flags)
+{
+	if (umm->umm_ops->mo_allot_evictable_mb)
+		return umm->umm_ops->mo_allot_evictable_mb(umm, flags);
+	else
+		return 0;
+}
+
+/**
+ * Get memory bucket id associated with the offset.
+ *
+ * \param[in]		umm		umem instance pointer.
+ * \param[in]		off		offset within the umem pool
+ *
+ * \return id > 0, id of evictable memory bucket.
+ *         id = 0, Memory bucket is non-evictable.
+ */
+uint32_t
+umem_get_mb_from_offset(struct umem_instance *umm, umem_off_t off);
+
+/**
+ * Get base offset of the memory bucket
+ *
+ * \param[in]		umm		umem instance pointer.
+ * \param[in]		mb_id		memory bucket id.
+ *
+ * \return off > 0, base offset of evictable memory bucket.
+ *         off = 0, base offset of non-evictable memory bucket.
+ */
+umem_off_t
+umem_get_mb_base_offset(struct umem_instance *umm, uint32_t mb_id);
+
 /*********************************************************************************/
 
 /* Type of memory actions */
@@ -855,219 +1267,5 @@ struct umem_action {
 	};
 };
 
-#define UMEM_CACHE_PAGE_SZ_SHIFT  24 /* 16MB */
-#define UMEM_CACHE_PAGE_SZ        (1 << UMEM_CACHE_PAGE_SZ_SHIFT)
-#define UMEM_CACHE_PAGE_SZ_MASK   (UMEM_CACHE_PAGE_SZ - 1)
-
-#define UMEM_CACHE_CHUNK_SZ_SHIFT 12 /* 4KB */
-#define UMEM_CACHE_CHUNK_SZ       (1 << UMEM_CACHE_CHUNK_SZ_SHIFT)
-#define UMEM_CACHE_CHUNK_SZ_MASK  (UMEM_CACHE_CHUNK_SZ - 1)
-
-#define UMEM_CACHE_BMAP_SZ        (1 << (UMEM_CACHE_PAGE_SZ_SHIFT - UMEM_CACHE_CHUNK_SZ_SHIFT - 6))
-
-struct umem_page_info;
-/** 16 MB page */
-struct umem_page {
-	/** page ID */
-	unsigned int		 pg_id;
-	/** refcount */
-	int			 pg_ref;
-	/** page info */
-	struct umem_page_info   *pg_info;
-};
-
-/** Global cache status for each umem_store */
-struct umem_cache {
-	struct umem_store	*ca_store;
-	/** Total pages store */
-	uint64_t                 ca_num_pages;
-	/** Total pages in cache */
-	uint64_t                 ca_mapped;
-	/** Maximum number of cached pages */
-	uint64_t                 ca_max_mapped;
-	/** Free list for mapped page info */
-	d_list_t                 ca_pi_free;
-	/** all the dirty pages */
-	d_list_t                 ca_pgs_dirty;
-	/** Pages waiting for copy to DMA buffer */
-	d_list_t                 ca_pgs_copying;
-	/** LRU list all pages not in one of the other states for future eviction support */
-	d_list_t                 ca_pgs_lru;
-	/** TODO: some other global status */
-	/** All pages, sorted by umem_page::pg_id */
-	struct umem_page         ca_pages[0];
-};
-
-struct umem_cache_chkpt_stats {
-	/** Last committed checkpoint id */
-	uint64_t	*uccs_chkpt_id;
-	/** Number of pages processed */
-	int		 uccs_nr_pages;
-	/** Number of dirty chunks copied */
-	int		 uccs_nr_dchunks;
-	/** Number of sgl iovs used to copy dirty chunks */
-	int		 uccs_nr_iovs;
-};
-
-static inline uint64_t
-umem_cache_size2pages(uint64_t len)
-{
-	D_ASSERT((len & UMEM_CACHE_PAGE_SZ_MASK) == 0);
-
-	return len >> UMEM_CACHE_PAGE_SZ_SHIFT;
-}
-
-static inline uint64_t
-umem_cache_size_round(uint64_t len)
-{
-	return (len + UMEM_CACHE_PAGE_SZ_MASK) & ~UMEM_CACHE_PAGE_SZ_MASK;
-}
-
-static inline struct umem_page *
-umem_cache_off2page(struct umem_cache *cache, umem_off_t offset)
-{
-	uint64_t idx = offset >> UMEM_CACHE_PAGE_SZ_SHIFT;
-
-	D_ASSERTF(idx < cache->ca_num_pages,
-		  "offset=" DF_U64 ", num_pages=" DF_U64 ", idx=" DF_U64 "\n", offset,
-		  cache->ca_num_pages, idx);
-
-	return &cache->ca_pages[idx];
-}
-
-/** From a mapped page address, return the umem_cache it belongs to */
-static inline struct umem_cache *
-umem_page2cache(struct umem_page *page)
-{
-	return (struct umem_cache *)container_of(&page[-page->pg_id], struct umem_cache, ca_pages);
-}
-
-/** From a mapped page address, return the umem_store it belongs to */
-static inline struct umem_store *
-umem_page2store(struct umem_page *page)
-{
-	return umem_page2cache(page)->ca_store;
-}
-
-/** Allocate global cache for umem store.  All 16MB pages are initially unmapped
- *
- * \param[in]	store		The umem store
- * \param[in]	max_mapped	0 or Maximum number of mapped 16MB pages (must be 0 for now)
- *
- * \return 0 on success
- */
-int
-umem_cache_alloc(struct umem_store *store, uint64_t max_mapped);
-
-/** Free global cache for umem store.  Pages must be unmapped first
- *
- * \param[in]	store	Store for which to free cache
- *
- * \return 0 on success
- */
-int
-umem_cache_free(struct umem_store *store);
-
-/** Query if the page cache has enough space to map a range
- *
- * \param[in]	store		The store
- * \param[in]	num_pages	Number of pages to bring into cache
- *
- * \return number of pages that need eviction to support mapping the range
- */
-int
-umem_cache_check(struct umem_store *store, uint64_t num_pages);
-
-/** Evict the pages.   This invokes the unmap callback. (XXX: not yet implemented)
- *
- * \param[in]	store		The store
- * \param[in]	num_pages	Number of pages to evict
- *
- * \return 0 on success, -DER_BUSY means a checkpoint is needed to evict the pages
- */
-int
-umem_cache_evict(struct umem_store *store, uint64_t num_pages);
-
-/** Adds a mapped range of pages to the page cache.
- *
- * \param[in]	store		The store
- * \param[in]	offset		The offset in the umem cache
- * \param[in]	start_addr	Start address of mapping
- * \param[in]	num_pages	Number of consecutive 16MB pages to being cached
- *
- * \return 0 on success
- */
-int
-umem_cache_map_range(struct umem_store *store, umem_off_t offset, void *start_addr,
-		     uint64_t num_pages);
-
-/** Take a reference on the pages in the range.   Only needed for cases where we need the page to
- *  stay loaded across a yield, such as the VOS object cache.  Pages in the range must be mapped.
- *
- *  \param[in]	store	The umem store
- *  \param[in]	addr	The address of the hold
- *  \param[in]	size	The size of the hold
- *
- *  \return 0 on success
- */
-int
-umem_cache_pin(struct umem_store *store, umem_off_t addr, daos_size_t size);
-
-/** Release a reference on pages in the range.  Pages in the range must be mapped and held.
- *
- *  \param[in]	store	The umem store
- *  \param[in]	addr	The address of the hold
- *  \param[in]	size	The size of the hold
- *
- *  \return 0 on success
- */
-int
-umem_cache_unpin(struct umem_store *store, umem_off_t addr, daos_size_t size);
-
-/**
- * Touched the region identified by @addr and @size, it will mark pages in this region as
- * dirty (also set bitmap within each page), and put it on dirty list
- *
- * This function is called by allocator(probably VOS as well) each time it creates memory
- * snapshot (calls tx_snap) or just to mark a region to be flushed.
- *
- * \param[in]	store	The umem store
- * \param[in]	wr_tx	The writing transaction
- * \param[in]	addr	The start address
- * \param[in]	size	size of dirty region
- *
- * \return 0 on success, -DER_CHKPT_BUSY if a checkpoint is in progress on the page. The calling
- *         transaction must either abort or find another location to modify.
- */
-int
-umem_cache_touch(struct umem_store *store, uint64_t wr_tx, umem_off_t addr, daos_size_t size);
-
-/** Callback for checkpoint to wait for the commit of chkpt_tx.
- *
- * \param[in]	arg		Argument passed to umem_cache_checkpoint
- * \param[in]	chkpt_tx	The WAL transaction ID we are waiting to commit to WAL
- * \param[out]	committed_tx	The WAL tx ID of the last transaction committed to WAL
- */
-typedef void
-umem_cache_wait_cb_t(void *arg, uint64_t chkpt_tx, uint64_t *committed_tx);
-
-/**
- * Write all dirty pages before @wal_tx to MD blob. (XXX: not yet implemented)
- *
- * This function can yield internally, it is called by checkpoint service of upper level stack.
- *
- * \param[in]		store		The umem store
- * \param[in]		wait_cb		Callback for to wait for wal commit completion
- * \param[in]		arg		argument for wait_cb
- * \param[in,out]	chkpt_id	Input is last committed id, output is checkpointed id
- * \param[out]		chkpt_stats	check point stats
- *
- * \return 0 on success
- */
-int
-umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, void *arg,
-		      uint64_t *chkpt_id, struct umem_cache_chkpt_stats *chkpt_stats);
-
 #endif /** DAOS_PMEM_BUILD */
-
 #endif /* __DAOS_MEM_H__ */
diff --git a/src/include/daos/task.h b/src/include/daos/task.h
index 5cc4672fa30..88d5ef8c4c8 100644
--- a/src/include/daos/task.h
+++ b/src/include/daos/task.h
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2015-2023 Intel Corporation.
+ * (C) Copyright 2015-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
diff --git a/src/include/daos_errno.h b/src/include/daos_errno.h
index 8f2960b5933..d7decc6654d 100644
--- a/src/include/daos_errno.h
+++ b/src/include/daos_errno.h
@@ -211,9 +211,11 @@ extern "C" {
 	ACTION(DER_CHKPT_BUSY, Page is temporarily read only due to checkpointing)                 \
 	ACTION(DER_DIV_BY_ZERO,	Division by zero)						   \
 	/** Target is overload, retry RPC */							   \
-	ACTION(DER_OVERLOAD_RETRY, "retry later because of overloaded service")			   \
+	ACTION(DER_OVERLOAD_RETRY, retry later because of overloaded service)			   \
 	ACTION(DER_NOT_RESUME, Cannot resume former DAOS check instance)
 
+/* clang-format on */
+
 /** Defines the gurt error codes */
 #define D_FOREACH_ERR_RANGE(ACTION)	\
 	ACTION(GURT,	1000)		\
diff --git a/src/include/daos_srv/bio.h b/src/include/daos_srv/bio.h
index c32202a1b19..1486692d947 100644
--- a/src/include/daos_srv/bio.h
+++ b/src/include/daos_srv/bio.h
@@ -1017,15 +1017,18 @@ enum bio_mc_flags {
  *
  * \param[in]	xs_ctxt		Per-xstream NVMe context
  * \param[in]	pool_id		Pool UUID
+ * \param[in]	scm_sz		VOS file size in bytes
  * \param[in]	meta_sz		Meta blob size in bytes
  * \param[in]	wal_sz		WAL blob in bytes
  * \param[in]	data_sz		Data blob in bytes
  * \param[in]	flags		bio_mc_flags
+ * \param[in]	backend_type	Backend allocator type
  *
  * \return			Zero on success, negative value on error.
  */
-int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_sz,
-		  uint64_t wal_sz, uint64_t data_sz, enum bio_mc_flags flags);
+int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t scm_sz,
+		  uint64_t meta_sz, uint64_t wal_sz, uint64_t data_sz, enum bio_mc_flags flags,
+		  uint8_t backend_type);
 
 /*
  * Destroy Meta/Data/WAL blobs
@@ -1151,10 +1154,10 @@ int bio_wal_flush_header(struct bio_meta_context *mc);
 int bio_wal_checkpoint(struct bio_meta_context *mc, uint64_t tx_id, uint64_t *purge_size);
 
 /*
- * Query meta capacity & meta block size & meta blob header blocks.
+ * Query the attributes of umem_store
  */
 void bio_meta_get_attr(struct bio_meta_context *mc, uint64_t *capacity, uint32_t *blk_sz,
-		       uint32_t *hdr_blks);
+		       uint32_t *hdr_blks, uint8_t *backend_type, bool *evictable);
 
 struct bio_wal_info {
 	uint32_t	wi_tot_blks;	/* Total blocks */
diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h
index f99f4df14e3..9fc615c2a8b 100644
--- a/src/include/daos_srv/container.h
+++ b/src/include/daos_srv/container.h
@@ -108,6 +108,9 @@ struct ds_cont_child {
 	uint32_t		 sc_dtx_committable_count;
 	uint32_t		 sc_dtx_committable_coll_count;
 
+	/* Last timestamp when EC aggregation reports -DER_INPROGRESS. */
+	uint64_t		 sc_ec_agg_busy_ts;
+
 	/* The global minimum EC aggregation epoch, which will be upper
 	 * limit for VOS aggregation, i.e. EC object VOS aggregation can
 	 * not cross this limit. For simplification purpose, all objects
diff --git a/src/include/daos_srv/evtree.h b/src/include/daos_srv/evtree.h
index 63224259ccc..292c8848c87 100644
--- a/src/include/daos_srv/evtree.h
+++ b/src/include/daos_srv/evtree.h
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2017-2023 Intel Corporation.
+ * (C) Copyright 2017-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -70,6 +70,10 @@ struct evt_desc_cbs {
 					  struct evt_desc *desc,
 					  daos_size_t nob, void *args);
 	void		 *dc_bio_free_args;
+	/**
+	 * Argument for allocation.
+	 */
+	void		 *dc_alloc_arg;
 	/**
 	 * Availability check, it is for data tracked by DTX undo log.
 	 * It is optional, EVTree always treats data extent is available if
diff --git a/src/include/daos_srv/smd.h b/src/include/daos_srv/smd.h
index 9efc2e790dc..d4de8b7b32b 100644
--- a/src/include/daos_srv/smd.h
+++ b/src/include/daos_srv/smd.h
@@ -56,6 +56,7 @@ struct smd_dev_info {
 struct smd_pool_info {
 	d_list_t	 spi_link;
 	uuid_t		 spi_id;
+	uint64_t	 spi_scm_sz;
 	uint64_t	 spi_blob_sz[SMD_DEV_TYPE_MAX];
 	uint16_t	 spi_flags[SMD_DEV_TYPE_MAX];
 	uint16_t	 spi_tgt_cnt[SMD_DEV_TYPE_MAX];
@@ -169,12 +170,13 @@ int smd_dev_replace(uuid_t old_id, uuid_t new_id, unsigned int old_roles);
  * \param [IN]	tgt_id		Target ID
  * \param [IN]	blob_id		Blob ID
  * \param [IN]	smd_type	SMD type
- * \param [IN]	blob_sz		Blob size
+ * \param [IN]	blob_sz		Blob size in bytes
+ * \param [IN]	scm_sz		VOS file size in bytes
  *
  * \return			Zero on success, negative value on error
  */
 int smd_pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id,
-		     enum smd_dev_type smd_type, uint64_t blob_sz);
+		     enum smd_dev_type smd_type, uint64_t blob_sz, uint64_t scm_sz);
 
 /* Assign a blob to a RDB pool target */
 int smd_rdb_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id,
diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h
index 3b838c2b4a6..3d94065b64a 100644
--- a/src/include/daos_srv/vos.h
+++ b/src/include/daos_srv/vos.h
@@ -288,8 +288,9 @@ vos_self_fini(void);
  * \param path	[IN]	Path of the memory pool
  * \param uuid	[IN]    Pool UUID
  * \param scm_sz [IN]	Size of SCM for the pool
- * \param blob_sz[IN]	Size of blob for the pool
+ * \param data_sz[IN]	Size of data blob for the pool
  * \param wal_sz [IN]	Size of WAL blob for the pool
+ * \param meta_sz[IN]	Size of Meta blob for the pool
  * \param flags [IN]	Pool open flags (see vos_pool_open_flags)
  * \param version[IN]	Pool version (0 for default version)
  * \param poh	[OUT]	Returned pool handle if not NULL
@@ -297,8 +298,9 @@ vos_self_fini(void);
  * \return              Zero on success, negative value if error
  */
 int
-vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t blob_sz,
-		   daos_size_t wal_sz, unsigned int flags, uint32_t version, daos_handle_t *poh);
+vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t data_sz,
+		   daos_size_t wal_sz, daos_size_t meta_sz, unsigned int flags, uint32_t version,
+		   daos_handle_t *poh);
 
 /**
  * Create a Versioning Object Storage Pool (VOSP), and open it if \a poh is not
@@ -307,7 +309,8 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
  * \param path	[IN]	Path of the memory pool
  * \param uuid	[IN]    Pool UUID
  * \param scm_sz [IN]	Size of SCM for the pool
- * \param blob_sz[IN]	Size of blob for the pool
+ * \param data_sz[IN]	Size of data blob for the pool
+ * \param meta_sz[IN]	Size of Meta blob for the pool
  * \param flags [IN]	Pool open flags (see vos_pool_open_flags)
  * \param version[IN]	Pool version (0 for default version)
  * \param poh	[OUT]	Returned pool handle if not NULL
@@ -315,8 +318,8 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
  * \return              Zero on success, negative value if error
  */
 int
-vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t blob_sz,
-		unsigned int flags, uint32_t version, daos_handle_t *poh);
+vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t data_sz,
+		daos_size_t meta_sz, unsigned int flags, uint32_t version, daos_handle_t *poh);
 
 /**
  * Kill a VOS pool before destroy
@@ -516,6 +519,16 @@ int
 vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr,
 	      int (*yield_func)(void *arg), void *yield_arg, uint32_t flags);
 
+/**
+ * Round up the scm and meta sizes to match the backend requirement.
+ * \param[in/out] scm_sz   SCM size that needs to be aligned up
+ * \param[in/out] meta_sz  META size that needs to be aligned up
+ *
+ * \return 0 on success, error otherwise.
+ */
+int
+vos_pool_roundup_size(size_t *scm_sz, size_t *meta_sz);
+
 /**
  * Discards changes in all epochs with the epoch range \a epr
  *
@@ -1538,4 +1551,30 @@ vos_aggregate_enter(daos_handle_t coh, daos_epoch_range_t *epr);
 void
 vos_aggregate_exit(daos_handle_t coh);
 
+struct vos_pin_handle;
+
+/**
+ * Unpin the pinned objects in md-on-ssd phase2 mode
+ *
+ * \param[in]	coh	container open handle.
+ * \param[in]	hdl	pin handle.
+ *
+ * \return 0 on success, error otherwise.
+ */
+void
+vos_unpin_objects(daos_handle_t coh, struct vos_pin_handle *hdl);
+
+/**
+ * Pin bunch of objects in md-on-ssd phase2 mode
+ *
+ * \param[in]	coh	container open handle.
+ * \param[in]	oids	object IDs.
+ * \param[in]	count	number of object IDs.
+ * \param[out]	hdl	pin handle.
+ *
+ * \return 0 on success, error otherwise.
+ */
+int
+vos_pin_objects(daos_handle_t coh, daos_unit_oid_t oids[], int count, struct vos_pin_handle **hdl);
+
 #endif /* __VOS_API_H */
diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h
index b57220f9a7c..3a30fd45399 100644
--- a/src/include/daos_srv/vos_types.h
+++ b/src/include/daos_srv/vos_types.h
@@ -125,6 +125,9 @@ struct vos_pool_space {
 	struct vea_attr		vps_vea_attr;
 	/** NVMe block allocator statistics */
 	struct vea_stat		vps_vea_stat;
+	/** Total & free non-evictable space for md-on-ssd phase2 pool */
+	uint64_t		vps_ne_total;
+	uint64_t		vps_ne_free;
 };
 
 #define SCM_TOTAL(vps)	((vps)->vps_space.s_total[DAOS_MEDIA_SCM])
diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h
index 060cdc86c18..e7d61ed3842 100644
--- a/src/include/gurt/common.h
+++ b/src/include/gurt/common.h
@@ -390,6 +390,7 @@ d_realpath(const char *path, char *resolved_path) _dalloc_;
 	})
 
 #define D_SPIN_LOCK(x)		__D_PTHREAD(pthread_spin_lock, x)
+#define D_MUTEX_TRYLOCK(x)	__D_PTHREAD_TRYLOCK(pthread_mutex_trylock, x)
 #define D_SPIN_UNLOCK(x)        __D_PTHREAD(pthread_spin_unlock, x)
 #define D_MUTEX_UNLOCK(x)       __D_PTHREAD(pthread_mutex_unlock, x)
 #define D_RWLOCK_TRYWRLOCK(x)	__D_PTHREAD_TRYLOCK(pthread_rwlock_trywrlock, x)
diff --git a/src/mgmt/pool.pb-c.c b/src/mgmt/pool.pb-c.c
index 6fed6ca6973..dfcbdc24c99 100644
--- a/src/mgmt/pool.pb-c.c
+++ b/src/mgmt/pool.pb-c.c
@@ -1504,166 +1504,315 @@ void   mgmt__pool_query_target_resp__free_unpacked
   assert(message->base.descriptor == &mgmt__pool_query_target_resp__descriptor);
   protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
 }
-static const ProtobufCFieldDescriptor mgmt__pool_create_req__field_descriptors[13] = {
-    {
-	"uuid", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateReq, uuid), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"sys", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"user", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateReq, user), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"user_group", 4, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateReq, user_group), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"acl", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-	offsetof(Mgmt__PoolCreateReq, n_acl), offsetof(Mgmt__PoolCreateReq, acl), NULL,
-	&protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL                /* reserved1,reserved2, etc */
-    },
-    {
-	"properties", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-	offsetof(Mgmt__PoolCreateReq, n_properties), offsetof(Mgmt__PoolCreateReq, properties),
-	&mgmt__pool_property__descriptor, NULL, 0, /* flags */
-	0, NULL, NULL                              /* reserved1,reserved2, etc */
-    },
-    {
-	"fault_domains", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolCreateReq, n_fault_domains),
-	offsetof(Mgmt__PoolCreateReq, fault_domains), NULL, NULL, 0, /* flags */
-	0, NULL, NULL                                                /* reserved1,reserved2, etc */
-    },
-    {
-	"num_svc_reps", 8, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateReq, num_svc_reps), NULL, NULL, 0,          /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"total_bytes", 9, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT64, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateReq, total_bytes), NULL, NULL, 0,          /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"tier_ratio", 10, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_DOUBLE,
-	offsetof(Mgmt__PoolCreateReq, n_tier_ratio), offsetof(Mgmt__PoolCreateReq, tier_ratio),
-	NULL, NULL, 0, /* flags */
-	0, NULL, NULL  /* reserved1,reserved2, etc */
-    },
-    {
-	"num_ranks", 11, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateReq, num_ranks), NULL, NULL, 0,           /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"ranks", 12, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolCreateReq, n_ranks), offsetof(Mgmt__PoolCreateReq, ranks), NULL, NULL,
-	0,            /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"tier_bytes", 13, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT64,
-	offsetof(Mgmt__PoolCreateReq, n_tier_bytes), offsetof(Mgmt__PoolCreateReq, tier_bytes),
-	NULL, NULL, 0, /* flags */
-	0, NULL, NULL  /* reserved1,reserved2, etc */
-    },
+static const ProtobufCFieldDescriptor mgmt__pool_create_req__field_descriptors[14] =
+{
+  {
+    "uuid",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateReq, uuid),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "sys",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateReq, sys),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "user",
+    3,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateReq, user),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "user_group",
+    4,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateReq, user_group),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "acl",
+    5,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_STRING,
+    offsetof(Mgmt__PoolCreateReq, n_acl),
+    offsetof(Mgmt__PoolCreateReq, acl),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "properties",
+    6,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_MESSAGE,
+    offsetof(Mgmt__PoolCreateReq, n_properties),
+    offsetof(Mgmt__PoolCreateReq, properties),
+    &mgmt__pool_property__descriptor,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "fault_domains",
+    7,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolCreateReq, n_fault_domains),
+    offsetof(Mgmt__PoolCreateReq, fault_domains),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "num_svc_reps",
+    8,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT32,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateReq, num_svc_reps),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "total_bytes",
+    9,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT64,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateReq, total_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "tier_ratio",
+    10,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_DOUBLE,
+    offsetof(Mgmt__PoolCreateReq, n_tier_ratio),
+    offsetof(Mgmt__PoolCreateReq, tier_ratio),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "num_ranks",
+    11,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT32,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateReq, num_ranks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "ranks",
+    12,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolCreateReq, n_ranks),
+    offsetof(Mgmt__PoolCreateReq, ranks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "tier_bytes",
+    13,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT64,
+    offsetof(Mgmt__PoolCreateReq, n_tier_bytes),
+    offsetof(Mgmt__PoolCreateReq, tier_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "mem_ratio",
+    14,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_FLOAT,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateReq, mem_ratio),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mgmt__pool_create_req__field_indices_by_name[] = {
-    4,  /* field[4] = acl */
-    6,  /* field[6] = fault_domains */
-    10, /* field[10] = num_ranks */
-    7,  /* field[7] = num_svc_reps */
-    5,  /* field[5] = properties */
-    11, /* field[11] = ranks */
-    1,  /* field[1] = sys */
-    12, /* field[12] = tier_bytes */
-    9,  /* field[9] = tier_ratio */
-    8,  /* field[8] = total_bytes */
-    2,  /* field[2] = user */
-    3,  /* field[3] = user_group */
-    0,  /* field[0] = uuid */
-};
-static const ProtobufCIntRange   mgmt__pool_create_req__number_ranges[1 + 1] = {{1, 0}, {0, 13}};
-const ProtobufCMessageDescriptor mgmt__pool_create_req__descriptor           = {
-    PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-    "mgmt.PoolCreateReq",
-    "PoolCreateReq",
-    "Mgmt__PoolCreateReq",
-    "mgmt",
-    sizeof(Mgmt__PoolCreateReq),
-    13,
-    mgmt__pool_create_req__field_descriptors,
-    mgmt__pool_create_req__field_indices_by_name,
-    1,
-    mgmt__pool_create_req__number_ranges,
-    (ProtobufCMessageInit)mgmt__pool_create_req__init,
-    NULL,
-    NULL,
-    NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor mgmt__pool_create_resp__field_descriptors[5] = {
-    {
-	"status", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_INT32, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateResp, status), NULL, NULL, 0,        /* flags */
-	0, NULL, NULL                                                 /* reserved1,reserved2, etc */
-    },
-    {
-	"svc_ldr", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolCreateResp, svc_ldr), NULL, NULL, 0,         /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"svc_reps", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolCreateResp, n_svc_reps), offsetof(Mgmt__PoolCreateResp, svc_reps), NULL,
-	NULL, 0,      /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"tgt_ranks", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolCreateResp, n_tgt_ranks), offsetof(Mgmt__PoolCreateResp, tgt_ranks),
-	NULL, NULL, 0, /* flags */
-	0, NULL, NULL  /* reserved1,reserved2, etc */
-    },
-    {
-	"tier_bytes", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT64,
-	offsetof(Mgmt__PoolCreateResp, n_tier_bytes), offsetof(Mgmt__PoolCreateResp, tier_bytes),
-	NULL, NULL, 0, /* flags */
-	0, NULL, NULL  /* reserved1,reserved2, etc */
-    },
+  4,   /* field[4] = acl */
+  6,   /* field[6] = fault_domains */
+  13,   /* field[13] = mem_ratio */
+  10,   /* field[10] = num_ranks */
+  7,   /* field[7] = num_svc_reps */
+  5,   /* field[5] = properties */
+  11,   /* field[11] = ranks */
+  1,   /* field[1] = sys */
+  12,   /* field[12] = tier_bytes */
+  9,   /* field[9] = tier_ratio */
+  8,   /* field[8] = total_bytes */
+  2,   /* field[2] = user */
+  3,   /* field[3] = user_group */
+  0,   /* field[0] = uuid */
 };
-static const unsigned mgmt__pool_create_resp__field_indices_by_name[] = {
-    0, /* field[0] = status */
-    1, /* field[1] = svc_ldr */
-    2, /* field[2] = svc_reps */
-    3, /* field[3] = tgt_ranks */
-    4, /* field[4] = tier_bytes */
-};
-static const ProtobufCIntRange   mgmt__pool_create_resp__number_ranges[1 + 1] = {{1, 0}, {0, 5}};
-const ProtobufCMessageDescriptor mgmt__pool_create_resp__descriptor           = {
-    PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-    "mgmt.PoolCreateResp",
-    "PoolCreateResp",
-    "Mgmt__PoolCreateResp",
-    "mgmt",
-    sizeof(Mgmt__PoolCreateResp),
-    5,
-    mgmt__pool_create_resp__field_descriptors,
-    mgmt__pool_create_resp__field_indices_by_name,
+static const ProtobufCIntRange mgmt__pool_create_req__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 14 }
+};
+const ProtobufCMessageDescriptor mgmt__pool_create_req__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "mgmt.PoolCreateReq",
+  "PoolCreateReq",
+  "Mgmt__PoolCreateReq",
+  "mgmt",
+  sizeof(Mgmt__PoolCreateReq),
+  14,
+  mgmt__pool_create_req__field_descriptors,
+  mgmt__pool_create_req__field_indices_by_name,
+  1,  mgmt__pool_create_req__number_ranges,
+  (ProtobufCMessageInit) mgmt__pool_create_req__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor mgmt__pool_create_resp__field_descriptors[6] =
+{
+  {
+    "status",
     1,
-    mgmt__pool_create_resp__number_ranges,
-    (ProtobufCMessageInit)mgmt__pool_create_resp__init,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_INT32,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateResp, status),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "svc_ldr",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT32,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateResp, svc_ldr),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "svc_reps",
+    3,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolCreateResp, n_svc_reps),
+    offsetof(Mgmt__PoolCreateResp, svc_reps),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "tgt_ranks",
+    4,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolCreateResp, n_tgt_ranks),
+    offsetof(Mgmt__PoolCreateResp, tgt_ranks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "tier_bytes",
+    5,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT64,
+    offsetof(Mgmt__PoolCreateResp, n_tier_bytes),
+    offsetof(Mgmt__PoolCreateResp, tier_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "mem_file_bytes",
+    6,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT64,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolCreateResp, mem_file_bytes),
     NULL,
     NULL,
-    NULL /* reserved[123] */
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+};
+static const unsigned mgmt__pool_create_resp__field_indices_by_name[] = {
+  5,   /* field[5] = mem_file_bytes */
+  0,   /* field[0] = status */
+  1,   /* field[1] = svc_ldr */
+  2,   /* field[2] = svc_reps */
+  3,   /* field[3] = tgt_ranks */
+  4,   /* field[4] = tier_bytes */
+};
+static const ProtobufCIntRange mgmt__pool_create_resp__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 6 }
+};
+const ProtobufCMessageDescriptor mgmt__pool_create_resp__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "mgmt.PoolCreateResp",
+  "PoolCreateResp",
+  "Mgmt__PoolCreateResp",
+  "mgmt",
+  sizeof(Mgmt__PoolCreateResp),
+  6,
+  mgmt__pool_create_resp__field_descriptors,
+  mgmt__pool_create_resp__field_indices_by_name,
+  1,  mgmt__pool_create_resp__number_ranges,
+  (ProtobufCMessageInit) mgmt__pool_create_resp__init,
+  NULL,NULL,NULL    /* reserved[123] */
 };
 static const ProtobufCFieldDescriptor mgmt__pool_destroy_req__field_descriptors[5] =
 {
@@ -1960,41 +2109,75 @@ const ProtobufCMessageDescriptor mgmt__pool_evict_resp__descriptor =
   (ProtobufCMessageInit) mgmt__pool_evict_resp__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mgmt__pool_exclude_req__field_descriptors[5] = {
-    {
-	"sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolExcludeReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"id", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolExcludeReq, id), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"rank", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolExcludeReq, rank), NULL, NULL, 0,         /* flags */
-	0, NULL, NULL                                                /* reserved1,reserved2, etc */
-    },
-    {
-	"target_idx", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolExcludeReq, n_target_idx), offsetof(Mgmt__PoolExcludeReq, target_idx),
-	NULL, NULL, 0, /* flags */
-	0, NULL, NULL  /* reserved1,reserved2, etc */
-    },
-    {
-	"svc_ranks", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolExcludeReq, n_svc_ranks), offsetof(Mgmt__PoolExcludeReq, svc_ranks),
-	NULL, NULL, 0, /* flags */
-	0, NULL, NULL  /* reserved1,reserved2, etc */
-    },
+static const ProtobufCFieldDescriptor mgmt__pool_exclude_req__field_descriptors[5] =
+{
+  {
+    "sys",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolExcludeReq, sys),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "id",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolExcludeReq, id),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "rank",
+    3,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT32,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolExcludeReq, rank),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "target_idx",
+    4,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolExcludeReq, n_target_idx),
+    offsetof(Mgmt__PoolExcludeReq, target_idx),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "svc_ranks",
+    5,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolExcludeReq, n_svc_ranks),
+    offsetof(Mgmt__PoolExcludeReq, svc_ranks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mgmt__pool_exclude_req__field_indices_by_name[] = {
-    1, /* field[1] = id */
-    2, /* field[2] = rank */
-    4, /* field[4] = svc_ranks */
-    0, /* field[0] = sys */
-    3, /* field[3] = target_idx */
+  1,   /* field[1] = id */
+  2,   /* field[2] = rank */
+  4,   /* field[4] = svc_ranks */
+  0,   /* field[0] = sys */
+  3,   /* field[3] = target_idx */
 };
 static const ProtobufCIntRange mgmt__pool_exclude_req__number_ranges[1 + 1] =
 {
@@ -2054,41 +2237,75 @@ const ProtobufCMessageDescriptor mgmt__pool_exclude_resp__descriptor =
   (ProtobufCMessageInit) mgmt__pool_exclude_resp__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mgmt__pool_drain_req__field_descriptors[5] = {
-    {
-	"sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolDrainReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"id", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0,           /* quantifier_offset */
-	offsetof(Mgmt__PoolDrainReq, id), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"rank", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolDrainReq, rank), NULL, NULL, 0,           /* flags */
-	0, NULL, NULL                                                /* reserved1,reserved2, etc */
-    },
-    {
-	"target_idx", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolDrainReq, n_target_idx), offsetof(Mgmt__PoolDrainReq, target_idx), NULL,
-	NULL, 0,      /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"svc_ranks", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolDrainReq, n_svc_ranks), offsetof(Mgmt__PoolDrainReq, svc_ranks), NULL,
-	NULL, 0,      /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
+static const ProtobufCFieldDescriptor mgmt__pool_drain_req__field_descriptors[5] =
+{
+  {
+    "sys",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolDrainReq, sys),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "id",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolDrainReq, id),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "rank",
+    3,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT32,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolDrainReq, rank),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "target_idx",
+    4,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolDrainReq, n_target_idx),
+    offsetof(Mgmt__PoolDrainReq, target_idx),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "svc_ranks",
+    5,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolDrainReq, n_svc_ranks),
+    offsetof(Mgmt__PoolDrainReq, svc_ranks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mgmt__pool_drain_req__field_indices_by_name[] = {
-    1, /* field[1] = id */
-    2, /* field[2] = rank */
-    4, /* field[4] = svc_ranks */
-    0, /* field[0] = sys */
-    3, /* field[3] = target_idx */
+  1,   /* field[1] = id */
+  2,   /* field[2] = rank */
+  4,   /* field[4] = svc_ranks */
+  0,   /* field[0] = sys */
+  3,   /* field[3] = target_idx */
 };
 static const ProtobufCIntRange mgmt__pool_drain_req__number_ranges[1 + 1] =
 {
@@ -2148,49 +2365,88 @@ const ProtobufCMessageDescriptor mgmt__pool_drain_resp__descriptor =
   (ProtobufCMessageInit) mgmt__pool_drain_resp__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mgmt__pool_extend_req__field_descriptors[6] = {
-    {
-	"sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolExtendReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"id", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolExtendReq, id), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"ranks", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolExtendReq, n_ranks), offsetof(Mgmt__PoolExtendReq, ranks), NULL, NULL,
-	0,            /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"svc_ranks", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolExtendReq, n_svc_ranks), offsetof(Mgmt__PoolExtendReq, svc_ranks), NULL,
-	NULL, 0,      /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"tier_bytes", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT64,
-	offsetof(Mgmt__PoolExtendReq, n_tier_bytes), offsetof(Mgmt__PoolExtendReq, tier_bytes),
-	NULL, NULL, 0, /* flags */
-	0, NULL, NULL  /* reserved1,reserved2, etc */
-    },
-    {
-	"fault_domains", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolExtendReq, n_fault_domains),
-	offsetof(Mgmt__PoolExtendReq, fault_domains), NULL, NULL, 0, /* flags */
-	0, NULL, NULL                                                /* reserved1,reserved2, etc */
-    },
+static const ProtobufCFieldDescriptor mgmt__pool_extend_req__field_descriptors[6] =
+{
+  {
+    "sys",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolExtendReq, sys),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "id",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolExtendReq, id),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "ranks",
+    3,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolExtendReq, n_ranks),
+    offsetof(Mgmt__PoolExtendReq, ranks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "svc_ranks",
+    4,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolExtendReq, n_svc_ranks),
+    offsetof(Mgmt__PoolExtendReq, svc_ranks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "tier_bytes",
+    5,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT64,
+    offsetof(Mgmt__PoolExtendReq, n_tier_bytes),
+    offsetof(Mgmt__PoolExtendReq, tier_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "fault_domains",
+    6,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolExtendReq, n_fault_domains),
+    offsetof(Mgmt__PoolExtendReq, fault_domains),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mgmt__pool_extend_req__field_indices_by_name[] = {
-    5, /* field[5] = fault_domains */
-    1, /* field[1] = id */
-    2, /* field[2] = ranks */
-    3, /* field[3] = svc_ranks */
-    0, /* field[0] = sys */
-    4, /* field[4] = tier_bytes */
+  5,   /* field[5] = fault_domains */
+  1,   /* field[1] = id */
+  2,   /* field[2] = ranks */
+  3,   /* field[3] = svc_ranks */
+  0,   /* field[0] = sys */
+  4,   /* field[4] = tier_bytes */
 };
 static const ProtobufCIntRange mgmt__pool_extend_req__number_ranges[1 + 1] =
 {
@@ -2212,7 +2468,7 @@ const ProtobufCMessageDescriptor mgmt__pool_extend_req__descriptor =
   (ProtobufCMessageInit) mgmt__pool_extend_req__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mgmt__pool_extend_resp__field_descriptors[2] =
+static const ProtobufCFieldDescriptor mgmt__pool_extend_resp__field_descriptors[3] =
 {
   {
     "status",
@@ -2238,15 +2494,28 @@ static const ProtobufCFieldDescriptor mgmt__pool_extend_resp__field_descriptors[
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+  {
+    "meta_blob_bytes",
+    3,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT32,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolExtendResp, meta_blob_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mgmt__pool_extend_resp__field_indices_by_name[] = {
+  2,   /* field[2] = meta_blob_bytes */
   0,   /* field[0] = status */
   1,   /* field[1] = tier_bytes */
 };
 static const ProtobufCIntRange mgmt__pool_extend_resp__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 2 }
+  { 0, 3 }
 };
 const ProtobufCMessageDescriptor mgmt__pool_extend_resp__descriptor =
 {
@@ -2256,55 +2525,95 @@ const ProtobufCMessageDescriptor mgmt__pool_extend_resp__descriptor =
   "Mgmt__PoolExtendResp",
   "mgmt",
   sizeof(Mgmt__PoolExtendResp),
-  2,
+  3,
   mgmt__pool_extend_resp__field_descriptors,
   mgmt__pool_extend_resp__field_indices_by_name,
   1,  mgmt__pool_extend_resp__number_ranges,
   (ProtobufCMessageInit) mgmt__pool_extend_resp__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mgmt__pool_reintegrate_req__field_descriptors[6] = {
-    {
-	"sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolReintegrateReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"id", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolReintegrateReq, id), NULL, &protobuf_c_empty_string, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"rank", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */
-	offsetof(Mgmt__PoolReintegrateReq, rank), NULL, NULL, 0,     /* flags */
-	0, NULL, NULL                                                /* reserved1,reserved2, etc */
-    },
-    {
-	"target_idx", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolReintegrateReq, n_target_idx),
-	offsetof(Mgmt__PoolReintegrateReq, target_idx), NULL, NULL, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
-    {
-	"svc_ranks", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32,
-	offsetof(Mgmt__PoolReintegrateReq, n_svc_ranks),
-	offsetof(Mgmt__PoolReintegrateReq, svc_ranks), NULL, NULL, 0, /* flags */
-	0, NULL, NULL                                                 /* reserved1,reserved2, etc */
-    },
-    {
-	"tier_bytes", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT64,
-	offsetof(Mgmt__PoolReintegrateReq, n_tier_bytes),
-	offsetof(Mgmt__PoolReintegrateReq, tier_bytes), NULL, NULL, 0, /* flags */
-	0, NULL, NULL /* reserved1,reserved2, etc */
-    },
+static const ProtobufCFieldDescriptor mgmt__pool_reintegrate_req__field_descriptors[6] =
+{
+  {
+    "sys",
+    1,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolReintegrateReq, sys),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "id",
+    2,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_STRING,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolReintegrateReq, id),
+    NULL,
+    &protobuf_c_empty_string,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "rank",
+    3,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT32,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolReintegrateReq, rank),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "target_idx",
+    4,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolReintegrateReq, n_target_idx),
+    offsetof(Mgmt__PoolReintegrateReq, target_idx),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "svc_ranks",
+    5,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT32,
+    offsetof(Mgmt__PoolReintegrateReq, n_svc_ranks),
+    offsetof(Mgmt__PoolReintegrateReq, svc_ranks),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "tier_bytes",
+    6,
+    PROTOBUF_C_LABEL_REPEATED,
+    PROTOBUF_C_TYPE_UINT64,
+    offsetof(Mgmt__PoolReintegrateReq, n_tier_bytes),
+    offsetof(Mgmt__PoolReintegrateReq, tier_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mgmt__pool_reintegrate_req__field_indices_by_name[] = {
-    1, /* field[1] = id */
-    2, /* field[2] = rank */
-    4, /* field[4] = svc_ranks */
-    0, /* field[0] = sys */
-    3, /* field[3] = target_idx */
-    5, /* field[5] = tier_bytes */
+  1,   /* field[1] = id */
+  2,   /* field[2] = rank */
+  4,   /* field[4] = svc_ranks */
+  0,   /* field[0] = sys */
+  3,   /* field[3] = target_idx */
+  5,   /* field[5] = tier_bytes */
 };
 static const ProtobufCIntRange mgmt__pool_reintegrate_req__number_ranges[1 + 1] =
 {
@@ -2996,7 +3305,7 @@ const ProtobufCMessageDescriptor mgmt__pool_rebuild_status__descriptor =
   (ProtobufCMessageInit) mgmt__pool_rebuild_status__init,
   NULL,NULL,NULL    /* reserved[123] */
 };
-static const ProtobufCFieldDescriptor mgmt__pool_query_resp__field_descriptors[19] =
+static const ProtobufCFieldDescriptor mgmt__pool_query_resp__field_descriptors[20] =
 {
   {
     "status",
@@ -3226,6 +3535,18 @@ static const ProtobufCFieldDescriptor mgmt__pool_query_resp__field_descriptors[1
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+  {
+    "mem_file_bytes",
+    21,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT64,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolQueryResp, mem_file_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mgmt__pool_query_resp__field_indices_by_name[] = {
   4,   /* field[4] = active_targets */
@@ -3234,6 +3555,7 @@ static const unsigned mgmt__pool_query_resp__field_indices_by_name[] = {
   10,   /* field[10] = enabled_ranks */
   2,   /* field[2] = label */
   9,   /* field[9] = leader */
+  19,   /* field[19] = mem_file_bytes */
   13,   /* field[13] = pool_layout_ver */
   18,   /* field[18] = query_mask */
   6,   /* field[6] = rebuild */
@@ -3252,7 +3574,7 @@ static const ProtobufCIntRange mgmt__pool_query_resp__number_ranges[2 + 1] =
 {
   { 1, 0 },
   { 10, 8 },
-  { 0, 19 }
+  { 0, 20 }
 };
 const ProtobufCMessageDescriptor mgmt__pool_query_resp__descriptor =
 {
@@ -3262,7 +3584,7 @@ const ProtobufCMessageDescriptor mgmt__pool_query_resp__descriptor =
   "Mgmt__PoolQueryResp",
   "mgmt",
   sizeof(Mgmt__PoolQueryResp),
-  19,
+  20,
   mgmt__pool_query_resp__field_descriptors,
   mgmt__pool_query_resp__field_indices_by_name,
   2,  mgmt__pool_query_resp__number_ranges,
@@ -3904,7 +4226,7 @@ const ProtobufCEnumDescriptor mgmt__pool_query_target_info__target_state__descri
   mgmt__pool_query_target_info__target_state__value_ranges,
   NULL,NULL,NULL,NULL   /* reserved[1234] */
 };
-static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descriptors[3] =
+static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descriptors[4] =
 {
   {
     "type",
@@ -3942,8 +4264,21 @@ static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descri
     0,             /* flags */
     0,NULL,NULL    /* reserved1,reserved2, etc */
   },
+  {
+    "mem_file_bytes",
+    4,
+    PROTOBUF_C_LABEL_NONE,
+    PROTOBUF_C_TYPE_UINT64,
+    0,   /* quantifier_offset */
+    offsetof(Mgmt__PoolQueryTargetInfo, mem_file_bytes),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
 };
 static const unsigned mgmt__pool_query_target_info__field_indices_by_name[] = {
+  3,   /* field[3] = mem_file_bytes */
   2,   /* field[2] = space */
   1,   /* field[1] = state */
   0,   /* field[0] = type */
@@ -3951,7 +4286,7 @@ static const unsigned mgmt__pool_query_target_info__field_indices_by_name[] = {
 static const ProtobufCIntRange mgmt__pool_query_target_info__number_ranges[1 + 1] =
 {
   { 1, 0 },
-  { 0, 3 }
+  { 0, 4 }
 };
 const ProtobufCMessageDescriptor mgmt__pool_query_target_info__descriptor =
 {
@@ -3961,7 +4296,7 @@ const ProtobufCMessageDescriptor mgmt__pool_query_target_info__descriptor =
   "Mgmt__PoolQueryTargetInfo",
   "mgmt",
   sizeof(Mgmt__PoolQueryTargetInfo),
-  3,
+  4,
   mgmt__pool_query_target_info__field_descriptors,
   mgmt__pool_query_target_info__field_indices_by_name,
   1,  mgmt__pool_query_target_info__number_ranges,
diff --git a/src/mgmt/pool.pb-c.h b/src/mgmt/pool.pb-c.h
index 10ea70360d9..5ae75572370 100644
--- a/src/mgmt/pool.pb-c.h
+++ b/src/mgmt/pool.pb-c.h
@@ -160,7 +160,7 @@ struct  _Mgmt__PoolCreateReq
   /*
    * formatted group e.g. "builders@"
    */
-  char                *user_group;
+  char *user_group;
   /*
    * Access Control Entries in short string format
    */
@@ -180,25 +180,25 @@ struct  _Mgmt__PoolCreateReq
   /*
    * Fault domain tree, minimal format
    */
-  size_t               n_fault_domains;
-  uint32_t            *fault_domains;
+  size_t n_fault_domains;
+  uint32_t *fault_domains;
   /*
    * desired number of pool service replicas
    */
-  uint32_t             num_svc_reps;
+  uint32_t num_svc_reps;
   /*
    * Total pool size in bytes
    */
-  uint64_t             total_bytes;
+  uint64_t total_bytes;
   /*
    * Ratio of storage tiers expressed as % of totalbytes
    */
-  size_t               n_tier_ratio;
-  double              *tier_ratio;
+  size_t n_tier_ratio;
+  double *tier_ratio;
   /*
    * Number of target ranks to use
    */
-  uint32_t             num_ranks;
+  uint32_t num_ranks;
   /*
    * target ranks
    */
@@ -207,16 +207,17 @@ struct  _Mgmt__PoolCreateReq
   /*
    * Size in bytes of storage tier
    */
-  size_t               n_tier_bytes;
-  uint64_t            *tier_bytes;
+  size_t n_tier_bytes;
+  uint64_t *tier_bytes;
+  /*
+   * Fraction of meta-blob-sz to use as mem-file-sz
+   */
+  float mem_ratio;
 };
-#define MGMT__POOL_CREATE_REQ__INIT                                                                \
-	{                                                                                          \
-		PROTOBUF_C_MESSAGE_INIT(&mgmt__pool_create_req__descriptor)                        \
-		, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string,                \
-		    (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, NULL, 0,  \
-		    NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, NULL, 0, NULL                              \
-	}
+#define MGMT__POOL_CREATE_REQ__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_create_req__descriptor) \
+    , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0,NULL, 0,NULL, 0,NULL, 0, 0, 0,NULL, 0, 0,NULL, 0,NULL, 0 }
+
 
 /*
  * PoolCreateResp returns created pool uuid and ranks.
@@ -243,16 +244,19 @@ struct  _Mgmt__PoolCreateResp
   size_t n_tgt_ranks;
   uint32_t *tgt_ranks;
   /*
-   * storage tiers allocated to pool
+   * per-rank storage tier sizes allocated in pool
    */
   size_t n_tier_bytes;
-  uint64_t        *tier_bytes;
+  uint64_t *tier_bytes;
+  /*
+   * per-rank accumulated value of memory file sizes
+   */
+  uint64_t mem_file_bytes;
 };
-#define MGMT__POOL_CREATE_RESP__INIT                                                               \
-	{                                                                                          \
-		PROTOBUF_C_MESSAGE_INIT(&mgmt__pool_create_resp__descriptor)                       \
-		, 0, 0, 0, NULL, 0, NULL, 0, NULL                                                  \
-	}
+#define MGMT__POOL_CREATE_RESP__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_create_resp__descriptor) \
+    , 0, 0, 0,NULL, 0,NULL, 0,NULL, 0 }
+
 
 /*
  * PoolDestroyReq supplies pool identifier and force flag.
@@ -386,8 +390,8 @@ struct  _Mgmt__PoolExcludeReq
   /*
    * target ranks
    */
-  size_t           n_target_idx;
-  uint32_t        *target_idx;
+  size_t n_target_idx;
+  uint32_t *target_idx;
   /*
    * List of pool service ranks
    */
@@ -436,8 +440,8 @@ struct  _Mgmt__PoolDrainReq
   /*
    * rank targets
    */
-  size_t           n_target_idx;
-  uint32_t        *target_idx;
+  size_t n_target_idx;
+  uint32_t *target_idx;
   /*
    * List of pool service ranks
    */
@@ -492,13 +496,13 @@ struct  _Mgmt__PoolExtendReq
   /*
    * Size in bytes of storage tiers
    */
-  size_t           n_tier_bytes;
-  uint64_t        *tier_bytes;
+  size_t n_tier_bytes;
+  uint64_t *tier_bytes;
   /*
    * fault domain tree, minimal format
    */
-  size_t           n_fault_domains;
-  uint32_t        *fault_domains;
+  size_t n_fault_domains;
+  uint32_t *fault_domains;
 };
 #define MGMT__POOL_EXTEND_REQ__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_extend_req__descriptor) \
@@ -520,10 +524,14 @@ struct  _Mgmt__PoolExtendResp
    */
   size_t n_tier_bytes;
   uint64_t *tier_bytes;
+  /*
+   * Size in bytes of metadata blob on SSD
+   */
+  uint32_t meta_blob_bytes;
 };
 #define MGMT__POOL_EXTEND_RESP__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_extend_resp__descriptor) \
-    , 0, 0,NULL }
+    , 0, 0,NULL, 0 }
 
 
 /*
@@ -547,8 +555,8 @@ struct  _Mgmt__PoolReintegrateReq
   /*
    * target ranks
    */
-  size_t           n_target_idx;
-  uint32_t        *target_idx;
+  size_t n_target_idx;
+  uint32_t *target_idx;
   /*
    * List of pool service ranks
    */
@@ -557,8 +565,8 @@ struct  _Mgmt__PoolReintegrateReq
   /*
    * Size in bytes of storage tiers
    */
-  size_t           n_tier_bytes;
-  uint64_t        *tier_bytes;
+  size_t n_tier_bytes;
+  uint64_t *tier_bytes;
 };
 #define MGMT__POOL_REINTEGRATE_REQ__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_reintegrate_req__descriptor) \
@@ -857,10 +865,14 @@ struct  _Mgmt__PoolQueryResp
    * Bitmask of pool query options used
    */
   uint64_t query_mask;
+  /*
+   * per-pool accumulated value of memory file sizes
+   */
+  uint64_t mem_file_bytes;
 };
 #define MGMT__POOL_QUERY_RESP__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_query_resp__descriptor) \
-    , 0, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, 0, 0, NULL, 0,NULL, 0, 0, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, 0, 0, MGMT__POOL_SERVICE_STATE__Creating, 0, 0,NULL, 0 }
+    , 0, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, 0, 0, NULL, 0,NULL, 0, 0, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, 0, 0, MGMT__POOL_SERVICE_STATE__Creating, 0, 0,NULL, 0, 0 }
 
 
 typedef enum {
@@ -1102,10 +1114,14 @@ struct  _Mgmt__PoolQueryTargetInfo
    */
   size_t n_space;
   Mgmt__StorageTargetUsage **space;
+  /*
+   * per-target value of memory file size
+   */
+  uint64_t mem_file_bytes;
 };
 #define MGMT__POOL_QUERY_TARGET_INFO__INIT \
  { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_query_target_info__descriptor) \
-    , MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__UNKNOWN, MGMT__POOL_QUERY_TARGET_INFO__TARGET_STATE__STATE_UNKNOWN, 0,NULL }
+    , MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__UNKNOWN, MGMT__POOL_QUERY_TARGET_INFO__TARGET_STATE__STATE_UNKNOWN, 0,NULL, 0 }
 
 
 /*
diff --git a/src/mgmt/rpc.h b/src/mgmt/rpc.h
index 47e4deacc20..e7b5501be1f 100644
--- a/src/mgmt/rpc.h
+++ b/src/mgmt/rpc.h
@@ -166,7 +166,8 @@ CRT_RPC_DECLARE(mgmt_pool_list, DAOS_ISEQ_MGMT_POOL_LIST, DAOS_OSEQ_MGMT_POOL_LI
 	((uuid_t)		(tc_pool_uuid)		CRT_VAR) \
 	((d_string_t)		(tc_tgt_dev)		CRT_VAR) \
 	((daos_size_t)		(tc_scm_size)		CRT_VAR) \
-	((daos_size_t)		(tc_nvme_size)		CRT_VAR)
+	((daos_size_t)		(tc_nvme_size)		CRT_VAR) \
+	((daos_size_t)		(tc_meta_size)		CRT_VAR)
 
 #define DAOS_OSEQ_MGMT_TGT_CREATE /* output fields */		   \
 	((d_rank_t)		(tc_ranks)		CRT_ARRAY) \
diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c
index be1a67a8c54..cd5fcdcb999 100644
--- a/src/mgmt/srv_drpc.c
+++ b/src/mgmt/srv_drpc.c
@@ -441,6 +441,7 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 	daos_prop_t		*base_props = NULL;
 	uint8_t			*body;
 	size_t			 len;
+	size_t                   scm_size;
 	int			 rc;
 
 	/* Unpack the inner request from the drpc call body */
@@ -495,13 +496,18 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 	}
 
 	/**
-	 * Ranks to allocate targets (in) & svc for pool replicas (out). Meta-blob size set equal
-	 * to SCM size for MD-on-SSD phase 1.
+	 * Ranks to allocate targets (in) & svc for pool replicas (out). Mapping of tier_bytes in
+	 * MD-on-SSD mode is (tier0*mem_ratio)->scm_size (mem-file-size), tier0->meta_size and
+	 * tier1->nvme_size (data_size).
 	 */
-	rc = ds_mgmt_create_pool(pool_uuid, req->sys, "pmem", targets,
-				 req->tier_bytes[DAOS_MEDIA_SCM], req->tier_bytes[DAOS_MEDIA_NVME],
-				 prop, &svc, req->n_fault_domains, req->fault_domains,
-				 req->tier_bytes[DAOS_MEDIA_SCM]);
+
+	scm_size = req->tier_bytes[DAOS_MEDIA_SCM];
+	if (req->mem_ratio)
+		scm_size *= (double)req->mem_ratio;
+
+	rc = ds_mgmt_create_pool(pool_uuid, req->sys, "pmem", targets, scm_size,
+				 req->tier_bytes[DAOS_MEDIA_NVME], prop, &svc, req->n_fault_domains,
+				 req->fault_domains, req->tier_bytes[DAOS_MEDIA_SCM]);
 	if (rc != 0) {
 		D_ERROR("failed to create pool: "DF_RC"\n", DP_RC(rc));
 		goto out;
@@ -510,6 +516,14 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 	rc = pool_create_fill_resp(&resp, pool_uuid, svc);
 	d_rank_list_free(svc);
 
+	/**
+	 * TODO DAOS-16209: Populate per-rank VOS-file sizes. For now just calculate here based on
+	 *                  the supplied input values but really should be returned from
+	 *                  ds_mgmt_pool_query() through the VOS query API and set in
+	 *                  pool_create_fill_resp(). Return zero for non-MD-on-SSD mode.
+	 */
+	resp.mem_file_bytes = req->tier_bytes[DAOS_MEDIA_SCM] * req->mem_ratio;
+
 out:
 	resp.status = rc;
 	len = mgmt__pool_create_resp__get_packed_size(&resp);
@@ -696,7 +710,7 @@ ds_mgmt_drpc_pool_evict(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 static int
 pool_change_target_state(char *id, d_rank_list_t *svc_ranks, size_t n_target_idx,
 			 uint32_t *target_idx, uint32_t rank, pool_comp_state_t state,
-			 size_t scm_size, size_t nvme_size)
+			 size_t scm_size, size_t nvme_size, size_t meta_blob_bytes)
 {
 	uuid_t				uuid;
 	struct pool_target_addr_list	target_addr_list;
@@ -725,7 +739,7 @@ pool_change_target_state(char *id, d_rank_list_t *svc_ranks, size_t n_target_idx
 	}
 
 	rc = ds_mgmt_pool_target_update_state(uuid, svc_ranks, &target_addr_list, state, scm_size,
-					      nvme_size);
+					      nvme_size, meta_blob_bytes);
 	if (rc != 0) {
 		D_ERROR("Failed to set pool target up "DF_UUID": "DF_RC"\n",
 			DP_UUID(uuid), DP_RC(rc));
@@ -765,7 +779,7 @@ ds_mgmt_drpc_pool_exclude(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 
 	rc = pool_change_target_state(req->id, svc_ranks, req->n_target_idx, req->target_idx,
 				      req->rank, PO_COMP_ST_DOWN, 0 /* scm_size */,
-				      0 /* nvme_size */);
+				      0 /* nvme_size */, 0 /* meta_blob_bytes */);
 
 	d_rank_list_free(svc_ranks);
 
@@ -814,7 +828,7 @@ ds_mgmt_drpc_pool_drain(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 
 	rc = pool_change_target_state(req->id, svc_ranks, req->n_target_idx, req->target_idx,
 				      req->rank, PO_COMP_ST_DRAIN, 0 /* scm_size */,
-				      0 /* nvme_size */);
+				      0 /* nvme_size */, 0 /* meta_blob_bytes */);
 
 	d_rank_list_free(svc_ranks);
 
@@ -883,7 +897,7 @@ ds_mgmt_drpc_pool_extend(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 	if (svc_ranks == NULL)
 		D_GOTO(out_list, rc = -DER_NOMEM);
 
-	rc = ds_mgmt_pool_extend(uuid, svc_ranks, rank_list, "pmem", scm_bytes, nvme_bytes,
+	rc = ds_mgmt_pool_extend(uuid, svc_ranks, rank_list, "pmem", scm_bytes, nvme_bytes, 0,
 				 req->n_fault_domains, req->fault_domains);
 
 	if (rc != 0)
@@ -898,6 +912,7 @@ ds_mgmt_drpc_pool_extend(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 	 */
 	resp.n_tier_bytes = req->n_tier_bytes;
 	resp.tier_bytes   = req->tier_bytes;
+	resp.meta_blob_bytes = 0;
 
 out_list:
 	d_rank_list_free(rank_list);
@@ -957,7 +972,7 @@ ds_mgmt_drpc_pool_reintegrate(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 		D_GOTO(out, rc = -DER_NOMEM);
 
 	rc = pool_change_target_state(req->id, svc_ranks, req->n_target_idx, req->target_idx,
-				      req->rank, PO_COMP_ST_UP, scm_bytes, nvme_bytes);
+				      req->rank, PO_COMP_ST_UP, scm_bytes, nvme_bytes, 0);
 
 	d_rank_list_free(svc_ranks);
 
@@ -1831,6 +1846,13 @@ ds_mgmt_drpc_pool_query(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
 	pool_rebuild_status_from_info(&rebuild, &pool_info.pi_rebuild_st);
 	resp.rebuild = &rebuild;
 
+	/**
+	 * TODO DAOS-16209: Populate VOS-file sizes in response. For now just return the meta-blob
+	 *                  size until VOS query API is updated. When updated, zero-value should
+	 *                  be returned in non-MD-on-SSD mode.
+	 */
+	resp.mem_file_bytes = scm.total;
+
 error:
 	resp.status = rc;
 
diff --git a/src/mgmt/srv_internal.h b/src/mgmt/srv_internal.h
index 11fe77c9b1c..a9de41a39bf 100644
--- a/src/mgmt/srv_internal.h
+++ b/src/mgmt/srv_internal.h
@@ -82,20 +82,21 @@ int ds_mgmt_group_update_handler(struct mgmt_grp_up_in *in);
 /** srv_pool.c */
 int ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, char *tgt_dev, d_rank_list_t *targets,
 			size_t scm_size, size_t nvme_size, daos_prop_t *prop, d_rank_list_t **svcp,
-			int domains_nr, uint32_t *domains, size_t meta_blob_size);
+			int domains_nr, uint32_t *domains, size_t meta_blob_bytes);
 int ds_mgmt_destroy_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks);
 int ds_mgmt_evict_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uuid_t *handles,
 		       size_t n_handles, uint32_t destroy, uint32_t force_destroy,
 		       char *machine, uint32_t *count);
 int ds_mgmt_pool_target_update_state(uuid_t pool_uuid, d_rank_list_t *svc_ranks,
 				     struct pool_target_addr_list *target_addrs,
-				     pool_comp_state_t state, size_t scm_size, size_t nvme_size);
+				     pool_comp_state_t state, size_t scm_size, size_t nvme_size,
+				     size_t meta_blob_bytes);
 int ds_mgmt_pool_reintegrate(uuid_t pool_uuid, d_rank_list_t *svc_ranks,
 			     uint32_t reint_rank,
 			     struct pool_target_id_list *reint_list);
 int ds_mgmt_pool_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks,
 			d_rank_list_t *rank_list, char *tgt_dev,
-			size_t scm_size, size_t nvme_size,
+			size_t scm_size, size_t nvme_size, size_t meta_blob_bytes,
 			size_t domains_nr, uint32_t *domains);
 int ds_mgmt_pool_set_prop(uuid_t pool_uuid, d_rank_list_t *svc_ranks,
 			  daos_prop_t *prop);
diff --git a/src/mgmt/srv_pool.c b/src/mgmt/srv_pool.c
index 0497058191c..9674b9e2652 100644
--- a/src/mgmt/srv_pool.c
+++ b/src/mgmt/srv_pool.c
@@ -83,7 +83,7 @@ pool_create_rpc_timeout(crt_rpc_t *tc_req, size_t scm_size)
 
 static int
 ds_mgmt_tgt_pool_create_ranks(uuid_t pool_uuid, char *tgt_dev, d_rank_list_t *rank_list,
-			      size_t scm_size, size_t nvme_size)
+			      size_t scm_size, size_t nvme_size, size_t meta_size)
 {
 	crt_rpc_t			*tc_req;
 	crt_opcode_t			opc;
@@ -117,6 +117,7 @@ ds_mgmt_tgt_pool_create_ranks(uuid_t pool_uuid, char *tgt_dev, d_rank_list_t *ra
 	tc_in->tc_tgt_dev = tgt_dev;
 	tc_in->tc_scm_size = scm_size;
 	tc_in->tc_nvme_size = nvme_size;
+	tc_in->tc_meta_size = meta_size;
 	rc = dss_rpc_send(tc_req);
 	if (rc == 0 && DAOS_FAIL_CHECK(DAOS_POOL_CREATE_FAIL_CORPC))
 		rc = -DER_TIMEDOUT;
@@ -170,14 +171,15 @@ ds_mgmt_pool_svc_create(uuid_t pool_uuid, int ntargets, const char *group, d_ran
 int
 ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, char *tgt_dev, d_rank_list_t *targets,
 		    size_t scm_size, size_t nvme_size, daos_prop_t *prop, d_rank_list_t **svcp,
-		    int domains_nr, uint32_t *domains, size_t meta_blob_size)
+		    int domains_nr, uint32_t *domains, size_t meta_size)
 {
 	d_rank_list_t			*pg_ranks = NULL;
 	d_rank_list_t			*pg_targets = NULL;
 	int				rc;
 	int				rc_cleanup;
 
-	D_DEBUG(DB_MGMT, DF_UUID ": meta blob size %ld", DP_UUID(pool_uuid), meta_blob_size);
+	D_DEBUG(DB_MGMT, DF_UUID ": create scm/meta/nvme sizes %ld/%ld/%ld\n", DP_UUID(pool_uuid),
+		scm_size, meta_size, nvme_size);
 
 	/* Sanity check targets versus cart's current primary group members.
 	 * If any targets not in PG, flag error before MGMT_TGT_ corpcs fail.
@@ -215,7 +217,7 @@ ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, char *tgt_dev, d_rank_l
 	}
 
 	rc = ds_mgmt_tgt_pool_create_ranks(pool_uuid, tgt_dev, targets,
-					   scm_size, nvme_size);
+					   scm_size, nvme_size, meta_size);
 	if (rc != 0) {
 		D_ERROR("creating pool "DF_UUID" on ranks failed: rc "DF_RC"\n",
 			DP_UUID(pool_uuid), DP_RC(rc));
@@ -280,8 +282,8 @@ ds_mgmt_destroy_pool(uuid_t pool_uuid, d_rank_list_t *ranks)
 
 int
 ds_mgmt_pool_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t *rank_list,
-		    char *tgt_dev,  size_t scm_size, size_t nvme_size, size_t domains_nr,
-		    uint32_t *domains)
+		    char *tgt_dev,  size_t scm_size, size_t nvme_size, size_t meta_size,
+		    size_t domains_nr, uint32_t *domains)
 {
 	d_rank_list_t		*unique_add_ranks = NULL;
 	int			ntargets;
@@ -294,7 +296,7 @@ ds_mgmt_pool_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t *r
 		D_GOTO(out, rc);
 
 	rc = ds_mgmt_tgt_pool_create_ranks(pool_uuid, tgt_dev, unique_add_ranks, scm_size,
-					   nvme_size);
+					   nvme_size, meta_size);
 	if (rc != 0) {
 		D_ERROR("creating pool on ranks "DF_UUID" failed: rc "DF_RC"\n",
 			DP_UUID(pool_uuid), DP_RC(rc));
@@ -336,7 +338,8 @@ ds_mgmt_evict_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uuid_t *handles,
 int
 ds_mgmt_pool_target_update_state(uuid_t pool_uuid, d_rank_list_t *svc_ranks,
 				 struct pool_target_addr_list *target_addrs,
-				 pool_comp_state_t state, size_t scm_size, size_t nvme_size)
+				 pool_comp_state_t state, size_t scm_size, size_t nvme_size,
+				 size_t meta_size)
 {
 	int			rc;
 
@@ -354,7 +357,7 @@ ds_mgmt_pool_target_update_state(uuid_t pool_uuid, d_rank_list_t *svc_ranks,
 		reint_ranks.rl_ranks = &target_addrs->pta_addrs[0].pta_rank;
 
 		rc = ds_mgmt_tgt_pool_create_ranks(pool_uuid, "pmem", &reint_ranks, scm_size,
-						   nvme_size);
+						   nvme_size, meta_size);
 		if (rc != 0) {
 			D_ERROR("creating pool on ranks "DF_UUID" failed: rc "
 				DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc));
diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c
index 7975a2115d4..fa54a05b529 100644
--- a/src/mgmt/srv_target.c
+++ b/src/mgmt/srv_target.c
@@ -576,7 +576,9 @@ recreate_pooltgts()
 				DP_UUID(pool_info->spi_id), DP_RC(rc));
 			goto out;
 		}
-		rc = tgt_recreate(pool_info->spi_id, pool_info->spi_blob_sz[SMD_DEV_TYPE_META],
+
+		D_ASSERT(pool_info->spi_scm_sz > 0);
+		rc = tgt_recreate(pool_info->spi_id, pool_info->spi_scm_sz,
 				  pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META], rdb_blob_sz);
 		if (rc)
 			goto out;
@@ -719,6 +721,7 @@ struct vos_pool_arg {
 	uuid_t		vpa_uuid;
 	daos_size_t	vpa_scm_size;
 	daos_size_t	vpa_nvme_size;
+	daos_size_t	vpa_meta_size;
 };
 
 static int
@@ -735,7 +738,8 @@ tgt_vos_create_one(void *varg)
 		return rc;
 
 	rc = vos_pool_create(path, (unsigned char *)vpa->vpa_uuid, vpa->vpa_scm_size,
-			     vpa->vpa_nvme_size, 0, 0 /* version */, NULL);
+			     vpa->vpa_nvme_size, vpa->vpa_meta_size, 0 /* flags */,
+			     0 /* version */, NULL);
 	if (rc)
 		D_ERROR(DF_UUID": failed to init vos pool %s: %d\n",
 			DP_UUID(vpa->vpa_uuid), path, rc);
@@ -755,7 +759,8 @@ tgt_vos_preallocate(uuid_t uuid, daos_size_t scm_size, int tgt_id)
 	if (rc)
 		goto out;
 
-	D_DEBUG(DB_MGMT, DF_UUID": creating vos file %s\n", DP_UUID(uuid), path);
+	D_DEBUG(DB_MGMT, DF_UUID ": creating vos file %s (%ld bytes)\n", DP_UUID(uuid), path,
+		scm_size);
 
 	fd = open(path, O_CREAT|O_RDWR, 0600);
 	if (fd < 0) {
@@ -1043,15 +1048,14 @@ tgt_create_preallocate(void *arg)
 		 * 16MB minimum per pmemobj file (SCM partition)
 		 */
 		D_ASSERT(dss_tgt_nr > 0);
+		D_ASSERT((tca->tca_scm_size / dss_tgt_nr) >= (1 << 24));
 		if (!bio_nvme_configured(SMD_DEV_TYPE_META)) {
-			rc = tgt_vos_preallocate_sequential(tca->tca_ptrec->dptr_uuid,
-							    max(tca->tca_scm_size / dss_tgt_nr,
-								1 << 24), dss_tgt_nr);
+			rc = tgt_vos_preallocate_sequential(
+			    tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr);
 		} else {
-			rc = tgt_vos_preallocate_parallel(tca->tca_ptrec->dptr_uuid,
-							  max(tca->tca_scm_size / dss_tgt_nr,
-							      1 << 24), dss_tgt_nr,
-							  &tca->tca_ptrec->cancel_create);
+			rc = tgt_vos_preallocate_parallel(
+			    tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr,
+			    &tca->tca_ptrec->cancel_create);
 		}
 		if (rc)
 			goto out;
@@ -1078,6 +1082,8 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 	pthread_t			 thread;
 	bool				 canceled_thread = false;
 	int				 rc = 0;
+	size_t                           tgt_scm_sz;
+	size_t                           tgt_meta_sz;
 
 	/** incoming request buffer */
 	tc_in = crt_req_get(tc_req);
@@ -1114,6 +1120,17 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 	D_DEBUG(DB_MGMT, DF_UUID": record inserted to dpt_creates_ht\n",
 		DP_UUID(tca.tca_ptrec->dptr_uuid));
 
+	tgt_scm_sz  = tc_in->tc_scm_size / dss_tgt_nr;
+	tgt_meta_sz = tc_in->tc_meta_size / dss_tgt_nr;
+	rc          = vos_pool_roundup_size(&tgt_scm_sz, &tgt_meta_sz);
+	if (rc) {
+		D_ERROR(DF_UUID": failed to roundup the vos size: "DF_RC"\n",
+			DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
+		goto out_rec;
+	}
+	tc_in->tc_scm_size  = tgt_scm_sz * dss_tgt_nr;
+	tc_in->tc_meta_size = tgt_meta_sz * dss_tgt_nr;
+
 	tca.tca_scm_size  = tc_in->tc_scm_size;
 	tca.tca_nvme_size = tc_in->tc_nvme_size;
 	tca.tca_dx = dss_current_xstream();
@@ -1178,8 +1195,9 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 		D_ASSERT(dss_tgt_nr > 0);
 		uuid_copy(vpa.vpa_uuid, tc_in->tc_pool_uuid);
 		/* A zero size accommodates the existing file */
-		vpa.vpa_scm_size = 0;
+		vpa.vpa_scm_size  = 0;
 		vpa.vpa_nvme_size = tc_in->tc_nvme_size / dss_tgt_nr;
+		vpa.vpa_meta_size = tc_in->tc_meta_size / dss_tgt_nr;
 		rc = dss_thread_collective(tgt_vos_create_one, &vpa, DSS_ULT_DEEP_STACK);
 		if (rc) {
 			D_ERROR(DF_UUID": thread collective tgt_vos_create_one failed, "DF_RC"\n",
diff --git a/src/mgmt/tests/mocks.c b/src/mgmt/tests/mocks.c
index 912a36f293a..4840030bca0 100644
--- a/src/mgmt/tests/mocks.c
+++ b/src/mgmt/tests/mocks.c
@@ -427,7 +427,8 @@ uuid_t  ds_mgmt_target_update_uuid;
 int
 ds_mgmt_pool_target_update_state(uuid_t pool_uuid, d_rank_list_t *svc_ranks,
 				 struct pool_target_addr_list *target_addrs,
-				 pool_comp_state_t state, size_t scm_size, size_t nvme_size)
+				 pool_comp_state_t state, size_t scm_size, size_t nvme_size,
+				 size_t meta_blob_bytes)
 {
 	uuid_copy(ds_mgmt_target_update_uuid, pool_uuid);
 	return ds_mgmt_target_update_return;
@@ -445,7 +446,7 @@ uuid_t  ds_mgmt_pool_extend_uuid;
 int
 ds_mgmt_pool_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks,
 		    d_rank_list_t *rank_list,
-		    char *tgt_dev,  size_t scm_size, size_t nvme_size,
+		    char *tgt_dev,  size_t scm_size, size_t nvme_size, size_t meta_blob_bytes,
 		    size_t domains_nr, uint32_t *domains)
 {
 	uuid_copy(ds_mgmt_pool_extend_uuid, pool_uuid);
@@ -525,7 +526,7 @@ ds_mgmt_group_update_handler(struct mgmt_grp_up_in *in)
 int
 ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, char *tgt_dev, d_rank_list_t *targets,
 		    size_t scm_size, size_t nvme_size, daos_prop_t *prop, d_rank_list_t **svcp,
-		    int domains_nr, uint32_t *domains, size_t meta_blob_size)
+		    int domains_nr, uint32_t *domains, size_t meta_blob_bytes)
 {
 	return 0;
 }
diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c
index 3285bec58b3..d517e3269d6 100644
--- a/src/object/cli_coll.c
+++ b/src/object/cli_coll.c
@@ -873,7 +873,7 @@ queue_coll_query_task(tse_task_t *api_task, struct obj_auxi_args *obj_auxi, stru
 			   0, 0, ocdc);
 
 	for (i = 0; i < ocdc->grp_nr; i++) {
-		obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep);
+		obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep, obj->cob_md.omd_id);
 
 		tmp = coa->coa_dcts[ocdc->cur_pos].dct_shards[tgt_ep.ep_tag].dcs_idx;
 		rc = queue_shard_query_key_task(api_task, obj_auxi, epoch, tmp, map_ver,
diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c
index fcc4f7601f4..75d661d0665 100644
--- a/src/object/cli_obj.c
+++ b/src/object/cli_obj.c
@@ -1718,15 +1718,20 @@ dc_obj_retry_delay(tse_task_t *task, int err, uint16_t *retry_cnt, uint16_t *inp
 		   uint32_t timeout_sec)
 {
 	uint32_t	delay = 0;
+	uint32_t	limit = 4;
 
 	/*
-	 * Randomly delay 5 - 68 us if it is not the first retry for
+	 * Randomly delay 5 ~ 1028 us if it is not the first retry for
 	 * -DER_INPROGRESS || -DER_UPDATE_AGAIN cases.
 	 */
 	++(*retry_cnt);
 	if (err == -DER_INPROGRESS || err == -DER_UPDATE_AGAIN) {
 		if (++(*inprogress_cnt) > 1) {
-			delay = (d_rand() & ((1 << 6) - 1)) + 5;
+			limit += *inprogress_cnt;
+			if (limit > 10)
+				limit = 10;
+
+			delay = (d_rand() & ((1 << limit) - 1)) + 5;
 			/* Rebuild is being established on the server side, wait a bit longer */
 			if (err == -DER_UPDATE_AGAIN)
 				delay <<= 10;
@@ -4856,11 +4861,14 @@ obj_comp_cb(tse_task_t *task, void *data)
 		D_ASSERT(daos_handle_is_inval(obj_auxi->th));
 		D_ASSERT(obj_is_modification_opc(obj_auxi->opc));
 
-		if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0)
-			/* XXX: it is must because miss to set "RESEND" flag, that is bug. */
-			D_ASSERTF(0,
-				  "Miss 'RESEND' flag (%x) when resend the RPC for task %p: %u\n",
-				  obj_auxi->flags, task, obj_auxi->retry_cnt);
+		if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) {
+			D_ERROR("TX ID maybe reused for unknown reason, "
+				"task %p, opc %u, flags %x, retry_cnt %u\n",
+				task, obj_auxi->opc, obj_auxi->flags, obj_auxi->retry_cnt);
+			task->dt_result = -DER_IO;
+			obj_auxi->io_retry = 0;
+			goto args_fini;
+		}
 
 		if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) {
 			daos_obj_rw_t		*api_args = dc_task_get_args(obj_auxi->obj_task);
@@ -4886,6 +4894,7 @@ obj_comp_cb(tse_task_t *task, void *data)
 		}
 	}
 
+args_fini:
 	if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH)
 		obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa);
 
diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c
index 0c9dfc1418e..9f084140f80 100644
--- a/src/object/cli_shard.c
+++ b/src/object/cli_shard.c
@@ -1451,11 +1451,14 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data)
 			shard_args->pa_auxi.obj_auxi->max_delay = timeout;
 	}
 
-	DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result,
-		  "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX "
-		  DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout",
-		  rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver,
-		  *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags,
+	DL_CDEBUG(task->dt_result < 0 && task->dt_result != -DER_INPROGRESS,
+		  DLOG_ERR, DB_IO, task->dt_result,
+		  "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" in "DF_UUID"/"DF_UUID"/"
+		  DF_UUID" with DTX "DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout",
+		  rpc, DP_UOID(ocpi->ocpi_oid), DP_UUID(ocpi->ocpi_po_uuid),
+		  DP_UUID(ocpi->ocpi_co_hdl), DP_UUID(ocpi->ocpi_co_uuid), DP_DTI(&ocpi->ocpi_xid),
+		  task, ocpi->ocpi_map_ver, *cb_args->cpca_ver,
+		  (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags,
 		  cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous");
 
 	crt_req_decref(rpc);
diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h
index 06cfdb5b195..c0df21dd009 100644
--- a/src/object/obj_internal.h
+++ b/src/object/obj_internal.h
@@ -1100,7 +1100,7 @@ int daos_obj_query_merge(struct obj_query_merge_args *oqma);
 void obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size,
 			uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc);
 void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts,
-			crt_endpoint_t *tgt_ep);
+			crt_endpoint_t *tgt_ep, daos_obj_id_t oid);
 void obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc);
 int obj_utils_init(void);
 void obj_utils_fini(void);
diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c
index 82d91c966ac..c01947a05a1 100644
--- a/src/object/obj_utils.c
+++ b/src/object/obj_utils.c
@@ -616,23 +616,22 @@ obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size,
 
 void
 obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts,
-		   crt_endpoint_t *tgt_ep)
+		   crt_endpoint_t *tgt_ep, daos_obj_id_t oid)
 {
 	struct daos_coll_target		*dct = &tgts[ocdc->cur_pos];
 	struct daos_coll_target		 tmp;
-	unsigned long			 rand = 0;
 	uint32_t			 size;
 	int				 pos;
 	int				 i;
 
 	if (ocdc->cur_step > 2) {
-		rand = d_rand();
 		/*
-		 * Randomly choose an engine as the relay one for load balance.
-		 * If the one corresponding to "pos" is former moved one, then
-		 * use the "cur_pos" as the relay engine.
+		 * Choose an engine (according to the given oid) as the relay one for load balance.
+		 * If the one corresponding to "pos" is former moved one, then use the "cur_pos" as
+		 * the relay engine. Then even if related RPC was resent without changing pool map,
+		 * then the relay one will be the same as the original case.
 		 */
-		pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos;
+		pos = oid.lo % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos;
 		if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) {
 			memcpy(&tmp, &tgts[pos], sizeof(tmp));
 			memcpy(&tgts[pos], dct, sizeof(tmp));
@@ -642,8 +641,8 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t
 
 	size = dct->dct_bitmap_sz << 3;
 
-	/* Randomly choose a XS as the local leader on target engine for load balance. */
-	for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) {
+	/* Choose a target as the local agent on the engine for load balance. */
+	for (i = 0, pos = oid.lo % dct->dct_tgt_nr; i < size; i++) {
 		if (isset(dct->dct_bitmap, i)) {
 			pos -= dct->dct_shards[i].dcs_nr;
 			if (pos < 0)
diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c
index 71c630fa947..3e31067f7d2 100644
--- a/src/object/srv_ec_aggregate.c
+++ b/src/object/srv_ec_aggregate.c
@@ -1010,7 +1010,14 @@ agg_diff_preprocess(struct ec_agg_entry *entry, unsigned char *diff,
 	hole_off = 0;
 	d_list_for_each_entry(extent, &entry->ae_cur_stripe.as_dextents,
 			      ae_link) {
-		D_ASSERT(!extent->ae_hole);
+		if (extent->ae_hole) {
+			/* valid hole processed by agg_process_holes_ult() */
+			D_ASSERTF(extent->ae_epoch < entry->ae_par_extent.ape_epoch,
+				  "hole ext epoch " DF_X64 ", parity epoch " DF_X64 "\n",
+				  extent->ae_epoch, entry->ae_par_extent.ape_epoch);
+			continue;
+		}
+
 		if (extent->ae_epoch <= entry->ae_par_extent.ape_epoch)
 			continue;
 		D_ASSERT(extent->ae_recx.rx_idx >= ss);
@@ -2667,8 +2674,13 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
 	struct ec_agg_param	 *ec_agg_param = agg_param->ap_data;
 	vos_iter_param_t	 iter_param = { 0 };
 	struct vos_iter_anchors  anchors = { 0 };
+	struct dtx_handle	*dth = NULL;
+	struct dtx_share_peer	*dsp;
+	struct dtx_id		 dti = { 0 };
+	struct dtx_epoch	 epoch = { 0 };
+	daos_unit_oid_t		 oid = { 0 };
+	int			 blocks = 0;
 	int			 rc = 0;
-	int                       blocks       = 0;
 
 	/*
 	 * Avoid calling into vos_aggregate() when aborting aggregation
@@ -2715,8 +2727,32 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
 	agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL);
 
 retry:
+	epoch.oe_value = epr->epr_hi;
+	rc = dtx_begin(cont->sc_hdl, &dti, &epoch, 0, cont->sc_pool->spc_map_version, &oid,
+		       NULL, 0, 0, NULL, &dth);
+	if (rc != 0)
+		goto update_hae;
+
 	rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors, agg_iterate_pre_cb,
-			 agg_iterate_post_cb, ec_agg_param, NULL);
+			 agg_iterate_post_cb, ec_agg_param, dth);
+	if (rc == -DER_INPROGRESS && !d_list_empty(&dth->dth_share_tbd_list)) {
+		uint64_t	now = daos_gettime_coarse();
+
+		/* Report warning per each 10 seconds to avoid log flood. */
+		if (now - cont->sc_ec_agg_busy_ts > 10) {
+			while ((dsp = d_list_pop_entry(&dth->dth_share_tbd_list,
+						       struct dtx_share_peer, dsp_link)) != NULL) {
+				D_WARN(DF_CONT ": EC aggregate hit non-committed DTX " DF_DTI "\n",
+				       DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
+				       DP_DTI(&dsp->dsp_xid));
+				dtx_dsp_free(dsp);
+			}
+
+			cont->sc_ec_agg_busy_ts = now;
+		}
+	}
+
+	dtx_end(dth, cont, rc);
 
 	/* Post_cb may not being executed in some cases */
 	agg_clear_extents(&ec_agg_param->ap_agg_entry);
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index 041ea903c4f..cdd6f4ffa67 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -2953,8 +2953,11 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 
 		d_tm_inc_counter(opm->opm_update_resent, 1);
 
-again1:
-		e = 0;
+again:
+		if (flags & ORF_RESEND)
+			e = orw->orw_epoch;
+		else
+			e = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti,
 				       &e, &version);
 		switch (rc) {
@@ -2965,8 +2968,13 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 			orw->orw_epoch = e;
 			/* TODO: Also recover the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &orw->orw_dti, e);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
-			rc = 0;
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -2976,7 +2984,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 		D_GOTO(out, rc);
 	}
 
-again2:
 	/* For leader case, we need to find out the potential conflict
 	 * (or share the same non-committed object/dkey) DTX(s) in the
 	 * CoS (committable) cache, piggyback them via the dispdatched
@@ -3021,7 +3028,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 	exec_arg.start = orw->orw_start_shard;
 
 	/* Execute the operation on all targets */
@@ -3036,28 +3043,25 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 	case -DER_TX_RESTART:
 		/*
 		 * If this is a standalone operation, we can restart the
-		 * internal transaction right here. Otherwise, we have to defer
-		 * the restart to the RPC client.
+		 * internal transaction right here. Otherwise we have to
+		 * defer the restart to the RPC sponsor.
 		 */
-		if (opc == DAOS_OBJ_RPC_UPDATE) {
-			/*
-			 * Only standalone updates use this RPC. Retry with
-			 * newer epoch.
-			 */
-			orw->orw_epoch = d_hlc_get();
-			orw->orw_flags &= ~ORF_RESEND;
-			flags = 0;
-			d_tm_inc_counter(opm->opm_update_restart, 1);
-			goto again2;
-		}
+		if (opc != DAOS_OBJ_RPC_UPDATE)
+			break;
 
-		break;
+		/* Only standalone updates use this RPC. Retry with newer epoch. */
+		orw->orw_epoch = d_hlc_get();
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		d_tm_inc_counter(opm->opm_update_restart, 1);
+		goto again;
 	case -DER_AGAIN:
-		orw->orw_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		d_tm_inc_counter(opm->opm_update_retry, 1);
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -3875,8 +3879,11 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 	if (opi->opi_flags & ORF_RESEND) {
 		daos_epoch_t	e;
 
-again1:
-		e = 0;
+again:
+		if (flags & ORF_RESEND)
+			e = opi->opi_epoch;
+		else
+			e = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti,
 				       &e, &version);
 		switch (rc) {
@@ -3887,8 +3894,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 			flags |= ORF_RESEND;
 			/* TODO: Also recovery the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
-			rc = 0;
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -3898,7 +3910,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 		goto cleanup;
 	}
 
-again2:
 	/* For leader case, we need to find out the potential conflict
 	 * (or share the same non-committed object/dkey) DTX(s) in the
 	 * CoS (committable) cache, piggyback them via the dispdatched
@@ -3943,7 +3954,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 
 	/* Execute the operation on all shards */
 	if (opi->opi_api_flags & DAOS_COND_PUNCH)
@@ -3959,19 +3970,17 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 	rc = dtx_leader_end(dlh, ioc.ioc_coh, rc);
 	switch (rc) {
 	case -DER_TX_RESTART:
-		/*
-		 * Only standalone punches use this RPC. Retry with newer
-		 * epoch.
-		 */
+		/* Only standalone punches use this RPC. Retry with newer epoch. */
 		opi->opi_epoch = d_hlc_get();
-		opi->opi_flags &= ~ORF_RESEND;
-		flags = 0;
-		goto again2;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		goto again;
 	case -DER_AGAIN:
-		opi->opi_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -4401,6 +4410,44 @@ obj_cpd_reply(crt_rpc_t *rpc, int status, uint32_t map_version)
 	oco->oco_sub_epochs.ca_count = 0;
 }
 
+static inline void
+cpd_unpin_objects(daos_handle_t coh, struct vos_pin_handle *pin_hdl)
+{
+	if (pin_hdl != NULL)
+		vos_unpin_objects(coh, pin_hdl);
+}
+
+static int
+cpd_pin_objects(daos_handle_t coh, struct daos_cpd_sub_req *dcsrs,
+		struct daos_cpd_req_idx *dcri, int count, struct vos_pin_handle **pin_hdl)
+{
+	struct daos_cpd_sub_req	*dcsr;
+	daos_unit_oid_t		*oids;
+	int			 i, rc;
+
+	if (count == 0)
+		return 0;
+
+	D_ALLOC_ARRAY(oids, count);
+	if (oids == NULL)
+		return -DER_NOMEM;
+
+	for (i = 0; i < count; i++) {
+		dcsr = &dcsrs[dcri[i].dcri_req_idx];
+		dcsr->dcsr_oid.id_shard = dcri[i].dcri_shard_id;
+
+		D_ASSERT(dcsr->dcsr_opc != DCSO_READ);
+		oids[i] = dcsr->dcsr_oid;
+	}
+
+	rc = vos_pin_objects(coh, oids, count, pin_hdl);
+	if (rc)
+		DL_ERROR(rc, "Failed to pin CPD objects.");
+
+	D_FREE(oids);
+	return rc;
+}
+
 /* Locally process the operations belong to one DTX.
  * Common logic, shared by both leader and non-leader.
  */
@@ -4438,6 +4485,7 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp
 	int                      i;
 	uint64_t                 update_flags;
 	uint64_t                 sched_seq = sched_cur_seq();
+	struct vos_pin_handle	*pin_hdl = NULL;
 
 	if (dth->dth_flags & DTE_LEADER &&
 	    DAOS_FAIL_CHECK(DAOS_DTX_RESTART))
@@ -4500,6 +4548,12 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp
 		}
 	}
 
+	rc = cpd_pin_objects(ioc->ioc_vos_coh, dcsrs, dcri, dcde->dcde_write_cnt, &pin_hdl);
+	if (rc) {
+		DL_ERROR(rc, "Failed to pin objects.");
+		goto out;
+	}
+
 	/* P2: vos_update_begin. */
 	for (i = 0; i < dcde->dcde_write_cnt; i++) {
 		dcsr = &dcsrs[dcri[i].dcri_req_idx];
@@ -4820,6 +4874,8 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp
 		}
 	}
 
+	cpd_unpin_objects(ioc->ioc_vos_coh, pin_hdl);
+
 	D_FREE(iohs);
 	D_FREE(biods);
 	D_FREE(bulks);
@@ -5663,8 +5719,11 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	if (ocpi->ocpi_flags & ORF_RESEND) {
 
-again1:
-		tmp = 0;
+again:
+		if (!(ocpi->ocpi_flags & ORF_LEADER) || (flags & ORF_RESEND))
+			tmp = ocpi->ocpi_epoch;
+		else
+			tmp = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version);
 		switch (rc) {
 		case -DER_ALREADY:
@@ -5674,7 +5733,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 			flags |= ORF_RESEND;
 			/* TODO: Also recovery the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &ocpi->ocpi_xid, tmp);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -5683,7 +5748,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 		dce->dce_ver = version;
 	}
 
-again2:
 	epoch.oe_value = ocpi->ocpi_epoch;
 	epoch.oe_first = epoch.oe_value;
 	epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags);
@@ -5695,7 +5759,7 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 	exec_arg.coll_shards = dcts[0].dct_shards;
 	exec_arg.coll_tgts = dcts;
 	obj_coll_disp_init(dct_nr, ocpi->ocpi_max_tgt_sz,
@@ -5728,14 +5792,15 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 	switch (rc) {
 	case -DER_TX_RESTART:
 		ocpi->ocpi_epoch = d_hlc_get();
-		ocpi->ocpi_flags &= ~ORF_RESEND;
-		flags = 0;
-		goto again2;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		goto again;
 	case -DER_AGAIN:
-		ocpi->ocpi_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -5755,12 +5820,14 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 		max_ver = version;
 
 	DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc,
-		  "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc "
-		  DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, "
-		  "tgt_nr %u, forward width %u, forward depth %u, flags %x",
+		  "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/"
+		  DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, "
+		  "bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x",
 		  (ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
 		  (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc,
-		  DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch,
+		  DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
+		  DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl),
+		  DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch,
 		  ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz,
 		  ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count,
 		  ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c
index ce06723621b..f64d851e5b4 100644
--- a/src/object/srv_obj_remote.c
+++ b/src/object/srv_obj_remote.c
@@ -136,7 +136,7 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx,
 	*orw = *orw_parent;
 
 	orw->orw_oid.id_shard = shard_tgt->st_shard_id;
-	orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags;
+	orw->orw_flags |= (ORF_BULK_BIND | obj_exec_arg->flags) & ~ORF_LEADER;
 	if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond)
 		orw->orw_api_flags &= ~DAOS_COND_MASK;
 	orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count;
@@ -247,7 +247,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx,
 	*opi = *opi_parent;
 
 	opi->opi_oid.id_shard = shard_tgt->st_shard_id;
-	opi->opi_flags |= obj_exec_arg->flags;
+	opi->opi_flags |= obj_exec_arg->flags & ~ORF_LEADER;
 	if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond)
 		opi->opi_api_flags &= ~DAOS_COND_PUNCH;
 	opi->opi_dti_cos.ca_count = dth->dth_dti_cos_count;
@@ -495,7 +495,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	crt_endpoint_t			 tgt_ep = { 0 };
 	crt_rpc_t			*parent_req = exec_arg->rpc;
 	crt_rpc_t			*req;
-	struct obj_coll_punch_in	*ocpi_parent;
+	struct obj_coll_punch_in	*ocpi_parent = crt_req_get(parent_req);
 	struct obj_coll_punch_in	*ocpi;
 	int				 tag;
 	int				 rc = 0;
@@ -509,7 +509,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	if (remote_arg == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep);
+	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocpi_parent->ocpi_oid.id_pub);
 	tag = tgt_ep.ep_tag;
 
 	crt_req_addref(parent_req);
@@ -524,9 +524,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 		D_GOTO(out, rc);
 	}
 
-	ocpi_parent = crt_req_get(parent_req);
 	ocpi = crt_req_get(req);
-
 	ocpi->ocpi_odm = ocpi_parent->ocpi_odm;
 	uuid_copy(ocpi->ocpi_po_uuid, ocpi_parent->ocpi_po_uuid);
 	uuid_copy(ocpi->ocpi_co_hdl, ocpi_parent->ocpi_co_hdl);
@@ -634,7 +632,7 @@ ds_obj_coll_query_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	if (remote_arg == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep);
+	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocqi_parent->ocqi_oid.id_pub);
 	tag = tgt_ep.ep_tag;
 
 	remote_arg->dlh = dlh;
diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c
index a3125152013..988680b3e76 100644
--- a/src/pool/srv_target.c
+++ b/src/pool/srv_target.c
@@ -447,8 +447,10 @@ pool_child_recreate(struct ds_pool_child *child)
 		goto pool_info;
 	}
 
-	rc = vos_pool_create(path, child->spc_uuid, 0, pool_info->spi_blob_sz[SMD_DEV_TYPE_DATA],
-			     0, 0 /* version */, NULL);
+	rc = vos_pool_create(path, child->spc_uuid, 0 /* scm_sz */,
+			     pool_info->spi_blob_sz[SMD_DEV_TYPE_DATA],
+			     pool_info->spi_blob_sz[SMD_DEV_TYPE_META],
+			     0 /* flags */, 0 /* version */, NULL);
 	if (rc)
 		DL_ERROR(rc, DF_UUID": Create VOS pool failed.", DP_UUID(child->spc_uuid));
 
diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto
index be068e5274d..81864169e33 100644
--- a/src/proto/ctl/storage_nvme.proto
+++ b/src/proto/ctl/storage_nvme.proto
@@ -28,7 +28,8 @@ message ScanNvmeReq {
 	bool Basic = 3;		// Strip NVMe device details to only basic
 	uint64 MetaSize = 4;	// Size of the metadata blob
 	uint64 RdbSize = 5;	// Size of the RDB blob
-	bool   LinkStats = 6;     // Populate PCIe link info in health statistics
+	float  MemRatio = 6;	// Ratio of VOS-file:meta-blob sizes
+	bool   LinkStats = 7;	// Populate PCIe link info in health statistics
 }
 
 message ScanNvmeResp {
diff --git a/src/proto/mgmt/pool.proto b/src/proto/mgmt/pool.proto
index e65374afaec..ad6920bbf6a 100644
--- a/src/proto/mgmt/pool.proto
+++ b/src/proto/mgmt/pool.proto
@@ -32,6 +32,7 @@ message PoolCreateReq {
 	uint32          num_ranks  = 11; // Number of target ranks to use
 	repeated uint32 ranks      = 12; // target ranks
 	repeated uint64 tier_bytes = 13; // Size in bytes of storage tier
+	float           mem_ratio = 14; // Fraction of meta-blob-sz to use as mem-file-sz
 }
 
 // PoolCreateResp returns created pool uuid and ranks.
@@ -40,7 +41,8 @@ message PoolCreateResp {
 	uint32 svc_ldr = 2; // Current service leader rank
 	repeated uint32 svc_reps = 3; // pool service replica ranks
 	repeated uint32 tgt_ranks = 4; // pool target ranks
-	repeated uint64 tier_bytes = 5; // storage tiers allocated to pool
+	repeated uint64 tier_bytes     = 5; // per-rank storage tier sizes allocated in pool
+	uint64          mem_file_bytes = 6; // per-rank accumulated value of memory file sizes
 }
 
 // PoolDestroyReq supplies pool identifier and force flag.
@@ -116,6 +118,7 @@ message PoolExtendReq {
 message PoolExtendResp {
 	int32 status = 1; // DAOS error code
 	repeated uint64 tier_bytes = 2; // storage tiers allocated to pool
+	uint32 meta_blob_bytes = 3; // Size in bytes of metadata blob on SSD
 }
 
 // PoolReintegrateReq supplies pool identifier, rank, and target_idxs.
@@ -235,7 +238,8 @@ message PoolQueryResp {
 	PoolServiceState state = 17; // pool state
 	uint32 svc_ldr = 18; // current raft leader (2.6+)
 	repeated uint32 svc_reps = 19; // service replica ranks
-	uint64 query_mask = 20; // Bitmask of pool query options used
+	uint64                     query_mask         = 20; // Bitmask of pool query options used
+	uint64 mem_file_bytes = 21; // per-pool accumulated value of memory file sizes
 }
 
 message PoolProperty {
@@ -326,6 +330,7 @@ message PoolQueryTargetInfo {
 	TargetState state = 2; // target state see enum daos_target_state_t
 	// TODO: target performance data
 	repeated StorageTargetUsage space = 3; // this target's usage per storage tier
+	uint64                      mem_file_bytes = 4; // per-target value of memory file size
 }
 
 // PoolQueryTargetResp represents a pool target query response
diff --git a/src/rdb/rdb.c b/src/rdb/rdb.c
index 7ca0879ed3a..ccae3e3f647 100644
--- a/src/rdb/rdb.c
+++ b/src/rdb/rdb.c
@@ -58,7 +58,7 @@ rdb_create(const char *path, const uuid_t uuid, uint64_t caller_term, size_t siz
 	 * basic system memory reservation and VOS_POF_EXCL for concurrent
 	 * access protection.
 	 */
-	rc = vos_pool_create(path, (unsigned char *)uuid, size, 0 /* nvme_sz */,
+	rc = vos_pool_create(path, (unsigned char *)uuid, size, 0 /* data_sz */, 0 /* meta_sz */,
 			     VOS_POF_SMALL | VOS_POF_EXCL | VOS_POF_RDB, vos_df_version, &pool);
 	if (rc != 0)
 		goto out;
diff --git a/src/tests/ftest/control/dmg_pool_query_test.py b/src/tests/ftest/control/dmg_pool_query_test.py
index b7c83b59b55..593862a0ea0 100644
--- a/src/tests/ftest/control/dmg_pool_query_test.py
+++ b/src/tests/ftest/control/dmg_pool_query_test.py
@@ -97,7 +97,11 @@ def test_pool_query_basic(self):
                     "tier_name": "NVME",
                     "size": self.params.get("total", path="/run/exp_vals/nvme/*")
                 }
-            ]
+            ],
+            "mem_file_bytes": (
+                self.params.get("total", path="/run/exp_vals/scm/*") if
+                self.server_managers[0].manager.job.using_control_metadata else
+                0)
         }
 
         self.assertDictEqual(
diff --git a/src/tests/ftest/pool/list_verbose.py b/src/tests/ftest/pool/list_verbose.py
index 370ca81ad40..a46dfb73408 100644
--- a/src/tests/ftest/pool/list_verbose.py
+++ b/src/tests/ftest/pool/list_verbose.py
@@ -108,7 +108,12 @@ def create_expected(self, pool, scm_free, nvme_free, scm_imbalance,
                     "size": nvme_size,
                     "free": nvme_free,
                     "imbalance": nvme_imbalance
-                }],
+                },
+            ],
+            "mem_file_bytes": (
+                scm_size if
+                self.server_managers[0].manager.job.using_control_metadata else
+                0)
         }
 
     @staticmethod
diff --git a/src/tests/ftest/util/dfuse_utils.py b/src/tests/ftest/util/dfuse_utils.py
index a26f372e76d..900da63ebf1 100644
--- a/src/tests/ftest/util/dfuse_utils.py
+++ b/src/tests/ftest/util/dfuse_utils.py
@@ -30,7 +30,6 @@ def __init__(self, namespace, command, path=""):
         self.sys_name = FormattedParameter("--sys-name {}")
         self.thread_count = FormattedParameter("--thread-count {}")
         self.eq_count = FormattedParameter("--eq-count {}")
-        self.singlethreaded = FormattedParameter("--singlethread", False)
         self.foreground = FormattedParameter("--foreground", False)
         self.enable_caching = FormattedParameter("--enable-caching", False)
         self.enable_wb_cache = FormattedParameter("--enable-wb-cache", False)
diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py
index a48b45e59dd..271560275f5 100644
--- a/src/tests/ftest/util/dmg_utils.py
+++ b/src/tests/ftest/util/dmg_utils.py
@@ -601,8 +601,11 @@ def pool_create(self, scm_size, uid=None, gid=None, nvme_size=None,
         #     0,
         #     1
         #   ],
-        #   "scm_bytes": 256000000,
-        #   "nvme_bytes": 0
+        #   "tier_bytes": [
+        #     256000000,
+        #     0
+        #   ],
+        #   "mem_file_bytes": 0
         # },
         # "error": null,
         # "status": 0
@@ -622,6 +625,7 @@ def pool_create(self, scm_size, uid=None, gid=None, nvme_size=None,
         data["ranks"] = ",".join([str(r) for r in output["response"]["tgt_ranks"]])
         data["scm_per_rank"] = output["response"]["tier_bytes"][0]
         data["nvme_per_rank"] = output["response"]["tier_bytes"][1]
+        data["memfile_per_rank"] = output["response"]["mem_file_bytes"]
 
         return data
 
diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py
index 70d2352881d..fb6b37d9ef3 100644
--- a/src/tests/ftest/util/telemetry_utils.py
+++ b/src/tests/ftest/util/telemetry_utils.py
@@ -427,7 +427,7 @@ class TelemetryUtils():
         ENGINE_NVME_CRIT_WARN_METRICS +\
         ENGINE_NVME_INTEL_VENDOR_METRICS
     ENGINE_MEM_USAGE_METRICS = [
-        "engine_mem_vos_vos_obj_360",
+        "engine_mem_vos_vos_obj_408",
         "engine_mem_vos_vos_lru_size",
         "engine_mem_dtx_dtx_leader_handle_360"]
     ENGINE_MEM_TOTAL_USAGE_METRICS = [
diff --git a/src/tests/vos_engine.c b/src/tests/vos_engine.c
index 077dd4f061b..33fe7068ffc 100644
--- a/src/tests/vos_engine.c
+++ b/src/tests/vos_engine.c
@@ -32,7 +32,8 @@ engine_pool_init(struct credit_context *tsc)
 
 	if (tsc_create_pool(tsc)) {
 		/* Use pool size as blob size for this moment. */
-		rc = vos_pool_create(pmem_file, tsc->tsc_pool_uuid, 0, tsc->tsc_nvme_size, 0,
+		rc = vos_pool_create(pmem_file, tsc->tsc_pool_uuid, 0 /* scm_sz */,
+				     tsc->tsc_nvme_size, 0 /* meta_sz */, 0 /* flags */,
 				     0 /* version */, &poh);
 		if (rc)
 			return rc;
diff --git a/src/utils/ddb/ddb_vos.c b/src/utils/ddb/ddb_vos.c
index b331e830fdd..ff52e052247 100644
--- a/src/utils/ddb/ddb_vos.c
+++ b/src/utils/ddb/ddb_vos.c
@@ -1746,7 +1746,7 @@ sync_cb(struct ddbs_sync_info *info, void *cb_args)
 		D_WARN("delete target failed: " DF_RC "\n", DP_RC(rc));
 
 	rc = smd_pool_add_tgt(pool_id, info->dsi_hdr->bbh_vos_id,
-			      info->dsi_hdr->bbh_blob_id, st, blob_size);
+			      info->dsi_hdr->bbh_blob_id, st, blob_size, 0);
 	if (!SUCCESS(rc)) {
 		D_ERROR("add target failed: "DF_RC"\n", DP_RC(rc));
 		args->sync_rc = rc;
diff --git a/src/utils/ddb/tests/ddb_test_driver.c b/src/utils/ddb/tests/ddb_test_driver.c
index 2f9c24f931f..e88e045120f 100644
--- a/src/utils/ddb/tests/ddb_test_driver.c
+++ b/src/utils/ddb/tests/ddb_test_driver.c
@@ -243,7 +243,9 @@ ddb_test_pool_setup(struct dt_vos_pool_ctx *tctx)
 		return rc;
 	}
 
-	rc = vos_pool_create(tctx->dvt_pmem_file, tctx->dvt_pool_uuid, 0, 0, 0, 0, NULL);
+	rc = vos_pool_create(tctx->dvt_pmem_file, tctx->dvt_pool_uuid, 0 /* scm_sz */,
+			     0 /* data_sz */, 0 /* meta_sz */, 0 /* flags */, 0 /* version */,
+			     NULL);
 	if (rc) {
 		close(tctx->dvt_fd);
 		return rc;
diff --git a/src/vos/evtree.c b/src/vos/evtree.c
index d635453f8b2..59f8855c3c1 100644
--- a/src/vos/evtree.c
+++ b/src/vos/evtree.c
@@ -1443,8 +1443,9 @@ evt_node_alloc(struct evt_context *tcx, unsigned int flags,
 	struct evt_node         *nd;
 	umem_off_t		 nd_off;
 	bool                     leaf = (flags & EVT_NODE_LEAF);
+	struct vos_object	*obj = tcx->tc_desc_cbs.dc_alloc_arg;
 
-	nd_off = umem_zalloc(evt_umm(tcx), evt_node_size(tcx, leaf));
+	nd_off = vos_obj_alloc(evt_umm(tcx), obj, evt_node_size(tcx, leaf), true);
 	if (UMOFF_IS_NULL(nd_off))
 		return -DER_NOSPACE;
 
@@ -3249,8 +3250,9 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd,
 	}
 
 	if (leaf) {
-		umem_off_t desc_off;
-		uint32_t   csum_buf_size = 0;
+		umem_off_t		 desc_off;
+		uint32_t		 csum_buf_size = 0;
+		struct vos_object	*obj = tcx->tc_desc_cbs.dc_alloc_arg;
 
 		if (ci_is_valid(&ent->ei_csum))
 			csum_buf_size = ci_csums_len(ent->ei_csum);
@@ -3263,7 +3265,7 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd,
 			D_DEBUG(DB_TRACE, "Allocating an extra %d bytes "
 						"for checksum", csum_buf_size);
 		}
-		desc_off = umem_zalloc(evt_umm(tcx), desc_size);
+		desc_off = vos_obj_alloc(evt_umm(tcx), obj, desc_size, true);
 		if (UMOFF_IS_NULL(desc_off))
 			return -DER_NOSPACE;
 
diff --git a/src/vos/sys_db.c b/src/vos/sys_db.c
index d1f6d4bce98..afac65f2879 100644
--- a/src/vos/sys_db.c
+++ b/src/vos/sys_db.c
@@ -129,8 +129,8 @@ db_open_create(struct sys_db *db, bool try_create)
 	}
 	D_DEBUG(DB_IO, "Opening %s, try_create=%d\n", vdb->db_file, try_create);
 	if (try_create) {
-		rc = vos_pool_create(vdb->db_file, vdb->db_pool, SYS_DB_SIZE, 0, VOS_POF_SYSDB,
-				     0 /* version */, &vdb->db_poh);
+		rc = vos_pool_create(vdb->db_file, vdb->db_pool, SYS_DB_SIZE, 0 /* data_sz */,
+				     0 /* meta_sz */, VOS_POF_SYSDB, 0 /* version */, &vdb->db_poh);
 		if (rc) {
 			D_CRIT("sys pool create error: "DF_RC"\n", DP_RC(rc));
 			goto failed;
diff --git a/src/vos/tests/pool_scrubbing_tests.c b/src/vos/tests/pool_scrubbing_tests.c
index c6fc20b3716..a7111045b73 100644
--- a/src/vos/tests/pool_scrubbing_tests.c
+++ b/src/vos/tests/pool_scrubbing_tests.c
@@ -225,8 +225,8 @@ sts_ctx_pool_init(struct sts_context *ctx)
 	}
 
 	/* Use pool size as blob size for this moment. */
-	rc = vos_pool_create(pmem_file, ctx->tsc_pool_uuid, 0, ctx->tsc_nvme_size, 0,
-			     0 /* version */, &poh);
+	rc = vos_pool_create(pmem_file, ctx->tsc_pool_uuid, 0 /* scm_sz */, ctx->tsc_nvme_size,
+			     0 /* meta_sz */, 0 /* flags */, 0 /* version */, &poh);
 	assert_success(rc);
 
 	ctx->tsc_poh = poh;
diff --git a/src/vos/tests/vos_cmd.c b/src/vos/tests/vos_cmd.c
index fa8b9e00d0b..4cba4793c52 100644
--- a/src/vos/tests/vos_cmd.c
+++ b/src/vos/tests/vos_cmd.c
@@ -273,7 +273,8 @@ create_pool(struct cmd_info *cinfo)
 
 	close(fd);
 
-	rc = vos_pool_create(known_pool->kp_path, known_pool->kp_uuid, 0, 0, 0, 0 /* version */,
+	rc = vos_pool_create(known_pool->kp_path, known_pool->kp_uuid, 0 /* scm_sz */,
+			     0 /* data_sz */, 0 /* meta_sz */, 0 /* flags */, 0 /* version */,
 			     NULL);
 	if (rc != 0) {
 		D_ERROR("Could not create vos pool at %s, rc=" DF_RC "\n", known_pool->kp_path,
diff --git a/src/vos/tests/vts_common.c b/src/vos/tests/vts_common.c
index 618f9fedddb..0e8902bf5b4 100644
--- a/src/vos/tests/vts_common.c
+++ b/src/vos/tests/vts_common.c
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2016-2022 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -89,9 +89,9 @@ vts_pool_fallocate(char **fname)
 }
 
 int
-vts_ctx_init(struct vos_test_ctx *tcx, size_t psize)
+vts_ctx_init_ex(struct vos_test_ctx *tcx, size_t psize, size_t meta_size)
 {
-	int	 rc;
+	int rc;
 
 	memset(tcx, 0, sizeof(*tcx));
 	oid_cnt = 0;
@@ -107,8 +107,8 @@ vts_ctx_init(struct vos_test_ctx *tcx, size_t psize)
 	uuid_generate_time_safe(tcx->tc_co_uuid);
 
 	/* specify @psize as both NVMe size and SCM size */
-	rc = vos_pool_create(tcx->tc_po_name, tcx->tc_po_uuid, psize, psize, 0, 0 /* version */,
-			     &tcx->tc_po_hdl);
+	rc = vos_pool_create(tcx->tc_po_name, tcx->tc_po_uuid, psize, psize, meta_size,
+           0 /* flags */, 0 /* version */, &tcx->tc_po_hdl);
 	if (rc) {
 		print_error("vpool create %s failed with error : %d\n",
 			    tcx->tc_po_name, rc);
@@ -139,6 +139,12 @@ vts_ctx_init(struct vos_test_ctx *tcx, size_t psize)
 	return rc;
 }
 
+int
+vts_ctx_init(struct vos_test_ctx *tcx, size_t psize)
+{
+	return vts_ctx_init_ex(tcx, psize, 0);
+}
+
 void
 vts_ctx_fini(struct vos_test_ctx *tcx)
 {
@@ -268,8 +274,8 @@ pool_init(struct credit_context *tsc)
 
 	/* Use pool size as blob size for this moment. */
 	if (tsc_create_pool(tsc)) {
-		rc = vos_pool_create(pmem_file, tsc->tsc_pool_uuid, 0, tsc->tsc_nvme_size, 0,
-				     0 /* version */, &poh);
+		rc = vos_pool_create(pmem_file, tsc->tsc_pool_uuid, 0, tsc->tsc_nvme_size,
+				     0 /* meta_sz */, 0 /* flags */, 0 /* version */, &poh);
 		if (rc)
 			goto out;
 	} else {
diff --git a/src/vos/tests/vts_common.h b/src/vos/tests/vts_common.h
index 2a08cbd8ff8..11529f040b7 100644
--- a/src/vos/tests/vts_common.h
+++ b/src/vos/tests/vts_common.h
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -95,6 +95,8 @@ vts_pool_fallocate(char **fname);
 int
 vts_ctx_init(struct vos_test_ctx *tcx,
 	     size_t pool_size);
+int
+vts_ctx_init_ex(struct vos_test_ctx *tcx, size_t pool_size, size_t meta_size);
 
 void
 vts_ctx_fini(struct vos_test_ctx *tcx);
diff --git a/src/vos/tests/vts_container.c b/src/vos/tests/vts_container.c
index 6966ae866d8..d19b11a0101 100644
--- a/src/vos/tests/vts_container.c
+++ b/src/vos/tests/vts_container.c
@@ -161,8 +161,8 @@ setup(void **state)
 
 	uuid_generate_time_safe(test_arg->pool_uuid);
 	vts_pool_fallocate(&test_arg->fname);
-	ret = vos_pool_create(test_arg->fname, test_arg->pool_uuid, 0, 0, 0, 0 /* version */,
-			      &test_arg->poh);
+	ret = vos_pool_create(test_arg->fname, test_arg->pool_uuid, 0 /* scm_sz */, 0 /* data_sz */,
+			      0 /* meta_sz */, 0 /* flags */, 0 /* version */, &test_arg->poh);
 	assert_rc_equal(ret, 0);
 	*state = test_arg;
 	return 0;
diff --git a/src/vos/tests/vts_io.c b/src/vos/tests/vts_io.c
index 2f084a2d99d..ff02abaf1e2 100644
--- a/src/vos/tests/vts_io.c
+++ b/src/vos/tests/vts_io.c
@@ -898,7 +898,7 @@ io_update_and_fetch_dkey(struct io_test_args *arg, daos_epoch_t update_epoch,
 static inline int
 hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *epr,
 	 daos_epoch_t bound, uint64_t flags, uint32_t intent, struct vos_object **obj_p,
-	 struct vos_ts_set *ts_set)
+	 struct vos_ts_set *ts_set, struct umem_instance *umm)
 {
 	int rc;
 
@@ -908,7 +908,16 @@ hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *ep
 
 	if (flags & VOS_OBJ_CREATE) {
 		assert_ptr_not_equal(*obj_p, NULL);
+
+		if (umm != NULL) {
+			rc = umem_tx_begin(umm, NULL);
+			assert_rc_equal(rc, 0);
+		}
+
 		rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set);
+
+		if (umm != NULL)
+			rc = umem_tx_end(umm, rc);
 	}
 
 	return rc;
@@ -926,7 +935,8 @@ hold_objects(struct vos_object **objs, daos_handle_t *coh, daos_unit_oid_t *oid,
 	hold_flags |= VOS_OBJ_VISIBLE;
 	for (i = start; i < end; i++) {
 		rc = hold_obj(vos_hdl2cont(*coh), *oid, &epr, 0, hold_flags,
-			      no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE, &objs[i], 0);
+			      no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE,
+			      &objs[i], 0, NULL);
 		if (rc != exp_rc)
 			return 1;
 	}
@@ -991,7 +1001,8 @@ io_obj_cache_test(void **state)
 	assert_int_equal(rc, 0);
 
 	uuid_generate_time_safe(pool_uuid);
-	rc = vos_pool_create(po_name, pool_uuid, VPOOL_256M, 0, 0, 0 /* version */, &l_poh);
+	rc = vos_pool_create(po_name, pool_uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			     0 /* flags */, 0 /* version */, &l_poh);
 	assert_rc_equal(rc, 0);
 
 	rc = vos_cont_create(l_poh, ctx->tc_co_uuid);
@@ -1005,82 +1016,72 @@ io_obj_cache_test(void **state)
 
 	ummg = vos_cont2umm(vos_hdl2cont(ctx->tc_co_hdl));
 	umml = vos_cont2umm(vos_hdl2cont(l_coh));
-	rc = umem_tx_begin(ummg, NULL);
-	assert_rc_equal(rc, 0);
 
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0,
-		      VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0);
+		      VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0, ummg);
 	assert_rc_equal(rc, 0);
 
 	/** Hold object for discard */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD,
-		      DAOS_INTENT_DISCARD, &obj1, 0);
+		      DAOS_INTENT_DISCARD, &obj1, 0, ummg);
 	assert_rc_equal(rc, 0);
 	/** Second discard should fail */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD,
-		      DAOS_INTENT_DISCARD, &obj2, 0);
+		      DAOS_INTENT_DISCARD, &obj2, 0, ummg);
 	assert_rc_equal(rc, -DER_BUSY);
 	/** Should prevent simultaneous aggregation */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE,
-		     DAOS_INTENT_PURGE, &obj2, 0);
+		     DAOS_INTENT_PURGE, &obj2, 0, ummg);
 	assert_rc_equal(rc, -DER_BUSY);
 	/** Should prevent simultaneous hold for create as well */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0,
-				   VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0);
+				   VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2,
+				   0, ummg);
 	assert_rc_equal(rc, -DER_UPDATE_AGAIN);
 
 	/** Need to be able to hold for read though or iteration won't work */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE,
-		      DAOS_INTENT_DEFAULT, &obj2, 0);
+		      DAOS_INTENT_DEFAULT, &obj2, 0, ummg);
 	vos_obj_release(obj2, 0, false);
 	vos_obj_release(obj1, VOS_OBJ_DISCARD, false);
 
 	/** Hold object for aggregation */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE,
-		      DAOS_INTENT_PURGE, &obj1, 0);
+		      DAOS_INTENT_PURGE, &obj1, 0, ummg);
 	assert_rc_equal(rc, 0);
 	/** Discard should fail */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD,
-		      DAOS_INTENT_DISCARD, &obj2, 0);
+		      DAOS_INTENT_DISCARD, &obj2, 0, ummg);
 	assert_rc_equal(rc, -DER_BUSY);
 	/** Second aggregation should fail */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE,
-		      DAOS_INTENT_PURGE, &obj2, 0);
+		      DAOS_INTENT_PURGE, &obj2, 0, ummg);
 	assert_rc_equal(rc, -DER_BUSY);
 	/** Simultaneous create should work */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0,
-		      VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0);
+		      VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0, ummg);
 	assert_rc_equal(rc, 0);
 	vos_obj_release(obj2, 0, false);
 
 	/** Need to be able to hold for read though or iteration won't work */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE,
-		      DAOS_INTENT_DEFAULT, &obj2, 0);
+		      DAOS_INTENT_DEFAULT, &obj2, 0, ummg);
 	vos_obj_release(obj2, 0, false);
 	vos_obj_release(obj1, VOS_OBJ_AGGREGATE, false);
 
 	/** Now that other one is done, this should work */
 	rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD,
-		      DAOS_INTENT_DISCARD, &obj2, 0);
+		      DAOS_INTENT_DISCARD, &obj2, 0, ummg);
 	assert_rc_equal(rc, 0);
 	vos_obj_release(obj2, VOS_OBJ_DISCARD, false);
 
-	rc = umem_tx_end(ummg, 0);
-	assert_rc_equal(rc, 0);
-
 	vos_obj_release(objs[0], 0, false);
 
-	rc = umem_tx_begin(umml, NULL);
-	assert_rc_equal(rc, 0);
-
 	rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0,
-		      VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0);
+		      VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0, umml);
 	assert_rc_equal(rc, 0);
 	vos_obj_release(objs[0], 0, false);
 
-	rc = umem_tx_end(umml, 0);
-	assert_rc_equal(rc, 0);
-
 	rc = hold_objects(objs, &ctx->tc_co_hdl, &oids[0], 0, 10, true, 0);
 	assert_int_equal(rc, 0);
 
@@ -1090,7 +1091,7 @@ io_obj_cache_test(void **state)
 	rc = hold_objects(objs, &l_coh, &oids[1], 10, 15, true, 0);
 	assert_int_equal(rc, 0);
 	rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0, VOS_OBJ_VISIBLE,
-		      DAOS_INTENT_DEFAULT, &objs[16], 0);
+		      DAOS_INTENT_DEFAULT, &objs[16], 0, NULL);
 	assert_rc_equal(rc, 0);
 
 	vos_obj_release(objs[16], 0, false);
@@ -1904,7 +1905,8 @@ pool_cont_same_uuid(void **state)
 	uuid_generate(pool_uuid);
 	uuid_copy(co_uuid, pool_uuid);
 
-	ret = vos_pool_create(arg->fname, pool_uuid, VPOOL_256M, 0, 0, 0 /* version */, &poh);
+	ret = vos_pool_create(arg->fname, pool_uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, 0 /* version */, &poh);
 	assert_rc_equal(ret, 0);
 
 	ret = vos_cont_create(poh, co_uuid);
diff --git a/src/vos/tests/vts_pool.c b/src/vos/tests/vts_pool.c
index acfd4e46a8a..fc1e7aefef7 100644
--- a/src/vos/tests/vts_pool.c
+++ b/src/vos/tests/vts_pool.c
@@ -89,7 +89,8 @@ pool_ref_count_test(void **state)
 	int			num = 10;
 
 	uuid_generate(uuid);
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, 0 /* version */, NULL);
 	for (i = 0; i < num; i++) {
 		ret = vos_pool_open(arg->fname[0], uuid, 0, &arg->poh[i]);
 		assert_rc_equal(ret, 0);
@@ -119,7 +120,8 @@ pool_interop(void **state)
 	uuid_generate(uuid);
 
 	daos_fail_loc_set(FLC_POOL_DF_VER | DAOS_FAIL_ONCE);
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, 0 /* version */, NULL);
 	assert_rc_equal(ret, 0);
 
 	ret = vos_pool_open(arg->fname[0], uuid, 0, &poh);
@@ -149,15 +151,19 @@ pool_ops_run(void **state)
 				if (arg->fcreate[j]) {
 					ret = vts_pool_fallocate(&arg->fname[j]);
 					assert_int_equal(ret, 0);
-					ret = vos_pool_create_ex(arg->fname[j], arg->uuid[j], 0, 0,
-								 VPOOL_TEST_WAL_SZ, 0,
-								 0 /* version */, poh);
+					ret = vos_pool_create_ex(arg->fname[j], arg->uuid[j],
+								 0 /* scm_sz */, 0 /* data_sz */,
+								 VPOOL_TEST_WAL_SZ, 0 /* meta_sz */,
+								 0 /* flags */, 0 /* version */,
+								 poh);
 				} else {
 					ret = vts_alloc_gen_fname(&arg->fname[j]);
 					assert_int_equal(ret, 0);
 					ret = vos_pool_create_ex(arg->fname[j], arg->uuid[j],
-								 VPOOL_256M, 0, VPOOL_TEST_WAL_SZ,
-								 0, 0 /* version */, poh);
+								 VPOOL_256M, 0 /* data_sz */,
+								 VPOOL_TEST_WAL_SZ,
+								 0 /* meta_sz */, 0 /* flags */,
+								 0 /* version */, poh);
 				}
 				break;
 			case OPEN:
@@ -421,7 +427,8 @@ pool_open_excl_test(void **state)
 	uuid_generate(uuid);
 
 	print_message("open EXCL shall fail upon existing create opener\n");
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, &arg->poh[0]);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, 0 /* version */, &arg->poh[0]);
 	assert_rc_equal(ret, 0);
 	ret = vos_pool_open(arg->fname[0], uuid, VOS_POF_EXCL, &arg->poh[1]);
 	assert_rc_equal(ret, -DER_BUSY);
@@ -431,7 +438,8 @@ pool_open_excl_test(void **state)
 	assert_rc_equal(ret, 0);
 
 	print_message("open EXCL shall fail upon existing opener\n");
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, 0 /* version */, NULL);
 	assert_rc_equal(ret, 0);
 	ret = vos_pool_open(arg->fname[0], uuid, 0, &arg->poh[0]);
 	assert_rc_equal(ret, 0);
@@ -443,7 +451,8 @@ pool_open_excl_test(void **state)
 	assert_rc_equal(ret, 0);
 
 	print_message("open EXCL shall fail upon existing EXCL opener\n");
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, 0 /* version */, NULL);
 	assert_rc_equal(ret, 0);
 	ret = vos_pool_open(arg->fname[0], uuid, VOS_POF_EXCL, &arg->poh[0]);
 	assert_rc_equal(ret, 0);
@@ -456,8 +465,8 @@ pool_open_excl_test(void **state)
 
 	print_message("open EXCL shall fail upon existing EXCL create "
 		      "opener\n");
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, VOS_POF_EXCL, 0 /* version */,
-			      &arg->poh[0]);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      VOS_POF_EXCL, 0 /* version */, &arg->poh[0]);
 	assert_rc_equal(ret, 0);
 	ret = vos_pool_open(arg->fname[0], uuid, VOS_POF_EXCL, &arg->poh[1]);
 	assert_rc_equal(ret, -DER_BUSY);
@@ -467,7 +476,8 @@ pool_open_excl_test(void **state)
 	assert_rc_equal(ret, 0);
 
 	print_message("open shall fail upon existing EXCL opener\n");
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, 0 /* version */, NULL);
 	assert_rc_equal(ret, 0);
 	ret = vos_pool_open(arg->fname[0], uuid, VOS_POF_EXCL, &arg->poh[0]);
 	assert_rc_equal(ret, 0);
@@ -479,8 +489,8 @@ pool_open_excl_test(void **state)
 	assert_rc_equal(ret, 0);
 
 	print_message("open shall fail upon existing EXCL create opener\n");
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, VOS_POF_EXCL, 0 /* version */,
-			      &arg->poh[0]);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      VOS_POF_EXCL, 0 /* version */, &arg->poh[0]);
 	assert_rc_equal(ret, 0);
 	ret = vos_pool_open(arg->fname[0], uuid, 0, &arg->poh[1]);
 	assert_rc_equal(ret, -DER_BUSY);
@@ -503,11 +513,13 @@ pool_interop_create_old(void **state)
 	uuid_generate(uuid);
 
 	/* DF version too old. */
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 1 /* version */, NULL);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, 1 /* version */, NULL);
 	assert_rc_equal(ret, -DER_INVAL);
 
 	/* DF version old but supported. */
-	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, VOS_POOL_DF_2_4, NULL);
+	ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */,
+			      0 /* flags */, VOS_POOL_DF_2_4, NULL);
 	assert_rc_equal(ret, 0);
 
 	ret = vos_pool_open(arg->fname[0], uuid, 0, &poh);
diff --git a/src/vos/tests/vts_wal.c b/src/vos/tests/vts_wal.c
index 029be44b2bc..7506c7ffa79 100644
--- a/src/vos/tests/vts_wal.c
+++ b/src/vos/tests/vts_wal.c
@@ -295,7 +295,8 @@ wal_tst_pool_cont(void **state)
 	assert_int_equal(rc, 0);
 
 	/* Create pool: Create meta & WAL blobs, write meta & WAL header */
-	rc = vos_pool_create(pool_name, pool_id, 0, VPOOL_1G, 0, 0 /* version */, NULL);
+	rc = vos_pool_create(pool_name, pool_id, 0 /* scm_sz */, VPOOL_1G, 0 /* meta_sz */,
+			     0 /* flags */, 0 /* version */, NULL);
 	assert_int_equal(rc, 0);
 
 	/* Create cont: write WAL */
@@ -626,6 +627,50 @@ setup_wal_io(void **state)
 	return 0;
 }
 
+static struct io_test_args test_args;
+
+#define MDTEST_META_BLOB_SIZE (256 * 1024 * 1024UL)
+#define MDTEST_VOS_SIZE       (160 * 1024 * 1024UL)
+#define MDTEST_MB_SIZE        (16 * 1024 * 1024UL)
+#define MDTEST_MB_CNT         (MDTEST_META_BLOB_SIZE / MDTEST_MB_SIZE)
+#define MDTEST_MB_VOS_CNT     (MDTEST_VOS_SIZE / MDTEST_MB_SIZE)
+#define MDTEST_MAX_NEMB_CNT   (MDTEST_MB_VOS_CNT * 8 / 10)
+#define MDTEST_MAX_EMB_CNT    (MDTEST_MB_CNT - MDTEST_MAX_NEMB_CNT)
+
+static int
+setup_mb_io(void **state)
+{
+	int rc;
+
+	memset(&test_args, 0, sizeof(test_args));
+	rc     = vts_ctx_init_ex(&test_args.ctx, MDTEST_VOS_SIZE, MDTEST_META_BLOB_SIZE);
+	*state = (void *)&test_args;
+	return rc;
+}
+
+static int
+teardown_mb_io(void **state)
+{
+	struct io_test_args *args = (struct io_test_args *)*state;
+
+	vts_ctx_fini(&args->ctx);
+	return 0;
+}
+
+static int
+setup_mb_io_nembpct(void **state)
+{
+	d_setenv("DAOS_MD_ON_SSD_NEMB_PCT", "40", true);
+	return setup_mb_io(state);
+}
+
+static int
+teardown_mb_io_nembpct(void **state)
+{
+	d_unsetenv("DAOS_MD_ON_SSD_NEMB_PCT");
+	return teardown_mb_io(state);
+}
+
 /* refill:true - perform the pool re-load and refill after every key update/punch */
 static int
 wal_update_and_fetch_dkey(struct io_test_args *arg, daos_epoch_t update_epoch,
@@ -1259,6 +1304,1173 @@ wal14_setup(void **state)
 	return 0;
 }
 
+static void
+wal_mb_tests(void **state)
+{
+	struct io_test_args  *arg = *state;
+	struct vos_container *cont;
+	struct umem_instance *umm;
+	uint32_t              mb_id;
+	uint64_t             *ptr;
+	umem_off_t            umoff;
+
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	mb_id = umem_allot_mb_evictable(umm, 0);
+	assert_true(mb_id != 0);
+	umem_tx_begin(umm, NULL);
+	umoff = umem_alloc_from_bucket(umm, 1024, mb_id);
+	assert_false(UMOFF_IS_NULL(umoff));
+	assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id);
+	ptr  = umem_off2ptr(umm, umoff);
+	*ptr = 0xdeadcab;
+	umem_tx_commit(umm);
+
+	wal_pool_refill(arg);
+
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	ptr = umem_off2ptr(umm, umoff);
+	assert_true(*ptr == 0xdeadcab);
+
+	umem_atomic_free(umm, umoff);
+}
+
+struct bucket_alloc_info {
+	umem_off_t start_umoff;
+	uint32_t   num_allocs;
+	uint32_t   mb_id;
+	uint32_t   alloc_size;
+};
+
+#define CHECKPOINT_FREQ 10000
+static void
+checkpoint_fn(void *arg)
+{
+	struct umem_store *store;
+	uint64_t           committed_id;
+	daos_handle_t      phdl = *(daos_handle_t *)arg;
+	int                rc;
+
+	vos_pool_checkpoint_init(phdl, update_cb, wait_cb, &committed_id, &store);
+	rc = vos_pool_checkpoint(phdl);
+	assert_rc_equal(rc, 0);
+	vos_pool_checkpoint_fini(phdl);
+}
+
+static void
+alloc_bucket_to_full(struct umem_instance *umm, struct bucket_alloc_info *ainfo,
+		     void (*chkpt_fn)(void *arg), void                   *arg)
+{
+	umem_off_t              umoff, prev_umoff;
+	size_t                  alloc_size = 512;
+	umem_off_t             *ptr;
+	struct umem_cache_range rg = {0};
+	struct umem_pin_handle *p_hdl;
+	uint32_t                id = ainfo->mb_id;
+
+	if (ainfo->alloc_size)
+		alloc_size = ainfo->alloc_size;
+	else
+		ainfo->alloc_size = alloc_size;
+
+	rg.cr_off  = umem_get_mb_base_offset(umm, id);
+	rg.cr_size = 1;
+	assert_true(umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl) == 0);
+
+	if (UMOFF_IS_NULL(ainfo->start_umoff)) {
+		umem_tx_begin(umm, NULL);
+		ainfo->start_umoff = umem_alloc_from_bucket(umm, alloc_size, id);
+		umem_tx_commit(umm);
+		assert_false(UMOFF_IS_NULL(ainfo->start_umoff));
+		ainfo->num_allocs++;
+		assert_true(umem_get_mb_from_offset(umm, ainfo->start_umoff) == id);
+		prev_umoff = ainfo->start_umoff;
+		ptr        = (umem_off_t *)umem_off2ptr(umm, prev_umoff);
+		*ptr       = UMOFF_NULL;
+	} else
+		prev_umoff = ainfo->start_umoff;
+
+	while (true) {
+		ptr   = (umem_off_t *)umem_off2ptr(umm, prev_umoff);
+		umoff = *ptr;
+		if (UMOFF_IS_NULL(umoff))
+			break;
+		prev_umoff = umoff;
+	}
+
+	while (1) {
+		umem_tx_begin(umm, NULL);
+		umoff = umem_alloc_from_bucket(umm, alloc_size, id);
+
+		if (UMOFF_IS_NULL(umoff) || (umem_get_mb_from_offset(umm, umoff) != id)) {
+			umem_tx_abort(umm, 1);
+			break;
+		}
+		umem_tx_add(umm, prev_umoff, sizeof(umem_off_t));
+		ptr  = (umem_off_t *)umem_off2ptr(umm, prev_umoff);
+		*ptr = umoff;
+		ptr  = (umem_off_t *)umem_off2ptr(umm, umoff);
+		*ptr = UMOFF_NULL;
+		umem_tx_commit(umm);
+		prev_umoff = umoff;
+		if (((ainfo->num_allocs++ % CHECKPOINT_FREQ) == 0) && (chkpt_fn != NULL))
+			chkpt_fn(arg);
+	}
+	if (chkpt_fn != NULL)
+		chkpt_fn(arg);
+	umem_cache_unpin(&umm->umm_pool->up_store, p_hdl);
+	print_message("Bulk Alloc: Bucket %d, start off %lu num_allocation %d\n", ainfo->mb_id,
+		      ainfo->start_umoff, ainfo->num_allocs);
+}
+
+static void
+free_bucket_by_pct(struct umem_instance *umm, struct bucket_alloc_info *ainfo, int pct,
+		   void (*chkpt_fn)(void *arg), void *arg)
+{
+	int                     num_free = (ainfo->num_allocs * pct) / 100;
+	umem_off_t              umoff, *ptr, next_umoff;
+	struct umem_pin_handle *p_hdl;
+	struct umem_cache_range rg = {0};
+	int                     i, rc;
+
+	assert_true((pct >= 0) && (pct <= 100));
+
+	if (UMOFF_IS_NULL(ainfo->start_umoff))
+		return;
+	print_message("Bulk Free BEFORE: Bucket %d, start off %lu num_allocation %d\n",
+		      ainfo->mb_id, ainfo->start_umoff, ainfo->num_allocs);
+
+	rg.cr_off  = umem_get_mb_base_offset(umm, ainfo->mb_id);
+	rg.cr_size = 1;
+	rc         = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl);
+	assert_true(rc == 0);
+
+	umoff = ainfo->start_umoff;
+	for (i = 0; i < num_free; i++) {
+		assert_true(umem_get_mb_from_offset(umm, umoff) == ainfo->mb_id);
+		ptr        = (umem_off_t *)umem_off2ptr(umm, umoff);
+		next_umoff = *ptr;
+		umem_atomic_free(umm, umoff);
+		umoff = next_umoff;
+		if (((ainfo->num_allocs-- % CHECKPOINT_FREQ) == 0) && (chkpt_fn != NULL))
+			chkpt_fn(arg);
+		if (UMOFF_IS_NULL(umoff))
+			break;
+	}
+	ainfo->start_umoff = umoff;
+	if (chkpt_fn != NULL)
+		chkpt_fn(arg);
+	umem_cache_unpin(&umm->umm_pool->up_store, p_hdl);
+	print_message("Bulk Free AFTER: Bucket %d, start off %lu num_allocation %d\n", ainfo->mb_id,
+		      ainfo->start_umoff, ainfo->num_allocs);
+}
+
+static void
+wal_mb_utilization_tests(void **state)
+{
+	struct io_test_args     *arg = *state;
+	struct vos_container    *cont;
+	struct umem_instance    *umm;
+	struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1];
+	uint32_t                 id;
+	int                      i, j;
+	int                      mb_reuse = 0;
+
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	assert_true(MDTEST_MAX_EMB_CNT >= 8);
+	for (i = 0; i < MDTEST_MAX_EMB_CNT - 1; i++) {
+		/* Create an MB and fill it with allocs */
+		ainfo[i].mb_id       = umem_allot_mb_evictable(umm, 0);
+		ainfo[i].num_allocs  = 0;
+		ainfo[i].start_umoff = UMOFF_NULL;
+		ainfo[i].alloc_size  = 0;
+		assert_true(ainfo[i].mb_id != 0);
+		alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	}
+
+	/* Free 5% of space for MB 2 */
+	free_bucket_by_pct(umm, &ainfo[0], 5, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 90+ */
+	/* Free 30% of space for MB 3 */
+	free_bucket_by_pct(umm, &ainfo[1], 30, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 30-75 */
+	/* Free 80% of space for MB 4 */
+	free_bucket_by_pct(umm, &ainfo[2], 80, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 0-30 */
+	/* Free 15% of space for MB 5 */
+	free_bucket_by_pct(umm, &ainfo[3], 20, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 75-90 */
+	/* Free 10% of space for MB 6 */
+	free_bucket_by_pct(umm, &ainfo[4], 18, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 75-90 */
+	/* Free 50% of space for MB 7 */
+	free_bucket_by_pct(umm, &ainfo[5], 50, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 30-75 */
+	/* Free 90% of space for MB 8 */
+	free_bucket_by_pct(umm, &ainfo[6], 90, NULL, NULL); /* 0-30 */
+
+	wal_pool_refill(arg);
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	/* Allocator should return mb with utilization 30%-75% */
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[1].mb_id);
+	assert_true(id == ainfo[1].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[5].mb_id);
+	assert_true(id == ainfo[5].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[5], checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/* Next preference should be 0%-30% */
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[2].mb_id);
+	assert_true(id == ainfo[2].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[2], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[6].mb_id);
+	assert_true(id == ainfo[6].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[6], checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/* Next is to create a new memory bucket. */
+	id = umem_allot_mb_evictable(umm, 0);
+	for (i = 0; i < MDTEST_MAX_EMB_CNT - 1; i++)
+		assert_true(id != ainfo[i].mb_id);
+	print_message("obtained id %d\n", id);
+	i = MDTEST_MAX_EMB_CNT - 1;
+
+	ainfo[i].mb_id       = id;
+	ainfo[i].num_allocs  = 0;
+	ainfo[i].start_umoff = UMOFF_NULL;
+	ainfo[i].alloc_size  = 0;
+	assert_true(ainfo[i].mb_id != 0);
+	alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/* Next preference should be 75%-90% */
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[3].mb_id);
+	assert_true(id == ainfo[3].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[3], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	id = umem_allot_mb_evictable(umm, 0);
+	print_message("obtained id %d, expected is %d\n", id, ainfo[4].mb_id);
+	assert_true(id == ainfo[4].mb_id);
+	alloc_bucket_to_full(umm, &ainfo[4], checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/* If there are no more new evictable mb available it should return
+	 * one with 90% or more utilization.
+	 */
+	id = umem_allot_mb_evictable(umm, 0);
+	for (j = 0; j < i; j++) {
+		if (id == ainfo[j].mb_id) {
+			print_message("reusing evictable mb %d\n", id);
+			mb_reuse = 1;
+			break;
+		}
+	}
+	assert_true(mb_reuse);
+}
+
+#define ZONE_MAX_SIZE (16 * 1024 * 1024)
+
+static void
+wal_mb_emb_evicts_emb(void **state)
+{
+	struct io_test_args     *arg = *state;
+	struct vos_container    *cont;
+	struct umem_instance    *umm;
+	int                      i, j, po;
+	struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1];
+	uint32_t                 id;
+
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	/* Fill non-evictable buckets. */
+	ainfo[0].mb_id       = 0;
+	ainfo[0].num_allocs  = 0;
+	ainfo[0].start_umoff = UMOFF_NULL;
+	ainfo[0].alloc_size  = 0;
+	alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/*
+	 * validate whether non-evictable mbs have actually consumed MDTEST_MAX_NEMB_CNT
+	 */
+	print_message("allocations in non-evictable mbs = %u\n", ainfo[0].num_allocs);
+	print_message("space used in non-evictable mbs = %u\n",
+		      ainfo[0].num_allocs * ainfo[0].alloc_size);
+	po = (ainfo[0].num_allocs * ainfo[0].alloc_size + ZONE_MAX_SIZE - 1) / ZONE_MAX_SIZE;
+	assert_true(po == MDTEST_MAX_NEMB_CNT);
+
+	/* Now free few allocation to support spill */
+	free_bucket_by_pct(umm, &ainfo[0], 20, checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/* Create and fill MDTEST_MB_CNT evictable memory buckets. */
+	for (i = 1; i < MDTEST_MB_CNT + 1; i++) {
+		/* Create an MB and fill it with allocs */
+		id = umem_allot_mb_evictable(umm, 0);
+		for (j = 0; j < i; j++) {
+			if (id == ainfo[j].mb_id) {
+				print_message("evictable mb reused at iteration %d\n", id);
+				goto out;
+			}
+		}
+		ainfo[i].mb_id       = id;
+		ainfo[i].num_allocs  = 0;
+		ainfo[i].start_umoff = UMOFF_NULL;
+		ainfo[i].alloc_size  = 0;
+		assert_true(ainfo[i].mb_id != 0);
+		alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	}
+out:
+	assert_true(i == MDTEST_MAX_EMB_CNT + 1);
+
+	/* Validate and free all allocations in evictable MBs */
+	for (j = 0; j < i; j++)
+		free_bucket_by_pct(umm, &ainfo[j], 100, checkpoint_fn, &arg->ctx.tc_po_hdl);
+}
+
+static void
+wal_mb_nemb_evicts_emb(void **state)
+{
+	struct io_test_args     *arg = *state;
+	struct vos_container    *cont;
+	struct umem_instance    *umm;
+	int                      i, j, po;
+	struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1];
+	uint32_t                 id;
+
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	/* Create and fill evictable memory buckets. */
+	for (i = 1; i < MDTEST_MB_CNT + 1; i++) {
+		/* Create an MB and fill it with allocs */
+		id = umem_allot_mb_evictable(umm, 0);
+		for (j = 1; j < i; j++) {
+			if (id == ainfo[j].mb_id) {
+				print_message("evictable mb reused at iteration %d\n", id);
+				goto out;
+			}
+		}
+		ainfo[i].mb_id       = id;
+		ainfo[i].num_allocs  = 0;
+		ainfo[i].start_umoff = UMOFF_NULL;
+		ainfo[i].alloc_size  = 0;
+		assert_true(ainfo[i].mb_id != 0);
+		alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	}
+out:
+	assert_true(i == MDTEST_MAX_EMB_CNT + 1);
+
+	/* Fill non-evictable buckets. */
+	ainfo[0].mb_id       = 0;
+	ainfo[0].num_allocs  = 0;
+	ainfo[0].start_umoff = UMOFF_NULL;
+	ainfo[0].alloc_size  = 0;
+	alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/*
+	 * validate whether non-evictable mbs have actually consumed MDTEST_MAX_NEMB_CNT buckets.
+	 */
+	print_message("allocations in non-evictable mbs = %u\n", ainfo[0].num_allocs);
+	print_message("space used in non-evictable mbs = %u\n",
+		      ainfo[0].num_allocs * ainfo[0].alloc_size);
+	po = (ainfo[0].num_allocs * ainfo[0].alloc_size + ZONE_MAX_SIZE - 1) / ZONE_MAX_SIZE;
+	assert_true(po == MDTEST_MAX_NEMB_CNT);
+
+	/* Validate and free all allocations in evictable MBs */
+	for (j = 0; j < i; j++)
+		free_bucket_by_pct(umm, &ainfo[j], 100, checkpoint_fn, &arg->ctx.tc_po_hdl);
+}
+
+static void
+wal_mb_nemb_pct(void **state)
+{
+	struct io_test_args     *arg = *state;
+	struct vos_container    *cont;
+	struct umem_instance    *umm;
+	int                      i, j, rc, found = 0;
+	struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1];
+	daos_size_t              maxsz, cur_allocated1, cur_allocated;
+
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	/*
+	 * The setup for this test would have set environment variable
+	 * DAOS_MD_ON_SSD_NEMB_PCT to 40 before creating the pool.
+	 */
+	ainfo[0].mb_id       = 0;
+	ainfo[0].num_allocs  = 0;
+	ainfo[0].start_umoff = UMOFF_NULL;
+	ainfo[0].alloc_size  = 2048;
+	alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated, &maxsz);
+	assert_true(rc == 0);
+	print_message("nemb space utilization is %lu max is %lu\n", cur_allocated, maxsz);
+	assert_true(maxsz == MDTEST_VOS_SIZE * 40 / 100);
+
+	/* Reopen pool after setting DAOS_MD_ON_SSD_NEMB_PCT to 80%
+	 * It should not impact already created vos pool.
+	 */
+	d_setenv("DAOS_MD_ON_SSD_NEMB_PCT", "80", true);
+	wal_pool_refill(arg);
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+	alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated1, &maxsz);
+	assert_true(rc == 0);
+	print_message("nemb space utilization is %lu max is %lu\n", cur_allocated1, maxsz);
+	assert_true(maxsz == MDTEST_VOS_SIZE * 40 / 100);
+	assert_true(cur_allocated == cur_allocated1);
+
+	/* Allocate from Evictable Buckets. */
+	for (i = 1; i <= MDTEST_MB_CNT; i++) {
+		/* Create an MB and fill it with allocs */
+		ainfo[i].mb_id = umem_allot_mb_evictable(umm, 0);
+		for (j = 1; j < i; j++) {
+			if (ainfo[i].mb_id == ainfo[j].mb_id) {
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			break;
+		ainfo[i].num_allocs  = 0;
+		ainfo[i].start_umoff = UMOFF_NULL;
+		ainfo[i].alloc_size  = 2048;
+		assert_true(ainfo[i].mb_id != 0);
+		alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	}
+	i--;
+	print_message("Created %d evictable buckets, expected = %ld\n", i,
+		      (MDTEST_META_BLOB_SIZE - maxsz) / MDTEST_MB_SIZE);
+	assert_true(i == (MDTEST_META_BLOB_SIZE - maxsz) / MDTEST_MB_SIZE);
+}
+
+static int
+umoff_in_freelist(umem_off_t *free_list, int cnt, umem_off_t umoff, bool clear)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++)
+		if (umoff == free_list[i])
+			break;
+
+	if (i < cnt) {
+		if (clear)
+			free_list[i] = UMOFF_NULL;
+		return 1;
+	}
+	return 0;
+}
+
+static void
+wal_umempobj_block_reuse_internal(void **state, int restart)
+{
+	struct io_test_args     *arg = *state;
+	struct vos_container    *cont;
+	struct umem_instance    *umm;
+	umem_off_t               umoff, next_umoff, nnext_umoff;
+	umem_off_t              *ptr_cur, *ptr_next;
+	umem_off_t              *free_list[MDTEST_MB_CNT + 1];
+	umem_off_t              *free_list_bk[MDTEST_MB_CNT + 1];
+	int                      free_num[MDTEST_MB_CNT + 1];
+	struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1];
+	int                      i, j, cnt, rc, num, total_frees;
+	struct umem_pin_handle  *p_hdl;
+	struct umem_cache_range  rg = {0};
+	uint64_t                 space_used_before, space_used_after;
+
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	/* Allocate from NE Buckets. It should use 80% 360M i.e, 16 buckets */
+	ainfo[0].mb_id       = 0;
+	ainfo[0].num_allocs  = 0;
+	ainfo[0].start_umoff = UMOFF_NULL;
+	ainfo[0].alloc_size  = 512;
+	alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/* Allocate from Evictable Buckets. */
+	for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) {
+		/* Create an MB and fill it with allocs */
+		ainfo[i].mb_id       = umem_allot_mb_evictable(umm, 0);
+		ainfo[i].num_allocs  = 0;
+		ainfo[i].start_umoff = UMOFF_NULL;
+		ainfo[i].alloc_size  = 512;
+		assert_true(ainfo[i].mb_id != 0);
+		alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	}
+
+	/* Free few allocations from each NE bucket */
+	umem_tx_begin(umm, NULL);
+	umoff       = ainfo[0].start_umoff;
+	num         = ainfo[0].num_allocs;
+	free_num[0] = num / 10000;
+	cnt         = 0;
+	D_ALLOC_ARRAY(free_list[0], free_num[0]);
+	for (j = 1; j <= num; j++) {
+		ptr_cur    = (umem_off_t *)umem_off2ptr(umm, umoff);
+		next_umoff = *ptr_cur;
+		if ((j % 10000) == 0) {
+			if (UMOFF_IS_NULL(next_umoff))
+				break;
+			ptr_next    = (umem_off_t *)umem_off2ptr(umm, next_umoff);
+			nnext_umoff = *ptr_next;
+			umem_tx_add_ptr(umm, ptr_cur, sizeof(umoff));
+			*ptr_cur = nnext_umoff;
+			umem_free(umm, next_umoff);
+			print_message("id=0:Freeing offset %lu\n", next_umoff);
+			ainfo->num_allocs--;
+			free_list[0][cnt++] = next_umoff;
+			umoff               = nnext_umoff;
+		} else
+			umoff = next_umoff;
+		if (UMOFF_IS_NULL(umoff))
+			break;
+	}
+	umem_tx_commit(umm);
+	assert_true(cnt == free_num[0]);
+	print_message("id=0:Total frees %d\n", cnt);
+
+	/* Free few allocations from each E bucket */
+	for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) {
+		rg.cr_off  = umem_get_mb_base_offset(umm, ainfo[i].mb_id);
+		rg.cr_size = 1;
+		rc         = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl);
+		assert_true(rc == 0);
+
+		umem_tx_begin(umm, NULL);
+		umoff       = ainfo[i].start_umoff;
+		num         = ainfo[i].num_allocs;
+		free_num[i] = num / 10000;
+		cnt         = 0;
+		D_ALLOC_ARRAY(free_list[i], free_num[i]);
+		for (j = 1; j <= num; j++) {
+			ptr_cur    = (umem_off_t *)umem_off2ptr(umm, umoff);
+			next_umoff = *ptr_cur;
+			if ((j % 10000) == 0) {
+				if (UMOFF_IS_NULL(next_umoff))
+					break;
+				ptr_next    = (umem_off_t *)umem_off2ptr(umm, next_umoff);
+				nnext_umoff = *ptr_next;
+				umem_tx_add_ptr(umm, ptr_cur, sizeof(umoff));
+				*ptr_cur = nnext_umoff;
+				umem_free(umm, next_umoff);
+				print_message("id=%d:Freeing offset %lu\n", i, next_umoff);
+				ainfo->num_allocs--;
+				free_list[i][cnt++] = next_umoff;
+				umoff               = nnext_umoff;
+			} else
+				umoff = next_umoff;
+			if (UMOFF_IS_NULL(umoff))
+				break;
+		}
+		umem_tx_commit(umm);
+		umem_cache_unpin(&umm->umm_pool->up_store, p_hdl);
+		assert_true(cnt == free_num[i]);
+		print_message("id=%d:Total frees %d\n", ainfo[i].mb_id, cnt);
+	}
+
+	/* restart with or without checkpoint */
+	if (restart) {
+		wal_pool_refill(arg);
+		cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+		umm  = vos_cont2umm(cont);
+	}
+
+	for (i = 0; i < MDTEST_MAX_EMB_CNT + 1; i++) {
+		D_ALLOC_ARRAY(free_list_bk[i], free_num[i]);
+		memcpy(free_list_bk[i], free_list[i], free_num[i] * sizeof(umem_off_t));
+	}
+
+	/* Allocate from NE Buckets and it should reuse the previous freed blocks */
+	for (j = 0; j < free_num[0]; j++) {
+		umem_tx_begin(umm, NULL);
+		umoff = umem_alloc(umm, ainfo[0].alloc_size);
+		umem_tx_commit(umm);
+		assert_true(!UMOFF_IS_NULL(umoff));
+		assert_true(umoff_in_freelist(free_list[0], free_num[0], umoff, true));
+	}
+
+	/* New allocation should fail */
+	umem_tx_begin(umm, NULL);
+	umoff = umem_alloc(umm, ainfo[0].alloc_size);
+	umem_tx_abort(umm, 1);
+	assert_true(UMOFF_IS_NULL(umoff));
+
+	/* Allocate from E Buckets and it should reuse the previous freed blocks */
+	for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) {
+		rg.cr_off  = umem_get_mb_base_offset(umm, ainfo[i].mb_id);
+		rg.cr_size = 1;
+		rc         = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl);
+		assert_true(rc == 0);
+
+		for (j = 0; j < free_num[i]; j++) {
+			umem_tx_begin(umm, NULL);
+			umoff = umem_alloc_from_bucket(umm, ainfo[i].alloc_size, ainfo[i].mb_id);
+			assert_true(!UMOFF_IS_NULL(umoff));
+			umem_tx_commit(umm);
+			assert_true(umoff_in_freelist(free_list[i], free_num[i], umoff, true));
+		}
+		umem_tx_begin(umm, NULL);
+		/* New allocation should fail */
+		umoff = umem_alloc(umm, ainfo[i].alloc_size);
+		umem_tx_abort(umm, 1);
+		assert_true(UMOFF_IS_NULL(umoff));
+		print_message("Finished reallocating for id = %d\n", ainfo[i].mb_id);
+		umem_cache_unpin(&umm->umm_pool->up_store, p_hdl);
+	}
+
+	/* Free the allocated memory to see whether they are properly accounted */
+	rc = umempobj_get_heapusage(umm->umm_pool, &space_used_before);
+	if (rc) {
+		print_message("Failed to get heap usage\n");
+		assert_true(rc == 0);
+	}
+	for (j = 0; j < free_num[0]; j++)
+		umem_atomic_free(umm, free_list_bk[0][j]);
+	D_FREE(free_list[0]);
+	D_FREE(free_list_bk[0]);
+
+	total_frees = free_num[0];
+
+	for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) {
+		rg.cr_off  = umem_get_mb_base_offset(umm, ainfo[i].mb_id);
+		rg.cr_size = 1;
+		rc         = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl);
+		assert_true(rc == 0);
+
+		for (j = 0; j < free_num[i]; j++) {
+			umoff = umem_atomic_free(umm, free_list_bk[i][j]);
+		}
+		umem_cache_unpin(&umm->umm_pool->up_store, p_hdl);
+		total_frees += free_num[i];
+		D_FREE(free_list[i]);
+		D_FREE(free_list_bk[i]);
+	}
+	rc = umempobj_get_heapusage(umm->umm_pool, &space_used_after);
+	if (rc) {
+		print_message("Failed to get heap usage\n");
+		assert_true(rc == 0);
+	}
+	print_message("Space usage: before free %lu, after free %lu, expected %lu\n",
+		      space_used_before, space_used_after, (space_used_before - total_frees * 512));
+	assert_true(space_used_after <= (space_used_before - total_frees * 512));
+}
+
+static void
+wal_umempobj_block_reuse(void **state)
+{
+	wal_umempobj_block_reuse_internal(state, 0);
+}
+
+static void
+wal_umempobj_replay_block_reuse(void **state)
+{
+	wal_umempobj_block_reuse_internal(state, 1);
+}
+
+static void
+wal_umempobj_chkpt_block_reuse(void **state)
+{
+	struct io_test_args *arg = *state;
+
+	arg->checkpoint = true;
+	arg->no_replay  = true;
+	wal_umempobj_block_reuse_internal(state, 1);
+	arg->checkpoint = false;
+	arg->no_replay  = false;
+	daos_fail_loc_set(0);
+}
+
+static void
+wal_umempobj_mbusage_test(void **state)
+{
+	struct io_test_args     *arg = *state;
+	struct vos_container    *cont;
+	struct umem_instance    *umm;
+	struct bucket_alloc_info ainfo[2];
+	struct umem_pin_handle  *p_hdl;
+	struct umem_cache_range  rg = {0};
+	uint64_t                 allocated0, allocated1, maxsz0, maxsz1, maxsz_exp;
+	uint64_t                 allocated, maxsz;
+	int                      rc;
+
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	maxsz_exp = MDTEST_MAX_NEMB_CNT * MDTEST_MB_SIZE;
+
+	/* Allocate from NE Buckets. It should use 80% 360M i.e, 16 buckets */
+	ainfo[0].mb_id       = 0;
+	ainfo[0].num_allocs  = 0;
+	ainfo[0].start_umoff = UMOFF_NULL;
+	ainfo[0].alloc_size  = 512;
+	alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	/* Create an MB and fill it with allocs */
+	ainfo[1].mb_id       = umem_allot_mb_evictable(umm, 0);
+	ainfo[1].num_allocs  = 0;
+	ainfo[1].start_umoff = UMOFF_NULL;
+	ainfo[1].alloc_size  = 512;
+	assert_true(ainfo[1].mb_id != 0);
+	alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl);
+	free_bucket_by_pct(umm, &ainfo[1], 50, checkpoint_fn, &arg->ctx.tc_po_hdl);
+
+	rc = umempobj_get_mbusage(umm->umm_pool, ainfo[0].mb_id, &allocated0, &maxsz0);
+	print_message("NE usage max_size = %lu allocated = %lu\n", maxsz0, allocated0);
+	assert_int_equal(rc, 0);
+	assert_int_equal(maxsz0, maxsz_exp);
+
+	rc = umempobj_get_mbusage(umm->umm_pool, ainfo[1].mb_id, &allocated1, &maxsz1);
+	print_message("E usage max_size = %lu allocated = %lu\n", maxsz1, allocated1);
+	assert_int_equal(rc, 0);
+	assert_int_equal(maxsz1, MDTEST_MB_SIZE);
+
+	wal_pool_refill(arg);
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	umm  = vos_cont2umm(cont);
+
+	rc = umempobj_get_mbusage(umm->umm_pool, ainfo[0].mb_id, &allocated, &maxsz);
+	print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated);
+	assert_int_equal(rc, 0);
+	assert_int_equal(maxsz, maxsz_exp);
+	assert_int_equal(allocated, allocated0);
+
+	rc = umempobj_get_mbusage(umm->umm_pool, ainfo[1].mb_id, &allocated, &maxsz);
+	print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated);
+	assert_int_equal(rc, 0);
+	/* allocated info is based on the hint */
+	assert_true((allocated != 0) && (allocated < allocated1));
+	assert_int_equal(maxsz, MDTEST_MB_SIZE);
+
+	rg.cr_off  = umem_get_mb_base_offset(umm, ainfo[1].mb_id);
+	rg.cr_size = 1;
+	rc         = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl);
+	assert_true(rc == 0);
+	rc = umempobj_get_mbusage(umm->umm_pool, ainfo[1].mb_id, &allocated, &maxsz);
+	umem_cache_unpin(&umm->umm_pool->up_store, p_hdl);
+	print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated);
+	assert_int_equal(rc, 0);
+	/* allocated info is based on the actual stats recorded */
+	assert_int_equal(allocated, allocated1);
+	assert_int_equal(maxsz, MDTEST_MB_SIZE);
+}
+
+static void
+dump_cache_stats(struct vos_pool *pool, char *op_str)
+{
+	struct umem_pool	*umm_pool = vos_pool2umm(pool)->umm_pool;
+	struct umem_cache	*cache = vos_pool2store(pool)->cache;
+	daos_size_t		 scm_used, ne_used, ne_tot;
+	int			 rc;
+
+	rc = umempobj_get_heapusage(umm_pool, &scm_used);
+	assert_rc_equal(rc, 0);
+
+	rc = umempobj_get_mbusage(umm_pool, UMEM_DEFAULT_MBKT_ID, &ne_used, &ne_tot);
+	assert_int_equal(rc, 0);
+
+	print_message("==================== (dump stats %s)\n", op_str);
+	print_message("[Space usage]    Total used:%lu, NE used:%lu, NE total:%lu\n",
+		      scm_used, ne_used, ne_tot);
+
+	print_message("[Page stats]     NE:%u, Pinned:%u, Free:%u\n",
+		      cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE],
+		      cache->ca_pgs_stats[UMEM_PG_STATS_PINNED],
+		      cache->ca_pgs_stats[UMEM_PG_STATS_FREE]);
+
+	print_message("[Swapping stats] Hit:%lu, Miss:%lu, Evict:%lu, Flush:%lu, Load:%lu\n",
+		      cache->ca_cache_stats[UMEM_CACHE_STATS_HIT],
+		      cache->ca_cache_stats[UMEM_CACHE_STATS_MISS],
+		      cache->ca_cache_stats[UMEM_CACHE_STATS_EVICT],
+		      cache->ca_cache_stats[UMEM_CACHE_STATS_FLUSH],
+		      cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD]);
+	print_message("====================\n");
+}
+
+static int
+obj_rw(struct io_test_args *arg, daos_unit_oid_t oid, char *dkey, char *akey,
+       daos_iod_type_t iod_type, daos_epoch_t epoch, int io_size, char *buf, bool update)
+{
+	daos_recx_t	recx = {.rx_idx = 0, .rx_nr = 1};
+	daos_key_t	dkey_iov, akey_iov;
+	daos_iod_t	iod = { 0 };
+	d_sg_list_t	sgl = { 0 };
+	int		rc;
+
+	arg->oid = oid;
+	d_iov_set(&dkey_iov, dkey, strlen(dkey));
+	d_iov_set(&akey_iov, akey, strlen(akey));
+
+	rc = d_sgl_init(&sgl, 1);
+	assert_rc_equal(rc, 0);
+
+	sgl.sg_iovs[0].iov_buf = buf;
+	sgl.sg_iovs[0].iov_buf_len = io_size;
+	sgl.sg_iovs[0].iov_len = io_size;
+
+	iod.iod_name = akey_iov;
+	iod.iod_nr = 1;
+	iod.iod_type = iod_type;
+	iod.iod_size = io_size;
+	iod.iod_recxs = (iod_type == DAOS_IOD_SINGLE) ? NULL : &recx;
+
+	if (update)
+		rc = io_test_obj_update(arg, epoch, 0, &dkey_iov, &iod, &sgl, NULL, true);
+	else
+		rc = io_test_obj_fetch(arg, epoch, 0, &dkey_iov, &iod, &sgl, true);
+
+	d_sgl_fini(&sgl, false);
+
+	return rc;
+}
+
+static inline uint64_t
+verify_space(struct vos_pool *pool, uint32_t bkt_id, uint64_t prev_used, int64_t delta, char *op_str)
+{
+	struct umem_pool	*umm_pool = vos_pool2umm(pool)->umm_pool;
+	daos_size_t		allocated, total;
+	int			rc;
+
+	rc = umempobj_get_mbusage(umm_pool, bkt_id, &allocated, &total);
+	assert_int_equal(rc, 0);
+
+	print_message("[%s] %s %u used space: %lu/%lu\n", op_str,
+		      bkt_id == UMEM_DEFAULT_MBKT_ID ? "Non-evictable" : "Evictable",
+		      bkt_id, allocated, total);
+
+	if (delta == INT64_MAX)
+		return allocated;
+
+	if (delta == 0)
+		assert_true(allocated == prev_used);
+	else if (delta > 0)
+		assert_true(allocated > (prev_used + delta));
+	else if (delta < 0)
+		assert_true(allocated <= (prev_used + delta));
+
+	return allocated;
+}
+
+static void
+reclaim_obj(struct io_test_args *arg, daos_unit_oid_t *oid, int oid_nr, daos_epoch_t *epoch)
+{
+	daos_epoch_range_t	epr;
+	int			i, rc;
+
+	/* Punch object */
+	for (i = 0; i < oid_nr; i++) {
+		rc = vos_obj_punch(arg->ctx.tc_co_hdl, *oid, (*epoch)++, 0, 0, NULL, 0,
+				   NULL, NULL);
+		oid++;
+		assert_rc_equal(rc, 0);
+	}
+
+	/* Aggregate punched object */
+	epr.epr_lo = 0;
+	epr.epr_hi = (*epoch)++;
+	rc = vos_aggregate(arg->ctx.tc_co_hdl, &epr, NULL, NULL, 0);
+	assert_rc_equal(rc, 0);
+
+	/* Wait GC done */
+	gc_wait();
+}
+
+/* Update/punch object, re-open pool, verify space usage and bucket ID */
+static void
+p2_basic_test(void **state)
+{
+	struct io_test_args	*arg = *state;
+	struct vos_pool		*pool = vos_hdl2pool(arg->ctx.tc_po_hdl);
+	struct vos_container	*cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	struct umem_cache	*cache;
+	daos_unit_oid_t		oid;
+	char			dkey[UPDATE_DKEY_SIZE] = { 0 };
+	char			akey[UPDATE_AKEY_SIZE] = { 0 };
+	char			*buf;
+	daos_epoch_t		epoch = 1;
+	daos_size_t		io_size = 512;
+	struct vos_object	*obj;
+	uint32_t		bkt_id = 1, missed, loaded;
+	uint64_t		used[2], ne_init;
+	int			rc;
+
+	dts_key_gen(dkey, UPDATE_DKEY_SIZE, UPDATE_DKEY);
+	dts_key_gen(akey, UPDATE_AKEY_SIZE, UPDATE_AKEY);
+
+	D_ALLOC(buf, io_size);
+	assert_non_null(buf);
+	dts_buf_render(buf, io_size);
+
+	/* Get initial space usage */
+	used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, 0, INT64_MAX, "Init");
+	ne_init = used[0];
+
+	/* Update object1 */
+	oid = dts_unit_oid_gen(0, 0);
+	rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_SINGLE, epoch++, io_size, buf, true);
+	assert_rc_equal(rc, 0);
+
+	/* Verify object1 bucket ID */
+	rc = vos_obj_acquire(cont, oid, false, &obj);
+	assert_rc_equal(rc, 0);
+
+	assert_int_equal(obj->obj_bkt_ids[0], bkt_id);
+
+	vos_obj_release(obj, 0, true);
+
+	/* Verify space usage */
+	used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], 1, "Object1");
+	used[1] = verify_space(pool, bkt_id, 0, INT64_MAX, "Object1");
+
+	/* Reclaim object1 */
+	reclaim_obj(arg, &oid, 1, &epoch);
+
+	/* Verify space usage */
+	used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], -1, "Reclaim object1");
+	used[1] = verify_space(pool, bkt_id, used[1], -used[1], "Reclaim object1");
+
+	/* Update object2 */
+	oid = dts_unit_oid_gen(0, 0);
+	rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_ARRAY, epoch++, io_size, buf, true);
+	assert_rc_equal(rc, 0);
+
+	/* Verify object2 bucket ID */
+	rc = vos_obj_acquire(cont, oid, false, &obj);
+	assert_rc_equal(rc, 0);
+
+	assert_int_equal(obj->obj_bkt_ids[0], bkt_id);
+
+	/* Verify space usage */
+	used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], 1, "Object2.1");
+	used[1] = verify_space(pool, bkt_id, used[1], io_size, "Object2.1");
+
+	/* Update object2 again */
+	dts_key_gen(dkey, UPDATE_DKEY_SIZE, UPDATE_DKEY);
+	dts_key_gen(akey, UPDATE_AKEY_SIZE, UPDATE_AKEY);
+	rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_SINGLE, epoch++, io_size, buf, true);
+	assert_rc_equal(rc, 0);
+
+	/* Verify object2 bucket ID */
+	assert_int_equal(obj->obj_bkt_ids[0], bkt_id);
+
+	vos_obj_release(obj, 0, true);
+
+	/* Verify space usage */
+	used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], 0, "Object2.2");
+	used[1] = verify_space(pool, bkt_id, used[1], io_size, "Object2.2");
+
+	/* Re-open pool */
+	arg->checkpoint = true;
+	wal_pool_refill(arg);
+	pool = vos_hdl2pool(arg->ctx.tc_po_hdl);
+	cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	cache = vos_pool2store(pool)->cache;
+	arg->checkpoint = false;
+
+	missed = cache->ca_cache_stats[UMEM_CACHE_STATS_MISS];
+	loaded = cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD];
+
+	/* Verify NE space usage */
+	used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], 0, "Re-open");
+
+	/* Fetch object2 */
+	rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_SINGLE, DAOS_EPOCH_MAX, io_size, buf, false);
+	assert_rc_equal(rc, 0);
+
+	dump_cache_stats(pool, "after re-open & fetch");
+	/* Verify cache stats */
+	assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_MISS], missed + 1);
+	assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD], loaded + 1);
+
+	/* Verify E space usage */
+	used[1] = verify_space(pool, bkt_id, used[1], 0, "Re-open");
+
+	/* Verify object2 bucket ID */
+	rc = vos_obj_acquire(cont, oid, false, &obj);
+	assert_rc_equal(rc, 0);
+
+	assert_int_equal(obj->obj_bkt_ids[0], bkt_id);
+	vos_obj_release(obj, 0, true);
+
+	/* Reclaim object2 */
+	reclaim_obj(arg, &oid, 1, &epoch);
+
+	/* Verify space usage */
+	used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], -1, "Reclaim object2");
+	used[1] = verify_space(pool, bkt_id, used[1], -used[1], "Reclaim object2");
+	assert_int_equal(used[0], ne_init);
+
+	D_FREE(buf);
+}
+
+static int
+fill_one(struct io_test_args *arg, daos_unit_oid_t oid, char *dkey, char *akey,
+	 daos_epoch_t *epoch, daos_size_t io_size, char *buf, uint32_t *ret_id)
+{
+	struct vos_object	*obj;
+	struct vos_container	*cont = vos_hdl2cont(arg->ctx.tc_co_hdl);
+	uint32_t		 bkt_id = UMEM_DEFAULT_MBKT_ID;
+	uint64_t		 used, total = 0, prev_used = 0;
+	daos_size_t		 written = 0;
+	int			 rc = 0;
+
+	while (written < MDTEST_MB_SIZE) {
+		rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_ARRAY, (*epoch)++, io_size, buf, true);
+		if (rc != 0)
+			break;
+		written += io_size;
+
+		if (bkt_id == UMEM_DEFAULT_MBKT_ID) {
+			rc = vos_obj_acquire(cont, oid, false, &obj);
+			assert_rc_equal(rc, 0);
+
+			bkt_id = obj->obj_bkt_ids[0];
+			vos_obj_release(obj, 0, false);
+			/* All evictable buckets are used up */
+			if (bkt_id == UMEM_DEFAULT_MBKT_ID) {
+				rc = 1;
+				break;
+			}
+		}
+
+		rc = umempobj_get_mbusage(vos_cont2umm(cont)->umm_pool, bkt_id, &used, &total);
+		assert_int_equal(rc, 0);
+		assert_int_equal(total, MDTEST_MB_SIZE);
+
+		/* This evictable bucket is filled up */
+		if (used == prev_used)
+			break;
+
+		prev_used = used;
+	}
+
+	print_message("Filled bucket:%u total:%lu, used:%lu/%lu, written:%lu, rc:%d\n",
+		      bkt_id, total, used, prev_used, written, rc);
+	*ret_id = bkt_id;
+
+	return rc;
+}
+
+/* Fill all evictable buckets */
+static void
+p2_fill_test(void **state)
+{
+	struct io_test_args     *arg = *state;
+	struct vos_pool		*pool = vos_hdl2pool(arg->ctx.tc_po_hdl);
+	struct umem_cache	*cache = vos_pool2store(pool)->cache;
+	daos_unit_oid_t		oids[MDTEST_MAX_EMB_CNT];
+	daos_epoch_t		epoch = 1;
+	char			dkey[UPDATE_DKEY_SIZE] = { 0 };
+	char			akey[UPDATE_AKEY_SIZE] = { 0 };
+	char			*buf;
+	uint32_t		missed, loaded, evicted;
+	daos_size_t		io_size = 800;
+	uint32_t		bkt_ids[MDTEST_MAX_EMB_CNT];
+	uint64_t		bkt_used[MDTEST_MAX_EMB_CNT];
+	uint64_t		ne_used, ne_init;
+	int			i, rc, obj_cnt = 0;
+
+	dts_key_gen(dkey, UPDATE_DKEY_SIZE, UPDATE_DKEY);
+	dts_key_gen(akey, UPDATE_AKEY_SIZE, UPDATE_AKEY);
+
+	D_ALLOC(buf, io_size);
+	assert_non_null(buf);
+	dts_buf_render(buf, io_size);
+
+	/* Get initial space usage */
+	ne_init = verify_space(pool, UMEM_DEFAULT_MBKT_ID, 0, INT64_MAX, "Init");
+
+	/* Fill up pool */
+	while (obj_cnt < MDTEST_MAX_EMB_CNT) {
+		oids[obj_cnt] = dts_unit_oid_gen(0, 0);
+		rc = fill_one(arg, oids[obj_cnt], dkey, akey, &epoch, io_size, buf,
+			      &bkt_ids[obj_cnt]);
+		if (rc)
+			break;
+		bkt_used[obj_cnt] = verify_space(pool, bkt_ids[obj_cnt], 0, INT64_MAX, "Fill");
+
+		obj_cnt++;
+		print_message("%d objects are allocated.\n", obj_cnt);
+
+		if (obj_cnt && (obj_cnt % 4 == 0))
+			checkpoint_fn(&arg->ctx.tc_po_hdl);
+	}
+	assert_true(obj_cnt > 0);
+
+	for (i = 0; i < obj_cnt; i++)
+		bkt_used[i] = verify_space(pool, bkt_ids[i], bkt_used[i], 0, "Filled");
+
+	missed = cache->ca_cache_stats[UMEM_CACHE_STATS_MISS];
+	loaded = cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD];
+	evicted = cache->ca_cache_stats[UMEM_CACHE_STATS_EVICT];
+
+	/* Fetch first object to trigger cache miss and page evict */
+	rc = obj_rw(arg, oids[0], dkey, akey, DAOS_IOD_ARRAY, DAOS_EPOCH_MAX, io_size, buf, false);
+	assert_rc_equal(rc, 0);
+
+	dump_cache_stats(pool, "after fetch");
+	assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_MISS], missed + 1);
+	assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD], loaded + 1);
+	assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_EVICT], evicted + 1);
+
+	/* Re-open pool */
+	arg->checkpoint = true;
+	wal_pool_refill(arg);
+	pool = vos_hdl2pool(arg->ctx.tc_po_hdl);
+	cache = vos_pool2store(pool)->cache;
+	arg->checkpoint = false;
+
+	missed = cache->ca_cache_stats[UMEM_CACHE_STATS_MISS];
+	loaded = cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD];
+
+	/* Fetch first object to trigger cache miss */
+	rc = obj_rw(arg, oids[0], dkey, akey, DAOS_IOD_ARRAY, DAOS_EPOCH_MAX, io_size, buf, false);
+	assert_rc_equal(rc, 0);
+
+	dump_cache_stats(pool, "after re-open & fetch");
+	assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_MISS], missed + 1);
+	assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD], loaded + 1);
+
+	ne_used = verify_space(pool, UMEM_DEFAULT_MBKT_ID, ne_init, 1, "Re-open");
+	bkt_used[0] = verify_space(pool, bkt_ids[0], bkt_used[0], 0, "Re-open");
+
+	/* Reclaim all objects */
+	reclaim_obj(arg, &oids[0], obj_cnt, &epoch);
+	dump_cache_stats(pool, "after reclaim objs");
+
+	/* Verify used space */
+	ne_used = verify_space(pool, UMEM_DEFAULT_MBKT_ID, ne_used, -1, "Reclaim objs");
+	assert_int_equal(ne_used, ne_init);
+	for (i = 0; i < obj_cnt; i++)
+		bkt_used[i] = verify_space(pool, bkt_ids[i], bkt_used[i], -bkt_used[i],
+					   "Reclaim objs");
+
+	/* Close container */
+	rc = vos_cont_close(arg->ctx.tc_co_hdl);
+	assert_rc_equal(rc, 0);
+	arg->ctx.tc_step = TCX_CO_CREATE;
+
+	/* Destroy container */
+	rc = vos_cont_destroy(arg->ctx.tc_po_hdl, arg->ctx.tc_co_uuid);
+	assert_rc_equal(rc, 0);
+	arg->ctx.tc_step = TCX_PO_CREATE_OPEN;
+
+	gc_wait();
+
+	dump_cache_stats(pool, "after cont destroy");
+
+	ne_used = verify_space(pool, UMEM_DEFAULT_MBKT_ID, ne_used, -1, "Cont destroy");
+	for (i = 0; i < obj_cnt; i++)
+		bkt_used[i] = verify_space(pool, bkt_ids[i], bkt_used[i], -bkt_used[i],
+					   "Cont destroy");
+
+	D_FREE(buf);
+}
+
 static const struct CMUnitTest wal_tests[] = {
     {"WAL01: Basic pool/cont create/destroy test", wal_tst_pool_cont, NULL, NULL},
     {"WAL02: Basic pool/cont create/destroy test with checkpointing", wal_tst_pool_cont,
@@ -1285,6 +2497,25 @@ static const struct CMUnitTest wal_io_int_tests[] = {
     {"WAL24: Key query punch with subsequent update", wal_io_query_key_punch_update, NULL, NULL},
 };
 
+static const struct CMUnitTest wal_MB_tests[] = {
+    {"WAL30: UMEM MB Basic Test", wal_mb_tests, setup_mb_io, teardown_mb_io},
+    {"WAL31: UMEM MB EMB selection based on utilization Test", wal_mb_utilization_tests,
+     setup_mb_io, teardown_mb_io},
+    {"WAL32: UMEM MB EMB eviction by other EMBs Test", wal_mb_emb_evicts_emb, setup_mb_io,
+     teardown_mb_io},
+    {"WAL33: UMEM MB EMB eviction by NEMB expansion Test", wal_mb_nemb_evicts_emb, setup_mb_io,
+     teardown_mb_io},
+    {"WAL34: UMEM MB garbage collection", wal_umempobj_block_reuse, setup_mb_io, teardown_mb_io},
+    {"WAL35: UMEM MB checkpoint restart garbage collection", wal_umempobj_chkpt_block_reuse,
+     setup_mb_io, teardown_mb_io},
+    {"WAL36: UMEM MB restart replay garbage collection", wal_umempobj_replay_block_reuse,
+     setup_mb_io, teardown_mb_io},
+    {"WAL37: UMEM MB stats test ", wal_umempobj_mbusage_test, setup_mb_io, teardown_mb_io},
+    {"WAL38: P2 basic", p2_basic_test, setup_mb_io, teardown_mb_io},
+    {"WAL39: P2 fill evictable buckets", p2_fill_test, setup_mb_io, teardown_mb_io},
+    {"WAL40: nemb pct test", wal_mb_nemb_pct, setup_mb_io_nembpct, teardown_mb_io_nembpct},
+};
+
 int
 run_wal_tests(const char *cfg)
 {
@@ -1332,5 +2563,11 @@ run_wal_tests(const char *cfg)
 							  setup_wal_io, teardown_io);
 		}
 	}
+
+	if (umempobj_get_backend_type() == DAOS_MD_BMEM_V2) {
+		dts_create_config(test_name, "Memory Bucket tests with WAL %s", cfg);
+		D_PRINT("Running %s\n", test_name);
+		rc += cmocka_run_group_tests_name(test_name, wal_MB_tests, NULL, NULL);
+	}
 	return rc;
 }
diff --git a/src/vos/tests/wal_ut.c b/src/vos/tests/wal_ut.c
index f123a3990a0..32b4b4c9957 100644
--- a/src/vos/tests/wal_ut.c
+++ b/src/vos/tests/wal_ut.c
@@ -29,7 +29,7 @@ ut_mc_init(struct bio_ut_args *args, uint64_t meta_sz, uint64_t wal_sz, uint64_t
 	int	rc, ret;
 
 	uuid_generate(args->bua_pool_id);
-	rc = bio_mc_create(args->bua_xs_ctxt, args->bua_pool_id, meta_sz, wal_sz, data_sz, 0);
+	rc = bio_mc_create(args->bua_xs_ctxt, args->bua_pool_id, 0, meta_sz, wal_sz, data_sz, 0, 0);
 	if (rc) {
 		D_ERROR("UT MC create failed. "DF_RC"\n", DP_RC(rc));
 		return rc;
diff --git a/src/vos/vos_aggregate.c b/src/vos/vos_aggregate.c
index 65d70dd7762..5064e74d730 100644
--- a/src/vos/vos_aggregate.c
+++ b/src/vos/vos_aggregate.c
@@ -984,7 +984,7 @@ reserve_segment(struct vos_object *obj, struct agg_io_context *io,
 
 	if (vos_io_scm(vos_obj2pool(obj), DAOS_IOD_ARRAY, size, VOS_IOS_AGGREGATION)) {
 		/** Store on SCM */
-		off = vos_reserve_scm(obj->obj_cont, io->ic_rsrvd_scm, size);
+		off = vos_reserve_scm(obj->obj_cont, io->ic_rsrvd_scm, size, obj);
 		if (UMOFF_IS_NULL(off)) {
 			now = daos_gettime_coarse();
 			if (now - obj->obj_cont->vc_agg_nospc_ts > VOS_NOSPC_ERROR_INTVL) {
diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c
index cd2f2a5a693..e19768d4c03 100644
--- a/src/vos/vos_common.c
+++ b/src/vos/vos_common.c
@@ -693,8 +693,7 @@ static inline int
 vos_metrics_count(void)
 {
 	return vea_metrics_count() +
-	       (sizeof(struct vos_agg_metrics) + sizeof(struct vos_space_metrics) +
-		sizeof(struct vos_chkpt_metrics)) / sizeof(struct d_tm_node_t *);
+		sizeof(struct vos_pool_metrics) / sizeof(struct d_tm_node_t *);
 }
 
 static void
@@ -874,6 +873,9 @@ vos_metrics_alloc(const char *path, int tgt_id)
 	/* Initialize metrics for WAL */
 	vos_wal_metrics_init(&vp_metrics->vp_wal_metrics, path, tgt_id);
 
+	/* Initialize metrcis for umem cache */
+	vos_cache_metrics_init(&vp_metrics->vp_cache_metrics, path, tgt_id);
+
 	return vp_metrics;
 }
 
diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c
index 6e6cbeeeb2a..a5a55a902b9 100644
--- a/src/vos/vos_container.c
+++ b/src/vos/vos_container.c
@@ -63,7 +63,7 @@ cont_df_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 	cont_df = umem_off2ptr(&tins->ti_umm, rec->rec_off);
 	vos_ts_evict(&cont_df->cd_ts_idx, VOS_TS_TYPE_CONT, vos_pool->vp_sysdb);
 
-	return gc_add_item(vos_pool, DAOS_HDL_INVAL, GC_CONT, rec->rec_off, 0);
+	return gc_add_item(vos_pool, DAOS_HDL_INVAL, GC_CONT, rec->rec_off, NULL);
 }
 
 static int
@@ -92,6 +92,17 @@ cont_df_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 	cont_df = umem_off2ptr(&tins->ti_umm, offset);
 	uuid_copy(cont_df->cd_id, ukey->uuid);
 
+	cont_df->cd_ext = umem_zalloc(&tins->ti_umm, sizeof(struct vos_cont_ext_df));
+	if (UMOFF_IS_NULL(cont_df->cd_ext)) {
+		D_ERROR("Failed to allocate cont df extension.\n");
+		rc = -DER_NOSPACE;
+		goto failed;
+	}
+
+	rc = gc_init_cont(&tins->ti_umm, cont_df);
+	if (rc)
+		goto failed;
+
 	rc = dbtree_create_inplace_ex(VOS_BTR_OBJ_TABLE, 0, VOS_OBJ_ORDER,
 				      &pool->vp_uma, &cont_df->cd_obj_root,
 				      DAOS_HDL_INVAL, pool, &hdl);
@@ -101,12 +112,13 @@ cont_df_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 	}
 	dbtree_close(hdl);
 
-	gc_init_cont(&tins->ti_umm, cont_df);
 	args->ca_cont_df = cont_df;
 	rec->rec_off = offset;
 	return 0;
 failed:
 	/* Ignore umem_free failure. */
+	if (!UMOFF_IS_NULL(cont_df->cd_ext))
+		umem_free(&tins->ti_umm, cont_df->cd_ext);
 	umem_free(&tins->ti_umm, offset);
 	return rc;
 }
@@ -191,6 +203,7 @@ cont_free_internal(struct vos_container *cont)
 
 	if (!d_list_empty(&cont->vc_gc_link))
 		d_list_del(&cont->vc_gc_link);
+	gc_close_cont(cont);
 
 	for (i = 0; i < VOS_IOS_CNT; i++) {
 		if (cont->vc_hint_ctxt[i])
@@ -384,6 +397,9 @@ vos_cont_open(daos_handle_t poh, uuid_t co_uuid, daos_handle_t *coh)
 	D_INIT_LIST_HEAD(&cont->vc_dtx_act_list);
 	cont->vc_dtx_committed_count = 0;
 	cont->vc_solo_dtx_epoch = d_hlc_get();
+	rc = gc_open_cont(cont);
+	if (rc)
+		D_GOTO(exit, rc);
 	gc_check_cont(cont);
 
 	/* Cache this btr object ID in container handle */
diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index 0a325088a77..86c100f4739 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -311,16 +311,38 @@ dtx_act_ent_update(struct btr_instance *tins, struct btr_record *rec,
 
 	if (unlikely(!dae_old->dae_aborted)) {
 		/*
-		 * XXX: There are two possible reasons for that:
-		 *
-		 *	1. Client resent the RPC but without set 'RESEND' flag.
-		 *	2. Client reused the DTX ID for different modifications.
-		 *
-		 *	Currently, the 1st case is more suspected.
+		 * If the new entry and the old entry are for the same transaction, then the RPC
+		 * for the new one will take 'RESEND' flag, that will cause the old one has been
+		 * aborted before arriving at here. So it is quite possible that the new one and
+		 * the old one are for different transactions.
 		 */
-		D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n",
-			DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
-		return -DER_TX_ID_REUSED;
+		if (DAE_EPOCH(dae_old) < DAE_EPOCH(dae_new)) {
+			D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n",
+				DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
+			return -DER_TX_ID_REUSED;
+		}
+
+		/*
+		 * If the old entry has higher epoch, it is quite possible that the resent RPC
+		 * was handled before the original RPC (corresponding to 'dae_new'). Returning
+		 * -DER_INPROGRESS to make the RPC sponsor to retry the RPC with 'RESEND' flag,
+		 *  then related RPC handler logic will handle such case.
+		 */
+		if (DAE_EPOCH(dae_old) > DAE_EPOCH(dae_new)) {
+			D_ERROR("Resent RPC may be handled before original one for DTX "DF_DTI
+				" with epoch "DF_X64" vs "DF_X64"\n",
+				DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
+			return -DER_INPROGRESS;
+		}
+
+		/*
+		 * The two entries uses the same epoch, then it may be caused by repeated RPCs
+		 * from different sources, such as multiple relay engines forward the same RPC
+		 * to current target. We need to notify related caller for such buggy case.
+		 */
+		D_ERROR("Receive repeated DTX "DF_DTI" with epoch "DF_X64"\n",
+			DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old));
+		return -DER_MISC;
 	}
 
 	rec->rec_off = umem_ptr2off(&tins->ti_umm, dae_new);
@@ -1171,16 +1193,20 @@ vos_dtx_check_availability(daos_handle_t coh, uint32_t entry,
 	}
 
 	if (intent == DAOS_INTENT_PURGE) {
-		uint32_t	age = d_hlc_age2sec(DAE_XID(dae).dti_hlc);
+		uint64_t	now = daos_gettime_coarse();
 
 		/*
 		 * The DTX entry still references related data record,
 		 * then we cannot (vos) aggregate related data record.
+		 * Report warning per each 10 seconds to avoid log flood.
 		 */
-		if (age >= DAOS_AGG_THRESHOLD)
-			D_WARN("DTX "DF_DTI" (state:%u, age:%u) still references the data, "
-			       "cannot be (VOS) aggregated\n",
-			       DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), age);
+		if (now - cont->vc_agg_busy_ts > 10) {
+			D_WARN("DTX "DF_DTI" (state:%u, flags:%x, age:%u) still references "
+			       "the modification, cannot be (VOS) aggregated\n",
+			       DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), DAE_FLAGS(dae),
+			       (unsigned int)d_hlc_age2sec(DAE_XID(dae).dti_hlc));
+			cont->vc_agg_busy_ts = now;
+		}
 
 		return ALB_AVAILABLE_DIRTY;
 	}
@@ -1908,8 +1934,13 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch,
 			daos_epoch_t	e = *epoch;
 
 			*epoch = DAE_EPOCH(dae);
-			if (e != 0 && e != DAE_EPOCH(dae))
-				return -DER_MISMATCH;
+			if (e != 0) {
+				if (e > DAE_EPOCH(dae))
+					return -DER_MISMATCH;
+
+				if (e < DAE_EPOCH(dae))
+					return -DER_TX_RESTART;
+			}
 		}
 
 		return vos_dae_is_prepare(dae) ? DTX_ST_PREPARED : DTX_ST_INITED;
@@ -2205,14 +2236,139 @@ vos_dtx_post_handle(struct vos_container *cont,
 	}
 }
 
+static inline void
+dtx_unpin(struct vos_container *cont, struct umem_pin_handle *pin_hdl)
+{
+	struct vos_pool	*pool = vos_cont2pool(cont);
+
+	if (pin_hdl != NULL)
+		umem_cache_unpin(vos_pool2store(pool), pin_hdl);
+}
+
+static inline int
+bkts_add_rec(struct vos_pool *pool, struct vos_bkt_array *bkts, umem_off_t rec_off)
+{
+	uint32_t	bkt_id;
+	int		rc;
+
+	if (UMOFF_IS_NULL(rec_off))
+		return 0;
+
+	bkt_id = umem_get_mb_from_offset(vos_pool2umm(pool), rec_off);
+	if (bkt_id == UMEM_DEFAULT_MBKT_ID)
+		return 0;
+
+	rc = vos_bkt_array_add(bkts, bkt_id);
+	if (rc)
+		DL_ERROR(rc, "Failed to add %u into bucket array.", bkt_id);
+
+	return rc;
+}
+
+static int
+bkts_add_dae(struct vos_pool *pool, struct vos_bkt_array *bkts_in, struct vos_dtx_act_ent *dae)
+{
+	struct vos_bkt_array	local_bkts, *bkts;
+	umem_off_t		rec_off;
+	int			i, count, rc = 0;
+
+	vos_bkt_array_init(&local_bkts);
+	bkts = bkts_in->vba_cnt == 0 ? bkts_in : &local_bkts;
+
+	if (dae->dae_records != NULL) {
+		D_ASSERT(DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT);
+
+		for (i = DAE_REC_CNT(dae) - DTX_INLINE_REC_CNT - 1; i >= 0; i--) {
+			rec_off = umem_off2offset(dae->dae_records[i]);
+			rc = bkts_add_rec(pool, bkts, rec_off);
+			if (rc)
+				goto out;
+		}
+		count = DTX_INLINE_REC_CNT;
+	} else {
+		count = DAE_REC_CNT(dae);
+	}
+
+	for (i = count - 1; i >= 0; i--) {
+		rec_off = umem_off2offset(DAE_REC_INLINE(dae)[i]);
+		rc = bkts_add_rec(pool, bkts, rec_off);
+		if (rc)
+			goto out;
+	}
+
+	/* Stop adding the dae when current dae not located in the subset of @bkts_in */
+	if (local_bkts.vba_cnt != 0 && !vos_bkt_array_subset(bkts_in, &local_bkts))
+		rc = 1;
+out:
+	vos_bkt_array_fini(&local_bkts);
+	return rc;
+}
+
+static int
+dtx_commit_pin(struct vos_container *cont, struct dtx_id dtis[], int count, int *pinned,
+	       struct umem_pin_handle **pin_hdl)
+{
+	struct vos_dtx_act_ent	*dae;
+	struct vos_bkt_array	 bkts;
+	d_iov_t			 kiov, riov;
+	int			 i, rc;
+
+	*pinned = count;
+	*pin_hdl = NULL;
+
+	if (!vos_pool_is_evictable(vos_cont2pool(cont)))
+		return 0;
+
+	vos_bkt_array_init(&bkts);
+
+	for (i = 0; i < count; i++) {
+		d_iov_set(&kiov, &dtis[i], sizeof(struct dtx_id));
+		d_iov_set(&riov, NULL, 0);
+
+		rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov);
+		if (rc == -DER_NONEXIST) {
+			rc = 0;
+			continue;
+		} else if (rc) {
+			DL_ERROR(rc, "Failed to lookup DTX active table.");
+			goto out;
+		}
+
+		dae = riov.iov_buf;
+		D_ASSERT(dae->dae_preparing == 0);
+
+		if (vos_dae_is_abort(dae) || dae->dae_committed || dae->dae_committing ||
+		    dae->dae_need_release == 0)
+			continue;
+
+		rc = bkts_add_dae(vos_cont2pool(cont), &bkts, dae);
+		if (rc) {
+			if (rc < 0) {
+				DL_ERROR(rc, "Failed to add DTX to bucket array.");
+				goto out;
+			}
+			*pinned = i;
+			break;
+		}
+	}
+
+	rc = vos_bkt_array_pin(vos_cont2pool(cont), &bkts, pin_hdl);
+	if (rc)
+		DL_ERROR(rc, "Failed to pin buckets.");
+out:
+	vos_bkt_array_fini(&bkts);
+	return rc;
+}
+
 int
 vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool rm_cos[])
 {
 	struct vos_dtx_act_ent	**daes = NULL;
 	struct vos_dtx_cmt_ent	**dces = NULL;
 	struct vos_container	 *cont;
-	int			  committed = 0;
-	int			  rc = 0;
+	struct umem_pin_handle	 *pin_hdl;
+	int			  tot_committed = 0, committed, pinned;
+	int			  idx = 0, rc = 0;
 
 	D_ASSERT(count > 0);
 
@@ -2227,24 +2383,73 @@ vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool rm_cos[]
 	cont = vos_hdl2cont(coh);
 	D_ASSERT(cont != NULL);
 
+pin_objects:
+	rc = dtx_commit_pin(cont, &dtis[idx], count, &pinned, &pin_hdl);
+	if (rc) {
+		DL_ERROR(rc, "Pin objects failed before DTX commit.");
+		goto out;
+	}
+
+	D_ASSERT(pinned > 0 && pinned <= count);
+	count -= pinned;
+
 	/* Commit multiple DTXs via single local transaction. */
 	rc = umem_tx_begin(vos_cont2umm(cont), NULL);
 	if (rc == 0) {
-		committed = vos_dtx_commit_internal(cont, dtis, count, 0, rm_cos, daes, dces);
+		committed = vos_dtx_commit_internal(cont, &dtis[idx], pinned, 0,
+						    rm_cos != NULL ? &rm_cos[idx] : NULL,
+						    &daes[idx], &dces[idx]);
 		if (committed >= 0) {
 			rc = umem_tx_commit(vos_cont2umm(cont));
 			D_ASSERT(rc == 0);
+			tot_committed += committed;
 		} else {
 			rc = umem_tx_abort(vos_cont2umm(cont), committed);
 		}
-		vos_dtx_post_handle(cont, daes, dces, count, false, rc != 0);
+		vos_dtx_post_handle(cont, &daes[idx], &dces[idx], pinned, false, rc != 0);
 	}
 
+	dtx_unpin(cont, pin_hdl);
+
+	if (count > 0) {
+		idx += pinned;
+		goto pin_objects;
+	}
 out:
 	D_FREE(daes);
 	D_FREE(dces);
 
-	return rc < 0 ? rc : committed;
+	return rc < 0 ? rc : tot_committed;
+}
+
+static int
+dtx_abort_pin(struct vos_container *cont, struct vos_dtx_act_ent *dae,
+	      struct umem_pin_handle **pin_hdl)
+{
+	struct vos_bkt_array	bkts;
+	int			rc;
+
+	if (!vos_pool_is_evictable(vos_cont2pool(cont)))
+		return 0;
+
+	if (dae->dae_need_release == 0)
+		return 0;
+
+	vos_bkt_array_init(&bkts);
+	rc = bkts_add_dae(vos_cont2pool(cont), &bkts, dae);
+	if (rc) {
+		D_ASSERT(rc < 0);
+		DL_ERROR(rc, "Failed to add DTX to buckets.");
+		goto out;
+	}
+
+	rc = vos_bkt_array_pin(vos_cont2pool(cont), &bkts, pin_hdl);
+	if (rc)
+		DL_ERROR(rc, "Failed to pin buckets.");
+out:
+	vos_bkt_array_fini(&bkts);
+	return rc;
+
 }
 
 int
@@ -2252,8 +2457,13 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae,
 {
 	struct dtx_handle	*dth = dae->dae_dth;
 	struct umem_instance	*umm;
+	struct umem_pin_handle	*pin_hdl = NULL;
 	int			 rc;
 
+	rc = dtx_abort_pin(cont, dae, &pin_hdl);
+	if (rc)
+		goto out;
+
 	umm = vos_cont2umm(cont);
 	rc = umem_tx_begin(umm, NULL);
 	if (rc != 0)
@@ -2294,6 +2504,8 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae,
 	 */
 
 out:
+	dtx_unpin(cont, pin_hdl);
+
 	if (rc == 0 || force)
 		vos_dtx_post_handle(cont, &dae, NULL, 1, true, false);
 	else if (rc != 0)
@@ -3044,6 +3256,11 @@ vos_dtx_attach(struct dtx_handle *dth, bool persistent, bool exist)
 		}
 	}
 
+	/*
+	 * Doesn't need to pin the object before starting tx, since the DTX commit from
+	 * following vos_dtx_prepared() is for read-only DTX transaction, no object data
+	 * will be accessed during DTX commit.
+	 */
 	if (persistent) {
 		rc = umem_tx_begin(umm, NULL);
 		if (rc != 0)
@@ -3298,6 +3515,13 @@ vos_dtx_local_begin(struct dtx_handle *dth, daos_handle_t poh)
 	pool = vos_hdl2pool(poh);
 	umm  = vos_pool2umm(pool);
 
+	if (vos_pool_is_evictable(pool)) {
+		D_ERROR("VOS local tx doesn't support evictable pool:"DF_UUID"\n",
+			DP_UUID(pool->vp_id));
+		rc = -DER_NOTSUPPORTED;
+		goto error;
+	}
+
 	rc = vos_tx_begin(dth, umm, pool->vp_sysdb);
 	if (rc != 0) {
 		D_ERROR("Failed to start transaction: rc=" DF_RC "\n", DP_RC(rc));
diff --git a/src/vos/vos_gc.c b/src/vos/vos_gc.c
index 0937b883f33..5d5383ed766 100644
--- a/src/vos/vos_gc.c
+++ b/src/vos/vos_gc.c
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -11,6 +11,7 @@
 #define D_LOGFAC	DD_FAC(vos)
 
 #include <daos/btree.h>
+#include <daos/btree_class.h>
 #include <daos/mem.h>
 #include <daos_srv/vos.h>
 #include "vos_internal.h"
@@ -74,12 +75,26 @@ struct vos_gc {
  */
 static int
 gc_drain_btr(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
-	     struct btr_root *root, int *credits, bool *empty)
+	     struct vos_gc_item *item, struct btr_root *root, int *credits, bool *empty)
 {
-	daos_handle_t	toh;
-	int		rc;
+	struct vos_object	 dummy_obj = { 0 };
+	struct vos_container	 dummy_cont = { 0 };
+	daos_handle_t		 toh;
+	void			*priv;
+	int			 rc, i;
+
+	if (gc->gc_type == GC_CONT) {
+		priv = pool;
+	} else {
+		dummy_cont.vc_pool = pool;
+		dummy_obj.obj_cont = &dummy_cont;
+		dummy_obj.obj_bkt_alloted = 1;
+		for (i = 0; i < VOS_GC_BKTS_MAX; i++)
+			dummy_obj.obj_bkt_ids[i] = item->it_bkt_ids[i];
+		priv = &dummy_obj;
+	}
 
-	rc = dbtree_open_inplace_ex(root, &pool->vp_uma, coh, pool, &toh);
+	rc = dbtree_open_inplace_ex(root, &pool->vp_uma, coh, priv, &toh);
 	if (rc == -DER_NONEXIST) { /* empty tree */
 		*empty = true;
 		return 0;
@@ -115,7 +130,7 @@ gc_drain_evt(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
 	daos_handle_t	    toh;
 	int		    rc;
 
-	vos_evt_desc_cbs_init(&cbs, pool, coh);
+	vos_evt_desc_cbs_init(&cbs, pool, coh, NULL);
 	rc = evt_open(root, &pool->vp_uma, &cbs, &toh);
 	if (rc == -DER_NONEXIST) {
 		*empty = true;
@@ -126,7 +141,7 @@ gc_drain_evt(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
 
 	D_DEBUG(DB_TRACE, "drain %s evtree, creds=%d\n", gc->gc_name, *credits);
 	rc = evt_drain(toh, credits, empty);
-	D_ASSERT(evt_close(toh) == 0);
+	evt_close(toh);
 	if (rc)
 		goto failed;
 
@@ -160,7 +175,7 @@ gc_drain_key(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
 	}
 
 	if (key->kr_bmap & KREC_BF_BTR) {
-		rc = gc_drain_btr(gc, pool, coh, &key->kr_btr, credits, empty);
+		rc = gc_drain_btr(gc, pool, coh, item, &key->kr_btr, credits, empty);
 
 	} else if (key->kr_bmap & KREC_BF_EVT) {
 		D_ASSERT(gc->gc_type == GC_AKEY);
@@ -195,7 +210,7 @@ gc_free_dkey(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, struct
 
 	D_ASSERT(krec->kr_bmap & KREC_BF_DKEY);
 	if (krec->kr_bmap & KREC_BF_NO_AKEY)
-		gc_add_item(pool, coh, GC_AKEY, item->it_addr, item->it_args);
+		gc_add_item(pool, coh, GC_AKEY, item->it_addr, &item->it_bkt_ids[0]);
 	else
 		umem_free(&pool->vp_umm, item->it_addr);
 	return 0;
@@ -211,7 +226,7 @@ gc_drain_obj(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
 {
 	struct vos_obj_df *obj = umem_off2ptr(&pool->vp_umm, item->it_addr);
 
-	return gc_drain_btr(gc, pool, coh, &obj->vo_tree, credits, empty);
+	return gc_drain_btr(gc, pool, coh, item, &obj->vo_tree, credits, empty);
 }
 
 static int
@@ -294,20 +309,29 @@ gc_drain_cont(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
 			return rc;
 
 		/** Indicate to caller that we've taken over container bags */
-		return 1;
+		if (!vos_pool_is_evictable(pool))
+			return 1;
 	}
 
 	D_ASSERT(daos_handle_is_inval(coh));
-	return gc_drain_btr(gc, pool, coh, &cont->cd_obj_root,
-			    credits, empty);
+	return gc_drain_btr(gc, pool, coh, item, &cont->cd_obj_root, credits, empty);
 }
 
 static int
 gc_free_cont(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, struct vos_gc_item *item)
 {
-	int	rc;
+	struct vos_cont_df	*cd = umem_off2ptr(&pool->vp_umm, item->it_addr);
+	int			 rc;
 
-	rc = vos_dtx_table_destroy(&pool->vp_umm, umem_off2ptr(&pool->vp_umm, item->it_addr));
+	if (!UMOFF_IS_NULL(cd->cd_ext)) {
+		rc = umem_free(&pool->vp_umm, cd->cd_ext);
+		if (rc) {
+			DL_ERROR(rc, "Failed to free cont_df extension");
+			return rc;
+		}
+	}
+
+	rc = vos_dtx_table_destroy(&pool->vp_umm, cd);
 	if (rc == 0)
 		rc = umem_free(&pool->vp_umm, item->it_addr);
 
@@ -369,19 +393,102 @@ gc_type2bin(struct vos_pool *pool, struct vos_container *cont,
 	return &cont->vc_cont_df->cd_gc_bins[type];
 }
 
+static int
+gc_bkt2bins(uint32_t *bkt_id, struct vos_gc_info *gc_info, bool create, bool try_next,
+	    struct vos_gc_bin_df **bins_ret)
+{
+	struct vos_gc_bin_df	dummy_bins[GC_CONT];
+	d_iov_t			key, key_out, val, val_out;
+	uint64_t		*new_id, key_id = *bkt_id;
+	int			probe_op = try_next ? BTR_PROBE_FIRST : BTR_PROBE_EQ;
+	int			i, rc;
+
+	D_ASSERT(try_next || *bkt_id != UMEM_DEFAULT_MBKT_ID);
+	D_ASSERT(daos_handle_is_valid(gc_info->gi_bins_btr));
+
+	/* Fetch the in-tree record */
+	d_iov_set(&key, &key_id, sizeof(key_id));
+	d_iov_set(&key_out, NULL, 0);
+	d_iov_set(&val_out, NULL, 0);
+
+	rc = dbtree_fetch(gc_info->gi_bins_btr, probe_op, DAOS_INTENT_DEFAULT, &key,
+			  &key_out, &val_out);
+	if (rc && rc != -DER_NONEXIST) {
+		DL_ERROR(rc, "Failed to lookup GC bins for bkt_id:%u", *bkt_id);
+		return rc;
+	}
+
+	if (rc == 0) {
+		*bins_ret = (struct vos_gc_bin_df *)val_out.iov_buf;
+		new_id = (uint64_t *)key_out.iov_buf;
+		D_ASSERT(new_id && (try_next || *bkt_id == *new_id));
+		*bkt_id = (uint32_t)*new_id;
+	} else if (create) {
+		D_ASSERT(!try_next);
+		memset(&dummy_bins[0], 0, sizeof(dummy_bins));
+		for (i = 0; i < GC_CONT; i++) {
+			dummy_bins[i].bin_bag_first	= UMOFF_NULL;
+			dummy_bins[i].bin_bag_last	= UMOFF_NULL;
+			dummy_bins[i].bin_bag_size	= gc_bag_size;
+			dummy_bins[i].bin_bag_nr	= 0;
+		}
+
+		d_iov_set(&val, &dummy_bins[0], sizeof(dummy_bins));
+		d_iov_set(&val_out, NULL, 0);
+
+		rc = dbtree_upsert(gc_info->gi_bins_btr, BTR_PROBE_BYPASS, DAOS_INTENT_UPDATE,
+				   &key, &val, &val_out);
+		if (rc != 0) {
+			DL_ERROR(rc, "Failed to insert GC bins for bkt_id:%u", *bkt_id);
+			return rc;
+		}
+		*bins_ret = (struct vos_gc_bin_df *)val_out.iov_buf;
+	}
+
+	return rc;
+}
+
+static int
+gc_get_bin(struct vos_pool *pool, struct vos_container *cont, enum vos_gc_type type,
+	   uint32_t bkt_id, struct vos_gc_bin_df **bin_df)
+{
+	struct vos_gc_bin_df	*bins = NULL;
+	int			 rc;
+
+	D_ASSERT(type < GC_MAX);
+	if (!vos_pool_is_evictable(pool) || bkt_id == UMEM_DEFAULT_MBKT_ID) {
+		*bin_df = gc_type2bin(pool, cont, type);
+		return 0;
+	}
+
+	D_ASSERT(type < GC_CONT);
+	if (cont == NULL)
+		rc = gc_bkt2bins(&bkt_id, &pool->vp_gc_info, true, false, &bins);
+	else
+		rc = gc_bkt2bins(&bkt_id, &cont->vc_gc_info, true, false, &bins);
+
+	if (rc == 0) {
+		D_ASSERT(bins != NULL);
+		*bin_df = &bins[type];
+	}
+
+	return rc;
+}
+
 /**
  * Free the first (oldest) garbage bag of a garbage bin unless it is also the
  * last (newest) bag.
  */
 static int
-gc_bin_free_bag(struct umem_instance *umm, struct vos_container *cont,
-		struct vos_gc_bin_df *bin, umem_off_t bag_id)
+gc_bin_free_bag(struct umem_instance *umm, struct vos_gc_bin_df *bin, umem_off_t bag_id,
+		bool free_last_bag)
+
 {
 	struct vos_gc_bag_df *bag = umem_off2ptr(umm, bag_id);
 	int		      rc;
 
 	D_ASSERT(bag_id == bin->bin_bag_first);
-	if (cont == NULL && bag_id == bin->bin_bag_last) {
+	if (!free_last_bag && bag_id == bin->bin_bag_last) {
 		/* don't free the last bag, only reset it */
 		D_ASSERT(bin->bin_bag_nr == 1);
 		rc = umem_tx_add_ptr(umm, bag, sizeof(*bag));
@@ -393,7 +500,7 @@ gc_bin_free_bag(struct umem_instance *umm, struct vos_container *cont,
 		return rc;
 	}
 
-	if (cont != NULL) {
+	if (free_last_bag) {
 		D_ASSERT(bin->bin_bag_nr > 0);
 	} else {
 		D_ASSERT(bin->bin_bag_nr > 1);
@@ -494,11 +601,10 @@ gc_bin_add_item(struct umem_instance *umm, struct vos_gc_bin_df *bin,
 	return rc;
 }
 
-static struct vos_gc_item *
-gc_get_item(struct vos_gc *gc, struct vos_pool *pool,
-	    struct vos_container *cont)
+
+static inline struct vos_gc_item *
+bin_get_item(struct vos_pool *pool, struct vos_gc_bin_df *bin)
 {
-	struct vos_gc_bin_df	*bin = gc_type2bin(pool, cont, gc->gc_type);
 	struct vos_gc_bag_df	*bag;
 
 	bag = umem_off2ptr(&pool->vp_umm, bin->bin_bag_first);
@@ -513,6 +619,14 @@ gc_get_item(struct vos_gc *gc, struct vos_pool *pool,
 	return &bag->bag_items[bag->bag_item_first];
 }
 
+static inline struct vos_gc_item *
+gc_get_item(struct vos_gc *gc, struct vos_pool *pool, struct vos_container *cont)
+{
+	struct vos_gc_bin_df	*bin = gc_type2bin(pool, cont, gc->gc_type);
+
+	return bin_get_item(pool, bin);
+}
+
 static int
 gc_drain_item(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
 	      struct vos_gc_item *item, int *credits, bool *empty)
@@ -554,10 +668,9 @@ gc_drain_item(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
 }
 
 static int
-gc_free_item(struct vos_gc *gc, struct vos_pool *pool,
-	     struct vos_container *cont, struct vos_gc_item *item)
+gc_free_item(struct vos_gc *gc, struct vos_pool *pool, struct vos_container *cont,
+	     struct vos_gc_item *item, struct vos_gc_bin_df *bin)
 {
-	struct vos_gc_bin_df *bin = gc_type2bin(pool, cont, gc->gc_type);
 	struct vos_gc_bag_df *bag;
 	int		      first;
 	struct vos_gc_item    it;
@@ -575,8 +688,8 @@ gc_free_item(struct vos_gc *gc, struct vos_pool *pool,
 	if (first == bag->bag_item_last) {
 		/* it's going to be a empty bag */
 		D_ASSERT(bag->bag_item_nr == 1);
-		rc = gc_bin_free_bag(&pool->vp_umm, cont, bin,
-				     bin->bin_bag_first);
+		rc = gc_bin_free_bag(&pool->vp_umm, bin, bin->bin_bag_first,
+				     (cont != NULL || item->it_bkt_ids[0] != UMEM_DEFAULT_MBKT_ID));
 		if (rc)
 			goto failed;
 	} else {
@@ -627,12 +740,12 @@ gc_free_item(struct vos_gc *gc, struct vos_pool *pool,
  */
 int
 gc_add_item(struct vos_pool *pool, daos_handle_t coh,
-	    enum vos_gc_type type, umem_off_t item_off, uint64_t args)
+	    enum vos_gc_type type, umem_off_t item_off, uint32_t *bkt_ids)
 {
 	struct vos_container	*cont = vos_hdl2cont(coh);
-	struct vos_gc_bin_df	*bin = gc_type2bin(pool, cont, type);
+	struct vos_gc_bin_df	*bin;
 	struct vos_gc_item	 item;
-	int			 rc;
+	int			 rc, i;
 
 	D_DEBUG(DB_TRACE, "Add %s addr="DF_X64"\n",
 		gc_type2name(type), item_off);
@@ -641,7 +754,16 @@ gc_add_item(struct vos_pool *pool, daos_handle_t coh,
 		return 0; /* OK to ignore because the pool is being deleted */
 
 	item.it_addr = item_off;
-	item.it_args = args;
+	for (i = 0; i < VOS_GC_BKTS_MAX; i++)
+		item.it_bkt_ids[i] = bkt_ids ? bkt_ids[i] : UMEM_DEFAULT_MBKT_ID;
+
+	rc = gc_get_bin(pool, cont, type, item.it_bkt_ids[0], &bin);
+	if (rc) {
+		DL_ERROR(rc, "Failed to get GC bin for type:%d, bkt_id:%u",
+			 type, item.it_bkt_ids[0]);
+		return rc;
+	}
+
 	rc = gc_bin_add_item(&pool->vp_umm, bin, &item);
 	if (rc) {
 		D_ERROR("Failed to add item, pool=" DF_UUID ", rc=" DF_RC "\n",
@@ -711,6 +833,7 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
 {
 	struct vos_container	*cont = gc_get_container(pool);
 	struct vos_gc		*gc    = &gc_table[0]; /* start from akey */
+	struct vos_gc_bin_df	*bin;
 	int			 creds = *credits;
 	int			 rc;
 
@@ -777,8 +900,9 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
 		}
 
 		if (empty && creds) {
+			bin = gc_type2bin(pool, cont, gc->gc_type);
 			/* item can be released and removed from bin */
-			rc = gc_free_item(gc, pool, cont, item);
+			rc = gc_free_item(gc, pool, cont, item, bin);
 			if (rc) {
 				D_ERROR("GC=%s free item error: "DF_RC"\n", gc->gc_name, DP_RC(rc));
 				break;
@@ -812,7 +936,7 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
 		"pool="DF_UUID", creds origin=%d, current=%d, rc=%s\n",
 		DP_UUID(pool->vp_id), *credits, creds, d_errstr(rc));
 
-	rc = umem_tx_end(&pool->vp_umm, rc);
+	rc = umem_tx_end(&pool->vp_umm, rc < 0 ? rc : 0);
 	if (rc == 0)
 		*credits = creds;
 
@@ -833,6 +957,592 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
 	return rc;
 }
 
+static inline bool
+bins_empty(struct vos_pool *pool, struct vos_gc_bin_df *bins)
+{
+	int	i;
+
+	for (i = 0; i < GC_CONT; i++) {
+		if (bin_get_item(pool, &bins[i]) != NULL)
+			return false;
+	}
+	return true;
+}
+
+/* Add gc_bin[GC_CONT] from container bucket tree to pool bucket tree */
+static int
+gc_add_bins(struct vos_pool *pool, struct vos_gc_bin_df *src_bins, uint32_t bkt_id)
+{
+	struct vos_gc_bin_df	*dst_bins, dummy_bins[GC_CONT];
+	daos_handle_t		 pool_btr = pool->vp_gc_info.gi_bins_btr;
+	d_iov_t			 key, val, val_out;
+	uint64_t		 key_id = bkt_id;
+	int			 i, rc, added = 0;
+
+	D_ASSERT(daos_handle_is_valid(pool_btr));
+	/* Fetch the in-tree record from pool */
+	d_iov_set(&key, &key_id, sizeof(key_id));
+	d_iov_set(&val_out, NULL, 0);
+
+	rc = dbtree_fetch(pool_btr, BTR_PROBE_EQ, DAOS_INTENT_DEFAULT, &key, NULL, &val_out);
+	if (rc == -DER_NONEXIST) {
+		d_iov_set(&val, src_bins, sizeof(dummy_bins));
+		rc = dbtree_upsert(pool_btr, BTR_PROBE_BYPASS, DAOS_INTENT_UPDATE, &key, &val, NULL);
+		if (rc)
+			DL_ERROR(rc, "Failed to add bins for bkt_id:%u", bkt_id);
+		return rc;
+	} else if (rc) {
+		DL_ERROR(rc, "Failed to fetch bins from pool bucket tree for bkt_id:%u", bkt_id);
+		return rc;
+	}
+
+	dst_bins = (struct vos_gc_bin_df *)val_out.iov_buf;
+	D_ASSERT(dst_bins && !bins_empty(pool, dst_bins));
+
+	for (i = GC_AKEY; i < GC_CONT; i++) {
+		if (src_bins[i].bin_bag_first == UMOFF_NULL)
+			continue;
+
+		rc = gc_bags_move(pool, &dst_bins[i], &src_bins[i]);
+		if (rc != 0) {
+			DL_ERROR(rc, "Failed to move bags for bkt_id:%u, type:%d", bkt_id, i);
+			return rc;
+		}
+		added++;
+	}
+
+	D_ASSERT(added > 0);
+	return 0;
+}
+
+static int
+gc_move_bins(struct vos_pool *pool, struct vos_gc_item *item, int *credits, bool *empty_ret)
+{
+	struct umem_instance	*umm = &pool->vp_umm;
+	struct umem_attr	*uma = &pool->vp_uma;
+	struct vos_cont_df	*cd = umem_off2ptr(umm, item->it_addr);
+	struct vos_cont_ext_df	*cd_ext = umem_off2ptr(umm, cd->cd_ext);
+	daos_handle_t		 cont_btr;
+	d_iov_t			 key, key_out, val_out;
+	uint64_t		 key_id = UMEM_DEFAULT_MBKT_ID;
+	struct vos_gc_bin_df	*bins;
+	uint64_t		*bkt_id;
+	int			 rc, creds = *credits, moved = 0;
+
+	D_ASSERT(cd_ext != NULL);
+	rc = dbtree_open_inplace(&cd_ext->ced_gc_bkt.gd_bins_root, uma, &cont_btr);
+	if (rc == -DER_NONEXIST) {
+		*empty_ret = true;
+		return 0;
+	} else if (rc) {
+		DL_ERROR(rc, "Failed to open container bucket tree.");
+		return rc;
+	}
+	D_ASSERT(daos_handle_is_valid(cont_btr));
+
+	*empty_ret = false;
+	while (creds > 0) {
+		/* Fetch the in-tree record from container */
+		d_iov_set(&key, &key_id, sizeof(key_id));
+		d_iov_set(&key_out, NULL, 0);
+		d_iov_set(&val_out, NULL, 0);
+
+		rc = dbtree_fetch(cont_btr, BTR_PROBE_GE, DAOS_INTENT_DEFAULT,
+				  &key, &key_out, &val_out);
+		if (rc == -DER_NONEXIST) {
+			*empty_ret = true;
+			rc = 0;
+			break;
+		} else if (rc) {
+			DL_ERROR(rc, "Failed to fetch bins from container bucket tree.");
+			break;
+		}
+
+		bins = (struct vos_gc_bin_df *)val_out.iov_buf;
+		D_ASSERT(bins && !bins_empty(pool, bins));
+		bkt_id = (uint64_t *)key_out.iov_buf;
+		D_ASSERT(bkt_id && *bkt_id != UMEM_DEFAULT_MBKT_ID);
+
+		rc = gc_add_bins(pool, bins, (uint32_t)*bkt_id);
+		if (rc)
+			break;
+
+		rc = dbtree_delete(cont_btr, BTR_PROBE_BYPASS, &key_out, NULL);
+		if (rc) {
+			DL_ERROR(rc, "Failed to delete bins from container bucket tree.");
+			break;
+		}
+
+		moved++;
+		/* Consume 1 user credit on moving 8 gc_bin[GC_CONT] */
+		if (moved % 8 == 0)
+			creds--;
+	}
+
+	if (*empty_ret)
+		dbtree_destroy(cont_btr, NULL);
+	else
+		dbtree_close(cont_btr);
+
+	if (rc == 0)
+		*credits = creds;
+
+	return rc;
+}
+
+static int
+gc_flatten_cont(struct vos_pool *pool, int *credits)
+{
+	struct vos_gc		*gc = &gc_table[GC_CONT];
+	struct vos_gc_item	*item;
+	struct vos_gc_bin_df	*bin;
+	int			 creds = *credits;
+	int			 rc = 0, flattened = 0;
+
+	while (creds > 0) {
+		bool	empty = false;
+
+		item = gc_get_item(gc, pool, NULL);
+		if (item == NULL)	/* No containers to be flattened */
+			break;
+
+		/* Move all gc_bin[GC_CONT] from container to pool */
+		rc = gc_move_bins(pool, item, &creds, &empty);
+		if (rc) {
+			DL_ERROR(rc, "GC move bins failed.");
+			break;
+		}
+
+		if (!empty) {
+			D_ASSERT(creds == 0);
+			break;
+		}
+
+		if (creds == 0)
+			break;
+
+		empty = false;
+		/* Container drain doesn't consume user credits */
+		rc = gc_drain_item(gc, pool, DAOS_HDL_INVAL, item, NULL, &empty);
+		if (rc) {
+			D_ASSERT(rc < 0);
+			DL_ERROR(rc, "GC drain %s failed.", gc->gc_name);
+			break;
+		}
+
+		flattened++;
+		/* Consume 1 user credit on flattening every 8 objects */
+		if (flattened % 8 == 0)
+			creds--;
+
+		/* The container is flattened, free the gc_item */
+		if (empty && creds) {
+			bin = gc_type2bin(pool, NULL, gc->gc_type);
+			rc = gc_free_item(gc, pool, NULL, item, bin);
+			if (rc) {
+				DL_ERROR(rc, "GC free %s item failed.", gc->gc_name);
+				break;
+			}
+			creds--;
+		}
+	}
+
+	if (rc == 0)
+		*credits = creds;
+
+	return rc;
+}
+
+static int
+bkt_get_bins(struct vos_pool *pool, struct vos_container *cont, uint32_t *bkt_id, bool try_next,
+	     struct vos_gc_bin_df **bins_ret)
+{
+	struct vos_gc_info	*gc_info;
+	struct vos_gc_bin_df	*bins = NULL;
+	int			 rc;
+
+	if (*bkt_id == UMEM_DEFAULT_MBKT_ID || try_next) {
+		if (cont != NULL)
+			bins = &cont->vc_cont_df->cd_gc_bins[0];
+		else
+			bins = &pool->vp_pool_df->pd_gc_bins[0];
+
+		if (!bins_empty(pool, bins)) {
+			*bkt_id = UMEM_DEFAULT_MBKT_ID;
+			*bins_ret = bins;
+			return 0;
+		} else if (!try_next) {
+			return -DER_NONEXIST;
+		}
+	}
+
+	gc_info = (cont != NULL) ? &cont->vc_gc_info : &pool->vp_gc_info;
+	rc = gc_bkt2bins(bkt_id, gc_info, false, try_next, &bins);
+	if (rc)
+		return rc;
+
+	D_ASSERT(bins && !bins_empty(pool, bins));
+	*bins_ret = bins;
+
+	return 0;
+}
+
+static inline bool
+cont_bins_empty(struct vos_pool *pool, struct vos_container *cont)
+{
+	struct vos_gc_bin_df	*bins = &cont->vc_cont_df->cd_gc_bins[0];
+
+	if (!bins_empty(pool, bins))
+		return false;
+
+	D_ASSERT(daos_handle_is_valid(cont->vc_gc_info.gi_bins_btr));
+	if (!dbtree_is_empty(cont->vc_gc_info.gi_bins_btr))
+		return false;
+
+	return true;
+}
+
+/*
+ * Return non-empty gc_bin[GC_CONT] with specified bucket ID, different bucket ID
+ * could be returned if there is nothing to be reclaimed on the specified bucket.
+ */
+static int
+gc_get_bkt(struct vos_pool *pool, struct vos_container **cont_in, uint32_t *bkt_id,
+	   struct vos_gc_bin_df **bins_ret)
+{
+	struct vos_container	*cont, *tmp;
+	bool			 try_next = false;
+	int			 rc;
+
+switch_bkt:
+	/* Find non-empty gc_bin[GC_CONT] from containers */
+	d_list_for_each_entry_safe(cont, tmp, &pool->vp_gc_cont, vc_gc_link) {
+		if (cont_bins_empty(pool, cont)) {
+			d_list_del_init(&cont->vc_gc_link);
+			continue;
+		}
+
+		rc = bkt_get_bins(pool, cont, bkt_id, try_next, bins_ret);
+		if ((rc && rc != -DER_NONEXIST) || rc == 0)
+			goto done;
+	}
+
+	/* Find satisfied gc_bin[GC_CONT] from pool */
+	cont = NULL;
+	rc = bkt_get_bins(pool, NULL, bkt_id, try_next, bins_ret);
+	if ((rc && rc != -DER_NONEXIST) || rc == 0)
+		goto done;
+
+	if (!try_next) {
+		try_next = true;
+		goto switch_bkt;
+	}
+done:
+	if (*cont_in) {
+		vos_cont_decref(*cont_in);
+		*cont_in = NULL;
+	}
+
+	if (rc == 0 && cont) {
+		vos_cont_addref(cont);
+		*cont_in = cont;
+		/* Keep fairness */
+		d_list_del_init(&cont->vc_gc_link);
+		d_list_add_tail(&cont->vc_gc_link, &pool->vp_gc_cont);
+	}
+
+	return rc;
+}
+
+static int
+gc_reclaim_bins(struct vos_pool *pool, struct vos_container *cont,
+		struct vos_gc_bin_df *bins, int *credits)
+{
+	struct vos_gc		*gc = &gc_table[0];	/* Start from akey */
+	struct vos_gc_item	*item;
+	int		 	 rc = 0, creds = *credits;
+
+	while (creds > 0) {
+		bool	empty = false;
+
+		D_ASSERT(gc->gc_type < GC_CONT);
+		item = bin_get_item(pool, &bins[gc->gc_type]);
+		if (item == NULL) {
+			if (gc->gc_type == GC_OBJ)	/* hit the top level */
+				break;
+
+			/* Try upper level */
+			gc++;
+			continue;
+		}
+
+		rc = gc_drain_item(gc, pool, vos_cont2hdl(cont), item, &creds, &empty);
+		if (rc < 0) {
+			DL_ERROR(rc, "GC drain %s failed.", gc->gc_name);
+			break;
+		}
+
+		if (empty && creds) {
+			rc = gc_free_item(gc, pool, cont, item, &bins[gc->gc_type]);
+			if (rc) {
+				DL_ERROR(rc, "GC free %s item failed.", gc->gc_name);
+				break;
+			}
+			creds--;
+		}
+
+		/* always try to free akeys and values because they are the
+		 * items consuming most storage space.
+		 */
+		if (gc->gc_type == GC_AKEY)
+			continue;
+
+		/* should have flattened some items to the child GC, switch
+		 * to the child GC.
+		 */
+		gc--;
+	}
+
+	if (rc == 0)
+		*credits = creds;
+
+	return rc;
+}
+
+static int
+gc_delete_bins(struct vos_pool *pool, struct vos_container *cont, uint32_t bkt_id)
+{
+	struct vos_gc_bin_df	*bins;
+	struct vos_gc_info	*gc_info;
+	d_iov_t			 key, val_out;
+	uint64_t		 key_id = bkt_id;
+	int			 rc;
+
+	if (bkt_id == UMEM_DEFAULT_MBKT_ID)
+		return 0;
+
+	gc_info = (cont != NULL) ? &cont->vc_gc_info : &pool->vp_gc_info;
+	D_ASSERT(daos_handle_is_valid(gc_info->gi_bins_btr));
+
+	/* Fetch the in-tree record */
+	d_iov_set(&key, &key_id, sizeof(key_id));
+	d_iov_set(&val_out, NULL, 0);
+
+	rc = dbtree_fetch(gc_info->gi_bins_btr, BTR_PROBE_EQ, DAOS_INTENT_DEFAULT, &key,
+			  NULL, &val_out);
+	if (rc) {
+		DL_ERROR(rc, "Failed to lookup GC bins for bkt_id:%u", bkt_id);
+		return rc;
+	}
+
+	bins = (struct vos_gc_bin_df *)val_out.iov_buf;
+	D_ASSERT(bins && bins_empty(pool, bins));
+
+	rc = dbtree_delete(gc_info->gi_bins_btr, BTR_PROBE_BYPASS, &key, NULL);
+	if (rc)
+		DL_ERROR(rc, "Failed to delete GC bins for bkt_id:%u", bkt_id);
+
+	return rc;
+}
+
+static int
+gc_reclaim_pool_p2(struct vos_pool *pool, int *credits, bool *empty_ret)
+{
+	struct vos_container	*cont = NULL;
+	struct vos_gc_bin_df	*bins = NULL;
+	struct vos_gc_info	*gc_info = &pool->vp_gc_info;
+	uint32_t		 bkt = gc_info->gi_last_pinned, pinned_bkt = UMEM_DEFAULT_MBKT_ID;
+	struct umem_pin_handle	*pin_hdl = NULL;
+	struct umem_cache_range	 rg;
+	bool			 tx_started = false;
+	int			 creds = *credits, rc = 0;
+
+	if (pool->vp_dying) {
+		*empty_ret = true;
+		return rc;
+	}
+
+	*empty_ret = false;
+	while(creds > 0) {
+		if (bkt != UMEM_DEFAULT_MBKT_ID && bkt != pinned_bkt) {
+			if (tx_started) {
+				tx_started = false;
+				rc = umem_tx_end(&pool->vp_umm, 0);
+				if (rc) {
+					DL_ERROR(rc, "Failed to commit GC tx.");
+					break;
+				}
+			}
+
+			if (pin_hdl != NULL) {
+				umem_cache_unpin(vos_pool2store(pool), pin_hdl);
+				pin_hdl = NULL;
+			}
+
+			rg.cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), bkt);
+			rg.cr_size = vos_pool2store(pool)->cache->ca_page_sz;
+
+			rc = vos_cache_pin(pool, &rg, 1, false, &pin_hdl);
+			if (rc) {
+				DL_ERROR(rc, "Failed to pin bucket %u.", bkt);
+				break;
+			}
+			pinned_bkt = bkt;
+			gc_info->gi_last_pinned = pinned_bkt;
+		}
+
+		if (!tx_started) {
+			rc = umem_tx_begin(&pool->vp_umm, NULL);
+			if (rc) {
+				DL_ERROR(rc, "Failed to start tx for pool:"DF_UUID".",
+					 DP_UUID(pool->vp_id));
+				break;
+			}
+			tx_started = true;
+		}
+
+		/* Flatten all containers first */
+		rc = gc_flatten_cont(pool, &creds);
+		if (rc < 0) {
+			DL_ERROR(rc, "GC flatten cont failed.");
+			break;
+		}
+
+		/* Container flattening used up all user credits */
+		if (creds == 0)
+			break;
+
+		/*
+		 * Pick gc_bin[GC_CONT] by bucket ID, the bucket ID could be switched if
+		 * there is nothing to be reclaimed for the specified ID
+		 */
+		rc = gc_get_bkt(pool, &cont, &bkt, &bins);
+		if (rc == -DER_NONEXIST) {
+			*empty_ret = true;
+			rc = 0;
+			break;
+		} else if (rc) {
+			DL_ERROR(rc, "Failed to get GC bkt bins for bkt_id:%u", bkt);
+			break;
+		}
+
+		/* Bucket ID is switched, need to unpin current bucket then pin the new bucket */
+		if (bkt != UMEM_DEFAULT_MBKT_ID && bkt != pinned_bkt)
+			continue;
+
+		rc = gc_reclaim_bins(pool, cont, bins, &creds);
+		if (rc) {
+			DL_ERROR(rc, "GC reclaim bins for bkt_id:%u failed.", bkt);
+			break;
+		}
+
+		if (bins_empty(pool, bins)) {
+			/* The gc_bin[GC_CONT] is empty, delete it to condense the bucket tree */
+			rc = gc_delete_bins(pool, cont, bkt);
+			if (rc) {
+				DL_ERROR(rc, "GC delete bins for bkt_id:%u failed.", bkt);
+				break;
+			}
+		}
+	}
+
+	if (tx_started) {
+		rc = umem_tx_end(&pool->vp_umm, rc);
+		if (rc)
+			DL_ERROR(rc, "Failed to commit GC tx.");
+	}
+
+	if (pin_hdl != NULL) {
+		umem_cache_unpin(vos_pool2store(pool), pin_hdl);
+		pin_hdl = NULL;
+	}
+
+	if (cont != NULL)
+		vos_cont_decref(cont);
+
+	if (rc == 0)
+		*credits = creds;
+
+	gc_update_stats(pool);
+	return rc;
+}
+
+static inline void
+gc_close_bkt(struct vos_gc_info *gc_info)
+{
+
+	if (daos_handle_is_valid(gc_info->gi_bins_btr)) {
+		dbtree_close(gc_info->gi_bins_btr);
+		gc_info->gi_bins_btr = DAOS_HDL_INVAL;
+	}
+	gc_info->gi_last_pinned = UMEM_DEFAULT_MBKT_ID;
+}
+
+static inline int
+gc_open_bkt(struct umem_attr *uma, struct vos_gc_bkt_df *bkt_df, struct vos_gc_info *gc_info)
+{
+	int	rc;
+
+	rc = dbtree_open_inplace(&bkt_df->gd_bins_root, uma, &gc_info->gi_bins_btr);
+	if (rc)
+		DL_ERROR(rc, "Failed to open GC bin tree.");
+	return rc;
+}
+
+void
+gc_close_pool(struct vos_pool *pool)
+{
+	return gc_close_bkt(&pool->vp_gc_info);
+}
+
+int
+gc_open_pool(struct vos_pool *pool)
+{
+	struct vos_pool_ext_df	*pd_ext = umem_off2ptr(&pool->vp_umm, pool->vp_pool_df->pd_ext);
+
+	if (pd_ext != NULL)
+		return gc_open_bkt(&pool->vp_uma, &pd_ext->ped_gc_bkt, &pool->vp_gc_info);
+	return 0;
+}
+
+void
+gc_close_cont(struct vos_container *cont)
+{
+	return gc_close_bkt(&cont->vc_gc_info);
+}
+
+int
+gc_open_cont(struct vos_container *cont)
+{
+	struct vos_pool		*pool = vos_cont2pool(cont);
+	struct vos_cont_ext_df	*cd_ext = umem_off2ptr(&pool->vp_umm, cont->vc_cont_df->cd_ext);
+
+	if (cd_ext != NULL)
+		return gc_open_bkt(&pool->vp_uma, &cd_ext->ced_gc_bkt, &cont->vc_gc_info);
+	return 0;
+}
+
+static int
+gc_init_bkt(struct umem_instance *umm, struct vos_gc_bkt_df *bkt_df)
+{
+	struct umem_attr	uma;
+	daos_handle_t		bins_btr;
+	int			rc;
+
+	uma.uma_id = umm->umm_id;
+	uma.uma_pool = umm->umm_pool;
+
+	rc = dbtree_create_inplace(DBTREE_CLASS_IFV, BTR_FEAT_UINT_KEY, 12, &uma,
+				   &bkt_df->gd_bins_root, &bins_btr);
+	if (rc) {
+		DL_ERROR(rc, "Failed to create GC bin tree.");
+		return rc;
+	}
+	dbtree_close(bins_btr);
+
+	return 0;
+}
+
 /**
  * Initialize garbage bins for a pool.
  *
@@ -842,10 +1552,9 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
 int
 gc_init_pool(struct umem_instance *umm, struct vos_pool_df *pd)
 {
-	int		i;
-	umem_off_t	bag_id;
-	int		size;
-	int		rc;
+	struct vos_pool_ext_df	*pd_ext = umem_off2ptr(umm, pd->pd_ext);
+	umem_off_t		 bag_id;
+	int			 i, size, rc;
 
 	D_DEBUG(DB_IO, "Init garbage bins for pool="DF_UUID"\n",
 		DP_UUID(pd->pd_id));
@@ -867,6 +1576,10 @@ gc_init_pool(struct umem_instance *umm, struct vos_pool_df *pd)
 		bin->bin_bag_last = bag_id;
 		bin->bin_bag_nr = 1;
 	}
+
+	if (pd_ext != NULL)
+		return gc_init_bkt(umm, &pd_ext->ped_gc_bkt);
+
 	return 0;
 }
 
@@ -879,7 +1592,8 @@ gc_init_pool(struct umem_instance *umm, struct vos_pool_df *pd)
 int
 gc_init_cont(struct umem_instance *umm, struct vos_cont_df *cd)
 {
-	int	i;
+	struct vos_cont_ext_df	*cd_ext = umem_off2ptr(umm, cd->cd_ext);
+	int			 i;
 
 	D_DEBUG(DB_IO, "Init garbage bins for cont="DF_UUID"\n",
 		DP_UUID(cd->cd_id));
@@ -892,6 +1606,10 @@ gc_init_cont(struct umem_instance *umm, struct vos_cont_df *cd)
 		bin->bin_bag_size  = gc_bag_size;
 		bin->bin_bag_nr	   = 0;
 	}
+
+	if (cd_ext != NULL)
+		return gc_init_bkt(umm, &cd_ext->ced_gc_bkt);
+
 	return 0;
 }
 
@@ -903,17 +1621,25 @@ gc_check_cont(struct vos_container *cont)
 {
 	int	i;
 	struct vos_gc_bin_df	*bin;
+	struct vos_pool		*pool = cont->vc_pool;
 
 	D_INIT_LIST_HEAD(&cont->vc_gc_link);
 
 	for (i = 0; i < GC_CONT; i++) {
-		bin = gc_type2bin(cont->vc_pool, cont, i);
+		bin = gc_type2bin(pool, cont, i);
 		if (bin->bin_bag_first != UMOFF_NULL) {
-			d_list_add_tail(&cont->vc_gc_link,
-					&cont->vc_pool->vp_gc_cont);
+			d_list_add_tail(&cont->vc_gc_link, &pool->vp_gc_cont);
 			return;
 		}
 	}
+
+	if (vos_pool_is_evictable(pool)) {
+		struct vos_gc_info	*gc_info = &cont->vc_gc_info;
+
+		D_ASSERT(daos_handle_is_valid(gc_info->gi_bins_btr));
+		if (!dbtree_is_empty(gc_info->gi_bins_btr))
+			d_list_add_tail(&cont->vc_gc_link, &pool->vp_gc_cont);
+	}
 }
 
 /**
@@ -949,8 +1675,10 @@ gc_del_pool(struct vos_pool *pool)
 	D_ASSERT(!d_list_empty(&pool->vp_gc_link));
 
 	pool->vp_opened--;
-	if (pool->vp_opened == 0)
+	if (pool->vp_opened == 0) {
 		vos_pool_hash_del(pool); /* un-pin from open-hash */
+		gc_close_pool(pool);
+	}
 
 	d_list_del_init(&pool->vp_gc_link);
 	vos_pool_decref(pool); /* -1 for the link */
@@ -1018,7 +1746,10 @@ vos_gc_run(int *credits)
 		D_DEBUG(DB_TRACE, "GC pool="DF_UUID", creds=%d\n",
 			DP_UUID(pool->vp_id), creds);
 
-		rc = gc_reclaim_pool(pool, &creds, &empty);
+		if (vos_pool_is_evictable(pool))
+			rc = gc_reclaim_pool_p2(pool, &creds, &empty);
+		else
+			rc = gc_reclaim_pool(pool, &creds, &empty);
 		if (rc) {
 			D_ERROR("GC pool="DF_UUID" error=%s\n",
 				DP_UUID(pool->vp_id), d_errstr(rc));
@@ -1097,7 +1828,10 @@ vos_gc_pool_tight(daos_handle_t poh, int *credits)
 		return 0; /* nothing to reclaim for this pool */
 
 	total = *credits;
-	rc = gc_reclaim_pool(pool, credits, &empty);
+	if (vos_pool_is_evictable(pool))
+		rc = gc_reclaim_pool_p2(pool, credits, &empty);
+	else
+		rc = gc_reclaim_pool(pool, credits, &empty);
 	if (rc) {
 		D_CRIT("gc_reclaim_pool failed " DF_RC "\n", DP_RC(rc));
 		return 0; /* caller can't do anything for it */
diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h
index 9441ba45265..ade36cb769a 100644
--- a/src/vos/vos_internal.h
+++ b/src/vos/vos_internal.h
@@ -241,6 +241,21 @@ struct vos_wal_metrics {
 
 void vos_wal_metrics_init(struct vos_wal_metrics *vw_metrics, const char *path, int tgt_id);
 
+/* VOS pool metrics for umem cache */
+struct vos_cache_metrics {
+	struct d_tm_node_t	*vcm_pg_ne;
+	struct d_tm_node_t	*vcm_pg_pinned;
+	struct d_tm_node_t	*vcm_pg_free;
+	struct d_tm_node_t	*vcm_pg_hit;
+	struct d_tm_node_t	*vcm_pg_miss;
+	struct d_tm_node_t	*vcm_pg_evict;
+	struct d_tm_node_t	*vcm_pg_flush;
+	struct d_tm_node_t	*vcm_pg_load;
+	struct d_tm_node_t	*vcm_obj_hit;
+};
+
+void vos_cache_metrics_init(struct vos_cache_metrics *vc_metrcis, const char *path, int tgt_id);
+
 struct vos_pool_metrics {
 	void			*vp_vea_metrics;
 	struct vos_agg_metrics	 vp_agg_metrics;
@@ -248,9 +263,15 @@ struct vos_pool_metrics {
 	struct vos_space_metrics vp_space_metrics;
 	struct vos_chkpt_metrics vp_chkpt_metrics;
 	struct vos_wal_metrics	 vp_wal_metrics;
+	struct vos_cache_metrics vp_cache_metrics;
 	/* TODO: add more metrics for VOS */
 };
 
+struct vos_gc_info {
+	daos_handle_t	gi_bins_btr;
+	uint32_t	gi_last_pinned;
+};
+
 /**
  * VOS pool (DRAM)
  */
@@ -310,6 +331,8 @@ struct vos_pool {
 	uint32_t		 vp_data_thresh;
 	/** Space (in percentage) reserved for rebuild */
 	unsigned int		 vp_space_rb;
+	/* GC runtime for pool */
+	struct vos_gc_info	 vp_gc_info;
 };
 
 /**
@@ -353,6 +376,8 @@ struct vos_container {
 	daos_epoch_range_t	vc_epr_aggregation;
 	/* Current ongoing discard EPR */
 	daos_epoch_range_t	vc_epr_discard;
+	/* Last timestamp when VOS aggregation reports -DER_TX_BUSY */
+	uint64_t		vc_agg_busy_ts;
 	/* Last timestamp when VOS aggregation reporting ENOSPACE */
 	uint64_t		vc_agg_nospc_ts;
 	/* Last timestamp when IO reporting ENOSPACE */
@@ -363,7 +388,8 @@ struct vos_container {
 	 * * transaction with older epoch must have been committed.
 	 */
 	daos_epoch_t		vc_solo_dtx_epoch;
-
+	/* GC runtime for container */
+	struct vos_gc_info	vc_gc_info;
 	/* Various flags */
 	unsigned int		vc_in_aggregation:1,
 				vc_in_discard:1,
@@ -1256,7 +1282,7 @@ vos_bio_addr_free(struct vos_pool *pool, bio_addr_t *addr, daos_size_t nob);
 
 void
 vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool,
-		      daos_handle_t coh);
+		      daos_handle_t coh, struct vos_object *obj);
 
 int
 vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb);
@@ -1307,7 +1333,7 @@ vos_dedup_invalidate(struct vos_pool *pool);
 
 umem_off_t
 vos_reserve_scm(struct vos_container *cont, struct umem_rsrvd_act *rsrvd_scm,
-		daos_size_t size);
+		daos_size_t size, struct vos_object *obj);
 int
 vos_publish_scm(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_scm, bool publish);
 int
@@ -1324,6 +1350,12 @@ vos_pool2umm(struct vos_pool *pool)
 	return &pool->vp_umm;
 }
 
+static inline struct umem_store *
+vos_pool2store(struct vos_pool *pool)
+{
+	return &pool->vp_umm.umm_pool->up_store;
+}
+
 static inline struct umem_instance *
 vos_cont2umm(struct vos_container *cont)
 {
@@ -1360,11 +1392,19 @@ void
 gc_check_cont(struct vos_container *cont);
 int
 gc_add_item(struct vos_pool *pool, daos_handle_t coh,
-	    enum vos_gc_type type, umem_off_t item_off, uint64_t args);
+	    enum vos_gc_type type, umem_off_t item_off, uint32_t *bkt_ids);
 int
 vos_gc_pool_tight(daos_handle_t poh, int *credits);
 void
 gc_reserve_space(daos_size_t *rsrvd);
+int
+gc_open_pool(struct vos_pool *pool);
+void
+gc_close_pool(struct vos_pool *pool);
+int
+gc_open_cont(struct vos_container *cont);
+void
+gc_close_cont(struct vos_container *cont);
 
 /**
  * If the object is fully punched, bypass normal aggregation and move it to container
@@ -1839,4 +1879,149 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v
 int
 vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid);
 
+static inline bool
+vos_pool_is_p2(struct vos_pool *pool)
+{
+	struct umem_store	*store = vos_pool2store(pool);
+
+	return store->store_type == DAOS_MD_BMEM_V2;
+}
+
+static inline bool
+vos_pool_is_evictable(struct vos_pool *pool)
+{
+	struct umem_store	*store = vos_pool2store(pool);
+
+	if (store->store_evictable) {
+		D_ASSERT(store->store_type == DAOS_MD_BMEM_V2);
+		return true;
+	}
+
+	return false;
+}
+
+static inline umem_off_t
+vos_obj_alloc(struct umem_instance *umm, struct vos_object *obj, size_t size, bool zeroing)
+{
+
+	if (obj != NULL && vos_pool_is_evictable(vos_obj2pool(obj))) {
+		D_ASSERT(obj->obj_bkt_alloted == 1);
+		if (zeroing)
+			return umem_zalloc_from_bucket(umm, size, obj->obj_bkt_ids[0]);
+
+		return umem_alloc_from_bucket(umm, size, obj->obj_bkt_ids[0]);
+	}
+
+	if (zeroing)
+		return umem_zalloc(umm, size);
+
+	return umem_alloc(umm, size);
+}
+
+static inline umem_off_t
+vos_obj_reserve(struct umem_instance *umm, struct vos_object *obj,
+		struct umem_rsrvd_act *rsrvd_scm, daos_size_t size)
+{
+	if (obj != NULL && vos_pool_is_evictable(vos_obj2pool(obj))) {
+		D_ASSERT(obj->obj_bkt_alloted == 1);
+		return umem_reserve_from_bucket(umm, rsrvd_scm, size, obj->obj_bkt_ids[0]);
+	}
+
+	return umem_reserve(umm, rsrvd_scm, size);
+}
+
+/* vos_obj_cache.c */
+static inline struct dtx_handle *
+clear_cur_dth(struct vos_pool *pool)
+{
+	struct dtx_handle	*dth;
+
+	dth = vos_dth_get(pool->vp_sysdb);
+	vos_dth_set(NULL, pool->vp_sysdb);
+
+	return dth;
+}
+
+static inline void
+restore_cur_dth(struct vos_pool *pool, struct dtx_handle *dth)
+{
+	vos_dth_set(dth, pool->vp_sysdb);
+}
+
+static inline struct vos_cache_metrics *
+store2cache_metrics(struct umem_store *store)
+{
+	struct vos_pool_metrics	*vpm = (struct vos_pool_metrics *)store->stor_stats;
+
+	return vpm != NULL ? &vpm->vp_cache_metrics : NULL;
+}
+
+static inline void
+update_page_stats(struct umem_store *store)
+{
+	struct vos_cache_metrics	*vcm = store2cache_metrics(store);
+	struct umem_cache		*cache = store->cache;
+
+	if (vcm == NULL)
+		return;
+
+	d_tm_set_gauge(vcm->vcm_pg_ne, cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE]);
+	d_tm_set_gauge(vcm->vcm_pg_pinned, cache->ca_pgs_stats[UMEM_PG_STATS_PINNED]);
+	d_tm_set_gauge(vcm->vcm_pg_free, cache->ca_pgs_stats[UMEM_PG_STATS_FREE]);
+
+	d_tm_set_counter(vcm->vcm_pg_hit, cache->ca_cache_stats[UMEM_CACHE_STATS_HIT]);
+	d_tm_set_counter(vcm->vcm_pg_miss, cache->ca_cache_stats[UMEM_CACHE_STATS_MISS]);
+	d_tm_set_counter(vcm->vcm_pg_evict, cache->ca_cache_stats[UMEM_CACHE_STATS_EVICT]);
+	d_tm_set_counter(vcm->vcm_pg_flush, cache->ca_cache_stats[UMEM_CACHE_STATS_FLUSH]);
+	d_tm_set_counter(vcm->vcm_pg_load, cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD]);
+}
+
+static inline int
+vos_cache_pin(struct vos_pool *pool, struct umem_cache_range *ranges, int range_nr,
+	      bool for_sys, struct umem_pin_handle **pin_handle)
+{
+	struct umem_store	*store = vos_pool2store(pool);
+	struct dtx_handle	*cur_dth;
+	int			 rc;
+
+	cur_dth = clear_cur_dth(pool);
+	rc = umem_cache_pin(store, ranges, range_nr, for_sys, pin_handle);
+	restore_cur_dth(pool, cur_dth);
+
+	update_page_stats(store);
+
+	return rc;
+}
+
+int vos_obj_acquire(struct vos_container *cont, daos_unit_oid_t oid, bool pin,
+		    struct vos_object **obj_p);
+
+#define	VOS_BKTS_INLINE_MAX	4
+struct vos_bkt_array {
+	uint32_t	 vba_tot;
+	uint32_t	 vba_cnt;
+	uint32_t	 vba_inline_bkts[VOS_BKTS_INLINE_MAX];
+	uint32_t	*vba_bkts;
+};
+
+static inline void
+vos_bkt_array_fini(struct vos_bkt_array *bkts)
+{
+	if (bkts->vba_tot > VOS_BKTS_INLINE_MAX)
+		D_FREE(bkts->vba_bkts);
+}
+
+static inline void
+vos_bkt_array_init(struct vos_bkt_array *bkts)
+{
+	bkts->vba_tot	= VOS_BKTS_INLINE_MAX;
+	bkts->vba_cnt	= 0;
+	bkts->vba_bkts	= &bkts->vba_inline_bkts[0];
+}
+
+bool vos_bkt_array_subset(struct vos_bkt_array *super, struct vos_bkt_array *sub);
+int vos_bkt_array_add(struct vos_bkt_array *bkts, uint32_t bkt_id);
+int vos_bkt_array_pin(struct vos_pool *pool, struct vos_bkt_array *bkts,
+		      struct umem_pin_handle **pin_hdl);
+
 #endif /* __VOS_INTERNAL_H__ */
diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c
index 7aa3c897755..efd3f9b9a49 100644
--- a/src/vos/vos_io.c
+++ b/src/vos/vos_io.c
@@ -37,6 +37,8 @@ struct vos_io_context {
 	struct dcs_iod_csums	*ic_iod_csums;
 	/** reference on the object */
 	struct vos_object	*ic_obj;
+	/** used only for md-on-ssd phase2 evictable pool */
+	struct vos_object	*ic_pinned_obj;
 	/** BIO descriptor, has ic_iod_nr SGLs */
 	struct bio_desc		*ic_biod;
 	struct vos_ts_set	*ic_ts_set;
@@ -600,6 +602,9 @@ vos_ioc_destroy(struct vos_io_context *ioc, bool evict)
 	if (ioc->ic_obj)
 		vos_obj_release(ioc->ic_obj, 0, evict);
 
+	if (ioc->ic_pinned_obj)
+		vos_obj_release(ioc->ic_pinned_obj, 0, evict);
+
 	vos_ioc_reserve_fini(ioc);
 	vos_ilog_fetch_finish(&ioc->ic_dkey_info);
 	vos_ilog_fetch_finish(&ioc->ic_akey_info);
@@ -2119,17 +2124,16 @@ dkey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_key_t *dkey,
 
 umem_off_t
 vos_reserve_scm(struct vos_container *cont, struct umem_rsrvd_act *rsrvd_scm,
-		daos_size_t size)
+		daos_size_t size, struct vos_object *obj)
 {
-	umem_off_t	umoff;
+	umem_off_t		 umoff;
+	struct umem_instance	*umm = vos_cont2umm(cont);
 
 	D_ASSERT(size > 0);
-
-	if (vos_cont2umm(cont)->umm_ops->mo_reserve != NULL) {
-		umoff = umem_reserve(vos_cont2umm(cont), rsrvd_scm, size);
-	} else {
-		umoff = umem_alloc(vos_cont2umm(cont), size);
-	}
+	if (umm->umm_ops->mo_reserve != NULL)
+		umoff = vos_obj_reserve(umm, obj, rsrvd_scm, size);
+	else
+		umoff = vos_obj_alloc(umm, obj, size, false);
 
 	return umoff;
 }
@@ -2175,7 +2179,7 @@ reserve_space(struct vos_io_context *ioc, uint16_t media, daos_size_t size,
 	if (media == DAOS_MEDIA_SCM) {
 		umem_off_t	umoff;
 
-		umoff = vos_reserve_scm(ioc->ic_cont, ioc->ic_rsrvd_scm, size);
+		umoff = vos_reserve_scm(ioc->ic_cont, ioc->ic_rsrvd_scm, size, ioc->ic_pinned_obj);
 		if (!UMOFF_IS_NULL(umoff)) {
 			ioc->ic_umoffs[ioc->ic_umoffs_cnt] = umoff;
 			ioc->ic_umoffs_cnt++;
@@ -2577,7 +2581,12 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err,
 
 	tx_started = true;
 
-	/* Commit the CoS DTXs via the IO PMDK transaction. */
+	/*
+	 * Commit the CoS DTXs via the IO PMDK transaction.
+	 *
+	 * It's guaranteed that no other objects are involved in the CoS DTXs, so we don't
+	 * need to pin extra objects here.
+	 */
 	if (dtx_is_valid_handle(dth) && dth->dth_dti_cos_count > 0 && !dth->dth_cos_done) {
 		D_ASSERT(!dth->dth_local);
 
@@ -2745,6 +2754,20 @@ vos_update_begin(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch,
 		goto error;
 	}
 
+	/* Hold the object for the evictable md-on-ssd phase2 pool */
+	if (vos_pool_is_evictable(vos_cont2pool(ioc->ic_cont))) {
+		/*
+		 * FIXME:
+		 * The same object will be referenced by vos_obj_acquire() and vos_obj_hold()
+		 * (in vos_update_end()) twice, this is for avoiding the complication of adding
+		 * object ilog to ts_set. We'll re-org vos_obj_hold() in the future to make the
+		 * code look cleaner.
+		 */
+		rc = vos_obj_acquire(ioc->ic_cont, ioc->ic_oid, true, &ioc->ic_pinned_obj);
+		if (rc != 0)
+			goto error;
+	}
+
 	rc = dkey_update_begin(ioc);
 	if (rc != 0) {
 		D_ERROR(DF_UOID ": dkey update begin failed. " DF_RC "\n", DP_UOID(oid), DP_RC(rc));
diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h
index 902cb064e26..87d092bc882 100644
--- a/src/vos/vos_layout.h
+++ b/src/vos/vos_layout.h
@@ -43,6 +43,16 @@ struct vos_gc_bin_df {
 	uint16_t		bin_pad16;
 };
 
+/*
+ * This is smaller than the VOS_OBJ_BKTS_MAX for object durable format, because
+ * I don't want to increase each GC item size (the amount of GC item is massive)
+ * for an imagined requirement.
+ *
+ * If we really need to support more than 2 evict-able buckets per object in the
+ * futhure, we can enlarge the GC item then.
+ */
+#define VOS_GC_BKTS_MAX		2
+
 struct vos_gc_bag_df {
 	/** index of the first item in FIFO */
 	uint16_t		bag_item_first;
@@ -57,19 +67,12 @@ struct vos_gc_bag_df {
 	struct vos_gc_item {
 		/* address of the item to be freed */
 		umem_off_t		it_addr;
-		/** Reserved, argument for GC_VEA/BIO (e.g. size of extent) */
-		uint64_t		it_args;
+		/* object buckets for GC_AKEY/DKEY/OBJ of the md-on-ssd p2 pool */
+		uint32_t		it_bkt_ids[VOS_GC_BKTS_MAX];
 	}			bag_items[0];
 };
 
 enum vos_gc_type {
-	/* XXX: we could define GC_VEA, which can free NVMe/SCM space.
-	 * So svt_rec_free() and evt_desc_bio_free() only need to call
-	 * gc_add_item() to register BIO address for GC.
-	 *
-	 * However, GC_VEA could have extra overhead of reassigning SCM
-	 * pointers, but it also has low latency for undo changes.
-	 */
 	GC_AKEY,
 	GC_DKEY,
 	GC_OBJ,
@@ -77,6 +80,11 @@ enum vos_gc_type {
 	GC_MAX,
 };
 
+struct vos_gc_bkt_df {
+	/* GC bins categorized by bucket number */
+	struct btr_root		gd_bins_root;
+};
+
 #define POOL_DF_MAGIC				0x5ca1ab1e
 
 /** Lowest supported durable format version */
@@ -107,6 +115,16 @@ enum vos_gc_type {
 /** 2.8 features */
 #define VOS_POOL_FEAT_2_8			(VOS_POOL_FEAT_GANG_SV)
 
+/* VOS pool durable format extension */
+struct vos_pool_ext_df {
+	/* Extension for GC bucket */
+	struct vos_gc_bkt_df	ped_gc_bkt;
+	/* Paddings for other potential new feature */
+	uint64_t		ped_paddings[54];
+	/* Reserved for future extension */
+	uint64_t		ped_reserve;
+};
+
 /**
  * Durable format for VOS pool
  */
@@ -124,8 +142,8 @@ struct vos_pool_df {
 	 * a new format, containers with old format can be attached at here.
 	 */
 	uint64_t				pd_reserv_upgrade;
-	/** Reserved for future usage */
-	uint64_t				pd_reserv;
+	/** Pool durable format extension */
+	umem_off_t				pd_ext;
 	/** Unique PoolID for each VOS pool assigned on creation */
 	uuid_t					pd_id;
 	/** Total space in bytes on SCM */
@@ -249,6 +267,16 @@ enum vos_io_stream {
 	VOS_IOS_CNT
 };
 
+/* VOS container durable format extension */
+struct vos_cont_ext_df {
+	/* GC bucket extension */
+	struct vos_gc_bkt_df		ced_gc_bkt;
+	/* Reserved for potential new features */
+	uint64_t			ced_paddings[38];
+	/* Reserved for future extension */
+	uint64_t			ced_reserve;
+};
+
 /* VOS Container Value */
 struct vos_cont_df {
 	uuid_t				cd_id;
@@ -260,8 +288,8 @@ struct vos_cont_df {
 	struct btr_root			cd_obj_root;
 	/** reserved for placement algorithm upgrade */
 	uint64_t			cd_reserv_upgrade;
-	/** reserved for future usage */
-	uint64_t			cd_reserv;
+	/** Container durable format extension */
+	umem_off_t			cd_ext;
 	/** The active DTXs blob head. */
 	umem_off_t			cd_dtx_active_head;
 	/** The active DTXs blob tail. */
@@ -380,4 +408,18 @@ struct vos_obj_df {
 	struct btr_root			vo_tree;
 };
 
+#define	VOS_OBJ_BKTS_MAX	4
+D_CASSERT(VOS_GC_BKTS_MAX <= VOS_OBJ_BKTS_MAX);
+
+/*
+ * VOS object durable format for md-on-ssd phase2. The size is fit to the 128 bytes
+ * slab (see slab_map[] defined in mem.c).
+ */
+struct vos_obj_p2_df {
+	struct vos_obj_df	p2_obj_df;
+	uint32_t		p2_bkt_ids[VOS_OBJ_BKTS_MAX];
+	uint64_t		p2_reserved;
+};
+D_CASSERT(sizeof(struct vos_obj_p2_df) == D_ALIGNUP(sizeof(struct vos_obj_df), 32));
+
 #endif
diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c
index cc72575f608..25d50ec5868 100644
--- a/src/vos/vos_obj.c
+++ b/src/vos/vos_obj.c
@@ -496,9 +496,12 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch,
 	if (rc != 0)
 		goto reset;
 
-	/* Commit the CoS DTXs via the PUNCH PMDK transaction. */
-	if (dtx_is_valid_handle(dth) && dth->dth_dti_cos_count > 0 &&
-	    !dth->dth_cos_done) {
+	/* Commit the CoS DTXs via the PUNCH PMDK transaction.
+	 *
+	 * It's guaranteed that no other objects are involved in the CoS DTXs, so we don't
+	 * need to pin extra objects here.
+	 */
+	if (dtx_is_valid_handle(dth) && dth->dth_dti_cos_count > 0 && !dth->dth_cos_done) {
 		D_ALLOC_ARRAY(daes, dth->dth_dti_cos_count);
 		if (daes == NULL)
 			D_GOTO(reset, rc = -DER_NOMEM);
@@ -1065,7 +1068,8 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type,
 		 * subtree
 		 */
 		if (krec->kr_bmap & KREC_BF_EVT) {
-			vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont));
+			vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont),
+					      obj);
 			rc = evt_open(&krec->kr_evt, info->ii_uma, &cbs, &info->ii_tree_hdl);
 			if (rc) {
 				D_DEBUG(DB_TRACE,
@@ -1077,7 +1081,7 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type,
 			info->ii_fake_akey_flag = VOS_IT_DKEY_EV;
 		} else {
 			rc = dbtree_open_inplace_ex(&krec->kr_btr, info->ii_uma,
-						    vos_cont2hdl(obj->obj_cont), vos_obj2pool(obj),
+						    vos_cont2hdl(obj->obj_cont), obj,
 						    &info->ii_tree_hdl);
 			if (rc) {
 				D_DEBUG(DB_TRACE,
@@ -2040,7 +2044,7 @@ vos_obj_akey_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info,
 	}
 
 	rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont),
-				    vos_obj2pool(obj), &toh);
+				    obj, &toh);
 	if (rc) {
 		D_DEBUG(DB_TRACE,
 			"Failed to open tree for iterator:"
@@ -2097,7 +2101,7 @@ vos_obj_iter_sv_nested_prep(vos_iter_type_t type, struct vos_iter_info *info,
 	}
 
 	rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont),
-				    vos_obj2pool(obj), &toh);
+				    obj, &toh);
 	if (rc) {
 		D_DEBUG(DB_TRACE,
 			"Failed to open tree for iterator:"
@@ -2157,7 +2161,7 @@ vos_obj_ev_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info,
 		goto prepare;
 	}
 
-	vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont));
+	vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont), obj);
 	rc = evt_open(info->ii_evt, info->ii_uma, &cbs, &toh);
 	if (rc) {
 		D_DEBUG(DB_TRACE,
diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h
index 2ccc8d71988..f572ebb03d9 100644
--- a/src/vos/vos_obj.h
+++ b/src/vos/vos_obj.h
@@ -47,12 +47,25 @@ struct vos_object {
 	struct vos_obj_df		*obj_df;
 	/** backref to container */
 	struct vos_container		*obj_cont;
+	/* Handle for the pinned object */
+	struct umem_pin_handle		*obj_pin_hdl;
+	/** Bucket IDs for the object */
+	uint32_t			obj_bkt_ids[VOS_OBJ_BKTS_MAX];
+	ABT_mutex			obj_mutex;
+	ABT_cond			obj_wait_alloting;
+	ABT_cond			obj_wait_loading;
 	/** nobody should access this object */
 	bool				obj_zombie;
 	/** Object is held for discard */
 	uint32_t                         obj_discard : 1,
 	    /** If non-zero, object is held for aggregation */
-	    obj_aggregate                            : 1;
+	    obj_aggregate                            : 1,
+	    /** Evict-able bucket is already allocated */
+	    obj_bkt_alloted			     : 1,
+	    /** Allocating evict-able bucket in in-progress */
+	    obj_bkt_alloting			     : 1,
+	    /** Loading object is in-progress */
+	    obj_bkt_loading			     : 1;
 };
 
 enum {
diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c
index 8845eae0085..9274e219d75 100644
--- a/src/vos/vos_obj_cache.c
+++ b/src/vos/vos_obj_cache.c
@@ -73,13 +73,37 @@ obj_lop_alloc(void *key, unsigned int ksize, void *args,
 
 	D_ALLOC_PTR(obj);
 	if (!obj)
-		D_GOTO(failed, rc = -DER_NOMEM);
+		return -DER_NOMEM;
+
+	rc = ABT_mutex_create(&obj->obj_mutex);
+	if (rc != ABT_SUCCESS) {
+		rc = dss_abterr2der(rc);
+		goto failed;
+	}
+
+	rc = ABT_cond_create(&obj->obj_wait_alloting);
+	if (rc != ABT_SUCCESS) {
+		rc = dss_abterr2der(rc);
+		goto free_mutex;
+	}
+
+	rc = ABT_cond_create(&obj->obj_wait_loading);
+	if (rc != ABT_SUCCESS) {
+		rc = dss_abterr2der(rc);
+		goto free_alloting;
+	}
 
 	init_object(obj, lkey->olk_oid, cont);
 	d_tm_inc_gauge(tls->vtl_obj_cnt, 1);
 	*llink_p = &obj->obj_llink;
-	rc = 0;
+	return 0;
+
+free_alloting:
+	ABT_cond_free(&obj->obj_wait_alloting);
+free_mutex:
+	ABT_mutex_free(&obj->obj_mutex);
 failed:
+	D_FREE(obj);
 	return rc;
 }
 
@@ -133,6 +157,9 @@ obj_lop_free(struct daos_llink *llink)
 	tls = vos_tls_get(obj->obj_cont->vc_pool->vp_sysdb);
 	d_tm_dec_gauge(tls->vtl_obj_cnt, 1);
 	clean_object(obj);
+	ABT_cond_free(&obj->obj_wait_loading);
+	ABT_cond_free(&obj->obj_wait_alloting);
+	ABT_mutex_free(&obj->obj_mutex);
 	D_FREE(obj);
 }
 
@@ -245,12 +272,132 @@ obj_get(struct daos_lru_cache *occ, struct vos_container *cont, daos_unit_oid_t
 	return rc;
 }
 
+static inline void
+vos_obj_unpin(struct vos_object *obj)
+{
+	struct vos_pool		*pool = vos_obj2pool(obj);
+	struct umem_store	*store = vos_pool2store(pool);
+
+	if (obj->obj_pin_hdl != NULL && daos_lru_is_last_user(&obj->obj_llink)) {
+		umem_cache_unpin(store, obj->obj_pin_hdl);
+		obj->obj_pin_hdl = NULL;
+	}
+}
+
+static void
+obj_allot_bkt(struct vos_pool *pool, struct vos_object *obj)
+{
+	struct dtx_handle	*cur_dth;
+
+	D_ASSERT(umem_tx_none(vos_pool2umm(pool)));
+
+	if (obj->obj_bkt_alloting) {
+		cur_dth = clear_cur_dth(pool);
+
+		ABT_mutex_lock(obj->obj_mutex);
+		ABT_cond_wait(obj->obj_wait_alloting, obj->obj_mutex);
+		ABT_mutex_unlock(obj->obj_mutex);
+
+		D_ASSERT(obj->obj_bkt_alloted == 1);
+		restore_cur_dth(pool, cur_dth);
+		return;
+	}
+	obj->obj_bkt_alloting = 1;
+
+	if (!obj->obj_df) {
+		cur_dth = clear_cur_dth(pool);
+		obj->obj_bkt_ids[0] = umem_allot_mb_evictable(vos_pool2umm(pool), 0);
+		restore_cur_dth(pool, cur_dth);
+	} else {
+		struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj->obj_df;
+
+		obj->obj_bkt_ids[0] = p2->p2_bkt_ids[0];
+	}
+
+	obj->obj_bkt_alloted = 1;
+	obj->obj_bkt_alloting = 0;
+
+	ABT_mutex_lock(obj->obj_mutex);
+	ABT_cond_broadcast(obj->obj_wait_alloting);
+	ABT_mutex_unlock(obj->obj_mutex);
+}
+
+static int
+obj_pin_bkt(struct vos_pool *pool, struct vos_object *obj)
+{
+	struct umem_store	*store = vos_pool2store(pool);
+	struct dtx_handle	*cur_dth;
+	struct umem_cache_range	 rg;
+	int			 rc;
+
+	if (obj->obj_bkt_ids[0] == UMEM_DEFAULT_MBKT_ID) {
+		D_ASSERT(obj->obj_pin_hdl == NULL);
+		D_ASSERT(!obj->obj_bkt_loading);
+		return 0;
+	}
+
+	if (obj->obj_bkt_loading) {
+		cur_dth = clear_cur_dth(pool);
+
+		ABT_mutex_lock(obj->obj_mutex);
+		ABT_cond_wait(obj->obj_wait_loading, obj->obj_mutex);
+		ABT_mutex_unlock(obj->obj_mutex);
+
+		restore_cur_dth(pool, cur_dth);
+		/* The loader failed on vos_cache_pin() */
+		if (obj->obj_pin_hdl == NULL) {
+			D_ERROR("Object:"DF_UOID" isn't pinned.\n", DP_UOID(obj->obj_id));
+			return -DER_BUSY;
+		}
+	}
+
+	if (obj->obj_pin_hdl != NULL) {
+		struct vos_cache_metrics *vcm = store2cache_metrics(store);
+
+		if (vcm)
+			d_tm_inc_counter(vcm->vcm_obj_hit, 1);
+		return 0;
+	}
+
+	obj->obj_bkt_loading = 1;
+
+	rg.cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), obj->obj_bkt_ids[0]);
+	rg.cr_size = store->cache->ca_page_sz;
+
+	rc = vos_cache_pin(pool, &rg, 1, false, &obj->obj_pin_hdl);
+	if (rc)
+		DL_ERROR(rc, "Failed to pin object:"DF_UOID".", DP_UOID(obj->obj_id));
+
+	obj->obj_bkt_loading = 0;
+
+	ABT_mutex_lock(obj->obj_mutex);
+	ABT_cond_broadcast(obj->obj_wait_loading);
+	ABT_mutex_unlock(obj->obj_mutex);
+
+	return rc;
+}
+
+/* Support single evict-able bucket for this moment */
+static inline int
+vos_obj_pin(struct vos_object *obj)
+{
+	struct vos_pool		*pool = vos_obj2pool(obj);
+
+	if (!vos_pool_is_evictable(pool))
+		return 0;
+
+	if (!obj->obj_bkt_alloted)
+		obj_allot_bkt(pool, obj);
+
+	return obj_pin_bkt(pool, obj);
+}
+
 static inline void
 obj_release(struct daos_lru_cache *occ, struct vos_object *obj, bool evict)
 {
 
 	D_ASSERT(obj != NULL);
-	/* TODO: Unpin the object in md-on-ssd phase II */
+	vos_obj_unpin(obj);
 
 	if (obj == &obj_local) {
 		clean_object(obj);
@@ -294,6 +441,8 @@ cache_object(struct daos_lru_cache *occ, struct vos_object **objp)
 	/* This object should not be cached */
 	D_ASSERT(obj_new != NULL);
 	D_ASSERT(obj_new->obj_df == NULL);
+	D_ASSERT(!obj_local.obj_bkt_alloting);
+	D_ASSERT(!obj_local.obj_bkt_loading);
 
 	vos_ilog_fetch_move(&obj_new->obj_ilog_info, &obj_local.obj_ilog_info);
 	obj_new->obj_toh = obj_local.obj_toh;
@@ -301,6 +450,8 @@ cache_object(struct daos_lru_cache *occ, struct vos_object **objp)
 	obj_new->obj_sync_epoch = obj_local.obj_sync_epoch;
 	obj_new->obj_df = obj_local.obj_df;
 	obj_new->obj_zombie = obj_local.obj_zombie;
+	obj_new->obj_bkt_alloted = obj_local.obj_bkt_alloted;
+	obj_new->obj_pin_hdl = obj_local.obj_pin_hdl;
 	obj_local.obj_toh = DAOS_HDL_INVAL;
 	obj_local.obj_ih = DAOS_HDL_INVAL;
 
@@ -363,13 +514,11 @@ vos_obj_check_discard(struct vos_container *cont, daos_unit_oid_t oid, uint64_t
 	if (rc)
 		return rc;
 
-	/* TODO: Pin object in memory */
-
 	if (check_discard(obj, flags))
 		/* Update request will retry with this error */
 		rc = (flags & VOS_OBJ_CREATE) ? -DER_UPDATE_AGAIN : -DER_BUSY;
 
-	obj_release(occ, obj, false);
+	obj_put(occ, obj, false);
 	return rc;
 }
 
@@ -420,6 +569,25 @@ vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t
 		return -DER_TX_RESTART;
 	}
 
+	if (obj->obj_bkt_ids[0] != UMEM_DEFAULT_MBKT_ID) {
+		struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj->obj_df;
+
+		D_ASSERT(vos_pool_is_evictable(vos_obj2pool(obj)));
+		D_ASSERT(obj->obj_bkt_alloted);
+
+		if (p2->p2_bkt_ids[0] == UMEM_DEFAULT_MBKT_ID) {
+			p2->p2_bkt_ids[0] = obj->obj_bkt_ids[0];
+			rc = umem_tx_add_ptr(vos_cont2umm(cont), &p2->p2_bkt_ids[0],
+					     sizeof(p2->p2_bkt_ids[0]));
+			if (rc) {
+				DL_ERROR(rc, "Add bucket ID failed.");
+				return rc;
+			}
+		} else {
+			D_ASSERT(p2->p2_bkt_ids[0] == obj->obj_bkt_ids[0]);
+		}
+	}
+
 	/* It's done for DAOS_INTENT_PUNCH case */
 	if (intent == DAOS_INTENT_PUNCH)
 		return 0;
@@ -453,6 +621,7 @@ vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t
 	D_ASSERT(cont != NULL);
 	D_ASSERT(cont->vc_pool);
 	D_ASSERT(obj_p != NULL);
+
 	*obj_p = NULL;
 
 	occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb);
@@ -507,8 +676,16 @@ vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t
 		D_ASSERT(tmprc == 0); /* Non-zero only valid for akey */
 	}
 
-	/* TODO: Pin the object in memory in md-on-ssd phase II. Revise the 'obj_local' implementation
-	 * then, since this function could yield. */
+	/* For md-on-ssd phase2 pool, add object to cache before yield in vos_obj_pin() */
+	if (obj == &obj_local && vos_pool_is_evictable(cont->vc_pool)) {
+		rc = cache_object(occ, &obj);
+		if (rc != 0)
+			goto failed;
+	}
+
+	rc = vos_obj_pin(obj);
+	if (rc)
+		goto failed;
 
 	/* It's done for DAOS_INTENT_UPDATE or DAOS_INTENT_PUNCH or DAOS_INTENT_KILL */
 	if (intent == DAOS_INTENT_UPDATE || intent == DAOS_INTENT_PUNCH ||
@@ -608,3 +785,279 @@ vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid)
 
 	return (rc == -DER_NONEXIST || rc == -DER_SHUTDOWN)? 0 : rc;
 }
+
+static int
+bkt_cmp(void *array, int a, int b)
+{
+	uint32_t	*bkt_arr = array;
+
+	if (bkt_arr[a] > bkt_arr[b])
+		return 1;
+	if (bkt_arr[a] < bkt_arr[b])
+		return -1;
+	return 0;
+}
+
+static int
+bkt_cmp_key(void *array, int i, uint64_t key)
+{
+	uint32_t	*bkt_arr = array;
+	uint32_t	 bkt_id = (uint32_t)key;
+
+	if (bkt_arr[i] > bkt_id)
+		return 1;
+	if (bkt_arr[i] < bkt_id)
+		return -1;
+	return 0;
+}
+
+static void
+bkt_swap(void *array, int a, int b)
+{
+	uint32_t	*bkt_arr = array;
+	uint32_t	 tmp;
+
+	tmp = bkt_arr[a];
+	bkt_arr[a] = bkt_arr[b];
+	bkt_arr[b] = tmp;
+}
+
+static daos_sort_ops_t bkt_sort_ops = {
+	.so_cmp		= bkt_cmp,
+	.so_swap	= bkt_swap,
+	.so_cmp_key	= bkt_cmp_key,
+};
+
+/* if @sub is a subset of @super */
+bool
+vos_bkt_array_subset(struct vos_bkt_array *super, struct vos_bkt_array *sub)
+{
+	int	i, idx;
+
+	D_ASSERT(sub->vba_cnt > 0);
+	if (sub->vba_cnt > super->vba_cnt)
+		return false;
+
+	for (i = 0; i < sub->vba_cnt; i++) {
+		idx = daos_array_find(super, super->vba_cnt, sub->vba_bkts[i], &bkt_sort_ops);
+		if (idx < 0)
+			return false;
+	}
+
+	return true;
+}
+
+int
+vos_bkt_array_add(struct vos_bkt_array *bkts, uint32_t bkt_id)
+{
+	int	idx;
+
+	D_ASSERT(bkt_id != UMEM_DEFAULT_MBKT_ID);
+
+	/* The @bkt_id is already in bucket array */
+	if (bkts->vba_cnt > 0) {
+		idx = daos_array_find(bkts->vba_bkts, bkts->vba_cnt, bkt_id, &bkt_sort_ops);
+		if (idx >= 0)
+			return 0;
+	}
+
+	/* Bucket array needs be expanded */
+	if (bkts->vba_cnt == bkts->vba_tot) {
+		uint32_t	*new_bkts;
+		size_t		 new_size = bkts->vba_tot * 2;
+
+		if (bkts->vba_tot > VOS_BKTS_INLINE_MAX)
+			D_REALLOC_ARRAY(new_bkts, bkts->vba_bkts, bkts->vba_tot, new_size);
+		else
+			D_ALLOC_ARRAY(new_bkts, new_size);
+
+		if (new_bkts == NULL)
+			return -DER_NOMEM;
+
+		if (bkts->vba_tot == VOS_BKTS_INLINE_MAX)
+			memcpy(new_bkts, bkts->vba_bkts, sizeof(uint32_t) * bkts->vba_tot);
+
+		bkts->vba_bkts = new_bkts;
+		bkts->vba_tot = new_size;
+	}
+
+	bkts->vba_bkts[bkts->vba_cnt] = bkt_id;
+	bkts->vba_cnt++;
+
+	idx = daos_array_sort(bkts->vba_bkts, bkts->vba_cnt, true, &bkt_sort_ops);
+	D_ASSERT(idx == 0);
+
+	return 0;
+}
+
+int
+vos_bkt_array_pin(struct vos_pool *pool, struct vos_bkt_array *bkts,
+		  struct umem_pin_handle **pin_hdl)
+{
+	struct umem_cache_range	 rg_inline[VOS_BKTS_INLINE_MAX];
+	struct umem_cache_range	*ranges;
+	int			 i, rc;
+
+	if (bkts->vba_cnt == 0)
+		return 0;
+
+	if (bkts->vba_cnt > VOS_BKTS_INLINE_MAX) {
+		D_ALLOC_ARRAY(ranges, bkts->vba_cnt);
+		if (ranges == NULL)
+			return -DER_NOMEM;
+	} else {
+		ranges = &rg_inline[0];
+	}
+
+	for (i = 0; i < bkts->vba_cnt; i++) {
+		D_ASSERT(bkts->vba_bkts[i] != UMEM_DEFAULT_MBKT_ID);
+		ranges[i].cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), bkts->vba_bkts[i]);
+		ranges[i].cr_size = vos_pool2store(pool)->cache->ca_page_sz;
+	}
+
+	rc = vos_cache_pin(pool, ranges, bkts->vba_cnt, false, pin_hdl);
+	if (rc)
+		DL_ERROR(rc, "Failed to pin %u ranges.", bkts->vba_cnt);
+
+	if (ranges != &rg_inline[0])
+		D_FREE(ranges);
+
+	return rc;
+}
+
+int
+vos_obj_acquire(struct vos_container *cont, daos_unit_oid_t oid, bool pin,
+		struct vos_object **obj_p)
+{
+	struct vos_object	*obj;
+	struct daos_lru_cache	*occ;
+	int			 rc;
+
+	D_ASSERT(cont != NULL);
+	D_ASSERT(cont->vc_pool);
+	D_ASSERT(obj_p != NULL);
+	*obj_p = NULL;
+
+	occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb);
+	D_ASSERT(occ != NULL);
+
+	/* Lookup object cache, create cache entry if not found */
+	rc = obj_get(occ, cont, oid, true, &obj);
+	if (rc) {
+		DL_ERROR(rc, "Failed to lookup/create object in cache.");
+		return rc;
+	}
+
+	if (obj->obj_zombie) {
+		D_ERROR("The object:"DF_UOID" is already evicted.\n", DP_UOID(oid));
+		obj_put(occ, obj, true);
+		return -DER_AGAIN;
+	}
+
+	/* Lookup OI table if the cached object is negative */
+	if (obj->obj_df == NULL) {
+		obj->obj_sync_epoch = 0;
+		rc = vos_oi_find(cont, oid, &obj->obj_df, NULL);
+		if (rc == 0) {
+			obj->obj_sync_epoch = obj->obj_df->vo_sync;
+		} else if (rc == -DER_NONEXIST) {
+			rc = 0;
+		} else if (rc) {
+			DL_ERROR(rc, "Failed to lookup OI table.");
+			obj_put(occ, obj, false);
+			return rc;
+		}
+	}
+
+	if (!obj->obj_bkt_alloted)
+		obj_allot_bkt(cont->vc_pool, obj);
+
+	if (pin) {
+		rc = obj_pin_bkt(cont->vc_pool, obj);
+		if (rc) {
+			obj_put(occ, obj, false);
+			return rc;
+		}
+	}
+
+	*obj_p = obj;
+
+	return 0;
+}
+
+struct vos_pin_handle {
+	unsigned int		 vph_acquired;
+	struct umem_pin_handle	*vph_pin_hdl;
+	struct vos_object	*vph_objs[0];
+};
+
+void
+vos_unpin_objects(daos_handle_t coh, struct vos_pin_handle *hdl)
+{
+	struct vos_container	*cont = vos_hdl2cont(coh);
+	struct vos_pool		*pool = vos_cont2pool(cont);
+	int			 i;
+
+	if (hdl->vph_pin_hdl != NULL)
+		umem_cache_unpin(vos_pool2store(pool), hdl->vph_pin_hdl);
+
+	for (i = 0; i < hdl->vph_acquired; i++)
+		vos_obj_release(hdl->vph_objs[i], 0, false);
+
+	D_FREE(hdl);
+}
+
+int
+vos_pin_objects(daos_handle_t coh, daos_unit_oid_t oids[], int count, struct vos_pin_handle **hdl)
+{
+	struct vos_pin_handle	*vos_hdl;
+	struct vos_object	*obj;
+	struct vos_bkt_array	 bkts;
+	struct vos_container	*cont = vos_hdl2cont(coh);
+	struct vos_pool		*pool = vos_cont2pool(cont);
+	int			 i, rc;
+
+	*hdl = NULL;
+	if (!vos_pool_is_evictable(pool))
+		return 0;
+
+	D_ASSERT(count > 0);
+	D_ALLOC(vos_hdl, sizeof(*vos_hdl) + sizeof(struct vos_object *) * count);
+	if (vos_hdl == NULL)
+		return -DER_NOMEM;
+
+	vos_bkt_array_init(&bkts);
+	for (i = 0; i < count; i++) {
+		rc = vos_obj_acquire(cont, oids[i], false, &vos_hdl->vph_objs[i]);
+		if (rc) {
+			DL_ERROR(rc, "Failed to acquire object:"DF_UOID"", DP_UOID(oids[i]));
+			goto error;
+		}
+		vos_hdl->vph_acquired++;
+
+		obj = vos_hdl->vph_objs[i];
+		D_ASSERT(obj->obj_bkt_alloted == 1);
+		if (obj->obj_bkt_ids[0] != UMEM_DEFAULT_MBKT_ID) {
+			rc = vos_bkt_array_add(&bkts, obj->obj_bkt_ids[0]);
+			if (rc) {
+				DL_ERROR(rc, "Failed to add bucket:%u to array",
+					 obj->obj_bkt_ids[0]);
+				goto error;
+			}
+		}
+	}
+
+	rc = vos_bkt_array_pin(pool, &bkts, &vos_hdl->vph_pin_hdl);
+	if (rc) {
+		DL_ERROR(rc, "Failed to pin %u objects.", vos_hdl->vph_acquired);
+		goto error;
+	}
+
+	vos_bkt_array_fini(&bkts);
+	*hdl = vos_hdl;
+	return 0;
+error:
+	vos_bkt_array_fini(&bkts);
+	vos_unpin_objects(coh, vos_hdl);
+	return rc;
+}
diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c
index d5955384454..6a86b383c90 100644
--- a/src/vos/vos_obj_index.c
+++ b/src/vos/vos_obj_index.c
@@ -47,7 +47,8 @@ oi_hkey_size(void)
 static int
 oi_rec_msize(int alloc_overhead)
 {
-	return alloc_overhead + sizeof(struct vos_obj_df);
+	/* This function is only used for metadata overhead estimation. */
+	return alloc_overhead + D_ALIGNUP(sizeof(struct vos_obj_df), 32);
 }
 
 static void
@@ -67,6 +68,15 @@ oi_hkey_cmp(struct btr_instance *tins, struct btr_record *rec, void *hkey)
 	return dbtree_key_cmp_rc(memcmp(oid1, oid2, sizeof(*oid1)));
 }
 
+static inline unsigned int
+vos_obj_df_size(struct vos_pool *pool)
+{
+	if (vos_pool_is_p2(pool))
+		return sizeof(struct vos_obj_p2_df);
+
+	return sizeof(struct vos_obj_df);
+}
+
 static int
 oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 	     d_iov_t *val_iov, struct btr_record *rec, d_iov_t *val_out)
@@ -76,10 +86,11 @@ oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 	struct vos_obj_df	*obj;
 	daos_unit_oid_t		*key;
 	umem_off_t		 obj_off;
+	struct vos_pool		*pool = (struct vos_pool *)tins->ti_priv;
 	int			 rc;
 
 	/* Allocate a PMEM value of type vos_obj_df */
-	obj_off = umem_zalloc(&tins->ti_umm, sizeof(struct vos_obj_df));
+	obj_off = umem_zalloc(&tins->ti_umm, vos_obj_df_size(pool));
 	if (UMOFF_IS_NULL(obj_off))
 		return -DER_NOSPACE;
 
@@ -100,11 +111,11 @@ oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 	} else {
 		struct vos_obj_df *new_obj = val_out->iov_buf;
 
-		memcpy(obj, new_obj, sizeof(*obj));
+		memcpy(obj, new_obj, vos_obj_df_size(pool));
 		obj->vo_id = *key;
 	}
 
-	d_iov_set(val_iov, obj, sizeof(struct vos_obj_df));
+	d_iov_set(val_iov, obj, vos_obj_df_size(pool));
 	rec->rec_off = obj_off;
 
 	/* For new created object, commit it synchronously to reduce
@@ -134,6 +145,7 @@ oi_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 	daos_handle_t		 coh = { 0 };
 	int			 rc;
 	struct vos_pool		*pool;
+	uint32_t		*bkt_ids = NULL;
 
 	obj = umem_off2ptr(umm, rec->rec_off);
 
@@ -162,7 +174,14 @@ oi_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 
 	if (del_arg != NULL)
 		coh = vos_cont2hdl((struct vos_container *)del_arg->cont);
-	return gc_add_item(tins->ti_priv, coh, GC_OBJ, rec->rec_off, 0);
+
+	if (vos_pool_is_evictable(pool)) {
+		struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj;
+
+		bkt_ids = &p2->p2_bkt_ids[0];
+	}
+
+	return gc_add_item(tins->ti_priv, coh, GC_OBJ, rec->rec_off, bkt_ids);
 }
 
 static int
@@ -176,7 +195,7 @@ oi_rec_fetch(struct btr_instance *tins, struct btr_record *rec,
 		DP_UOID(obj->vo_id), rec->rec_off);
 
 	D_ASSERT(val_iov != NULL);
-	d_iov_set(val_iov, obj, sizeof(struct vos_obj_df));
+	d_iov_set(val_iov, obj, vos_obj_df_size((struct vos_pool *)tins->ti_priv));
 	return 0;
 }
 
@@ -234,7 +253,6 @@ vos_oi_find(struct vos_container *cont, daos_unit_oid_t oid,
 	}
 
 	tmprc = vos_ilog_ts_add(ts_set, ilog, &oid, sizeof(oid));
-
 	D_ASSERT(tmprc == 0); /* Non-zero return for akey only */
 
 	return rc;
@@ -504,7 +522,7 @@ oi_iter_nested_tree_fetch(struct vos_iterator *iter, vos_iter_type_t type,
 		return rc;
 	}
 
-	D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df));
+	D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool));
 	obj = (struct vos_obj_df *)rec_iov.iov_buf;
 
 	rc = oi_iter_ilog_check(obj, oiter, &info->ii_epr, false);
@@ -610,7 +628,7 @@ oi_iter_match_probe(struct vos_iterator *iter, daos_anchor_t *anchor, uint32_t f
 			goto failed;
 		}
 
-		D_ASSERT(iov.iov_len == sizeof(struct vos_obj_df));
+		D_ASSERT(iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool));
 		obj = (struct vos_obj_df *)iov.iov_buf;
 
 		if (iter->it_filter_cb != NULL && (flags & VOS_ITER_PROBE_AGAIN) == 0) {
@@ -767,7 +785,7 @@ oi_iter_fetch(struct vos_iterator *iter, vos_iter_entry_t *it_entry,
 		return rc;
 	}
 
-	D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df));
+	D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool));
 
 	return oi_iter_fill(rec_iov.iov_buf, oiter, false, it_entry);
 }
@@ -818,7 +836,7 @@ oi_iter_check_punch(daos_handle_t ih)
 		  "Probe should be done before aggregation\n");
 	if (rc != 0)
 		return rc;
-	D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df));
+	D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool));
 	obj = (struct vos_obj_df *)rec_iov.iov_buf;
 	oid = obj->vo_id;
 
@@ -883,7 +901,7 @@ oi_iter_aggregate(daos_handle_t ih, bool range_discard)
 		  "Probe should be done before aggregation\n");
 	if (rc != 0)
 		return rc;
-	D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df));
+	D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool));
 	obj = (struct vos_obj_df *)rec_iov.iov_buf;
 	oid = obj->vo_id;
 
diff --git a/src/vos/vos_overhead.c b/src/vos/vos_overhead.c
index fff55c67d2a..f0b0f0375d3 100644
--- a/src/vos/vos_overhead.c
+++ b/src/vos/vos_overhead.c
@@ -8,13 +8,13 @@
 int
 vos_pool_get_msize(void)
 {
-	return sizeof(struct vos_pool_df);
+	return sizeof(struct vos_pool_df) + sizeof(struct vos_pool_ext_df);
 }
 
 int
 vos_container_get_msize(void)
 {
-	return sizeof(struct vos_cont_df);
+	return sizeof(struct vos_cont_df) + sizeof(struct vos_cont_ext_df);
 }
 
 int
diff --git a/src/vos/vos_pool.c b/src/vos/vos_pool.c
index 6c2e0120842..f0b8fa8604f 100644
--- a/src/vos/vos_pool.c
+++ b/src/vos/vos_pool.c
@@ -149,12 +149,12 @@ vos_meta_load_fn(void *arg)
 		ABT_cond_signal(mlc->mlc_cond);
 }
 
-static inline int
-vos_meta_load(struct umem_store *store, char *start)
+static int
+vos_meta_load(struct umem_store *store, char *start, daos_off_t offset, daos_size_t len)
 {
 	uint64_t		 read_size;
-	uint64_t		 remain_size = store->stor_size;
-	daos_off_t		 off = 0;
+	uint64_t		 remain_size = len;
+	daos_off_t		 off = offset;
 	int			 rc = 0;
 	struct meta_load_arg	*mla;
 	struct meta_load_control mlc;
@@ -223,6 +223,74 @@ vos_meta_load(struct umem_store *store, char *start)
 	return rc ? rc : mlc.mlc_rc;
 }
 
+struct vos_waitqueue {
+	ABT_cond	vw_cond;
+	ABT_mutex	vw_mutex;
+};
+
+static int
+vos_waitqueue_create(void **ret_wq)
+{
+	struct vos_waitqueue	*wq;
+	int			 rc;
+
+	D_ALLOC_PTR(wq);
+	if (wq == NULL)
+		return -DER_NOMEM;
+
+	rc = ABT_mutex_create(&wq->vw_mutex);
+	if (rc != ABT_SUCCESS) {
+		D_FREE(wq);
+		return dss_abterr2der(rc);
+	}
+	rc = ABT_cond_create(&wq->vw_cond);
+	if (rc != ABT_SUCCESS) {
+		ABT_mutex_free(&wq->vw_mutex);
+		D_FREE(wq);
+		return dss_abterr2der(rc);
+	}
+
+	*ret_wq = wq;
+	return 0;
+}
+
+static void
+vos_waitqueue_destroy(void *arg)
+{
+	struct vos_waitqueue	*wq = arg;
+
+	ABT_cond_free(&wq->vw_cond);
+	ABT_mutex_free(&wq->vw_mutex);
+	D_FREE(wq);
+}
+
+static void
+vos_waitqueue_wait(void *arg, bool yield_only)
+{
+	struct vos_waitqueue	*wq = arg;
+
+	if (yield_only) {
+		ABT_thread_yield();
+		return;
+	}
+	ABT_mutex_lock(wq->vw_mutex);
+	ABT_cond_wait(wq->vw_cond, wq->vw_mutex);
+	ABT_mutex_unlock(wq->vw_mutex);
+}
+
+static void
+vos_waitqueue_wakeup(void *arg, bool wakeup_all)
+{
+	struct vos_waitqueue	*wq = arg;
+
+	ABT_mutex_lock(wq->vw_mutex);
+	if (wakeup_all)
+		ABT_cond_broadcast(wq->vw_cond);
+	else
+		ABT_cond_signal(wq->vw_cond);
+	ABT_mutex_unlock(wq->vw_mutex);
+}
+
 static inline int
 vos_meta_writev(struct umem_store *store, struct umem_store_iod *iod, d_sg_list_t *sgl)
 {
@@ -353,13 +421,75 @@ vos_wal_metrics_init(struct vos_wal_metrics *vw_metrics, const char *path, int t
 		D_WARN("Failed to create 'replay_entries' telemetry: "DF_RC"\n", DP_RC(rc));
 }
 
+#define VOS_CACHE_DIR	"vos_cache"
+
+void
+vos_cache_metrics_init(struct vos_cache_metrics *vc_metrics, const char *path, int tgt_id)
+{
+	int	rc;
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_pg_ne, D_TM_GAUGE, "Non-evictable pages",
+			     "pages", "%s/%s/page_ne/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create non-evictable pages telemetry.");
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_pg_pinned, D_TM_GAUGE, "Pinned pages",
+			     "pages", "%s/%s/page_pinned/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create pinned pages telemetry.");
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_pg_free, D_TM_GAUGE, "Free pages",
+			     "pages", "%s/%s/page_free/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create free pages telemetry.");
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_pg_hit, D_TM_COUNTER, "Page cache hit",
+			     "hits", "%s/%s/page_hit/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create page hit telemetry.");
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_pg_miss, D_TM_COUNTER, "Page cache miss",
+			     "misses", "%s/%s/page_miss/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create page miss telemetry.");
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_pg_evict, D_TM_COUNTER, "Page cache evict",
+			     "pages", "%s/%s/page_evict/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create page evict telemetry.");
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_pg_flush, D_TM_COUNTER, "Page cache flush",
+			     "pages", "%s/%s/page_flush/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create page flush telemetry.");
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_pg_load, D_TM_COUNTER, "Page cache load",
+			     "pages", "%s/%s/page_load/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create page load telemetry.");
+
+	rc = d_tm_add_metric(&vc_metrics->vcm_obj_hit, D_TM_COUNTER, "Object cache hit",
+			     "hits", "%s/%s/obj_hit/tgt_%d", path, VOS_CACHE_DIR, tgt_id);
+	if (rc)
+		DL_WARN(rc, "Failed to create object hit telemetry.");
+
+}
+
+static inline struct vos_wal_metrics *
+store2wal_metrics(struct umem_store *store)
+{
+	struct vos_pool_metrics	*vpm = (struct vos_pool_metrics *)store->stor_stats;
+
+	return vpm != NULL ? &vpm->vp_wal_metrics : NULL;
+}
+
 static inline int
 vos_wal_reserve(struct umem_store *store, uint64_t *tx_id)
 {
 	struct bio_wal_info	wal_info;
 	struct vos_pool		*pool;
 	struct bio_wal_stats	ws = { 0 };
-	struct vos_wal_metrics	*vwm;
+	struct vos_wal_metrics	*vwm = store2wal_metrics(store);
 	int			rc;
 
 	pool = store->vos_priv;
@@ -377,7 +507,6 @@ vos_wal_reserve(struct umem_store *store, uint64_t *tx_id)
 
 reserve:
 	D_ASSERT(store && store->stor_priv != NULL);
-	vwm = (struct vos_wal_metrics *)store->stor_stats;
 	rc = bio_wal_reserve(store->stor_priv, tx_id, (vwm != NULL) ? &ws : NULL);
 	if (rc == 0 && vwm != NULL)
 		d_tm_set_gauge(vwm->vwm_wal_waiters, ws.ws_waiters);
@@ -391,11 +520,10 @@ vos_wal_commit(struct umem_store *store, struct umem_wal_tx *wal_tx, void *data_
 	struct bio_wal_info     wal_info;
 	struct vos_pool        *pool;
 	struct bio_wal_stats    ws = {0};
-	struct vos_wal_metrics *vwm;
+	struct vos_wal_metrics *vwm = store2wal_metrics(store);
 	int                     rc;
 
 	D_ASSERT(store && store->stor_priv != NULL);
-	vwm = (struct vos_wal_metrics *)store->stor_stats;
 	if (vwm != NULL)
 		d_tm_mark_duration_start(vwm->vwm_wal_dur, D_TM_CLOCK_REALTIME);
 	rc = bio_wal_commit(store->stor_priv, wal_tx, data_iod, (vwm != NULL) ? &ws : NULL);
@@ -426,6 +554,9 @@ vos_wal_commit(struct umem_store *store, struct umem_wal_tx *wal_tx, void *data_
 		d_tm_set_gauge(vwm->vwm_wal_qd, ws.ws_qd);
 	}
 
+	bio_wal_query(store->stor_priv, &wal_info);
+	umem_cache_commit(store, wal_info.wi_commit_id);
+
 	pool = store->vos_priv;
 	if (unlikely(pool == NULL))
 		return 0; /** In case there is any race for checkpoint init. */
@@ -433,8 +564,6 @@ vos_wal_commit(struct umem_store *store, struct umem_wal_tx *wal_tx, void *data_
 	/** Update checkpoint state after commit in case there is an active checkpoint waiting
 	 *  for this commit to finish.
 	 */
-	bio_wal_query(store->stor_priv, &wal_info);
-
 	pool->vp_update_cb(pool->vp_chkpt_arg, wal_info.wi_commit_id, wal_info.wi_used_blks,
 			   wal_info.wi_tot_blks);
 
@@ -446,18 +575,15 @@ vos_wal_replay(struct umem_store *store,
 	       int (*replay_cb)(uint64_t tx_id, struct umem_action *act, void *arg),
 	       void *arg)
 {
-	struct bio_wal_rp_stats wrs;
-	int rc;
+	struct bio_wal_rp_stats	 wrs;
+	struct vos_wal_metrics	*vwm = store2wal_metrics(store);
+	int			 rc;
 
 	D_ASSERT(store && store->stor_priv != NULL);
-	rc = bio_wal_replay(store->stor_priv,
-			    (store->stor_stats != NULL) ? &wrs : NULL,
-			    replay_cb, arg);
+	rc = bio_wal_replay(store->stor_priv, (vwm != NULL) ? &wrs : NULL, replay_cb, arg);
 
 	/* VOS file rehydration metrics */
-	if (store->stor_stats != NULL && rc >= 0) {
-		struct vos_wal_metrics *vwm = (struct vos_wal_metrics *)store->stor_stats;
-
+	if (vwm != NULL && rc >= 0) {
 		d_tm_inc_counter(vwm->vwm_replay_count, 1);
 		d_tm_set_gauge(vwm->vwm_replay_size, wrs.wrs_sz);
 		d_tm_set_gauge(vwm->vwm_replay_time, wrs.wrs_tm);
@@ -475,6 +601,10 @@ vos_wal_id_cmp(struct umem_store *store, uint64_t id1, uint64_t id2)
 }
 
 struct umem_store_ops vos_store_ops = {
+	.so_waitqueue_create	= vos_waitqueue_create,
+	.so_waitqueue_destroy	= vos_waitqueue_destroy,
+	.so_waitqueue_wait	= vos_waitqueue_wait,
+	.so_waitqueue_wakeup	= vos_waitqueue_wakeup,
 	.so_load	= vos_meta_load,
 	.so_read	= vos_meta_readv,
 	.so_write	= vos_meta_writev,
@@ -667,30 +797,90 @@ vos2mc_flags(unsigned int vos_flags)
 	return mc_flags;
 }
 
+static inline void
+init_umem_store(struct umem_store *store, struct bio_meta_context *mc)
+{
+	bio_meta_get_attr(mc, &store->stor_size, &store->stor_blk_size, &store->stor_hdr_blks,
+			  (uint8_t *)&store->store_type, &store->store_evictable);
+	store->stor_priv = mc;
+	store->stor_ops = &vos_store_ops;
+
+	/* Legacy BMEM V1 pool without backend type stored */
+	if (bio_nvme_configured(SMD_DEV_TYPE_META) && store->store_type == DAOS_MD_PMEM)
+		store->store_type = DAOS_MD_BMEM;
+}
+
+static int
+vos_pool_store_type(daos_size_t scm_sz, daos_size_t meta_sz)
+{
+	int backend;
+
+	backend = umempobj_get_backend_type();
+	D_ASSERT((meta_sz != 0) && (scm_sz != 0));
+
+	if (scm_sz > meta_sz) {
+		D_ERROR("memsize %lu is greater than metasize %lu", scm_sz, meta_sz);
+		return -DER_INVAL;
+	}
+
+	if (scm_sz < meta_sz) {
+		if ((backend == DAOS_MD_BMEM) && umempobj_allow_md_bmem_v2())
+			backend = DAOS_MD_BMEM_V2;
+		else if (backend != DAOS_MD_BMEM_V2) {
+			D_ERROR("scm_sz %lu is less than meta_sz %lu", scm_sz, meta_sz);
+			return -DER_INVAL;
+		}
+	}
+
+	return backend;
+}
+
+int
+vos_pool_roundup_size(daos_size_t *scm_sz, daos_size_t *meta_sz)
+{
+	size_t alignsz;
+	int    rc;
+
+	D_ASSERT(*scm_sz != 0);
+	rc = vos_pool_store_type(*scm_sz, *meta_sz ? *meta_sz : *scm_sz);
+	if (rc < 0)
+		return rc;
+
+	/* Round up the size such that it is compatible with backend */
+	alignsz  = umempobj_pgsz(rc);
+	*scm_sz  = max(D_ALIGNUP(*scm_sz, alignsz), 1 << 24);
+	if (*meta_sz)
+		*meta_sz = max(D_ALIGNUP(*meta_sz, alignsz), 1 << 24);
+
+	return 0;
+}
+
 static int
 vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
-		   size_t scm_sz, size_t nvme_sz, size_t wal_sz, unsigned int flags,
-		   struct umem_pool **ph)
+		   size_t scm_sz, size_t nvme_sz, size_t wal_sz, size_t meta_sz,
+		   unsigned int flags, struct umem_pool **ph)
 {
 	struct bio_xs_context	*xs_ctxt = vos_xsctxt_get();
 	struct umem_store	 store = { 0 };
 	struct bio_meta_context	*mc;
 	struct umem_pool	*pop = NULL;
 	enum bio_mc_flags	 mc_flags = vos2mc_flags(flags);
-	size_t			 meta_sz = scm_sz;
 	int			 rc, ret;
+	size_t                   scm_sz_actual;
 
 	*ph = NULL;
 	/* always use PMEM mode for SMD */
-	store.store_type = umempobj_get_backend_type();
 	if (flags & VOS_POF_SYSDB) {
 		store.store_type = DAOS_MD_PMEM;
 		store.store_standalone = true;
+		goto umem_create;
 	}
 
 	/* No NVMe is configured or current xstream doesn't have NVMe context */
-	if (!bio_nvme_configured(SMD_DEV_TYPE_MAX) || xs_ctxt == NULL)
+	if (!bio_nvme_configured(SMD_DEV_TYPE_MAX) || xs_ctxt == NULL) {
+		store.store_type = DAOS_MD_PMEM;
 		goto umem_create;
+	}
 
 	if (!scm_sz) {
 		struct stat lstat;
@@ -698,14 +888,28 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
 		rc = stat(path, &lstat);
 		if (rc != 0)
 			return daos_errno2der(errno);
-		meta_sz = lstat.st_size;
+		scm_sz_actual = lstat.st_size;
+	} else
+		scm_sz_actual = scm_sz;
+
+	/* Is meta_sz is set then use it, otherwise derive from VOS file size or scm_sz */
+	if (!meta_sz)
+		meta_sz = scm_sz_actual;
+
+	rc = vos_pool_store_type(scm_sz_actual, meta_sz);
+	if (rc < 0) {
+		D_ERROR("Failed to determine the store type for xs:%p pool:"DF_UUID". "DF_RC,
+			xs_ctxt, DP_UUID(pool_id), DP_RC(rc));
+		return rc;
 	}
+	store.store_type = rc;
 
 	D_DEBUG(DB_MGMT, "Create BIO meta context for xs:%p pool:"DF_UUID" "
-		"meta_sz: %zu, nvme_sz: %zu wal_sz:%zu\n",
-		xs_ctxt, DP_UUID(pool_id), meta_sz, nvme_sz, wal_sz);
+		"scm_sz: %zu meta_sz: %zu, nvme_sz: %zu wal_sz:%zu backend:%d\n",
+		xs_ctxt, DP_UUID(pool_id), scm_sz, meta_sz, nvme_sz, wal_sz, store.store_type);
 
-	rc = bio_mc_create(xs_ctxt, pool_id, meta_sz, wal_sz, nvme_sz, mc_flags);
+	rc = bio_mc_create(xs_ctxt, pool_id, scm_sz_actual, meta_sz, wal_sz, nvme_sz, mc_flags,
+			   store.store_type);
 	if (rc != 0) {
 		D_ERROR("Failed to create BIO meta context for xs:%p pool:"DF_UUID". "DF_RC"\n",
 			xs_ctxt, DP_UUID(pool_id), DP_RC(rc));
@@ -724,11 +928,11 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
 		return rc;
 	}
 
-	bio_meta_get_attr(mc, &store.stor_size, &store.stor_blk_size, &store.stor_hdr_blks);
-	store.stor_priv = mc;
-	store.stor_ops = &vos_store_ops;
+	init_umem_store(&store, mc);
 
 umem_create:
+	D_DEBUG(DB_MGMT, "umempobj_create sz: " DF_U64 " store_sz: " DF_U64, scm_sz,
+		store.stor_size);
 	pop = umempobj_create(path, layout, UMEMPOBJ_ENABLE_STATS, scm_sz, 0600, &store);
 	if (pop != NULL) {
 		*ph = pop;
@@ -764,15 +968,17 @@ vos_pmemobj_open(const char *path, uuid_t pool_id, const char *layout, unsigned
 
 	*ph = NULL;
 	/* always use PMEM mode for SMD */
-	store.store_type = umempobj_get_backend_type();
 	if (flags & VOS_POF_SYSDB) {
 		store.store_type = DAOS_MD_PMEM;
 		store.store_standalone = true;
+		goto umem_open;
 	}
 
 	/* No NVMe is configured or current xstream doesn't have NVMe context */
-	if (!bio_nvme_configured(SMD_DEV_TYPE_MAX) || xs_ctxt == NULL)
+	if (!bio_nvme_configured(SMD_DEV_TYPE_MAX) || xs_ctxt == NULL) {
+		store.store_type = DAOS_MD_PMEM;
 		goto umem_open;
+	}
 
 	D_DEBUG(DB_MGMT, "Open BIO meta context for xs:%p pool:"DF_UUID"\n",
 		xs_ctxt, DP_UUID(pool_id));
@@ -784,14 +990,8 @@ vos_pmemobj_open(const char *path, uuid_t pool_id, const char *layout, unsigned
 		return rc;
 	}
 
-	bio_meta_get_attr(mc, &store.stor_size, &store.stor_blk_size, &store.stor_hdr_blks);
-	store.stor_priv = mc;
-	store.stor_ops = &vos_store_ops;
-	if (metrics != NULL) {
-		struct vos_pool_metrics	*vpm = (struct vos_pool_metrics *)metrics;
-
-		store.stor_stats = &vpm->vp_wal_metrics;
-	}
+	init_umem_store(&store, mc);
+	store.stor_stats = metrics;
 
 umem_open:
 	pop = umempobj_open(path, layout, UMEMPOBJ_ENABLE_STATS, &store);
@@ -1014,7 +1214,8 @@ static int pool_open(void *ph, struct vos_pool_df *pool_df,
 
 int
 vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t nvme_sz,
-		   daos_size_t wal_sz, unsigned int flags, uint32_t version, daos_handle_t *poh)
+		   daos_size_t wal_sz, daos_size_t meta_sz, unsigned int flags, uint32_t version,
+		   daos_handle_t *poh)
 {
 	struct umem_pool	*ph;
 	struct umem_attr	 uma = {0};
@@ -1036,9 +1237,9 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
 		return -DER_INVAL;
 
 	D_DEBUG(DB_MGMT,
-		"Pool Path: %s, size: " DF_U64 ":" DF_U64 ", "
+		"Pool Path: %s, size: " DF_U64 ":" DF_U64 ":" DF_U64 ", "
 		"UUID: " DF_UUID ", version: %u\n",
-		path, scm_sz, nvme_sz, DP_UUID(uuid), version);
+		path, scm_sz, nvme_sz, meta_sz, DP_UUID(uuid), version);
 
 	if (flags & VOS_POF_SMALL)
 		flags |= VOS_POF_EXCL;
@@ -1054,15 +1255,16 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
 	}
 
 	/* Path must be a file with a certain size when size argument is 0 */
-	if (!scm_sz && access(path, F_OK) == -1) {
+	if (!scm_sz && access(path, F_OK | R_OK | W_OK) == -1) {
 		D_ERROR("File not accessible (%d) when size is 0\n", errno);
 		return daos_errno2der(errno);
 	}
 
-	rc = vos_pmemobj_create(path, uuid, VOS_POOL_LAYOUT, scm_sz, nvme_sz, wal_sz, flags, &ph);
+	rc = vos_pmemobj_create(path, uuid, VOS_POOL_LAYOUT, scm_sz, nvme_sz, wal_sz, meta_sz,
+				flags, &ph);
 	if (rc) {
-		D_ERROR("Failed to create pool %s, scm_sz="DF_U64", nvme_sz="DF_U64". "DF_RC"\n",
-			path, scm_sz, nvme_sz, DP_RC(rc));
+		D_ERROR("Failed to create pool %s, scm_sz="DF_U64", nvme_sz="DF_U64", meta_sz="
+			DF_U64". "DF_RC"\n", path, scm_sz, nvme_sz, meta_sz, DP_RC(rc));
 		return rc;
 	}
 
@@ -1096,6 +1298,18 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
 		goto end;
 
 	memset(pool_df, 0, sizeof(*pool_df));
+
+	pool_df->pd_ext = umem_zalloc(&umem, sizeof(struct vos_pool_ext_df));
+	if (UMOFF_IS_NULL(pool_df->pd_ext)) {
+		D_ERROR("Failed to allocate pool df extension.\n");
+		rc = -DER_NOSPACE;
+		goto end;
+	}
+
+	rc = gc_init_pool(&umem, pool_df);
+	if (rc)
+		goto end;
+
 	rc = dbtree_create_inplace(VOS_BTR_CONT_TABLE, 0, VOS_CONT_ORDER,
 				   &uma, &pool_df->pd_cont_root, &hdl);
 	if (rc != 0)
@@ -1104,15 +1318,14 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
 	dbtree_close(hdl);
 
 	uuid_copy(pool_df->pd_id, uuid);
-	pool_df->pd_scm_sz	= scm_sz;
+	/* Use meta-blob size as scm if present */
+	pool_df->pd_scm_sz      = (meta_sz) ? meta_sz : scm_sz;
 	pool_df->pd_nvme_sz	= nvme_sz;
 	pool_df->pd_magic	= POOL_DF_MAGIC;
 	if (DAOS_FAIL_CHECK(FLC_POOL_DF_VER))
 		pool_df->pd_version = 0;
 	else
 		pool_df->pd_version = version;
-
-	gc_init_pool(&umem, pool_df);
 end:
 	/**
 	 * The transaction can in reality be aborted
@@ -1172,11 +1385,11 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
 }
 
 int
-vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t nvme_sz,
-		unsigned int flags, uint32_t version, daos_handle_t *poh)
+vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t data_sz,
+		daos_size_t meta_sz, unsigned int flags, uint32_t version, daos_handle_t *poh)
 {
 	/* create vos pool with default WAL size */
-	return vos_pool_create_ex(path, uuid, scm_sz, nvme_sz, 0, flags, version, poh);
+	return vos_pool_create_ex(path, uuid, scm_sz, data_sz, 0, meta_sz, flags, version, poh);
 }
 
 /**
@@ -1399,15 +1612,8 @@ pool_open(void *ph, struct vos_pool_df *pool_df, unsigned int flags, void *metri
 	/* Insert the opened pool to the uuid hash table */
 	uuid_copy(ukey.uuid, pool_df->pd_id);
 	pool->vp_sysdb = !!(flags & VOS_POF_SYSDB);
-	rc = pool_link(pool, &ukey, poh);
-	if (rc) {
-		D_ERROR("Error inserting into vos DRAM hash\n");
-		D_GOTO(failed, rc);
-	}
-
 	pool->vp_dtx_committed_count = 0;
 	pool->vp_pool_df             = pool_df;
-
 	pool->vp_opened = 1;
 	pool->vp_excl = !!(flags & VOS_POF_EXCL);
 	pool->vp_small = !!(flags & VOS_POF_SMALL);
@@ -1425,6 +1631,16 @@ pool_open(void *ph, struct vos_pool_df *pool_df, unsigned int flags, void *metri
 	else
 		pool->vp_data_thresh = DAOS_PROP_PO_DATA_THRESH_DEFAULT;
 
+	rc = gc_open_pool(pool);
+	if (rc)
+		goto failed;
+
+	rc = pool_link(pool, &ukey, poh);
+	if (rc) {
+		D_ERROR("Error inserting into vos DRAM hash\n");
+		D_GOTO(failed, rc);
+	}
+
 	vos_space_sys_init(pool);
 	/* Ensure GC is triggered after server restart */
 	gc_add_pool(pool);
@@ -1616,10 +1832,12 @@ vos_pool_close(daos_handle_t poh)
 	pool->vp_opened--;
 
 	/* If the last reference is holding by GC */
-	if (pool->vp_opened == 1 && gc_have_pool(pool))
+	if (pool->vp_opened == 1 && gc_have_pool(pool)) {
 		gc_del_pool(pool);
-	else if (pool->vp_opened == 0)
+	} else if (pool->vp_opened == 0) {
 		vos_pool_hash_del(pool);
+		gc_close_pool(pool);
+	}
 
 	vos_pool_decref(pool); /* -1 for myself */
 	return 0;
diff --git a/src/vos/vos_query.c b/src/vos/vos_query.c
index e924e4016b6..b4d414012e5 100644
--- a/src/vos/vos_query.c
+++ b/src/vos/vos_query.c
@@ -162,7 +162,7 @@ query_normal_recx(struct open_query *query, daos_recx_t *recx)
 	uint32_t		inob;
 
 
-	vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh);
+	vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh, query->qt_obj);
 	rc = evt_open(query->qt_recx_root, &query->qt_pool->vp_uma, &cbs, &toh);
 	if (rc != 0)
 		return rc;
@@ -344,7 +344,7 @@ query_ec_recx(struct open_query *query, daos_recx_t *recx)
 	bool			prefresh = true;
 
 
-	vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh);
+	vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh, query->qt_obj);
 	rc = evt_open(query->qt_recx_root, &query->qt_pool->vp_uma, &cbs, &toh);
 	if (rc != 0)
 		return rc;
@@ -517,7 +517,7 @@ open_and_query_key(struct open_query *query, daos_key_t *key,
 		return -DER_NONEXIST;
 
 	rc = dbtree_open_inplace_ex(to_open, &query->qt_pool->vp_uma,
-				    query->qt_coh, query->qt_pool, toh);
+				    query->qt_coh, query->qt_obj, toh);
 	if (rc != 0)
 		return rc;
 
diff --git a/src/vos/vos_space.c b/src/vos/vos_space.c
index 5763e3f8bac..35a407e2b3e 100644
--- a/src/vos/vos_space.c
+++ b/src/vos/vos_space.c
@@ -126,7 +126,7 @@ vos_space_query(struct vos_pool *pool, struct vos_pool_space *vps, bool slow)
 	struct vos_pool_df	*df = pool->vp_pool_df;
 	struct vea_attr		*attr = &vps->vps_vea_attr;
 	struct vea_stat		*stat = slow ? &vps->vps_vea_stat : NULL;
-	daos_size_t		 scm_used;
+	daos_size_t		 scm_used, ne_used;
 	int			 rc;
 
 	SCM_TOTAL(vps) = df->pd_scm_sz;
@@ -143,6 +143,27 @@ vos_space_query(struct vos_pool *pool, struct vos_pool_space *vps, bool slow)
 		return rc;
 	}
 
+	/* Query non-evictable zones usage when the phase2 pool is evictable */
+	if (vos_pool_is_evictable(pool)) {
+		rc = umempobj_get_mbusage(vos_pool2umm(pool)->umm_pool, UMEM_DEFAULT_MBKT_ID,
+					  &ne_used, &vps->vps_ne_total);
+		if (rc) {
+			rc = umem_tx_errno(rc);
+			DL_ERROR(rc, "Query pool:"DF_UUID" NE space usage failed.",
+				 DP_UUID(pool->vp_id));
+			return rc;
+		}
+		if (ne_used > vps->vps_ne_total) {
+			D_ERROR("NE used:"DF_U64" > NE total:"DF_U64"\n",
+				ne_used, vps->vps_ne_total);
+			return -DER_INVAL;
+		}
+		vps->vps_ne_free = vps->vps_ne_total - ne_used;
+	} else {
+		vps->vps_ne_total = 0;
+		vps->vps_ne_free = 0;
+	}
+
 	/*
 	 * FIXME: pmemobj_ctl_get() sometimes return an insane large value, it
 	 * could be a PMDK defect.
diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c
index e9dd4e94436..c7aa8b57f5e 100644
--- a/src/vos/vos_tree.c
+++ b/src/vos/vos_tree.c
@@ -154,8 +154,9 @@ ktr_hkey_gen(struct btr_instance *tins, d_iov_t *key_iov, void *hkey)
 {
 	struct ktr_hkey		*kkey = (struct ktr_hkey *)hkey;
 	struct umem_pool        *umm_pool = tins->ti_umm.umm_pool;
-	struct vos_pool         *pool     = (struct vos_pool *)tins->ti_priv;
+	struct vos_pool         *pool;
 
+	pool = vos_obj2pool(tins->ti_priv);
 	D_ASSERT(key_iov->iov_len < pool->vp_pool_df->pd_scm_sz);
 	hkey_common_gen(key_iov, hkey);
 
@@ -255,7 +256,7 @@ ktr_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 
 	rbund = iov2rec_bundle(val_iov);
 
-	rec->rec_off = umem_zalloc(&tins->ti_umm, vos_krec_size(rbund));
+	rec->rec_off = vos_obj_alloc(&tins->ti_umm, tins->ti_priv, vos_krec_size(rbund), true);
 	if (UMOFF_IS_NULL(rec->rec_off))
 		return -DER_NOSPACE;
 
@@ -286,6 +287,8 @@ ktr_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 	int			 gc;
 	int			 rc;
 	struct vos_pool		*pool;
+	struct vos_object	*obj;
+	uint32_t		*bkt_ids = NULL;
 
 	if (UMOFF_IS_NULL(rec->rec_off))
 		return 0;
@@ -298,14 +301,22 @@ ktr_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 	if (rc != 0)
 		return rc;
 
-	pool = (struct vos_pool *)tins->ti_priv;
+	D_ASSERT(tins->ti_priv);
+	obj = tins->ti_priv;
+	pool = vos_obj2pool(obj);
+
 	vos_ilog_ts_evict(&krec->kr_ilog, (krec->kr_bmap & KREC_BF_DKEY) ?
 			  VOS_TS_TYPE_DKEY : VOS_TS_TYPE_AKEY, pool->vp_sysdb);
 
-	D_ASSERT(tins->ti_priv);
 	gc = (krec->kr_bmap & KREC_BF_DKEY) ? GC_DKEY : GC_AKEY;
 	coh = vos_cont2hdl(args);
-	return gc_add_item(pool, coh, gc, rec->rec_off, 0);
+
+	if (vos_pool_is_evictable(pool)) {
+		D_ASSERT(obj->obj_bkt_alloted == 1);
+		bkt_ids = &obj->obj_bkt_ids[0];
+	}
+
+	return gc_add_item(pool, coh, gc, rec->rec_off, bkt_ids);
 }
 
 static int
@@ -351,7 +362,7 @@ ktr_rec_update(struct btr_instance *tins, struct btr_record *rec,
 static umem_off_t
 ktr_node_alloc(struct btr_instance *tins, int size)
 {
-	return umem_zalloc(&tins->ti_umm, size);
+	return vos_obj_alloc(&tins->ti_umm, tins->ti_priv, size, true);
 }
 
 static btr_ops_t key_btr_ops = {
@@ -636,7 +647,7 @@ svt_free_payload(struct vos_pool *pool, bio_addr_t *addr, uint64_t rsize)
 	} else if (addr->ba_type == DAOS_MEDIA_NVME) {
 		rc = vos_bio_addr_free(pool, addr, rsize);
 		if (rc)
-			DL_ERROR(rc, "Free SV payload on NVMe failed."); 
+			DL_ERROR(rc, "Free SV payload on NVMe failed.");
 	}
 	/* Payload is allocated along with vos_iref_df when SV is stored on SCM */
 
@@ -670,7 +681,10 @@ svt_rec_free_internal(struct btr_instance *tins, struct btr_record *rec,
 		return rc;
 
 	if (!overwrite) {
-		struct vos_pool	*pool = tins->ti_priv;
+		struct vos_pool *pool;
+
+		D_ASSERT(tins->ti_priv != NULL);
+		pool = vos_obj2pool(tins->ti_priv);
 
 		rc = svt_free_payload(pool, addr, irec->ir_size);
 		if (rc)
@@ -762,7 +776,7 @@ svt_check_availability(struct btr_instance *tins, struct btr_record *rec,
 static umem_off_t
 svt_node_alloc(struct btr_instance *tins, int size)
 {
-	return umem_zalloc(&tins->ti_umm, size);
+	return vos_obj_alloc(&tins->ti_umm, tins->ti_priv, size, true);
 }
 
 static btr_ops_t singv_btr_ops = {
@@ -850,12 +864,13 @@ evt_dop_log_del(struct umem_instance *umm, daos_epoch_t epoch,
 }
 
 void
-vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool,
-		      daos_handle_t coh)
+vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, daos_handle_t coh,
+		      struct vos_object *obj)
 {
 	/* NB: coh is not required for destroy */
 	cbs->dc_bio_free_cb	= evt_dop_bio_free;
 	cbs->dc_bio_free_args	= (void *)pool;
+	cbs->dc_alloc_arg	= (void *)obj;
 	cbs->dc_log_status_cb	= evt_dop_log_status;
 	cbs->dc_log_status_args	= (void *)(unsigned long)coh.cookie;
 	cbs->dc_log_add_cb	= evt_dop_log_add;
@@ -877,7 +892,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags,
 	int			 unexpected_flag;
 	int			 rc = 0;
 
-	vos_evt_desc_cbs_init(&cbs, pool, coh);
+	vos_evt_desc_cbs_init(&cbs, pool, coh, obj);
 	if ((krec->kr_bmap & (KREC_BF_BTR | KREC_BF_EVT)) == 0)
 		goto create;
 
@@ -903,7 +918,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags,
 	if (expected_flag == KREC_BF_EVT) {
 		rc = evt_open(&krec->kr_evt, uma, &cbs, sub_toh);
 	} else {
-		rc = dbtree_open_inplace_ex(&krec->kr_btr, uma, coh, pool, sub_toh);
+		rc = dbtree_open_inplace_ex(&krec->kr_btr, uma, coh, obj, sub_toh);
 	}
 	if (rc != 0)
 		D_ERROR("Failed to open tree: " DF_RC "\n", DP_RC(rc));
@@ -972,7 +987,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags,
 
 		rc = dbtree_create_inplace_ex(ta->ta_class, tree_feats,
 					      ta->ta_order, uma, &krec->kr_btr,
-					      coh, pool, sub_toh);
+					      coh, obj, sub_toh);
 		if (rc != 0) {
 			D_ERROR("Failed to create btree: "DF_RC"\n", DP_RC(rc));
 			goto out;
@@ -1254,14 +1269,13 @@ obj_tree_init(struct vos_object *obj)
 					      ta->ta_order, vos_obj2uma(obj),
 					      &obj->obj_df->vo_tree,
 					      vos_cont2hdl(obj->obj_cont),
-					      vos_obj2pool(obj),
-					      &obj->obj_toh);
+					      obj, &obj->obj_toh);
 	} else {
 		D_DEBUG(DB_DF, "Open btree for object\n");
 		rc = dbtree_open_inplace_ex(&obj->obj_df->vo_tree,
 					    vos_obj2uma(obj),
 					    vos_cont2hdl(obj->obj_cont),
-					    vos_obj2pool(obj), &obj->obj_toh);
+					    obj, &obj->obj_toh);
 	}
 
 	if (rc)
diff --git a/utils/build.config b/utils/build.config
index 5b039750a9b..55dc0b05862 100644
--- a/utils/build.config
+++ b/utils/build.config
@@ -8,7 +8,7 @@ pmdk=2.1.0
 isal=v2.30.0
 isal_crypto=v2.23.0
 spdk=v22.01.2
-ofi=v1.19.1
+ofi=v1.22.0
 mercury=v2.4.0rc5
 protobufc=v1.3.3
 ucx=v1.14.1
@@ -27,7 +27,6 @@ ucx=https://github.com/openucx/ucx.git
 
 [patch_versions]
 spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff
-ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff
 fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff
 mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch
 pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff
diff --git a/utils/node_local_test.py b/utils/node_local_test.py
index 1394dc8182a..e49537f3098 100755
--- a/utils/node_local_test.py
+++ b/utils/node_local_test.py
@@ -1336,7 +1336,7 @@ def __str__(self):
 
         return f'DFuse instance at {self.dir} ({running})'
 
-    def start(self, v_hint=None, single_threaded=False, use_oopt=False):
+    def start(self, v_hint=None, use_oopt=False):
         """Start a dfuse instance"""
         # pylint: disable=too-many-branches
         dfuse_bin = join(self.conf['PREFIX'], 'bin', 'dfuse')
@@ -1384,9 +1384,7 @@ def start(self, v_hint=None, single_threaded=False, use_oopt=False):
         if self.multi_user:
             cmd.append('--multi-user')
 
-        if single_threaded:
-            cmd.append('--singlethread')
-        elif not self.cores:
+        if not self.cores:
             # Use a lower default thread-count for NLT due to running tests in parallel.
             cmd.extend(['--thread-count', '4'])
 
@@ -1979,11 +1977,9 @@ class needs_dfuse_with_opt():
     wrapping_lock = threading.Lock()
 
     # pylint: disable=too-few-public-methods
-    def __init__(self, caching_variants=None, wbcache=True, single_threaded=False,
-                 dfuse_inval=True, ro=False):
+    def __init__(self, caching_variants=None, wbcache=True, dfuse_inval=True, ro=False):
         self.caching_variants = caching_variants if caching_variants else [False, True]
         self.wbcache = wbcache
-        self.single_threaded = single_threaded
         self.dfuse_inval = dfuse_inval
         self.ro = ro
 
@@ -2019,7 +2015,7 @@ def _helper(obj):
                               caching=caching,
                               wbcache=self.wbcache,
                               **args)
-            obj.dfuse.start(v_hint=method.__name__, single_threaded=self.single_threaded)
+            obj.dfuse.start(v_hint=method.__name__)
             try:
                 rc = method(obj)
             finally:
@@ -2677,11 +2673,6 @@ def test_readdir_unlink(self):
         assert len(post_files) == len(files) - 1
         assert post_files == files[:-2] + [files[-1]]
 
-    @needs_dfuse_with_opt(single_threaded=True, caching_variants=[True])
-    def test_single_threaded(self):
-        """Test single-threaded mode"""
-        self.readdir_test(10)
-
     @needs_dfuse
     def test_open_replaced(self):
         """Test that fstat works on file clobbered by rename"""
@@ -5919,7 +5910,7 @@ def test_dfuse_start(server, conf, wf):
 
     cmd = [join(conf['PREFIX'], 'bin', 'dfuse'),
            '--mountpoint', mount_point,
-           '--pool', pool.id(), '--cont', container.id(), '--foreground', '--singlethread']
+           '--pool', pool.id(), '--cont', container.id(), '--foreground', '--thread-count=2']
 
     test_cmd = AllocFailTest(conf, 'dfuse', cmd)
     test_cmd.wf = wf
diff --git a/utils/rpms/daos.rpmlintrc b/utils/rpms/daos.rpmlintrc
index b1553ca5141..9912465edf4 100644
--- a/utils/rpms/daos.rpmlintrc
+++ b/utils/rpms/daos.rpmlintrc
@@ -44,7 +44,7 @@ addFilter("E: static-library-without-debuginfo \/usr\/lib64\/lib(dfuse|ioil)\.a"
 
 # these need to be fixed:
 # https://daosio.atlassian.net/browse/DAOS-11539
-addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|self_test|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)).so")
+addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|self_test|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)|dav_v2).so")
 
 # Tests rpm needs to be able to build daos from source so pulls in build deps and is expected.
 addFilter("daos-client-tests\.x86_64: E: devel-dependency protobuf-c-devel")
diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec
index ea49dd2d8df..12ac6bd3d5c 100644
--- a/utils/rpms/daos.spec
+++ b/utils/rpms/daos.spec
@@ -16,7 +16,7 @@
 
 Name:          daos
 Version:       2.7.100
-Release:       9%{?relval}%{?dist}
+Release:       10%{?relval}%{?dist}
 Summary:       DAOS Storage Engine
 
 License:       BSD-2-Clause-Patent
@@ -457,6 +457,7 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 %{_libdir}/daos_srv/libplacement.so
 %{_libdir}/daos_srv/libpipeline.so
 %{_libdir}/libdaos_common_pmem.so
+%{_libdir}/libdav_v2.so
 %config(noreplace) %{conf_dir}/vos_size_input.yaml
 %{_bindir}/daos_storage_estimator.py
 %{python3_sitearch}/storage_estimator/*.py
@@ -592,6 +593,10 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 # No files in a shim package
 
 %changelog
+* Fri Nov 1 2024 Sherin T George <sherin-t.george@hpe.com> 2.7.100-10
+- The modified DAV allocator with memory bucket support for md_on_ssd
+  phase-2 is delivered as dav_v2.so.
+
 * Tue Oct 15 2024 Brian J. Murrell <brian.murrell@intel.com> - 2.7.100-9
 - Drop BRs for UCX as they were obsoleted as of e01970d
 
diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml
index cfb13b5c40f..c6d9974456d 100644
--- a/utils/trivy/trivy.yaml
+++ b/utils/trivy/trivy.yaml
@@ -1,3 +1,6 @@
+# SPDX-License-Identifier: BSD-2-Clause-Patent
+# Copyright (c) 2024 Intel Corporation.
+
 cache:
   backend: fs
   dir:
@@ -16,7 +19,7 @@ db:
   no-progress: false
   repository: ghcr.io/aquasecurity/trivy-db
   skip-update: false
-debug: false
+debug: true
 dependency-tree: true
 exit-code: 0
 generate-default-config: false
diff --git a/utils/utest.yaml b/utils/utest.yaml
index d9e66e2ad1f..faf0102050d 100644
--- a/utils/utest.yaml
+++ b/utils/utest.yaml
@@ -130,6 +130,11 @@
   sudo: True
   required_src: ["src/vos/tests/bio_ut.c"]
   tests:
+    - cmd: ["bin/vos_tests", "-A", "50"]
+      env_vars:
+        DAOS_MD_ON_SSD_MODE: "3"
+      aio: "AIO_7"
+      size: 13
     - cmd: ["bin/vos_tests", "-A", "50"]
       aio: "AIO_7"
       size: 13