diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 8f5524d4513..89152cb1afa 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -1,7 +1,12 @@ +# SPDX-License-Identifier: BSD-2-Clause-Patent +# Copyright (c) 2024 Intel Corporation. + name: Trivy scan on: workflow_dispatch: + schedule: + - cron: '0 0 * * *' push: branches: ["master", "release/**"] pull_request: @@ -11,15 +16,17 @@ on: permissions: {} jobs: - build: - name: Build - runs-on: ubuntu-20.04 + scan: + name: Scan with Trivy + runs-on: ubuntu-latest + permissions: + security-events: write steps: - name: Checkout code uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: Run Trivy vulnerability scanner in repo mode - uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + - name: Run Trivy vulnerability scanner in filesystem mode (table format) + uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2 # 0.28.0 with: scan-type: 'fs' scan-ref: '.' @@ -43,8 +50,8 @@ jobs: utils/trivy/trivy.yaml sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml - - name: Run Trivy vulnerability scanner in repo mode - uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + - name: Run Trivy vulnerability scanner in filesystem mode (sarif format) + uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2 # 0.28.0 with: scan-type: 'fs' scan-ref: '.' @@ -62,8 +69,8 @@ jobs: sed -i 's/format: sarif/format: table/g' utils/trivy/trivy.yaml sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml - - name: Run Trivy vulnerability scanner in repo mode - uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + - name: Run Trivy vulnerability scanner in filesystem mode (human readable format) + uses: aquasecurity/trivy-action@915b19bbe73b92a6cf82a1bc12b087c9a19a5fe2 # 0.28.0 with: scan-type: 'fs' scan-ref: '.' diff --git a/Jenkinsfile b/Jenkinsfile index 18451cd9c56..87416ffdf98 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -876,7 +876,7 @@ pipeline { } steps { job_step_update( - unitTest(timeout_time: 60, + unitTest(timeout_time: 180, unstash_opt: true, ignore_failure: true, inst_repos: prRepos(), @@ -1167,6 +1167,7 @@ pipeline { 'Functional Hardware Medium': getFunctionalTestStage( name: 'Functional Hardware Medium', pragma_suffix: '-hw-medium', + base_branch: 'master', label: params.FUNCTIONAL_HARDWARE_MEDIUM_LABEL, next_version: next_version, stage_tags: 'hw,medium,-provider', @@ -1179,6 +1180,7 @@ pipeline { 'Functional Hardware Medium MD on SSD': getFunctionalTestStage( name: 'Functional Hardware Medium MD on SSD', pragma_suffix: '-hw-medium-md-on-ssd', + base_branch: 'master', label: params.FUNCTIONAL_HARDWARE_MEDIUM_LABEL, next_version: next_version, stage_tags: 'hw,medium,-provider', @@ -1192,6 +1194,7 @@ pipeline { 'Functional Hardware Medium VMD': getFunctionalTestStage( name: 'Functional Hardware Medium VMD', pragma_suffix: '-hw-medium-vmd', + base_branch: 'master', label: params.FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL, next_version: next_version, stage_tags: 'hw_vmd,medium', @@ -1205,6 +1208,7 @@ pipeline { 'Functional Hardware Medium Verbs Provider': getFunctionalTestStage( name: 'Functional Hardware Medium Verbs Provider', pragma_suffix: '-hw-medium-verbs-provider', + base_branch: 'master', label: params.FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL, next_version: next_version, stage_tags: 'hw,medium,provider', @@ -1218,6 +1222,7 @@ pipeline { 'Functional Hardware Medium Verbs Provider MD on SSD': getFunctionalTestStage( name: 'Functional Hardware Medium Verbs Provider MD on SSD', pragma_suffix: '-hw-medium-verbs-provider-md-on-ssd', + base_branch: 'master', label: params.FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL, next_version: next_version, stage_tags: 'hw,medium,provider', @@ -1232,6 +1237,7 @@ pipeline { 'Functional Hardware Medium UCX Provider': getFunctionalTestStage( name: 'Functional Hardware Medium UCX Provider', pragma_suffix: '-hw-medium-ucx-provider', + base_branch: 'master', label: params.FUNCTIONAL_HARDWARE_MEDIUM_UCX_PROVIDER_LABEL, next_version: next_version, stage_tags: 'hw,medium,provider', @@ -1245,6 +1251,7 @@ pipeline { 'Functional Hardware Large': getFunctionalTestStage( name: 'Functional Hardware Large', pragma_suffix: '-hw-large', + base_branch: 'master', label: params.FUNCTIONAL_HARDWARE_LARGE_LABEL, next_version: next_version, stage_tags: 'hw,large', @@ -1257,6 +1264,7 @@ pipeline { 'Functional Hardware Large MD on SSD': getFunctionalTestStage( name: 'Functional Hardware Large MD on SSD', pragma_suffix: '-hw-large-md-on-ssd', + base_branch: 'master', label: params.FUNCTIONAL_HARDWARE_LARGE_LABEL, next_version: next_version, stage_tags: 'hw,large', diff --git a/README.md b/README.md index 0bd1915919e..35fd647b185 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![Build](https://github.com/daos-stack/daos/actions/workflows/ci2.yml/badge.svg)](https://github.com/daos-stack/daos/actions/workflows/ci2.yml) [![Codespell](https://github.com/daos-stack/daos/actions/workflows/spelling.yml/badge.svg)](https://github.com/daos-stack/daos/actions/workflows/spelling.yml) [![Doxygen](https://github.com/daos-stack/daos/actions/workflows/doxygen.yml/badge.svg)](https://github.com/daos-stack/daos/actions/workflows/doxygen.yml) +[![Trivy scan](https://github.com/daos-stack/daos/actions/workflows/trivy.yml/badge.svg)](https://github.com/daos-stack/daos/actions/workflows/trivy.yml) diff --git a/debian/changelog b/debian/changelog index f65c13eeb9f..a77e2e130b9 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +daos (2.7.100-10) unstable; urgency=medium + + [ Sherin T George ] + * Add DAV v2 lib + + -- Sherin T George Fri, 1 Nov 2024 11:54:00 +0530 + daos (2.7.100-9) unstable; urgency=medium [ Brian J. Murrell ] * Remove Build-Depends: for UCX as they were obsoleted as of e01970d @@ -130,6 +137,7 @@ daos (2.5.100-12) unstable; urgency=medium -- Tomasz Gromadzki Fri, 17 Nov 2023 12:52:00 -0400 +daos (2.5.100-11) unstable; urgency=medium [ Jerome Soumagne ] * Bump mercury min version to 2.3.1 diff --git a/debian/daos-server.install b/debian/daos-server.install index fb1e8af9a67..99d344327f4 100644 --- a/debian/daos-server.install +++ b/debian/daos-server.install @@ -28,6 +28,7 @@ usr/lib64/daos_srv/libbio.so usr/lib64/daos_srv/libplacement.so usr/lib64/daos_srv/libpipeline.so usr/lib64/libdaos_common_pmem.so +usr/lib64/libdav_v2.so usr/share/daos/control/setup_spdk.sh usr/lib/systemd/system/daos_server.service usr/lib/sysctl.d/10-daos_server.conf diff --git a/docs/admin/pool_operations.md b/docs/admin/pool_operations.md index 36907a2e31f..8c3db202c4b 100644 --- a/docs/admin/pool_operations.md +++ b/docs/admin/pool_operations.md @@ -26,6 +26,7 @@ Its subcommands can be grouped into the following areas: * An upgrade command to upgrade a pool's format version after a DAOS software upgrade. + ### Creating a Pool A DAOS pool can be created through the `dmg pool create` command. @@ -170,6 +171,195 @@ on pool size, but also on number of targets, target size, object class, storage redundancy factor, etc. +#### Creating a pool in MD-on-SSD mode + +In MD-on-SSD mode, a pool is made up of a single component in memory (RAM-disk +associated with each engine) and three components on storage (NVMe SSD). The +components in storage are related to "roles" WAL, META and DATA and roles are +assigned to hardware devices in the +[server configuration file](https://docs.daos.io/v2.6/admin/deployment/#server-configuration-file). + +In MD-on-SSD mode pools are by default created with equal allocations for +metadata-in-memory and metadata-on-SSD but it is possible to change this. To +create a pool with a metadata-on-SSD allocation size that is double what is +allocated in memory, set `dmg pool create --mem-ratio` option to `50%`. This +implies that the ratio of metadata on memory and on storage should be 0.5 and +therefore metadata-on-SSD allocation is twice that of metadata-in-memory. + +A MD-on-SSD pool created with a `--mem-ratio` between 0 and 100 percent is +said to be operating in "phase-2" mode. + +#### MD-on-SSD phase-2 pool create examples + +These examples cover the recommended way to create a pool in MD-on-SSD phase-2 +mode using the `--size` percentage option. + +The following example is run on a single host with dual engines where bdev +roles META and DATA are not shared. Two pools are created with VOS index file +size equal to half the meta-blob size (`--mem-ratio 50%`). Both pools use +roughly half the original capacity available (first using 50% and the second +100% of the remainder). + +Rough calculations: `dmg storage scan` shows that for each rank, one 800GB SSD +is assigned for each tier (first: WAL+META, second: DATA). `df -h /mnt/daos*` +reports usable ramdisk capacity for each rank is 66GiB. +- Expected Data storage would then be 400GB for a 50% capacity first pool and + 100% capacity second pool per-rank. +- Expected Meta storage at 50% mem-ratio would be `66GiB*2 = 132GiB == 141GB` + giving ~70GB for 50% first and 100% second pools. +- Expected Memory file size (aggregated) is `66GiB/2 = 35GB` for 50% first and + 100% second pools. + +```bash +$ dmg pool create bob --size 50% --mem-ratio 50% + +Pool created with 14.86%,85.14% storage tier ratio +-------------------------------------------------- + UUID : 47060d94-c689-4981-8c89-011beb063f8f + Service Leader : 0 + Service Ranks : [0-1] + Storage Ranks : [0-1] + Total Size : 940 GB + Metadata Storage : 140 GB (70 GB / rank) + Data Storage : 800 GB (400 GB / rank) + Memory File Size : 70 GB (35 GB / rank) + +$ dmg pool create bob2 --size 100% --mem-ratio 50% + +Pool created with 14.47%,85.53% storage tier ratio +-------------------------------------------------- + UUID : bdbef091-f0f8-411d-8995-f91c4efc690f + Service Leader : 1 + Service Ranks : [0-1] + Storage Ranks : [0-1] + Total Size : 935 GB + Metadata Storage : 135 GB (68 GB / rank) + Data Storage : 800 GB (400 GB / rank) + Memory File Size : 68 GB (34 GB / rank) + +$ dmg pool query bob + +Pool 47060d94-c689-4981-8c89-011beb063f8f, ntarget=32, disabled=0, leader=0, version=1, state=Ready +Pool health info: +- Rebuild idle, 0 objs, 0 recs +Pool space info: +- Target count:32 +- Total memory-file size: 70 GB +- Metadata storage: + Total size: 140 GB + Free: 131 GB, min:4.1 GB, max:4.1 GB, mean:4.1 GB +- Data storage: + Total size: 800 GB + Free: 799 GB, min:25 GB, max:25 GB, mean:25 GB + +$ dmg pool query bob2 + +Pool bdbef091-f0f8-411d-8995-f91c4efc690f, ntarget=32, disabled=0, leader=1, version=1, state=Ready +Pool health info: +- Rebuild idle, 0 objs, 0 recs +Pool space info: +- Target count:32 +- Total memory-file size: 68 GB +- Metadata storage: + Total size: 135 GB + Free: 127 GB, min:4.0 GB, max:4.0 GB, mean:4.0 GB +- Data storage: + Total size: 800 GB + Free: 799 GB, min:25 GB, max:25 GB, mean:25 GB +``` + +The following examples are with a single host with dual engines where bdev +roles WAL, META and DATA are shared. + +Single pool with VOS index file size equal to the meta-blob size (`--mem-ratio +100%`). + +```bash +$ dmg pool create bob --size 100% --mem-ratio 100% + +Pool created with 5.93%,94.07% storage tier ratio +------------------------------------------------- + UUID : bad54f1d-8976-428b-a5dd-243372dfa65c + Service Leader : 1 + Service Ranks : [0-1] + Storage Ranks : [0-1] + Total Size : 2.4 TB + Metadata Storage : 140 GB (70 GB / rank) + Data Storage : 2.2 TB (1.1 TB / rank) + Memory File Size : 140 GB (70 GB / rank) + +``` + +Rough calculations: 1.2TB of usable space is returned from storage scan and +because roles are shared required META (70GB) is reserved so only 1.1TB is +provided for data. + +Logging shows: +```bash +DEBUG 2024/09/24 15:44:38.554431 pool.go:1139: added smd device c7da7391-9077-4eb6-9f4a-a3d656166236 (rank 1, ctrlr 0000:d8:00.0, roles "data,meta,wal") as usable: device state="NORMAL", smd-size 623 GB (623307128832), ctrlr-total-free 623 GB (623307128832) +DEBUG 2024/09/24 15:44:38.554516 pool.go:1139: added smd device 18c7bf45-7586-49ba-93c0-cbc08caed901 (rank 1, ctrlr 0000:d9:00.0, roles "data,meta,wal") as usable: device state="NORMAL", smd-size 554 GB (554050781184), ctrlr-total-free 1.2 TB (1177357910016) +DEBUG 2024/09/24 15:44:38.554603 pool.go:1246: based on minimum available ramdisk capacity of 70 GB and mem-ratio 1.00 with 70 GB of reserved metadata capacity, the maximum per-rank sizes for a pool are META=70 GB (69792169984 B) DATA=1.1 TB (1107565740032 B) +``` + +Now the same as above but with a single pool with VOS index file size equal to +a quarter of the meta-blob size (`--mem-ratio 25%`). + +```bash +$ dmg pool create bob --size 100% --mem-ratio 25% + +Pool created with 23.71%,76.29% storage tier ratio +-------------------------------------------------- + UUID : 999ecf55-474e-4476-9f90-0b4c754d4619 + Service Leader : 0 + Service Ranks : [0-1] + Storage Ranks : [0-1] + Total Size : 2.4 TB + Metadata Storage : 558 GB (279 GB / rank) + Data Storage : 1.8 TB (898 GB / rank) + Memory File Size : 140 GB (70 GB / rank) + +``` + +Rough calculations: 1.2TB of usable space is returned from storage scan and +because roles are shared required META (279GB) is reserved so only ~900GB is +provided for data. + +Logging shows: +```bash +DEBUG 2024/09/24 16:16:00.172719 pool.go:1246: based on minimum available ramdisk capacity of 70 GB and mem-ratio 0.25 with 279 GB of reserved metadata capacity, the maximum per-rank sizes for a pool are META=279 GB (279168679936 B) DATA=898 GB (898189230080 B) +``` + +Now with 6 ranks and a single pool with VOS index file size equal to a half of +the meta-blob size (`--mem-ratio 50%`). + +```bash +$ dmg pool create bob --size 100% --mem-ratio 50% + +Pool created with 11.86%,88.14% storage tier ratio +-------------------------------------------------- + UUID : 4fa38199-23a9-4b4d-aa9a-8b9838cad1d6 + Service Leader : 1 + Service Ranks : [0-2,4-5] + Storage Ranks : [0-5] + Total Size : 7.1 TB + Metadata Storage : 838 GB (140 GB / rank) + Data Storage : 6.2 TB (1.0 TB / rank) + Memory File Size : 419 GB (70 GB / rank) + +``` + +Rough calculations: 1177 GB of usable space is returned from storage scan and +because roles are shared required META (140 GB) is reserved so only 1037 GB is +provided for data (per-rank). + +Logging shows: +```bash +DEBUG 2024/09/24 16:40:41.570331 pool.go:1139: added smd device c921c7b9-5f5c-4332-a878-0ebb8191c160 (rank 1, ctrlr 0000:d8:00.0, roles "data,meta,wal") as usable: device state="NORMAL", smd-size 623 GB (623307128832), ctrlr-total-free 623 GB (623307128832) +DEBUG 2024/09/24 16:40:41.570447 pool.go:1139: added smd device a071c3cf-5de1-4911-8549-8c5e8f550554 (rank 1, ctrlr 0000:d9:00.0, roles "data,meta,wal") as usable: device state="NORMAL", smd-size 554 GB (554050781184), ctrlr-total-free 1.2 TB (1177357910016) +DEBUG 2024/09/24 16:40:41.570549 pool.go:1246: based on minimum available ramdisk capacity of 70 GB and mem-ratio 0.50 with 140 GB of reserved metadata capacity, the maximum per-rank sizes for a pool are META=140 GB (139584339968 B) DATA=1.0 TB (1037773570048 B) +``` + + ### Listing Pools To see a list of the pools in the DAOS system: diff --git a/docs/user/filesystem.md b/docs/user/filesystem.md index 048dcfd04f7..f1a3398df87 100644 --- a/docs/user/filesystem.md +++ b/docs/user/filesystem.md @@ -228,7 +228,6 @@ Additionally, there are several optional command-line options: | --container= | container label or uuid to open | | --sys-name= | DAOS system name | | --foreground | run in foreground | -| --singlethreaded | run single threaded | | --thread-count= | Number of threads to use | | --multi-user | Run in multi user mode | | --read-only | Mount in read-only mode | diff --git a/src/bio/bio_context.c b/src/bio/bio_context.c index 297694c6e6a..c450a25f0af 100644 --- a/src/bio/bio_context.c +++ b/src/bio/bio_context.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2018-2023 Intel Corporation. + * (C) Copyright 2018-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -457,7 +457,8 @@ int bio_mc_destroy(struct bio_xs_context *xs_ctxt, uuid_t pool_id, enum bio_mc_f static int bio_blob_create(uuid_t uuid, struct bio_xs_context *xs_ctxt, uint64_t blob_sz, - enum smd_dev_type st, enum bio_mc_flags flags, spdk_blob_id *blob_id) + enum smd_dev_type st, enum bio_mc_flags flags, spdk_blob_id *blob_id, + uint64_t scm_sz) { struct blob_msg_arg bma = { 0 }; struct blob_cp_arg *ba = &bma.bma_cp_arg; @@ -541,9 +542,10 @@ bio_blob_create(uuid_t uuid, struct bio_xs_context *xs_ctxt, uint64_t blob_sz, blob_sz); else rc = smd_pool_add_tgt(uuid, xs_ctxt->bxc_tgt_id, ba->bca_id, st, - blob_sz); + blob_sz, scm_sz); } else { - rc = smd_pool_add_tgt(uuid, xs_ctxt->bxc_tgt_id, ba->bca_id, st, blob_sz); + rc = smd_pool_add_tgt(uuid, xs_ctxt->bxc_tgt_id, ba->bca_id, st, blob_sz, + 0); } if (rc != 0) { @@ -611,14 +613,14 @@ __bio_ioctxt_open(struct bio_io_context **pctxt, struct bio_xs_context *xs_ctxt, /* * Calculate a reasonable WAL size based on following assumptions: * - Single target update IOPS can be up to 65k; - * - Each TX consumes 2 WAL blocks in average; + * - Each TX consumes 2 WAL blocks on average; * - Checkpointing interval is 5 seconds, and the WAL should have at least * half free space before next checkpoint; */ uint64_t default_wal_sz(uint64_t meta_sz) { - uint64_t wal_sz = (6ULL << 30); /* 6GB */ + uint64_t wal_sz = (6ULL << 30); /* 6GiB */ /* The WAL size could be larger than meta size for tiny pool */ if ((meta_sz * 2) <= wal_sz) @@ -627,8 +629,8 @@ default_wal_sz(uint64_t meta_sz) return wal_sz; } -int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_sz, - uint64_t wal_sz, uint64_t data_sz, enum bio_mc_flags flags) +int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t scm_sz, uint64_t meta_sz, + uint64_t wal_sz, uint64_t data_sz, enum bio_mc_flags flags, uint8_t backend_type) { int rc = 0, rc1; spdk_blob_id data_blobid = SPDK_BLOBID_INVALID; @@ -637,12 +639,13 @@ int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_ struct bio_meta_context *mc = NULL; struct meta_fmt_info *fi = NULL; struct bio_xs_blobstore *bxb; + uint32_t meta_flags = 0; D_ASSERT(xs_ctxt != NULL); if (data_sz > 0 && bio_nvme_configured(SMD_DEV_TYPE_DATA)) { D_ASSERT(!(flags & BIO_MC_FL_RDB)); rc = bio_blob_create(pool_id, xs_ctxt, data_sz, SMD_DEV_TYPE_DATA, flags, - &data_blobid); + &data_blobid, 0); if (rc) return rc; } @@ -656,9 +659,28 @@ int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_ meta_sz, default_cluster_sz()); rc = -DER_INVAL; goto delete_data; + } else if (meta_sz < scm_sz) { + D_ERROR("Meta blob size("DF_U64") is less than scm size("DF_U64")\n", + meta_sz, scm_sz); + rc = -DER_INVAL; + goto delete_data; + } else if (scm_sz == meta_sz) { + scm_sz = 0; + } + + /* scm_sz < meta_sz case */ + if (scm_sz != 0) { + if (flags & BIO_MC_FL_RDB) { + D_ERROR("RDB doesn't allow scm_sz("DF_U64") != meta_sz("DF_U64")\n", + scm_sz, meta_sz); + rc = -DER_INVAL; + goto delete_data; + } + meta_flags |= META_HDR_FL_EVICTABLE; } - rc = bio_blob_create(pool_id, xs_ctxt, meta_sz, SMD_DEV_TYPE_META, flags, &meta_blobid); + rc = bio_blob_create(pool_id, xs_ctxt, meta_sz, SMD_DEV_TYPE_META, flags, &meta_blobid, + scm_sz); if (rc) goto delete_data; @@ -671,7 +693,7 @@ int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_ if (wal_sz == 0 || wal_sz < default_cluster_sz()) wal_sz = default_wal_sz(meta_sz); - rc = bio_blob_create(pool_id, xs_ctxt, wal_sz, SMD_DEV_TYPE_WAL, flags, &wal_blobid); + rc = bio_blob_create(pool_id, xs_ctxt, wal_sz, SMD_DEV_TYPE_WAL, flags, &wal_blobid, 0); if (rc) goto delete_meta; @@ -717,8 +739,9 @@ int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_ fi->fi_wal_size = wal_sz; fi->fi_data_size = data_sz; fi->fi_vos_id = xs_ctxt->bxc_tgt_id; + fi->fi_backend_type = backend_type; - rc = meta_format(mc, fi, true); + rc = meta_format(mc, fi, meta_flags, true); if (rc) D_ERROR("Unable to format newly created blob for xs:%p pool:"DF_UUID"\n", xs_ctxt, DP_UUID(pool_id)); diff --git a/src/bio/bio_wal.c b/src/bio/bio_wal.c index 6c99a203966..1caa538eb5e 100644 --- a/src/bio/bio_wal.c +++ b/src/bio/bio_wal.c @@ -1861,13 +1861,15 @@ bio_wal_checkpoint(struct bio_meta_context *mc, uint64_t tx_id, uint64_t *purged void bio_meta_get_attr(struct bio_meta_context *mc, uint64_t *capacity, uint32_t *blk_sz, - uint32_t *hdr_blks) + uint32_t *hdr_blks, uint8_t *backend_type, bool *evictable) { /* The mc could be NULL when md on SSD not enabled & data blob not existing */ if (mc != NULL) { *blk_sz = mc->mc_meta_hdr.mh_blk_bytes; *capacity = mc->mc_meta_hdr.mh_tot_blks * (*blk_sz); *hdr_blks = mc->mc_meta_hdr.mh_hdr_blks; + *backend_type = mc->mc_meta_hdr.mh_backend_type; + *evictable = mc->mc_meta_hdr.mh_flags & META_HDR_FL_EVICTABLE; } } @@ -2022,7 +2024,7 @@ get_wal_gen(uuid_t pool_id, uint32_t tgt_id) } int -meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, bool force) +meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, uint32_t flags, bool force) { struct meta_header *meta_hdr = &mc->mc_meta_hdr; struct wal_super_info *si = &mc->mc_wal_info; @@ -2068,7 +2070,8 @@ meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, bool force) meta_hdr->mh_hdr_blks = META_HDR_BLKS; meta_hdr->mh_tot_blks = (fi->fi_meta_size / META_BLK_SZ) - META_HDR_BLKS; meta_hdr->mh_vos_id = fi->fi_vos_id; - meta_hdr->mh_flags = META_HDR_FL_EMPTY; + meta_hdr->mh_flags = (flags | META_HDR_FL_EMPTY); + meta_hdr->mh_backend_type = fi->fi_backend_type; rc = write_header(mc, mc->mc_meta, meta_hdr, sizeof(*meta_hdr), &meta_hdr->mh_csum); if (rc) { diff --git a/src/bio/bio_wal.h b/src/bio/bio_wal.h index 6eb187c61e6..1f15a7d94ef 100644 --- a/src/bio/bio_wal.h +++ b/src/bio/bio_wal.h @@ -11,6 +11,7 @@ enum meta_hdr_flags { META_HDR_FL_EMPTY = (1UL << 0), + META_HDR_FL_EVICTABLE = (1UL << 1), }; /* Meta blob header */ @@ -28,7 +29,10 @@ struct meta_header { uint64_t mh_tot_blks; /* Meta blob capacity, in blocks */ uint32_t mh_vos_id; /* Associated per-engine target ID */ uint32_t mh_flags; /* Meta header flags */ - uint32_t mh_padding[5]; /* Reserved */ + uint8_t mh_backend_type; /* Backend allocator type */ + uint8_t mh_padding1; /* Reserved */ + uint16_t mh_padding2; /* Reserved */ + uint32_t mh_padding[4]; /* Reserved */ uint32_t mh_csum; /* Checksum of this header */ }; @@ -124,9 +128,10 @@ struct meta_fmt_info { uint64_t fi_wal_size; /* WAL blob size in bytes */ uint64_t fi_data_size; /* Data blob size in bytes */ uint32_t fi_vos_id; /* Associated per-engine target ID */ + uint8_t fi_backend_type; /* Backend allocator type */ }; -int meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, bool force); +int meta_format(struct bio_meta_context *mc, struct meta_fmt_info *fi, uint32_t flags, bool force); int meta_open(struct bio_meta_context *mc); void meta_close(struct bio_meta_context *mc); int wal_open(struct bio_meta_context *mc); diff --git a/src/bio/smd/smd_internal.h b/src/bio/smd/smd_internal.h index 0b641cddb61..4195581e40a 100644 --- a/src/bio/smd/smd_internal.h +++ b/src/bio/smd/smd_internal.h @@ -27,6 +27,8 @@ extern char TABLE_TGTS[SMD_DEV_TYPE_MAX][SMD_DEV_NAME_MAX]; extern char TABLE_POOLS[SMD_DEV_TYPE_MAX][SMD_DEV_NAME_MAX]; +extern char TABLE_POOLS_EX[SMD_DEV_TYPE_MAX][SMD_DEV_NAME_MAX]; + #define SMD_MAX_TGT_CNT 64 /** callback parameter for smd_db_traverse */ diff --git a/src/bio/smd/smd_pool.c b/src/bio/smd/smd_pool.c index c9d9572c556..84c25a9863f 100644 --- a/src/bio/smd/smd_pool.c +++ b/src/bio/smd/smd_pool.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2018-2023 Intel Corporation. + * (C) Copyright 2018-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -35,6 +35,17 @@ struct smd_pool { uint64_t sp_blobs[SMD_MAX_TGT_CNT]; }; +char TABLE_POOLS_EX[SMD_DEV_TYPE_MAX][SMD_DEV_NAME_MAX] = { + "ex_data_pool", + "ex_meta_pool", + "ex_wal_pool", +}; + +struct smd_pool_meta { + uint64_t spm_scm_sz; + uint64_t spm_reserved[3]; +}; + static int smd_pool_find_tgt(struct smd_pool *pool, int tgt_id) { @@ -56,7 +67,6 @@ pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, char *table_name uuid_copy(id.uuid, pool_id); - smd_db_lock(); /* Fetch pool if it's already existing */ rc = smd_db_fetch(table_name, &id, sizeof(id), &pool, sizeof(pool)); if (rc == 0) { @@ -65,23 +75,20 @@ pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, char *table_name ""DF_U64" != "DF_U64"\n", DP_UUID(&id.uuid), pool.sp_blob_sz, blob_sz); - rc = -DER_INVAL; - goto out; + return -DER_INVAL; } if (pool.sp_tgt_cnt >= SMD_MAX_TGT_CNT) { D_ERROR("Pool "DF_UUID" is assigned to too many " "targets (%d)\n", DP_UUID(&id.uuid), pool.sp_tgt_cnt); - rc = -DER_OVERFLOW; - goto out; + return -DER_OVERFLOW; } rc = smd_pool_find_tgt(&pool, tgt_id); if (rc >= 0) { D_ERROR("Dup target %d, idx: %d\n", tgt_id, rc); - rc = -DER_EXIST; - goto out; + return -DER_EXIST; } pool.sp_tgts[pool.sp_tgt_cnt] = tgt_id; @@ -102,32 +109,69 @@ pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, char *table_name } else { D_ERROR("Fetch pool "DF_UUID" failed. "DF_RC"\n", DP_UUID(&id.uuid), DP_RC(rc)); - goto out; + return rc; } rc = smd_db_upsert(table_name, &id, sizeof(id), &pool, sizeof(pool)); - if (rc) { + if (rc) D_ERROR("Update pool "DF_UUID" failed. "DF_RC"\n", DP_UUID(&id.uuid), DP_RC(rc)); - goto out; - } -out: - smd_db_unlock(); + return rc; } int smd_pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, - enum smd_dev_type st, uint64_t blob_sz) + enum smd_dev_type st, uint64_t blob_sz, uint64_t scm_sz) { - return pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_POOLS[st], blob_sz); + struct smd_pool_meta meta = { 0 }; + struct d_uuid id; + int rc; + + smd_db_lock(); + + rc = pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_POOLS[st], blob_sz); + if (rc || scm_sz == 0) { + smd_db_unlock(); + return rc; + } + + D_ASSERTF(scm_sz < blob_sz, "scm_sz("DF_U64") >= blob_sz("DF_U64")\n", scm_sz, blob_sz); + D_ASSERT(st == SMD_DEV_TYPE_META); + + uuid_copy(id.uuid, pool_id); + /* Fetch pool_meta_ex to see if it's already existing */ + rc = smd_db_fetch(TABLE_POOLS_EX[st], &id, sizeof(id), &meta, sizeof(meta)); + if (rc == 0) { + if (meta.spm_scm_sz != scm_sz) { + D_ERROR("Pool "DF_UUID" meta size mismatch. "DF_U64" != "DF_U64"\n", + DP_UUID(&id.uuid), meta.spm_scm_sz, scm_sz); + rc = -DER_INVAL; + } + } else if (rc == -DER_NONEXIST) { + meta.spm_scm_sz = scm_sz; + rc = smd_db_upsert(TABLE_POOLS_EX[st], &id, sizeof(id), &meta, sizeof(meta)); + if (rc) + DL_ERROR(rc, "Update pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid)); + } else { + DL_ERROR(rc, "Fetch pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid)); + } + + smd_db_unlock(); + return rc; } int smd_rdb_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, enum smd_dev_type st, uint64_t blob_sz) { - return pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_RDBS[st], blob_sz); + int rc; + + smd_db_lock(); + rc = pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_RDBS[st], blob_sz); + smd_db_unlock(); + + return rc; } static int @@ -140,20 +184,18 @@ pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, char *table_name) uuid_copy(id.uuid, pool_id); - smd_db_lock(); rc = smd_db_fetch(table_name, &id, sizeof(id), &pool, sizeof(pool)); if (rc) { D_ERROR("Fetch pool "DF_UUID" failed. "DF_RC"\n", DP_UUID(id.uuid), DP_RC(rc)); - goto out; + return rc; } rc = smd_pool_find_tgt(&pool, tgt_id); if (rc < 0) { D_ERROR("Pool "DF_UUID" target %d not found.\n", DP_UUID(id.uuid), tgt_id); - rc = -DER_NONEXIST; - goto out; + return -DER_NONEXIST; } for (i = rc; i < pool.sp_tgt_cnt - 1; i++) { @@ -168,35 +210,69 @@ pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, char *table_name) if (rc) { D_ERROR("Update pool "DF_UUID" failed: "DF_RC"\n", DP_UUID(&id.uuid), DP_RC(rc)); - goto out; + return rc; } } else { rc = smd_db_delete(table_name, &id, sizeof(id)); if (rc) { D_ERROR("Delete pool "DF_UUID" failed: "DF_RC"\n", DP_UUID(&id.uuid), DP_RC(rc)); - goto out; + return rc; } + rc = 1; /* Inform caller that last target is deleted */ } -out: - smd_db_unlock(); + return rc; } int smd_pool_del_tgt(uuid_t pool_id, uint32_t tgt_id, enum smd_dev_type st) { - return pool_del_tgt(pool_id, tgt_id, TABLE_POOLS[st]); + struct smd_pool_meta meta = { 0 }; + struct d_uuid id; + int rc; + + smd_db_lock(); + rc = pool_del_tgt(pool_id, tgt_id, TABLE_POOLS[st]); + if (rc <= 0) + goto out; + + rc = 0; + if (st == SMD_DEV_TYPE_META) { + uuid_copy(id.uuid, pool_id); + + rc = smd_db_fetch(TABLE_POOLS_EX[st], &id, sizeof(id), &meta, sizeof(meta)); + if (rc == -DER_NONEXIST) { + rc = 0; + goto out; + } else if (rc) { + DL_ERROR(rc, "Fetch pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid)); + goto out; + } + + rc = smd_db_delete(TABLE_POOLS_EX[st], &id, sizeof(id)); + if (rc) + DL_ERROR(rc, "Delete pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid)); + } +out: + smd_db_unlock(); + return rc; } int smd_rdb_del_tgt(uuid_t pool_id, uint32_t tgt_id, enum smd_dev_type st) { - return pool_del_tgt(pool_id, tgt_id, TABLE_RDBS[st]); + int rc; + + smd_db_lock(); + rc = pool_del_tgt(pool_id, tgt_id, TABLE_RDBS[st]); + smd_db_unlock(); + + return rc < 0 ? rc : 0; } static struct smd_pool_info * -smd_pool_alloc_info(struct d_uuid *id, struct smd_pool *pools) +smd_pool_alloc_info(struct d_uuid *id, struct smd_pool *pools, uint64_t scm_sz) { struct smd_pool_info *info; enum smd_dev_type st; @@ -206,6 +282,7 @@ smd_pool_alloc_info(struct d_uuid *id, struct smd_pool *pools) if (info == NULL) return NULL; + info->spi_scm_sz = scm_sz; for (st = SMD_DEV_TYPE_DATA; st < SMD_DEV_TYPE_MAX; st++) { D_ALLOC_ARRAY(info->spi_tgts[st], SMD_MAX_TGT_CNT); if (info->spi_tgts[st] == NULL) { @@ -237,6 +314,7 @@ smd_pool_get_info(uuid_t pool_id, struct smd_pool_info **pool_info) { struct smd_pool_info *info; struct smd_pool pools[SMD_DEV_TYPE_MAX]; + struct smd_pool_meta meta = { 0 }; enum smd_dev_type st; struct d_uuid id; int rc; @@ -255,7 +333,16 @@ smd_pool_get_info(uuid_t pool_id, struct smd_pool_info **pool_info) } } - info = smd_pool_alloc_info(&id, pools); + rc = smd_db_fetch(TABLE_POOLS_EX[SMD_DEV_TYPE_META], &id, sizeof(id), &meta, sizeof(meta)); + if (rc == -DER_NONEXIST) { + meta.spm_scm_sz = pools[SMD_DEV_TYPE_META].sp_blob_sz; + rc = 0; + } else if (rc) { + DL_ERROR(rc, "Fetch pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid)); + goto out; + } + + info = smd_pool_alloc_info(&id, pools, meta.spm_scm_sz); if (info == NULL) { rc = -DER_NOMEM; goto out; @@ -338,6 +425,7 @@ smd_pool_list_cb(struct sys_db *db, char *table, d_iov_t *key, void *args) struct smd_trav_data *td = args; struct smd_pool_info *info; struct smd_pool pools[SMD_DEV_TYPE_MAX]; + struct smd_pool_meta meta = { 0 }; enum smd_dev_type st; struct d_uuid id; int rc; @@ -363,7 +451,16 @@ smd_pool_list_cb(struct sys_db *db, char *table, d_iov_t *key, void *args) return rc; } - info = smd_pool_alloc_info(&id, pools); + rc = smd_db_fetch(TABLE_POOLS_EX[SMD_DEV_TYPE_META], &id, sizeof(id), &meta, sizeof(meta)); + if (rc == -DER_NONEXIST) { + meta.spm_scm_sz = pools[SMD_DEV_TYPE_META].sp_blob_sz; + rc = 0; + } else if (rc) { + DL_ERROR(rc, "Fetch pool_meta "DF_UUID" failed.", DP_UUID(&id.uuid)); + return rc; + } + + info = smd_pool_alloc_info(&id, pools, meta.spm_scm_sz); if (!info) return -DER_NOMEM; diff --git a/src/bio/smd/tests/smd_ut.c b/src/bio/smd/tests/smd_ut.c index 129db9acf0d..bb2fcb6107a 100644 --- a/src/bio/smd/tests/smd_ut.c +++ b/src/bio/smd/tests/smd_ut.c @@ -21,7 +21,7 @@ #include #define SMD_STORAGE_PATH "/mnt/daos" -#define DB_LIST_NR (SMD_DEV_TYPE_MAX * 2 + 1) +#define DB_LIST_NR (SMD_DEV_TYPE_MAX * 2 + 2) struct ut_db { struct sys_db ud_db; @@ -46,11 +46,14 @@ db_name2list(struct sys_db *db, char *name) if (!strcmp(name, TABLE_DEV)) return &ud->ud_lists[0]; + if (!strcmp(name, TABLE_POOLS_EX[SMD_DEV_TYPE_META])) + return &ud->ud_lists[1]; + for (st = SMD_DEV_TYPE_DATA; st < SMD_DEV_TYPE_MAX; st++) { if (!strcmp(name, TABLE_TGTS[st])) - return &ud->ud_lists[st + 1]; + return &ud->ud_lists[st + 2]; if (!strcmp(name, TABLE_POOLS[st])) - return &ud->ud_lists[st + SMD_DEV_TYPE_MAX + 1]; + return &ud->ud_lists[st + SMD_DEV_TYPE_MAX + 2]; } D_ASSERT(0); return NULL; @@ -325,12 +328,13 @@ ut_device(void **state) } static void -verify_pool(struct smd_pool_info *pool_info, uuid_t id, int shift) +verify_pool(struct smd_pool_info *pool_info, uuid_t id, int shift, uint64_t scm_sz) { enum smd_dev_type st; int i, j; assert_int_equal(uuid_compare(pool_info->spi_id, id), 0); + assert_int_equal(pool_info->spi_scm_sz, scm_sz); assert_int_equal(pool_info->spi_tgt_cnt[SMD_DEV_TYPE_DATA], 4); assert_int_equal(pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META], 1); assert_int_equal(pool_info->spi_tgt_cnt[SMD_DEV_TYPE_WAL], 1); @@ -359,35 +363,42 @@ ut_pool(void **state) for (i = 0; i < 6; i++) { st = (i < 4) ? SMD_DEV_TYPE_DATA : SMD_DEV_TYPE_DATA + i - 3; - rc = smd_pool_add_tgt(id1, i, i << 10, st, 100); + rc = smd_pool_add_tgt(id1, i, i << 10, st, 100, 0); assert_rc_equal(rc, 0); - rc = smd_pool_add_tgt(id2, i, i << 20, st, 200); + if (st == SMD_DEV_TYPE_META) + rc = smd_pool_add_tgt(id2, i, i << 20, st, 200, 50); + else + rc = smd_pool_add_tgt(id2, i, i << 20, st, 200, 0); assert_rc_equal(rc, 0); } - rc = smd_pool_add_tgt(id1, 0, 5000, SMD_DEV_TYPE_DATA, 100); + rc = smd_pool_add_tgt(id1, 0, 5000, SMD_DEV_TYPE_DATA, 100, 0); assert_rc_equal(rc, -DER_EXIST); - rc = smd_pool_add_tgt(id1, 4, 4 << 10, SMD_DEV_TYPE_DATA, 200); + rc = smd_pool_add_tgt(id1, 4, 4 << 10, SMD_DEV_TYPE_DATA, 200, 0); assert_rc_equal(rc, -DER_INVAL); - rc = smd_pool_add_tgt(id1, 4, 5000, SMD_DEV_TYPE_META, 100); + rc = smd_pool_add_tgt(id1, 4, 5000, SMD_DEV_TYPE_META, 100, 0); assert_rc_equal(rc, -DER_EXIST); - rc = smd_pool_add_tgt(id1, 0, 4 << 10, SMD_DEV_TYPE_META, 200); + rc = smd_pool_add_tgt(id1, 0, 4 << 10, SMD_DEV_TYPE_META, 200, 0); assert_rc_equal(rc, -DER_INVAL); - rc = smd_pool_add_tgt(id1, 5, 5000, SMD_DEV_TYPE_WAL, 100); + rc = smd_pool_add_tgt(id1, 5, 5000, SMD_DEV_TYPE_WAL, 100, 0); assert_rc_equal(rc, -DER_EXIST); - rc = smd_pool_add_tgt(id1, 0, 4 << 10, SMD_DEV_TYPE_WAL, 200); + rc = smd_pool_add_tgt(id1, 0, 4 << 10, SMD_DEV_TYPE_WAL, 200, 0); assert_rc_equal(rc, -DER_INVAL); rc = smd_pool_get_info(id1, &pool_info); assert_rc_equal(rc, 0); - verify_pool(pool_info, id1, 10); + verify_pool(pool_info, id1, 10, 100); + smd_pool_free_info(pool_info); + rc = smd_pool_get_info(id2, &pool_info); + assert_rc_equal(rc, 0); + verify_pool(pool_info, id2, 20, 50); smd_pool_free_info(pool_info); rc = smd_pool_get_info(id3, &pool_info); @@ -416,9 +427,9 @@ ut_pool(void **state) d_list_for_each_entry_safe(pool_info, tmp, &pool_list, spi_link) { if (uuid_compare(pool_info->spi_id, id1) == 0) - verify_pool(pool_info, id1, 10); + verify_pool(pool_info, id1, 10, 100); else if (uuid_compare(pool_info->spi_id, id2) == 0) - verify_pool(pool_info, id2, 20); + verify_pool(pool_info, id2, 20, 50); else assert_true(false); diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index e3b3c0d7d0e..9d162810db6 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -29,7 +29,6 @@ struct dfuse_info { char *di_mountpoint; int32_t di_thread_count; uint32_t di_eq_count; - bool di_threaded; bool di_foreground; bool di_caching; bool di_multi_user; diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index 4f654fa3209..6397b283e97 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -53,7 +53,7 @@ dfuse_progress_thread(void *arg) return NULL; } - rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_WAIT, 128, &dev[0]); + rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_NOWAIT, 128, &dev[0]); if (rc >= 1) { for (i = 0; i < rc; i++) { struct dfuse_event *ev; diff --git a/src/client/dfuse/dfuse_main.c b/src/client/dfuse/dfuse_main.c index d75656121a5..02db62cc4e9 100644 --- a/src/client/dfuse/dfuse_main.c +++ b/src/client/dfuse/dfuse_main.c @@ -166,6 +166,7 @@ dfuse_bg(struct dfuse_info *dfuse_info) * * Should be called from the post_start plugin callback and creates * a filesystem. + * Returns a DAOS error code. * Returns true on success, false on failure. */ int @@ -204,18 +205,17 @@ dfuse_launch_fuse(struct dfuse_info *dfuse_info, struct fuse_args *args) DFUSE_TRA_ERROR(dfuse_info, "Error sending signal to fg: "DF_RC, DP_RC(rc)); /* Blocking */ - if (dfuse_info->di_threaded) - rc = dfuse_loop(dfuse_info); - else - rc = fuse_session_loop(dfuse_info->di_session); - if (rc != 0) + rc = dfuse_loop(dfuse_info); + if (rc != 0) { DHS_ERROR(dfuse_info, rc, "Fuse loop exited"); + rc = daos_errno2der(rc); + } umount: fuse_session_unmount(dfuse_info->di_session); - return daos_errno2der(rc); + return rc; } #define DF_POOL_PREFIX "pool=" @@ -279,7 +279,6 @@ show_help(char *name) " --path= Path to load UNS pool/container data\n" " --sys-name=STR DAOS system name context for servers\n" "\n" - " -S --singlethread Single threaded (deprecated)\n" " -t --thread-count=count Total number of threads to use\n" " -e --eq-count=count Number of event queues to use\n" " -f --foreground Run in foreground\n" @@ -423,7 +422,6 @@ main(int argc, char **argv) {"pool", required_argument, 0, 'p'}, {"container", required_argument, 0, 'c'}, {"sys-name", required_argument, 0, 'G'}, - {"singlethread", no_argument, 0, 'S'}, {"thread-count", required_argument, 0, 't'}, {"eq-count", required_argument, 0, 'e'}, {"foreground", no_argument, 0, 'f'}, @@ -447,13 +445,12 @@ main(int argc, char **argv) if (dfuse_info == NULL) D_GOTO(out_debug, rc = -DER_NOMEM); - dfuse_info->di_threaded = true; dfuse_info->di_caching = true; dfuse_info->di_wb_cache = true; dfuse_info->di_eq_count = 1; while (1) { - c = getopt_long(argc, argv, "Mm:St:o:fhe:v", long_options, NULL); + c = getopt_long(argc, argv, "Mm:t:o:fhe:v", long_options, NULL); if (c == -1) break; @@ -491,13 +488,6 @@ main(int argc, char **argv) case 'P': path = optarg; break; - case 'S': - /* Set it to be single threaded, but allow an extra one - * for the event queue processing - */ - dfuse_info->di_threaded = false; - dfuse_info->di_thread_count = 2; - break; case 'e': dfuse_info->di_eq_count = atoi(optarg); break; @@ -564,7 +554,7 @@ main(int argc, char **argv) * check CPU binding. If bound to a number of cores then launch that number of threads, * if not bound them limit to 16. */ - if (dfuse_info->di_threaded && !have_thread_count) { + if (!have_thread_count) { struct hwloc_topology *hwt; hwloc_const_cpuset_t hw; int total; diff --git a/src/client/dfuse/pil4dfs/hook.c b/src/client/dfuse/pil4dfs/hook.c index 4af38d885db..09a54e7351c 100644 --- a/src/client/dfuse/pil4dfs/hook.c +++ b/src/client/dfuse/pil4dfs/hook.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "hook.h" #include "hook_int.h" @@ -89,10 +90,15 @@ static uint64_t lib_base_addr[MAX_NUM_LIB]; /* List of names of loaded libraries */ static char **lib_name_list; +/* libc version number in current process. e.g., 2.28 */ +static float libc_version; +static char *libc_version_str; + /* end to compile list of memory blocks in /proc/pid/maps */ static char *path_ld; static char *path_libc; +static char *path_libdl; static char *path_libpthread; /* This holds the path of libpil4dfs.so. It is needed when we want to * force child processes append libpil4dfs.so to env LD_PRELOAD. */ @@ -213,7 +219,7 @@ determine_lib_path(void) { int path_offset = 0, read_size, i, rc; char *read_buff_map = NULL; - char *pos, *start, *end, lib_ver_str[32] = "", *lib_dir_str = NULL; + char *pos, *start, *end, *lib_dir_str = NULL; read_size = read_map_file(&read_buff_map); @@ -290,19 +296,17 @@ determine_lib_path(void) goto err; path_libc[end - start] = 0; - pos = strstr(path_libc, "libc-2."); - if (pos) { - /* containing version in name. example, 2.17 */ - memcpy(lib_ver_str, pos + 5, 4); - lib_ver_str[4] = 0; + if (libc_version_str == NULL) { + libc_version_str = (char *)gnu_get_libc_version(); + if (libc_version_str == NULL) { + DS_ERROR(errno, "Failed to determine libc version"); + goto err; + } + libc_version = atof(libc_version_str); } - if (lib_ver_str[0]) { - /* with version in name */ - rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, lib_ver_str); - } else { - rc = asprintf(&path_libpthread, "%s/libpthread.so.0", lib_dir_str); - } + /* with version in name */ + rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, libc_version_str); if (rc < 0) { DS_ERROR(ENOMEM, "Failed to allocate memory for path_libpthread"); goto err_1; @@ -312,7 +316,18 @@ determine_lib_path(void) path_libpthread = NULL; DS_ERROR(ENAMETOOLONG, "path_libpthread is too long"); goto err_1; - } + } + rc = asprintf(&path_libdl, "%s/libdl-%s.so", lib_dir_str, libc_version_str); + if (rc < 0) { + DS_ERROR(ENOMEM, "Failed to allocate memory for path_libdl"); + goto err_1; + } + if (rc >= PATH_MAX) { + free(path_libdl); + path_libdl = NULL; + DS_ERROR(ENAMETOOLONG, "path_libdl is too long"); + goto err_1; + } D_FREE(lib_dir_str); if (strstr(read_buff_map, "libioil.so")) { @@ -354,6 +369,11 @@ query_pil4dfs_path(void) return path_libpil4dfs; } +float +query_libc_version(void) +{ + return libc_version; +} /* * query_func_addr - Determine the addresses and code sizes of functions in func_name_list[]. @@ -760,6 +780,7 @@ free_memory_in_hook(void) D_FREE(path_ld); D_FREE(path_libc); D_FREE(module_list); + free(path_libdl); free(path_libpthread); if (lib_name_list) { @@ -1040,6 +1061,8 @@ register_a_hook(const char *module_name, const char *func_name, const void *new_ module_name_local = path_ld; else if (strncmp(module_name, "libc", 5) == 0) module_name_local = path_libc; + else if (strncmp(module_name, "libdl", 6) == 0) + module_name_local = path_libdl; else if (strncmp(module_name, "libpthread", 11) == 0) module_name_local = path_libpthread; else diff --git a/src/client/dfuse/pil4dfs/hook.h b/src/client/dfuse/pil4dfs/hook.h index 7742faaff53..b686d99ce4e 100644 --- a/src/client/dfuse/pil4dfs/hook.h +++ b/src/client/dfuse/pil4dfs/hook.h @@ -60,4 +60,10 @@ free_memory_in_hook(void); char * query_pil4dfs_path(void); +/** + * return glibc version in current process + */ +float +query_libc_version(void); + #endif diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index bfd3dcd0ff1..0c7db5d9563 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -159,6 +159,7 @@ static long int page_size; #define DAOS_INIT_RUNNING 1 static _Atomic uint64_t mpi_init_count; +static _Atomic int64_t zeInit_count; static long int daos_initing; _Atomic bool d_daos_inited; @@ -488,6 +489,9 @@ static int (*next_tcgetattr)(int fd, void *termios_p); static int (*next_mpi_init)(int *argc, char ***argv); static int (*next_pmpi_init)(int *argc, char ***argv); +static int (*next_ze_init)(int flags); +static void *(*next_dlsym)(void *handle, const char *symbol); +static void *(*new_dlsym)(void *handle, const char *symbol); /* to do!! */ /** @@ -1074,6 +1078,143 @@ PMPI_Init(int *argc, char ***argv) return rc; } +int +zeInit(int flags) +{ + int rc; + + if (next_ze_init == NULL) { + if (d_hook_enabled) + next_ze_init = next_dlsym(RTLD_NEXT, "zeInit"); + else + next_ze_init = dlsym(RTLD_NEXT, "zeInit"); + } + D_ASSERT(next_ze_init != NULL); + atomic_fetch_add_relaxed(&zeInit_count, 1); + rc = next_ze_init(flags); + atomic_fetch_add_relaxed(&zeInit_count, -1); + return rc; +} + +#if defined(__x86_64__) +/* This is used to work around compiling warning and limitations of using asm function. */ +static void * +query_new_dlsym_addr(void *addr) +{ + int i; + + /* assume little endian */ + for (i = 0; i < 64; i++) { + /* 0x56579090 is corresponding to the first four instructions at new_dlsym_asm. + * 0x90 - nop, 0x90 - nop, 0x57 - push %rdi, 0x56 - push %rsi + */ + if (*((int *)(addr + i)) == 0x56579090) { + /* two nop are added for easier positioning. offset +2 here to skip two + * nop and start from the real entry. + */ + return ((void *)(addr + i + 2)); + } + } + return NULL; +} + +_Pragma("GCC diagnostic push") +_Pragma("GCC diagnostic ignored \"-Wunused-function\"") +_Pragma("GCC diagnostic ignored \"-Wunused-variable\"") + +_Pragma("GCC push_options") +_Pragma("GCC optimize(\"-O0\")") +static char str_zeinit[] = "zeInit"; + +static int +is_hook_enabled(void) +{ + return (d_hook_enabled ? (1) : (0)); +} + +/* This wrapper function is introduced to avoid compiling issue with Intel-C on Leap 15.5 */ +static int +my_strcmp(const char *s1, const char *s2) +{ + return strcmp(s1, s2); +} + +static void * +get_zeinit_addr(void) +{ + return (void *)zeInit; +} + +__attribute__((aligned(16))) static void +new_dlsym_marker(void) +{ +} + +__asm__( + "new_dlsym_asm:\n" + "nop\n" + "nop\n" + "push %rdi\n" + "push %rsi\n" + + "call is_hook_enabled\n" + "test %eax,%eax\n" + "je org_dlsym\n" + + "mov %rsi, %rdi\n" + "lea str_zeinit(%rip), %rsi\n" + "call my_strcmp\n" + "test %eax,%eax\n" + "jne org_dlsym\n" + + "pop %rsi\n" + "pop %rdi\n" + "call *next_dlsym(%rip)\n" + "mov %rax, next_ze_init(%rip)\n" + + "test %eax,%eax\n" + "jne found\n" + "ret\n" + + "found:\n" + "call get_zeinit_addr\n" + "ret\n" + + "org_dlsym:\n" + "pop %rsi\n" + "pop %rdi\n" + "jmp *next_dlsym(%rip)\n" +); +_Pragma("GCC pop_options") +_Pragma("GCC diagnostic pop") + +#else +/* c code for other architecture. caller info could be wrong inside libc dlsym() when handle is set + * RTLD_NEXT. Assembly version implementation similar to above is needed to fix the issue by using + * jump instead of call instruction. + */ +static void * +new_dlsym_c(void *handle, const char *symbol) +{ + if (!d_hook_enabled) + goto org_dlsym; + printf("Inside my dlsym().\n"); + if (strcmp(symbol, "zeInit") != 0) + goto org_dlsym; + + next_ze_init = next_dlsym(handle, symbol); + if (next_ze_init) + /* dlsym() finished successfully, then intercept zeInit() */ + return zeInit; + else + return next_ze_init; + +org_dlsym: + /* Ideally we need to adjust stack and jump to next_dlsym(). */ + return next_dlsym(handle, symbol); +} +#endif + /** determine whether a path (both relative and absolute) is on DAOS or not. If yes, * returns parent object, item name, full path of parent dir, full absolute path, and * the pointer to struct dfs_mt. @@ -1180,6 +1321,15 @@ query_path(const char *szInput, int *is_target_path, struct dcache_rec **parent, goto out_normal; } + /* Check whether zeInit() is running. If yes, pass to the original + * libc functions. Avoid possible zeInit reentrancy/nested call. + */ + + if (atomic_load_relaxed(&zeInit_count) > 0) { + *is_target_path = 0; + goto out_normal; + } + /* daos_init() is expensive to call. We call it only when necessary. */ /* Check whether daos_init() is running. If yes, pass to the original @@ -2051,6 +2201,7 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char if (!is_target_path) goto org_func; + atomic_fetch_add_relaxed(&num_open, 1); if (oflags & O_CREAT && (oflags & O_DIRECTORY || oflags & O_PATH)) { /* Create a dir is not supported. */ errno = ENOENT; @@ -2078,7 +2229,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char } /* Need to create a fake fd and associate with fd_kernel */ - atomic_fetch_add_relaxed(&num_open, 1); dfs_get_mode(dfs_obj, &mode_query); /* regular file */ @@ -2254,7 +2404,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char return (idx_dirfd + FD_DIR_BASE); } - atomic_fetch_add_relaxed(&num_open, 1); rc = find_next_available_fd(NULL, &idx_fd); if (rc) @@ -6092,7 +6241,7 @@ ioctl(int fd, unsigned long request, ...) va_list arg; void *param; struct dfuse_user_reply *reply; - int fd_directed; + int fd_directed = fd; va_start(arg, request); param = va_arg(arg, void *); @@ -6118,12 +6267,11 @@ ioctl(int fd, unsigned long request, ...) return next_ioctl(fd, request, param); fd_directed = d_get_fd_redirected(fd); - if (fd_directed < FD_FILE_BASE) + if ((fd_directed < FD_FILE_BASE) || (fd_directed >= (FD_DIR_BASE + MAX_OPENED_DIR))) return next_ioctl(fd, request, param); errno = ENOTSUP; - - return -1; + return (-1); } int @@ -6936,6 +7084,18 @@ check_bypasslist(void) return; } +#define SMALL_DIFF (0.0001) +static int +libc_ver_cmp(float ver_a, float ver_b) +{ + if ((ver_a + SMALL_DIFF) < ver_b) + return (-1); + else if (ver_a > (ver_b + SMALL_DIFF)) + return (1); + else + return (0); +} + static __attribute__((constructor)) void init_myhook(void) { @@ -6944,6 +7104,7 @@ init_myhook(void) char *env_no_bypass; int rc; uint64_t eq_count_loc = 0; + float libc_version; /* D_IL_NO_BYPASS is ONLY for testing. It always keeps function interception enabled in * current process and children processes. This is needed to thoroughly test interception @@ -7118,6 +7279,18 @@ init_myhook(void) register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3)); register_a_hook("libc", "readlink", (void *)new_readlink, (long int *)(&libc_readlink)); +#if defined(__x86_64__) + new_dlsym = query_new_dlsym_addr(new_dlsym_marker); +#else + new_dlsym = new_dlsym_c; +#endif + D_ASSERT(new_dlsym != NULL); + libc_version = query_libc_version(); + if (libc_ver_cmp(libc_version, 2.34) < 0) + register_a_hook("libdl", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym)); + else + register_a_hook("libc", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym)); + init_fd_dup2_list(); if (is_bash && no_dcache_in_bash) @@ -7127,6 +7300,10 @@ init_myhook(void) dcache_rec_timeout = 0; install_hook(); + + /* Check it here to minimize the work in function new_dlsym() written in assembly */ + D_ASSERT(next_dlsym != NULL); + d_hook_enabled = 1; hook_enabled_bak = d_hook_enabled; } diff --git a/src/client/dfuse/pil4dfs/pil4dfs_int.h b/src/client/dfuse/pil4dfs/pil4dfs_int.h index a9c54b55555..0693123b51f 100644 --- a/src/client/dfuse/pil4dfs/pil4dfs_int.h +++ b/src/client/dfuse/pil4dfs/pil4dfs_int.h @@ -30,7 +30,7 @@ /* FD_FILE_BASE - The base number of the file descriptor for a directory. * The fd allocate from this lib is always larger than FD_FILE_BASE. */ -#define FD_DIR_BASE (0x40000000) +#define FD_DIR_BASE (FD_FILE_BASE + MAX_OPENED_FILE) /* structure allocated for a FD for a file */ struct file_obj { diff --git a/src/common/SConscript b/src/common/SConscript index 0eec057198d..9d4c522536e 100644 --- a/src/common/SConscript +++ b/src/common/SConscript @@ -30,7 +30,7 @@ def build_daos_common(denv, client): 'dav/ravl_interval.c', 'dav/recycler.c', 'dav/stats.c', 'dav/tx.c', 'dav/ulog.c', 'dav/util.c', 'dav/wal_tx.c'] ad_mem_files = ['ad_mem.c', 'ad_tx.c'] - common_libs.extend(['pmemobj', 'abt']) + common_libs.extend(['pmemobj', 'abt', 'dav_v2']) benv.AppendUnique(RPATH_FULL=['$PREFIX/lib64/daos_srv']) benv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD']) benv.Append(OBJPREFIX="v_") @@ -51,6 +51,7 @@ def scons(): """Execute build""" Import('env', 'base_env', 'prereqs') + SConscript('dav_v2/SConscript') env.AppendUnique(LIBPATH=[Dir('.')]) base_env.AppendUnique(LIBPATH=[Dir('.')]) base_env.d_add_build_rpath() diff --git a/src/common/ad_tx.c b/src/common/ad_tx.c index a68ac18eb0e..9ca51db969e 100644 --- a/src/common/ad_tx.c +++ b/src/common/ad_tx.c @@ -1147,8 +1147,8 @@ umo_tx_free(struct umem_instance *umm, umem_off_t umoff) } static umem_off_t -umo_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, - unsigned int type_num) +umo_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num, + unsigned int mbkt_id) { struct ad_tx *tx = tx_get(); struct ad_blob_handle bh = umm2ad_blob_hdl(umm); @@ -1242,7 +1242,8 @@ umo_tx_add_ptr(struct umem_instance *umm, void *ptr, size_t size) } static umem_off_t -umo_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num) +umo_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num, + unsigned int mbkt_id) { struct ad_blob_handle bh = umm2ad_blob_hdl(umm); struct ad_reserv_act *ract = act; @@ -1330,9 +1331,10 @@ umo_atomic_copy(struct umem_instance *umm, void *dest, const void *src, size_t l } static umem_off_t -umo_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num) +umo_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num, + unsigned int mbkt_id) { - return umo_tx_alloc(umm, size, 0, type_num); + return umo_tx_alloc(umm, size, 0, type_num, mbkt_id); } static int diff --git a/src/common/btree.c b/src/common/btree.c index 6bf1bdb2b15..579b921d768 100644 --- a/src/common/btree.c +++ b/src/common/btree.c @@ -945,8 +945,12 @@ btr_root_alloc(struct btr_context *tcx) struct btr_instance *tins = &tcx->tc_tins; struct btr_root *root; - tins->ti_root_off = umem_zalloc(btr_umm(tcx), - sizeof(struct btr_root)); + if (btr_ops(tcx)->to_node_alloc != NULL) + tins->ti_root_off = btr_ops(tcx)->to_node_alloc(&tcx->tc_tins, + sizeof(struct btr_root)); + else + tins->ti_root_off = umem_zalloc(btr_umm(tcx), sizeof(struct btr_root)); + if (UMOFF_IS_NULL(tins->ti_root_off)) return btr_umm(tcx)->umm_nospc_rc; diff --git a/src/common/dav/bucket.c b/src/common/dav/bucket.c index 8df41288a13..55e72b45ce8 100644 --- a/src/common/dav/bucket.c +++ b/src/common/dav/bucket.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2022, Intel Corporation */ +/* Copyright 2015-2023, Intel Corporation */ /* * bucket.c -- bucket implementation diff --git a/src/common/dav/bucket.h b/src/common/dav/bucket.h index aadc6e714fc..8f5754324f5 100644 --- a/src/common/dav/bucket.h +++ b/src/common/dav/bucket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2021, Intel Corporation */ +/* Copyright 2015-2023, Intel Corporation */ /* * bucket.h -- internal definitions for bucket diff --git a/src/common/dav/dav.h b/src/common/dav/dav.h index 72f836c937b..1c1840a9bb3 100644 --- a/src/common/dav/dav.h +++ b/src/common/dav/dav.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2022, Intel Corporation */ +/* Copyright 2015-2024, Intel Corporation */ /* * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) @@ -23,17 +23,23 @@ #define DAV_FLAG_TX_NO_ABORT (((uint64_t)1) << 4) #define DAV_CLASS_ID(id) (((uint64_t)(id)) << 48) -#define DAV_ARENA_ID(id) (((uint64_t)(id)) << 32) +#ifdef DAV_V2_BUILD +#define DAV_EZONE_ID(id) (((uint64_t)(id)) << 16) +#endif /* DAV_V2_BUILD */ #define DAV_XALLOC_CLASS_MASK ((((uint64_t)1 << 16) - 1) << 48) -#define DAV_XALLOC_ARENA_MASK ((((uint64_t)1 << 16) - 1) << 32) +#ifdef DAV_V2_BUILD +#define DAV_XALLOC_EZONE_MASK ((((uint64_t)1 << 32) - 1) << 16) +#else /* DAV_V2_BUILD */ +#define DAV_XALLOC_EZONE_MASK 0 +#endif /* DAV_V2_BUILD */ #define DAV_XALLOC_ZERO DAV_FLAG_ZERO #define DAV_XALLOC_NO_FLUSH DAV_FLAG_NO_FLUSH #define DAV_XALLOC_NO_ABORT DAV_FLAG_TX_NO_ABORT #define DAV_TX_XALLOC_VALID_FLAGS (DAV_XALLOC_ZERO |\ DAV_XALLOC_NO_FLUSH |\ - DAV_XALLOC_ARENA_MASK |\ + DAV_XALLOC_EZONE_MASK |\ DAV_XALLOC_CLASS_MASK |\ DAV_XALLOC_NO_ABORT) diff --git a/src/common/dav/dav_iface.c b/src/common/dav/dav_iface.c index 7d0efa14b4b..4c8448c4b19 100644 --- a/src/common/dav/dav_iface.c +++ b/src/common/dav/dav_iface.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2015-2023 Intel Corporation. + * (C) Copyright 2015-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -94,7 +94,9 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume if (hdl->do_store->stor_priv == NULL) { D_ERROR("meta context not defined. WAL commit disabled for %s\n", path); } else { - rc = umem_cache_alloc(store, 0); + num_pages = (sz + UMEM_CACHE_PAGE_SZ - 1) >> UMEM_CACHE_PAGE_SZ_SHIFT; + rc = umem_cache_alloc(store, UMEM_CACHE_PAGE_SZ, num_pages, 0, 0, 0, base, NULL, + NULL, NULL); if (rc != 0) { D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc)); err = rc; @@ -104,14 +106,6 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume D_STRNDUP(hdl->do_path, path, strlen(path)); - num_pages = (sz + UMEM_CACHE_PAGE_SZ - 1) >> UMEM_CACHE_PAGE_SZ_SHIFT; - rc = umem_cache_map_range(hdl->do_store, 0, base, num_pages); - if (rc != 0) { - D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc)); - err = rc; - goto out2; - } - if (flags & DAV_HEAP_INIT) { setup_dav_phdr(hdl); heap_base = (char *)hdl->do_base + hdl->do_phdr->dp_heap_offset; @@ -135,7 +129,7 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume D_ASSERT(store != NULL); - rc = store->stor_ops->so_load(store, hdl->do_base); + rc = store->stor_ops->so_load(store, hdl->do_base, 0, store->stor_size); if (rc) { D_ERROR("Failed to read blob to vos file %s, rc = %d\n", path, rc); goto out2; diff --git a/src/common/dav/dav_internal.h b/src/common/dav/dav_internal.h index 0f8ddff5916..ae6150c2748 100644 --- a/src/common/dav/dav_internal.h +++ b/src/common/dav/dav_internal.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2022, Intel Corporation */ +/* Copyright 2015-2023, Intel Corporation */ /* * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) diff --git a/src/common/dav/heap.c b/src/common/dav/heap.c index 4384fe40f8c..ee2feca85a1 100644 --- a/src/common/dav/heap.c +++ b/src/common/dav/heap.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2022, Intel Corporation */ +/* Copyright 2015-2023, Intel Corporation */ /* * heap.c -- heap implementation diff --git a/src/common/dav/heap.h b/src/common/dav/heap.h index d3e2bba4cdf..2b3f86e2fff 100644 --- a/src/common/dav/heap.h +++ b/src/common/dav/heap.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2021, Intel Corporation */ +/* Copyright 2015-2023, Intel Corporation */ /* * heap.h -- internal definitions for heap diff --git a/src/common/dav/obj.h b/src/common/dav/obj.h index 3140235d105..e85c0d317e8 100644 --- a/src/common/dav/obj.h +++ b/src/common/dav/obj.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2014-2021, Intel Corporation */ +/* Copyright 2014-2023, Intel Corporation */ /* * obj.h -- internal definitions for obj module diff --git a/src/common/dav/palloc.c b/src/common/dav/palloc.c index a7b5424576f..59b4d1833f0 100644 --- a/src/common/dav/palloc.c +++ b/src/common/dav/palloc.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2022, Intel Corporation */ +/* Copyright 2015-2023, Intel Corporation */ /* * palloc.c -- implementation of pmalloc POSIX-like API diff --git a/src/common/dav/palloc.h b/src/common/dav/palloc.h index 9c7560f1aaa..047bee47424 100644 --- a/src/common/dav/palloc.h +++ b/src/common/dav/palloc.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2020, Intel Corporation */ +/* Copyright 2015-2023, Intel Corporation */ /* * palloc.h -- internal definitions for persistent allocator diff --git a/src/common/dav/recycler.c b/src/common/dav/recycler.c index 07537a44bd4..392610985a5 100644 --- a/src/common/dav/recycler.c +++ b/src/common/dav/recycler.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2016-2022, Intel Corporation */ +/* Copyright 2016-2023, Intel Corporation */ /* * recycler.c -- implementation of run recycler diff --git a/src/common/dav/recycler.h b/src/common/dav/recycler.h index 2d68d8d70fc..e89720f8871 100644 --- a/src/common/dav/recycler.h +++ b/src/common/dav/recycler.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2016-2021, Intel Corporation */ +/* Copyright 2016-2023, Intel Corporation */ /* * recycler.h -- internal definitions of run recycler diff --git a/src/common/dav/tx.c b/src/common/dav/tx.c index 45b3daba73c..6d1efe0b8e7 100644 --- a/src/common/dav/tx.c +++ b/src/common/dav/tx.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2022, Intel Corporation */ +/* Copyright 2015-2023, Intel Corporation */ /* * tx.c -- transactions implementation diff --git a/src/common/dav_v2/README.md b/src/common/dav_v2/README.md new file mode 100644 index 00000000000..008b3202327 --- /dev/null +++ b/src/common/dav_v2/README.md @@ -0,0 +1,6 @@ +# DAOS Allocator for VOS + +The DAV allocator for md_on_ssd phase 2 now supports evictable zones. This introduces change in the +layout of heap and is not compatible with the DAV allocator of phase 1. In order to support both +layouts the new allocator is packaged as a different library and linked to daos_common_pmem +library. diff --git a/src/common/dav_v2/SConscript b/src/common/dav_v2/SConscript new file mode 100644 index 00000000000..fe69cb34697 --- /dev/null +++ b/src/common/dav_v2/SConscript @@ -0,0 +1,31 @@ +"""Build dav_v2 libraries""" + + +SRC = ['alloc_class.c', 'bucket.c', 'container_ravl.c', 'container_seglists.c', 'critnib.c', + 'dav_clogs.c', 'dav_iface.c', 'heap.c', 'memblock.c', 'memops.c', 'meta_io.c', + 'palloc.c', 'ravl.c', 'ravl_interval.c', 'recycler.c', 'stats.c', 'tx.c', 'ulog.c', + 'util.c', 'wal_tx.c'] + + +def scons(): + """Scons function""" + + Import('env', 'base_env') + + env.AppendUnique(LIBPATH=[Dir('.')]) + base_env.AppendUnique(LIBPATH=[Dir('.')]) + base_env.d_add_build_rpath() + env.d_add_build_rpath() + + denv = env.Clone() + + denv.AppendUnique(LIBS=['pthread', 'gurt']) + denv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD', '-DDAV_V2_BUILD']) + denv.AppendUnique(CFLAGS=['-fvisibility=hidden']) + + dav_v2 = denv.d_library('dav_v2', SRC) + denv.Install('$PREFIX/lib64/', dav_v2) + + +if __name__ == "SCons.Script": + scons() diff --git a/src/common/dav_v2/alloc_class.c b/src/common/dav_v2/alloc_class.c new file mode 100644 index 00000000000..02c968c2d4f --- /dev/null +++ b/src/common/dav_v2/alloc_class.c @@ -0,0 +1,647 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2024, Intel Corporation */ + +/* + * alloc_class.c -- implementation of allocation classes + */ + +#include +#include + +#include "alloc_class.h" +#include "heap_layout.h" +#include "util.h" +#include "out.h" +#include "bucket.h" +#include "critnib.h" + +#define RUN_CLASS_KEY_PACK(map_idx_s, flags_s, size_idx_s)\ +((uint64_t)(map_idx_s) << 32 |\ +(uint64_t)(flags_s) << 16 |\ +(uint64_t)(size_idx_s)) + +/* + * Value used to mark a reserved spot in the bucket array. + */ +#define ACLASS_RESERVED ((void *)0xFFFFFFFFULL) + +/* + * The last size that is handled by runs. + */ +#define MAX_RUN_SIZE (CHUNKSIZE * 10) + +/* + * Maximum number of bytes the allocation class generation algorithm can decide + * to waste in a single run chunk. + */ +#define MAX_RUN_WASTED_BYTES 1024 + +/* + * Allocation categories are used for allocation classes generation. Each one + * defines the biggest handled size (in bytes) and step pct of the generation + * process. The step percentage defines maximum allowed external fragmentation + * for the category. + */ +#define MAX_ALLOC_CATEGORIES 9 + +/* + * The first size (in byes) which is actually used in the allocation + * class generation algorithm. All smaller sizes use the first predefined bucket + * with the smallest run unit size. + */ +#define FIRST_GENERATED_CLASS_SIZE 128 + +/* + * The granularity of the allocation class generation algorithm. + */ +#define ALLOC_BLOCK_SIZE_GEN 64 + +/* + * The first predefined allocation class size + */ +#define MIN_UNIT_SIZE 128 + +static const struct { + size_t size; + float step; +} categories[MAX_ALLOC_CATEGORIES] = { + /* dummy category - the first allocation class is predefined */ + {FIRST_GENERATED_CLASS_SIZE, 0.05f}, + {1024, 0.05f}, + {2048, 0.05f}, + {4096, 0.05f}, + {8192, 0.05f}, + {16384, 0.05f}, + {32768, 0.05f}, + {131072, 0.05f}, + {393216, 0.05f}, +}; + +#define RUN_UNIT_MAX_ALLOC 8U + +/* + * Every allocation has to be a multiple of at least 8 because we need to + * ensure proper alignment of every persistent structure. + */ +#define ALLOC_BLOCK_SIZE 16 + +/* + * Converts size (in bytes) to number of allocation blocks. + */ +#define SIZE_TO_CLASS_MAP_INDEX(_s, _g) (1 + (((_s) - 1) / (_g))) + +/* + * Target number of allocations per run instance. + */ +#define RUN_MIN_NALLOCS 200 + +/* + * Hard limit of chunks per single run. + */ +#define RUN_SIZE_IDX_CAP (16) + +#define ALLOC_CLASS_DEFAULT_FLAGS CHUNK_FLAG_FLEX_BITMAP + +struct alloc_class_collection { + size_t granularity; + + struct alloc_class *aclasses[MAX_ALLOCATION_CLASSES]; + + /* + * The last size (in bytes) that is handled by runs, everything bigger + * uses the default class. + */ + size_t last_run_max_size; + + /* maps allocation classes to allocation sizes, excluding the header! */ + uint8_t *class_map_by_alloc_size; + + /* maps allocation classes to run unit sizes */ + struct critnib *class_map_by_unit_size; + + int fail_on_missing_class; + int autogenerate_on_missing_class; +}; + +/* + * alloc_class_find_first_free_slot -- searches for the + * first available allocation class slot + * + * This function must be thread-safe because allocation classes can be created + * at runtime. + */ +int +alloc_class_find_first_free_slot(struct alloc_class_collection *ac, + uint8_t *slot) +{ + for (int n = 0; n < MAX_ALLOCATION_CLASSES; ++n) { + if (util_bool_compare_and_swap64(&ac->aclasses[n], + NULL, ACLASS_RESERVED)) { + *slot = (uint8_t)n; + return 0; + } + } + + return -1; +} + +/* + * alloc_class_reserve -- reserve the specified class id + */ +int +alloc_class_reserve(struct alloc_class_collection *ac, uint8_t id) +{ + return util_bool_compare_and_swap64(&ac->aclasses[id], + NULL, ACLASS_RESERVED) ? 0 : -1; +} + +/* + * alloc_class_reservation_clear -- removes the reservation on class id + */ +static void +alloc_class_reservation_clear(struct alloc_class_collection *ac, int id) +{ + int ret = util_bool_compare_and_swap64(&ac->aclasses[id], + ACLASS_RESERVED, NULL); + ASSERT(ret); +} + +/* + * alloc_class_new -- creates a new allocation class + */ +struct alloc_class * +alloc_class_new(int id, struct alloc_class_collection *ac, + enum alloc_class_type type, enum header_type htype, + size_t unit_size, size_t alignment, + uint32_t size_idx) +{ + DAV_DBG("alloc_class_new id:%d\n", + (type == CLASS_HUGE) ? DEFAULT_ALLOC_CLASS_ID : id); + + struct alloc_class *c; + + D_ALLOC_PTR_NZ(c); + + if (c == NULL) + goto error_class_alloc; + + c->unit_size = unit_size; + c->header_type = htype; + c->type = type; + c->flags = (uint16_t) + (header_type_to_flag[c->header_type] | + (alignment ? CHUNK_FLAG_ALIGNED : 0)) | + ALLOC_CLASS_DEFAULT_FLAGS; + + switch (type) { + case CLASS_HUGE: + id = DEFAULT_ALLOC_CLASS_ID; + break; + case CLASS_RUN: + c->rdsc.alignment = alignment; + memblock_run_bitmap(&size_idx, c->flags, unit_size, + alignment, NULL, &c->rdsc.bitmap); + c->rdsc.nallocs = c->rdsc.bitmap.nbits; + c->rdsc.size_idx = size_idx; + + /* these two fields are duplicated from class */ + c->rdsc.unit_size = c->unit_size; + c->rdsc.flags = c->flags; + + uint8_t slot = (uint8_t)id; + + if (id < 0 && alloc_class_find_first_free_slot(ac, + &slot) != 0) + goto error_map_insert; + id = slot; + + size_t map_idx = SIZE_TO_CLASS_MAP_INDEX(c->unit_size, + ac->granularity); + ASSERT(map_idx <= UINT32_MAX); + uint32_t map_idx_s = (uint32_t)map_idx; + uint16_t size_idx_s = (uint16_t)size_idx; + uint16_t flags_s = (uint16_t)c->flags; + uint64_t k = RUN_CLASS_KEY_PACK(map_idx_s, + flags_s, size_idx_s); + + if (critnib_insert(ac->class_map_by_unit_size, + k, c) != 0) { + ERR("unable to register allocation class"); + goto error_map_insert; + } + + break; + default: + ASSERT(0); + } + + c->id = (uint8_t)id; + ac->aclasses[c->id] = c; + return c; + +error_map_insert: + D_FREE(c); +error_class_alloc: + if (id >= 0) + alloc_class_reservation_clear(ac, id); + + D_CRIT("alloc_class_new failed\n"); + return NULL; +} + +/* + * alloc_class_delete -- (internal) deletes an allocation class + */ +void +alloc_class_delete(struct alloc_class_collection *ac, + struct alloc_class *c) +{ + DAV_DBG("alloc_class_delete: %d\n", c->id); + + ac->aclasses[c->id] = NULL; + D_FREE(c); +} + +/* + * alloc_class_find_or_create -- (internal) searches for the + * biggest allocation class for which unit_size is evenly divisible by n. + * If no such class exists, create one. + */ +static struct alloc_class * +alloc_class_find_or_create(struct alloc_class_collection *ac, size_t n) +{ + COMPILE_ERROR_ON(MAX_ALLOCATION_CLASSES > UINT8_MAX); + uint64_t required_size_bytes = n * RUN_MIN_NALLOCS; + uint32_t required_size_idx = 1; + + if (required_size_bytes > RUN_DEFAULT_SIZE) { + required_size_bytes -= RUN_DEFAULT_SIZE; + required_size_idx += + CALC_SIZE_IDX(CHUNKSIZE, required_size_bytes); + if (required_size_idx > RUN_SIZE_IDX_CAP) + required_size_idx = RUN_SIZE_IDX_CAP; + } + + for (int i = MAX_ALLOCATION_CLASSES - 1; i >= 0; --i) { + struct alloc_class *c = ac->aclasses[i]; + + if (c == NULL || c->type == CLASS_HUGE || + c->rdsc.size_idx < required_size_idx) + continue; + + if (n % c->unit_size == 0 && + n / c->unit_size <= RUN_UNIT_MAX_ALLOC) + return c; + } + + /* + * In order to minimize the wasted space at the end of the run the + * run data size must be divisible by the allocation class unit size + * with the smallest possible remainder, preferably 0. + */ + struct run_bitmap b; + size_t runsize_bytes = 0; + + do { + if (runsize_bytes != 0) /* don't increase on first iteration */ + n += ALLOC_BLOCK_SIZE_GEN; + + uint32_t size_idx = required_size_idx; + + memblock_run_bitmap(&size_idx, ALLOC_CLASS_DEFAULT_FLAGS, n, 0, + NULL, &b); + + runsize_bytes = RUN_CONTENT_SIZE_BYTES(size_idx) - b.size; + } while ((runsize_bytes % n) > MAX_RUN_WASTED_BYTES); + + /* + * Now that the desired unit size is found the existing classes need + * to be searched for possible duplicates. If a class that can handle + * the calculated size already exists, simply return that. + */ + for (int i = 1; i < MAX_ALLOCATION_CLASSES; ++i) { + struct alloc_class *c = ac->aclasses[i]; + + if (c == NULL || c->type == CLASS_HUGE) + continue; + if (n / c->unit_size <= RUN_UNIT_MAX_ALLOC && + n % c->unit_size == 0) + return c; + if (c->unit_size == n) + return c; + } + + return alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT, n, 0, + required_size_idx); +} + +/* + * alloc_class_find_min_frag -- searches for an existing allocation + * class that will provide the smallest internal fragmentation for the given + * size. + */ +static struct alloc_class * +alloc_class_find_min_frag(struct alloc_class_collection *ac, size_t n) +{ + struct alloc_class *best_c = NULL; + size_t lowest_waste = SIZE_MAX; + + ASSERTne(n, 0); + + /* + * Start from the largest buckets in order to minimize unit size of + * allocated memory blocks. + */ + for (int i = MAX_ALLOCATION_CLASSES - 1; i >= 0; --i) { + struct alloc_class *c = ac->aclasses[i]; + + /* can't use alloc classes /w no headers by default */ + if (c == NULL || c->header_type == HEADER_NONE) + continue; + + size_t real_size = n + header_type_to_size[c->header_type]; + + size_t units = CALC_SIZE_IDX(c->unit_size, real_size); + + /* can't exceed the maximum allowed run unit max */ + if (c->type == CLASS_RUN && units > RUN_UNIT_MAX_ALLOC) + continue; + + if (c->unit_size * units == real_size) + return c; + + size_t waste = (c->unit_size * units) - real_size; + + /* + * If we assume that the allocation class is only ever going to + * be used with exactly one size, the effective internal + * fragmentation would be increased by the leftover + * memory at the end of the run. + */ + if (c->type == CLASS_RUN) { + size_t wasted_units = c->rdsc.nallocs % units; + size_t wasted_bytes = wasted_units * c->unit_size; + size_t waste_avg_per_unit = wasted_bytes / + c->rdsc.nallocs; + + waste += waste_avg_per_unit; + } + + if (best_c == NULL || lowest_waste > waste) { + best_c = c; + lowest_waste = waste; + } + } + + ASSERTne(best_c, NULL); + return best_c; +} + +/* + * alloc_class_collection_new -- creates a new collection of allocation classes + */ +struct alloc_class_collection * +alloc_class_collection_new() +{ + struct alloc_class_collection *ac; + + D_ALLOC_PTR(ac); + if (ac == NULL) + return NULL; + + ac->granularity = ALLOC_BLOCK_SIZE; + ac->last_run_max_size = MAX_RUN_SIZE; + ac->fail_on_missing_class = 0; + ac->autogenerate_on_missing_class = 1; + + size_t maps_size = (MAX_RUN_SIZE / ac->granularity) + 1; + + D_ALLOC_NZ(ac->class_map_by_alloc_size, maps_size); + if (ac->class_map_by_alloc_size == NULL) + goto error; + ac->class_map_by_unit_size = critnib_new(); + if (ac->class_map_by_unit_size == NULL) + goto error; + + memset(ac->class_map_by_alloc_size, 0xFF, maps_size); + + if (alloc_class_new(-1, ac, CLASS_HUGE, HEADER_COMPACT, + CHUNKSIZE, 0, 1) == NULL) + goto error; + + struct alloc_class *predefined_class = + alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT, + MIN_UNIT_SIZE, 0, 1); + if (predefined_class == NULL) + goto error; + + for (size_t i = 0; i < FIRST_GENERATED_CLASS_SIZE / ac->granularity; + ++i) { + ac->class_map_by_alloc_size[i] = predefined_class->id; + } + + /* + * Based on the defined categories, a set of allocation classes is + * created. The unit size of those classes is depended on the category + * initial size and step. + */ + size_t granularity_mask = ALLOC_BLOCK_SIZE_GEN - 1; + + for (int c = 1; c < MAX_ALLOC_CATEGORIES; ++c) { + size_t n = categories[c - 1].size + ALLOC_BLOCK_SIZE_GEN; + + do { + if (alloc_class_find_or_create(ac, n) == NULL) + goto error; + + float stepf = (float)n * categories[c].step; + size_t stepi = (size_t)stepf; + + stepi = (stepf - (float)stepi < FLT_EPSILON) ? + stepi : stepi + 1; + + n += (stepi + (granularity_mask)) & ~granularity_mask; + } while (n <= categories[c].size); + } + + /* + * Find the largest alloc class and use it's unit size as run allocation + * threshold. + */ + uint8_t largest_aclass_slot; + + for (largest_aclass_slot = MAX_ALLOCATION_CLASSES - 1; + largest_aclass_slot > 0 && + ac->aclasses[largest_aclass_slot] == NULL; + --largest_aclass_slot) { + /* intentional NOP */ + } + + struct alloc_class *c = ac->aclasses[largest_aclass_slot]; + + /* + * The actual run might contain less unit blocks than the theoretical + * unit max variable. This may be the case for very large unit sizes. + */ + size_t real_unit_max = (c->rdsc.nallocs < RUN_UNIT_MAX_ALLOC) ? + c->rdsc.nallocs : RUN_UNIT_MAX_ALLOC; + + size_t theoretical_run_max_size = c->unit_size * real_unit_max; + + ac->last_run_max_size = theoretical_run_max_size <= MAX_RUN_SIZE ? + theoretical_run_max_size : MAX_RUN_SIZE; + +#ifdef DAV_EXTRA_DEBUG + /* + * Verify that each bucket's unit size points back to the bucket by the + * bucket map. This must be true for the default allocation classes, + * otherwise duplicate buckets will be created. + */ + for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + struct alloc_class *cl = ac->aclasses[i]; + + if (cl != NULL && cl->type == CLASS_RUN) { + ASSERTeq(i, cl->id); + ASSERTeq(alloc_class_by_run(ac, cl->unit_size, + cl->flags, cl->rdsc.size_idx), cl); + } + } +#endif + + return ac; + +error: + alloc_class_collection_delete(ac); + + return NULL; +} + +/* + * alloc_class_collection_delete -- deletes the allocation class collection and + * all of the classes within it + */ +void +alloc_class_collection_delete(struct alloc_class_collection *ac) +{ + for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + struct alloc_class *c = ac->aclasses[i]; + + if (c != NULL) + alloc_class_delete(ac, c); + } + + if (ac->class_map_by_unit_size) + critnib_delete(ac->class_map_by_unit_size); + D_FREE(ac->class_map_by_alloc_size); + D_FREE(ac); +} + +/* + * alloc_class_assign_by_size -- (internal) chooses the allocation class that + * best approximates the provided size + */ +static struct alloc_class * +alloc_class_assign_by_size(struct alloc_class_collection *ac, + size_t size) +{ + size_t class_map_index = SIZE_TO_CLASS_MAP_INDEX(size, + ac->granularity); + struct alloc_class *c = alloc_class_find_min_frag(ac, + class_map_index * ac->granularity); + + ASSERTne(c, NULL); + + /* + * We don't lock this array because locking this section here and then + * bailing out if someone else was faster would be still slower than + * just calculating the class and failing to assign the variable. + * We are using a compare and swap so that helgrind/drd don't complain. + */ + util_bool_compare_and_swap64( + &ac->class_map_by_alloc_size[class_map_index], + MAX_ALLOCATION_CLASSES, c->id); + + DAV_DBG("alloc_class_assign_by_size: %zu id:%d", + size, c->id); + + return c; +} + +/* + * alloc_class_by_alloc_size -- returns allocation class that is assigned + * to handle an allocation of the provided size + */ +struct alloc_class * +alloc_class_by_alloc_size(struct alloc_class_collection *ac, size_t size) +{ + if (size < ac->last_run_max_size) { + uint8_t class_id = ac->class_map_by_alloc_size[ + SIZE_TO_CLASS_MAP_INDEX(size, ac->granularity)]; + + if (class_id == MAX_ALLOCATION_CLASSES) { + if (ac->fail_on_missing_class) + return NULL; + else if (ac->autogenerate_on_missing_class) + return alloc_class_assign_by_size(ac, size); + else + return ac->aclasses[DEFAULT_ALLOC_CLASS_ID]; + } + + return ac->aclasses[class_id]; + } else { + return ac->aclasses[DEFAULT_ALLOC_CLASS_ID]; + } +} + +/* + * alloc_class_by_run -- returns the allocation class that has the given + * unit size + */ +struct alloc_class * +alloc_class_by_run(struct alloc_class_collection *ac, + size_t unit_size, uint16_t flags, uint32_t size_idx) +{ + size_t map_idx = SIZE_TO_CLASS_MAP_INDEX(unit_size, ac->granularity); + + ASSERT(map_idx <= UINT32_MAX); + + uint32_t map_idx_s = (uint32_t)map_idx; + + ASSERT(size_idx <= MAX_CHUNK); + + uint16_t size_idx_s = (uint16_t)size_idx; + uint16_t flags_s = (uint16_t)flags; + + return critnib_get(ac->class_map_by_unit_size, + RUN_CLASS_KEY_PACK(map_idx_s, flags_s, size_idx_s)); +} + +/* + * alloc_class_by_id -- returns the allocation class with an id + */ +struct alloc_class * +alloc_class_by_id(struct alloc_class_collection *ac, uint8_t id) +{ + return ac->aclasses[id]; +} + +/* + * alloc_class_calc_size_idx -- calculates how many units does the size require + */ +ssize_t +alloc_class_calc_size_idx(struct alloc_class *c, size_t size) +{ + uint32_t size_idx = CALC_SIZE_IDX(c->unit_size, + size + header_type_to_size[c->header_type]); + + if (c->type == CLASS_RUN) { + if (c->header_type == HEADER_NONE && size_idx != 1) + return -1; + else if (size_idx > RUN_UNIT_MAX) + return -1; + else if (size_idx > c->rdsc.nallocs) + return -1; + } + + return size_idx; +} diff --git a/src/common/dav_v2/alloc_class.h b/src/common/dav_v2/alloc_class.h new file mode 100644 index 00000000000..48ffd815e26 --- /dev/null +++ b/src/common/dav_v2/alloc_class.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2023, Intel Corporation */ + +/* + * alloc_class.h -- internal definitions for allocation classes + */ + +#ifndef __DAOS_COMMON_ALLOC_CLASS_H +#define __DAOS_COMMON_ALLOC_CLASS_H 1 + +#include +#include +#include +#include "heap_layout.h" +#include "memblock.h" + +#define MAX_ALLOCATION_CLASSES (UINT8_MAX) +#define DEFAULT_ALLOC_CLASS_ID (0) +#define RUN_UNIT_MAX RUN_BITS_PER_VALUE + +struct alloc_class_collection; + +enum alloc_class_type { + CLASS_UNKNOWN, + CLASS_HUGE, + CLASS_RUN, + + MAX_ALLOC_CLASS_TYPES +}; + +struct alloc_class { + uint8_t id; + uint16_t flags; + + size_t unit_size; + + enum header_type header_type; + enum alloc_class_type type; + + /* run-specific data */ + struct run_descriptor rdsc; +}; + +struct alloc_class_collection *alloc_class_collection_new(void); +void alloc_class_collection_delete(struct alloc_class_collection *ac); + +struct alloc_class *alloc_class_by_run( + struct alloc_class_collection *ac, + size_t unit_size, uint16_t flags, uint32_t size_idx); +struct alloc_class *alloc_class_by_alloc_size( + struct alloc_class_collection *ac, size_t size); +struct alloc_class *alloc_class_by_id( + struct alloc_class_collection *ac, uint8_t id); + +int alloc_class_reserve(struct alloc_class_collection *ac, uint8_t id); +int alloc_class_find_first_free_slot(struct alloc_class_collection *ac, + uint8_t *slot); + +ssize_t +alloc_class_calc_size_idx(struct alloc_class *c, size_t size); + +struct alloc_class * +alloc_class_new(int id, struct alloc_class_collection *ac, + enum alloc_class_type type, enum header_type htype, + size_t unit_size, size_t alignment, + uint32_t size_idx); + +void alloc_class_delete(struct alloc_class_collection *ac, + struct alloc_class *c); + +#endif /* __DAOS_COMMON_ALLOC_CLASS_H */ diff --git a/src/common/dav_v2/bucket.c b/src/common/dav_v2/bucket.c new file mode 100644 index 00000000000..ab86f94ee6d --- /dev/null +++ b/src/common/dav_v2/bucket.c @@ -0,0 +1,275 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * bucket.c -- bucket implementation + * + * Buckets manage volatile state of the heap. They are the abstraction layer + * between the heap-managed chunks/runs and memory allocations. + * + * Each bucket instance can have a different underlying container that is + * responsible for selecting blocks - which means that whether the allocator + * serves memory blocks in best/first/next -fit manner is decided during bucket + * creation. + */ + +#include "alloc_class.h" +#include "bucket.h" +#include "heap.h" +#include "memblock.h" +#include "out.h" +#include "sys_util.h" +#include "valgrind_internal.h" + +struct bucket { + /* this struct is both the lock guard and the locked state */ + struct bucket_locked *locked; + struct alloc_class *aclass; + struct block_container *container; + const struct block_container_ops *c_ops; + struct memory_block_reserved *active_memory_block; + struct mbrt *mb; + int is_active; +}; + +struct bucket_locked { + struct bucket bucket; + pthread_mutex_t lock; +}; + +/* + * bucket_init -- initializes the bucket's runtime state + */ +static int +bucket_init(struct bucket *b, struct block_container *c, + struct alloc_class *aclass) +{ + b->container = c; + b->c_ops = c->c_ops; + + b->is_active = 0; + b->active_memory_block = NULL; + if (aclass && aclass->type == CLASS_RUN) { + D_ALLOC_PTR(b->active_memory_block); + + if (b->active_memory_block == NULL) + return -1; + } + b->aclass = aclass; + + return 0; +} + +/* + * bucket_fini -- destroys the bucket's runtime state + */ +static void +bucket_fini(struct bucket *b) +{ + if (b->active_memory_block) + D_FREE(b->active_memory_block); + b->c_ops->destroy(b->container); +} + +/* + * bucket_locked_new -- creates a new locked bucket instance + */ +struct bucket_locked * +bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct mbrt *mb) +{ + ASSERTne(c, NULL); + + struct bucket_locked *b; + + D_ALLOC_PTR_NZ(b); + if (b == NULL) + return NULL; + + if (bucket_init(&b->bucket, c, aclass) != 0) + goto err_bucket_init; + + util_mutex_init(&b->lock); + b->bucket.locked = b; + b->bucket.mb = mb; + + return b; + +err_bucket_init: + D_FREE(b); + return NULL; +} + +/* + * bucket_locked_delete -- cleanups and deallocates locked bucket instance + */ +void +bucket_locked_delete(struct bucket_locked *b) +{ + bucket_fini(&b->bucket); + util_mutex_destroy(&b->lock); + D_FREE(b); +} + +/* + * bucket_acquire -- acquires a usable bucket struct + */ +struct bucket * +bucket_acquire(struct bucket_locked *b) +{ + util_mutex_lock(&b->lock); + return &b->bucket; +} + +/* + * bucket_release -- releases a bucket struct + */ +void +bucket_release(struct bucket *b) +{ + util_mutex_unlock(&b->locked->lock); +} + +/* + * bucket_try_insert_attached_block -- tries to return a previously allocated + * memory block back to the original bucket + */ +void +bucket_try_insert_attached_block(struct bucket *b, const struct memory_block *m) +{ + struct memory_block *active = &b->active_memory_block->m; + + if (b->is_active && + m->chunk_id == active->chunk_id && + m->zone_id == active->zone_id) { + bucket_insert_block(b, m); + } +} + +/* + * bucket_alloc_class -- returns the bucket's alloc class + */ +struct alloc_class * +bucket_alloc_class(struct bucket *b) +{ + return b->aclass; +} + +/* + * bucket_insert_block -- inserts a block into the bucket + */ +int +bucket_insert_block(struct bucket *b, const struct memory_block *m) +{ +#if VG_MEMCHECK_ENABLED || VG_HELGRIND_ENABLED || VG_DRD_ENABLED + if (On_memcheck || On_drd_or_hg) { + size_t size = m->m_ops->get_real_size(m); + void *data = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_NOACCESS(data, size); + VALGRIND_ANNOTATE_NEW_MEMORY(data, size); + } +#endif + return b->c_ops->insert(b->container, m); +} + +/* + * bucket_remove_block -- removes an exact block from the bucket + */ +int +bucket_remove_block(struct bucket *b, const struct memory_block *m) +{ + return b->c_ops->get_rm_exact(b->container, m); +} + +/* + * bucket_alloc_block -- allocates a block from the bucket + */ +int +bucket_alloc_block(struct bucket *b, struct memory_block *m_out) +{ + return b->c_ops->get_rm_bestfit(b->container, m_out); +} + +/* + * bucket_memblock_insert_block -- (internal) bucket insert wrapper + * for callbacks + */ +static int +bucket_memblock_insert_block(const struct memory_block *m, void *b) +{ + return bucket_insert_block(b, m); +} + +/* + * bucket_attach_run - attaches a run to a bucket, making it active + */ +int +bucket_attach_run(struct bucket *b, const struct memory_block *m) +{ + pthread_mutex_t *lock = m->m_ops->get_lock(m); + + util_mutex_lock(lock); + + int ret = m->m_ops->iterate_free(m, bucket_memblock_insert_block, b); + + util_mutex_unlock(lock); + + if (ret == 0) { + b->active_memory_block->m = *m; + b->active_memory_block->bucket = b->locked; + b->is_active = 1; + util_fetch_and_add64(&b->active_memory_block->nresv, 1); + } else { + b->c_ops->rm_all(b->container); + } + return 0; +} + +/* + * bucket_detach_run - gets rid of the active block in the bucket + */ +int +bucket_detach_run(struct bucket *b, struct memory_block *m_out, int *empty) +{ + *empty = 0; + + struct memory_block_reserved **active = &b->active_memory_block; + + if (b->is_active) { + b->c_ops->rm_all(b->container); + if (util_fetch_and_sub64(&(*active)->nresv, 1) == 1) { + *m_out = (*active)->m; + *empty = 1; + + VALGRIND_ANNOTATE_HAPPENS_AFTER(&(*active)->nresv); + (*active)->m = MEMORY_BLOCK_NONE; + } else { + VALGRIND_ANNOTATE_HAPPENS_BEFORE(&(*active)->nresv); + *active = NULL; + } + b->is_active = 0; + } + + if (*active == NULL) { + D_ALLOC_PTR(*active); + if (*active == NULL) + return -1; + } + + return 0; +} + +/* + * bucket_active_block -- returns the bucket active block + */ +struct memory_block_reserved * +bucket_active_block(struct bucket *b) +{ + return b->is_active ? b->active_memory_block : NULL; +} + +struct mbrt * +bucket_get_mbrt(struct bucket *b) +{ + return b->mb; +} diff --git a/src/common/dav_v2/bucket.h b/src/common/dav_v2/bucket.h new file mode 100644 index 00000000000..af2d5be6410 --- /dev/null +++ b/src/common/dav_v2/bucket.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * bucket.h -- internal definitions for bucket + */ + +#ifndef __DAOS_COMMON_BUCKET_H +#define __DAOS_COMMON_BUCKET_H 1 + +#include +#include + +#include "alloc_class.h" +#include "container.h" +#include "memblock.h" + +#define CALC_SIZE_IDX(_unit_size, _size)\ + ((_size) == 0 ? 0 : (uint32_t)((((_size)-1) / (_unit_size)) + 1)) + +struct bucket_locked; +struct bucket; + +struct bucket_locked * +bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct mbrt *mb); + +struct bucket *bucket_acquire(struct bucket_locked *b); +void bucket_release(struct bucket *b); + +struct alloc_class *bucket_alloc_class(struct bucket *b); +int bucket_insert_block(struct bucket *b, const struct memory_block *m); +void bucket_try_insert_attached_block(struct bucket *b, + const struct memory_block *m); +int bucket_remove_block(struct bucket *b, const struct memory_block *m); +int bucket_alloc_block(struct bucket *b, struct memory_block *m_out); + +int bucket_attach_run(struct bucket *b, const struct memory_block *m); +int bucket_detach_run(struct bucket *b, + struct memory_block *m_out, int *empty); + +struct memory_block_reserved *bucket_active_block(struct bucket *b); + +void bucket_locked_delete(struct bucket_locked *b); +struct mbrt * +bucket_get_mbrt(struct bucket *b); + +#endif /* __DAOS_COMMON_BUCKET_H */ diff --git a/src/common/dav_v2/container.h b/src/common/dav_v2/container.h new file mode 100644 index 00000000000..5d2c247e248 --- /dev/null +++ b/src/common/dav_v2/container.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * container.h -- internal definitions for block containers + */ + +#ifndef __DAOS_COMMON_CONTAINER_H +#define __DAOS_COMMON_CONTAINER_H 1 + +#include "memblock.h" + +struct block_container { + const struct block_container_ops *c_ops; + struct palloc_heap *heap; +}; + +struct block_container_ops { + /* inserts a new memory block into the container */ + int (*insert)(struct block_container *c, const struct memory_block *m); + + /* removes exact match memory block */ + int (*get_rm_exact)(struct block_container *c, + const struct memory_block *m); + + /* removes and returns the best-fit memory block for size */ + int (*get_rm_bestfit)(struct block_container *c, + struct memory_block *m); + + /* checks whether the container is empty */ + int (*is_empty)(struct block_container *c); + + /* removes all elements from the container */ + void (*rm_all)(struct block_container *c); + + /* deletes the container */ + void (*destroy)(struct block_container *c); +}; + +struct palloc_heap; +struct block_container *container_new_ravl(struct palloc_heap *heap); +struct block_container *container_new_seglists(struct palloc_heap *heap); + +#endif /* __DAOS_COMMON_CONTAINER_H */ diff --git a/src/common/dav_v2/container_ravl.c b/src/common/dav_v2/container_ravl.c new file mode 100644 index 00000000000..af542c3c744 --- /dev/null +++ b/src/common/dav_v2/container_ravl.c @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2023, Intel Corporation */ + +/* + * container_ravl.c -- implementation of ravl-based block container + */ + +#include "container.h" +#include "ravl.h" +#include "out.h" +#include "sys_util.h" + +struct block_container_ravl { + struct block_container super; + struct memory_block m; + struct ravl *tree; +}; + +/* + * container_compare_memblocks -- (internal) compares two memory blocks + */ +static int +container_compare_memblocks(const void *lhs, const void *rhs) +{ + const struct memory_block *l = lhs; + const struct memory_block *r = rhs; + + int64_t diff = (int64_t)l->size_idx - (int64_t)r->size_idx; + + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->zone_id - (int64_t)r->zone_id; + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->chunk_id - (int64_t)r->chunk_id; + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->block_off - (int64_t)r->block_off; + if (diff != 0) + return diff > 0 ? 1 : -1; + + return 0; +} + +/* + * container_ravl_insert_block -- (internal) inserts a new memory block + * into the container + */ +static int +container_ravl_insert_block(struct block_container *bc, + const struct memory_block *m) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + ASSERT(m->chunk_id < MAX_CHUNK); + ASSERT(m->zone_id < UINT32_MAX); + + c->m = *m; + + return ravl_emplace_copy(c->tree, m); +} + +/* + * container_ravl_get_rm_block_bestfit -- (internal) removes and returns the + * best-fit memory block for size + */ +static int +container_ravl_get_rm_block_bestfit(struct block_container *bc, + struct memory_block *m) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + struct ravl_node *n = ravl_find(c->tree, m, + RAVL_PREDICATE_GREATER_EQUAL); + + if (n == NULL) + return ENOMEM; + + struct memory_block *e = ravl_data(n); + *m = c->m; + m->zone_id = e->zone_id; + m->chunk_id = e->chunk_id; + m->size_idx = e->size_idx; + m->block_off = e->block_off; + /* Rest of the fields in e should not be accessed. */ + + ravl_remove(c->tree, n); + + return 0; +} + +/* + * container_ravl_get_rm_block_exact -- + * (internal) removes exact match memory block + */ +static int +container_ravl_get_rm_block_exact(struct block_container *bc, + const struct memory_block *m) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + struct ravl_node *n = ravl_find(c->tree, m, RAVL_PREDICATE_EQUAL); + + if (n == NULL) + return ENOMEM; + + ravl_remove(c->tree, n); + + return 0; +} + +/* + * container_ravl_is_empty -- (internal) checks whether the container is empty + */ +static int +container_ravl_is_empty(struct block_container *bc) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + return ravl_empty(c->tree); +} + +/* + * container_ravl_rm_all -- (internal) removes all elements from the tree + */ +static void +container_ravl_rm_all(struct block_container *bc) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + ravl_clear(c->tree); +} + +/* + * container_ravl_delete -- (internal) deletes the container + */ +static void +container_ravl_destroy(struct block_container *bc) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + ravl_delete(c->tree); + + D_FREE(bc); +} + +/* + * Tree-based block container used to provide best-fit functionality to the + * bucket. The time complexity for this particular container is O(k) where k is + * the length of the key. + * + * The get methods also guarantee that the block with lowest possible address + * that best matches the requirements is provided. + */ +static const struct block_container_ops container_ravl_ops = { + .insert = container_ravl_insert_block, + .get_rm_exact = container_ravl_get_rm_block_exact, + .get_rm_bestfit = container_ravl_get_rm_block_bestfit, + .is_empty = container_ravl_is_empty, + .rm_all = container_ravl_rm_all, + .destroy = container_ravl_destroy, +}; + +/* + * container_new_ravl -- allocates and initializes a ravl container + */ +struct block_container * +container_new_ravl(struct palloc_heap *heap) +{ + struct block_container_ravl *bc; + + D_ALLOC_PTR_NZ(bc); + if (bc == NULL) + goto error_container_malloc; + + bc->super.heap = heap; + bc->super.c_ops = &container_ravl_ops; + bc->tree = + ravl_new_sized(container_compare_memblocks, offsetof(struct memory_block, m_ops)); + if (bc->tree == NULL) + goto error_ravl_new; + + return (struct block_container *)&bc->super; + +error_ravl_new: + D_FREE(bc); + +error_container_malloc: + return NULL; +} diff --git a/src/common/dav_v2/container_seglists.c b/src/common/dav_v2/container_seglists.c new file mode 100644 index 00000000000..3ec18df0b3f --- /dev/null +++ b/src/common/dav_v2/container_seglists.c @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * container_seglists.c -- implementation of segregated lists block container + * + * This container is constructed from N (up to 64) intrusive lists and a + * single 8 byte bitmap that stores the information whether a given list is + * empty or not. + */ + +#include "container.h" +#include "out.h" +#include "sys_util.h" +#include "util.h" +#include "valgrind_internal.h" +#include "vecq.h" + +#define SEGLIST_BLOCK_LISTS 64U + +struct block_container_seglists { + struct block_container super; + struct memory_block m; + + VECQ(, uint32_t) blocks[SEGLIST_BLOCK_LISTS]; + uint64_t nonempty_lists; +}; + +/* + * container_seglists_insert_block -- (internal) inserts a new memory block + * into the container + */ +static int +container_seglists_insert_block(struct block_container *bc, + const struct memory_block *m) +{ + ASSERT(m->chunk_id < MAX_CHUNK); + ASSERT(m->zone_id < UINT32_MAX); + ASSERTne(m->size_idx, 0); + + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + if (c->nonempty_lists == 0) + c->m = *m; + + ASSERT(m->size_idx <= SEGLIST_BLOCK_LISTS); + ASSERT(m->chunk_id == c->m.chunk_id); + ASSERT(m->zone_id == c->m.zone_id); + + if (VECQ_ENQUEUE(&c->blocks[m->size_idx - 1], m->block_off) != 0) + return -1; + + /* marks the list as nonempty */ + c->nonempty_lists |= 1ULL << (m->size_idx - 1); + + return 0; +} + +/* + * container_seglists_get_rm_block_bestfit -- (internal) removes and returns the + * best-fit memory block for size + */ +static int +container_seglists_get_rm_block_bestfit(struct block_container *bc, + struct memory_block *m) +{ + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + ASSERT(m->size_idx <= SEGLIST_BLOCK_LISTS); + uint32_t i = 0; + + /* applicable lists */ + uint64_t size_mask = (1ULL << (m->size_idx - 1)) - 1; + uint64_t v = c->nonempty_lists & ~size_mask; + + if (v == 0) + return ENOMEM; + + /* finds the list that serves the smallest applicable size */ + i = util_lssb_index64(v); + + uint32_t block_offset = VECQ_DEQUEUE(&c->blocks[i]); + + if (VECQ_SIZE(&c->blocks[i]) == 0) /* marks the list as empty */ + c->nonempty_lists &= ~(1ULL << (i)); + + *m = c->m; + m->block_off = block_offset; + m->size_idx = i + 1; + + return 0; +} + +/* + * container_seglists_is_empty -- (internal) checks whether the container is + * empty + */ +static int +container_seglists_is_empty(struct block_container *bc) +{ + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + return c->nonempty_lists == 0; +} + +/* + * container_seglists_rm_all -- (internal) removes all elements from the tree + */ +static void +container_seglists_rm_all(struct block_container *bc) +{ + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i) + VECQ_CLEAR(&c->blocks[i]); + + c->nonempty_lists = 0; +} + +/* + * container_seglists_delete -- (internal) deletes the container + */ +static void +container_seglists_destroy(struct block_container *bc) +{ + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i) + VECQ_DELETE(&c->blocks[i]); + + D_FREE(c); +} + +/* + * This container does not support retrieval of exact memory blocks, but other + * than provides best-fit in O(1) time for unit sizes that do not exceed 64. + */ +static const struct block_container_ops container_seglists_ops = { + .insert = container_seglists_insert_block, + .get_rm_exact = NULL, + .get_rm_bestfit = container_seglists_get_rm_block_bestfit, + .is_empty = container_seglists_is_empty, + .rm_all = container_seglists_rm_all, + .destroy = container_seglists_destroy, +}; + +/* + * container_new_seglists -- allocates and initializes a seglists container + */ +struct block_container * +container_new_seglists(struct palloc_heap *heap) +{ + struct block_container_seglists *bc; + + D_ALLOC_PTR_NZ(bc); + if (bc == NULL) + goto error_container_malloc; + + bc->super.heap = heap; + bc->super.c_ops = &container_seglists_ops; + + for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i) + VECQ_INIT(&bc->blocks[i]); + bc->nonempty_lists = 0; + + return (struct block_container *)&bc->super; + +error_container_malloc: + return NULL; +} diff --git a/src/common/dav_v2/critnib.c b/src/common/dav_v2/critnib.c new file mode 100644 index 00000000000..304d568ca8e --- /dev/null +++ b/src/common/dav_v2/critnib.c @@ -0,0 +1,678 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2023, Intel Corporation */ + +/* + * critnib.c -- implementation of critnib tree + * + * It offers identity lookup (like a hashmap) and <= lookup (like a search + * tree). Unlike some hashing algorithms (cuckoo hash, perfect hashing) the + * complexity isn't constant, but for data sizes we expect it's several + * times as fast as cuckoo, and has no "stop the world" cases that would + * cause latency (ie, better worst case behavior). + */ + +/* + * STRUCTURE DESCRIPTION + * + * Critnib is a hybrid between a radix tree and DJ Bernstein's critbit: + * it skips nodes for uninteresting radix nodes (ie, ones that would have + * exactly one child), this requires adding to every node a field that + * describes the slice (4-bit in our case) that this radix level is for. + * + * This implementation also stores each node's path (ie, bits that are + * common to every key in that subtree) -- this doesn't help with lookups + * at all (unused in == match, could be reconstructed at no cost in <= + * after first dive) but simplifies inserts and removes. If we ever want + * that piece of memory it's easy to trim it down. + */ + +/* + * CONCURRENCY ISSUES + * + * Reads are completely lock-free sync-free, but only almost wait-free: + * if for some reason a read thread gets pathologically stalled, it will + * notice the data being stale and restart the work. In usual cases, + * the structure having been modified does _not_ cause a restart. + * + * Writes could be easily made lock-free as well (with only a cmpxchg + * sync), but this leads to problems with removes. A possible solution + * would be doing removes by overwriting by NULL w/o freeing -- yet this + * would lead to the structure growing without bounds. Complex per-node + * locks would increase concurrency but they slow down individual writes + * enough that in practice a simple global write lock works faster. + * + * Removes are the only operation that can break reads. The structure + * can do local RCU well -- the problem being knowing when it's safe to + * free. Any synchronization with reads would kill their speed, thus + * instead we have a remove count. The grace period is DELETED_LIFE, + * after which any read will notice staleness and restart its work. + */ +#include +#include + +#include "critnib.h" +#include "out.h" +#include "sys_util.h" +#include "valgrind_internal.h" +#include "util.h" + +/* + * A node that has been deleted is left untouched for this many delete + * cycles. Reads have guaranteed correctness if they took no longer than + * DELETED_LIFE concurrent deletes, otherwise they notice something is + * wrong and restart. The memory of deleted nodes is never freed to + * malloc nor their pointers lead anywhere wrong, thus a stale read will + * (temporarily) get a wrong answer but won't crash. + * + * There's no need to count writes as they never interfere with reads. + * + * Allowing stale reads (of arbitrarily old writes or of deletes less than + * DELETED_LIFE old) might sound counterintuitive, but it doesn't affect + * semantics in any way: the thread could have been stalled just after + * returning from our code. Thus, the guarantee is: the result of get() or + * find_le() is a value that was current at any point between the call + * start and end. + */ +#define DELETED_LIFE 16 + +#define SLICE 4 +#define NIB ((1ULL << SLICE) - 1) +#define SLNODES (1 << SLICE) + +typedef unsigned char sh_t; + +struct critnib_node { + /* + * path is the part of a tree that's already traversed (be it through + * explicit nodes or collapsed links) -- ie, any subtree below has all + * those bits set to this value. + * + * nib is a 4-bit slice that's an index into the node's children. + * + * shift is the length (in bits) of the part of the key below this node. + * + * nib + * |XXXXXXXXXX|?|*****| + * path ^ + * +-----+ + * shift + */ + struct critnib_node *child[SLNODES]; + uint64_t path; + sh_t shift; +}; + +struct critnib_leaf { + uint64_t key; + void *value; +}; + +struct critnib { + struct critnib_node *root; + + /* pool of freed nodes: singly linked list, next at child[0] */ + struct critnib_node *deleted_node; + struct critnib_leaf *deleted_leaf; + + /* nodes removed but not yet eligible for reuse */ + struct critnib_node *pending_del_nodes[DELETED_LIFE]; + struct critnib_leaf *pending_del_leaves[DELETED_LIFE]; + + uint64_t remove_count; + + pthread_mutex_t mutex; /* writes/removes */ +}; + +/* + * atomic load + */ +static void +load(void *src, void *dst) +{ + util_atomic_load_explicit64((uint64_t *)src, (uint64_t *)dst, + memory_order_acquire); +} + +/* + * atomic store + */ +static void +store(void *dst, void *src) +{ + util_atomic_store_explicit64((uint64_t *)dst, (uint64_t)src, + memory_order_release); +} + +/* + * internal: is_leaf -- check tagged pointer for leafness + */ +static inline bool +is_leaf(struct critnib_node *n) +{ + return (uint64_t)n & 1; +} + +/* + * internal: to_leaf -- untag a leaf pointer + */ +static inline struct critnib_leaf * +to_leaf(struct critnib_node *n) +{ + return (void *)((uint64_t)n & ~1ULL); +} + +/* + * internal: path_mask -- return bit mask of a path above a subtree [shift] + * bits tall + */ +static inline uint64_t +path_mask(sh_t shift) +{ + return ~NIB << shift; +} + +/* + * internal: slice_index -- return index of child at the given nib + */ +static inline unsigned +slice_index(uint64_t key, sh_t shift) +{ + return (unsigned)((key >> shift) & NIB); +} + +/* + * critnib_new -- allocates a new critnib structure + */ +struct critnib * +critnib_new(void) +{ + struct critnib *c; + + D_ALLOC_PTR(c); + if (!c) + return NULL; + + util_mutex_init(&c->mutex); + + VALGRIND_HG_DRD_DISABLE_CHECKING(&c->root, sizeof(c->root)); + VALGRIND_HG_DRD_DISABLE_CHECKING(&c->remove_count, + sizeof(c->remove_count)); + + return c; +} + +/* + * internal: delete_node -- recursively free (to malloc) a subtree + */ +static void +delete_node(struct critnib_node *__restrict n) +{ + if (!is_leaf(n)) { + for (int i = 0; i < SLNODES; i++) { + if (n->child[i]) + delete_node(n->child[i]); + } + + D_FREE(n); + } else { + void *ptr; + + ptr = (void *)to_leaf(n); + D_FREE(ptr); + } +} + +/* + * critnib_delete -- destroy and free a critnib struct + */ +void +critnib_delete(struct critnib *c) +{ + if (c->root) + delete_node(c->root); + + util_mutex_destroy(&c->mutex); + + for (struct critnib_node *m = c->deleted_node; m; ) { + struct critnib_node *mm = m->child[0]; + + D_FREE(m); + m = mm; + } + + for (struct critnib_leaf *k = c->deleted_leaf; k; ) { + struct critnib_leaf *kk = k->value; + + D_FREE(k); + k = kk; + } + + for (int i = 0; i < DELETED_LIFE; i++) { + D_FREE(c->pending_del_nodes[i]); + D_FREE(c->pending_del_leaves[i]); + } + + D_FREE(c); +} + +/* + * internal: free_node -- free (to internal pool, not malloc) a node. + * + * We cannot free them to malloc as a stalled reader thread may still walk + * through such nodes; it will notice the result being bogus but only after + * completing the walk, thus we need to ensure any freed nodes still point + * to within the critnib structure. + */ +static void +free_node(struct critnib *__restrict c, struct critnib_node *__restrict n) +{ + if (!n) + return; + + ASSERT(!is_leaf(n)); + n->child[0] = c->deleted_node; + c->deleted_node = n; +} + +/* + * internal: alloc_node -- allocate a node from our pool or from malloc + */ +static struct critnib_node * +alloc_node(struct critnib *__restrict c) +{ + if (!c->deleted_node) { + struct critnib_node *n; + + D_ALLOC_PTR_NZ(n); + if (n == NULL) + D_CRIT("Malloc!\n"); + + return n; + } + + struct critnib_node *n = c->deleted_node; + + c->deleted_node = n->child[0]; + VALGRIND_ANNOTATE_NEW_MEMORY(n, sizeof(*n)); + + return n; +} + +/* + * internal: free_leaf -- free (to internal pool, not malloc) a leaf. + * + * See free_node(). + */ +static void +free_leaf(struct critnib *__restrict c, struct critnib_leaf *__restrict k) +{ + if (!k) + return; + + k->value = c->deleted_leaf; + c->deleted_leaf = k; +} + +/* + * internal: alloc_leaf -- allocate a leaf from our pool or from malloc + */ +static struct critnib_leaf * +alloc_leaf(struct critnib *__restrict c) +{ + if (!c->deleted_leaf) { + struct critnib_leaf *k; + + D_ALLOC_PTR_NZ(k); + if (k == NULL) + D_CRIT("Malloc!\n"); + + return k; + } + + struct critnib_leaf *k = c->deleted_leaf; + + c->deleted_leaf = k->value; + VALGRIND_ANNOTATE_NEW_MEMORY(k, sizeof(*k)); + + return k; +} + +/* + * critnib_insert -- write a key:value pair to the critnib structure + * + * Returns: + * - 0 on success + * - EEXIST if such a key already exists + * - ENOMEM if we're out of memory + * + * Takes a global write lock but doesn't stall any readers. + */ +int +critnib_insert(struct critnib *c, uint64_t key, void *value) +{ + util_mutex_lock(&c->mutex); + + struct critnib_leaf *k = alloc_leaf(c); + + if (!k) { + util_mutex_unlock(&c->mutex); + + return ENOMEM; + } + + VALGRIND_HG_DRD_DISABLE_CHECKING(k, sizeof(struct critnib_leaf)); + + k->key = key; + k->value = value; + + struct critnib_node *kn = (void *)((uint64_t)k | 1); + + struct critnib_node *n = c->root; + + if (!n) { + c->root = kn; + + util_mutex_unlock(&c->mutex); + + return 0; + } + + struct critnib_node **parent = &c->root; + struct critnib_node *prev = c->root; + + while (n && !is_leaf(n) && (key & path_mask(n->shift)) == n->path) { + prev = n; + parent = &n->child[slice_index(key, n->shift)]; + n = *parent; + } + + if (!n) { + n = prev; + store(&n->child[slice_index(key, n->shift)], kn); + + util_mutex_unlock(&c->mutex); + + return 0; + } + + uint64_t path = is_leaf(n) ? to_leaf(n)->key : n->path; + /* Find where the path differs from our key. */ + uint64_t at = path ^ key; + + if (!at) { + ASSERT(is_leaf(n)); + free_leaf(c, to_leaf(kn)); + /* fail instead of replacing */ + + util_mutex_unlock(&c->mutex); + + return EEXIST; + } + + /* and convert that to an index. */ + sh_t sh = util_mssb_index64(at) & (sh_t)~(SLICE - 1); + + struct critnib_node *m = alloc_node(c); + + if (!m) { + free_leaf(c, to_leaf(kn)); + + util_mutex_unlock(&c->mutex); + + return ENOMEM; + } + VALGRIND_HG_DRD_DISABLE_CHECKING(m, sizeof(struct critnib_node)); + + for (int i = 0; i < SLNODES; i++) + m->child[i] = NULL; + + m->child[slice_index(key, sh)] = kn; + m->child[slice_index(path, sh)] = n; + m->shift = sh; + m->path = key & path_mask(sh); + store(parent, m); + + util_mutex_unlock(&c->mutex); + + return 0; +} + +/* + * critnib_remove -- delete a key from the critnib structure, return its value + */ +void * +critnib_remove(struct critnib *c, uint64_t key) +{ + struct critnib_leaf *k; + void *value = NULL; + + util_mutex_lock(&c->mutex); + + struct critnib_node *n = c->root; + + if (!n) + goto not_found; + + uint64_t del = util_fetch_and_add64(&c->remove_count, 1) % DELETED_LIFE; + + free_node(c, c->pending_del_nodes[del]); + free_leaf(c, c->pending_del_leaves[del]); + c->pending_del_nodes[del] = NULL; + c->pending_del_leaves[del] = NULL; + + if (is_leaf(n)) { + k = to_leaf(n); + if (k->key == key) { + store(&c->root, NULL); + goto del_leaf; + } + + goto not_found; + } + /* + * n and k are a parent:child pair (after the first iteration); k is the + * leaf that holds the key we're deleting. + */ + struct critnib_node **k_parent = &c->root; + struct critnib_node **n_parent = &c->root; + struct critnib_node *kn = n; + + while (!is_leaf(kn)) { + n_parent = k_parent; + n = kn; + k_parent = &kn->child[slice_index(key, kn->shift)]; + kn = *k_parent; + + if (!kn) + goto not_found; + } + + k = to_leaf(kn); + if (k->key != key) + goto not_found; + + store(&n->child[slice_index(key, n->shift)], NULL); + + /* Remove the node if there's only one remaining child. */ + int ochild = -1; + + for (int i = 0; i < SLNODES; i++) { + if (n->child[i]) { + if (ochild != -1) + goto del_leaf; + + ochild = i; + } + } + + ASSERTne(ochild, -1); + + store(n_parent, n->child[ochild]); + c->pending_del_nodes[del] = n; + +del_leaf: + value = k->value; + c->pending_del_leaves[del] = k; + +not_found: + util_mutex_unlock(&c->mutex); + return value; +} + +/* + * critnib_get -- query for a key ("==" match), returns value or NULL + * + * Doesn't need a lock but if many deletes happened while our thread was + * somehow stalled the query is restarted (as freed nodes remain unused only + * for a grace period). + * + * Counterintuitively, it's pointless to return the most current answer, + * we need only one that was valid at any point after the call started. + */ +void * +critnib_get(struct critnib *c, uint64_t key) +{ + uint64_t wrs1, wrs2; + void *res; + + do { + struct critnib_node *n; + + load(&c->remove_count, &wrs1); + load(&c->root, &n); + + /* + * critbit algorithm: dive into the tree, looking at nothing but + * each node's critical bit^H^H^Hnibble. This means we risk + * going wrong way if our path is missing, but that's ok... + */ + while (n && !is_leaf(n)) + load(&n->child[slice_index(key, n->shift)], &n); + + /* ... as we check it at the end. */ + struct critnib_leaf *k = to_leaf(n); + + res = (n && k->key == key) ? k->value : NULL; + load(&c->remove_count, &wrs2); + } while (wrs1 + DELETED_LIFE <= wrs2); + + return res; +} + +/* + * internal: find_successor -- return the rightmost non-null node in a subtree + */ +static void * +find_successor(struct critnib_node *__restrict n) +{ + while (1) { + int nib; + + for (nib = NIB; nib >= 0; nib--) + if (n->child[nib]) + break; + + if (nib < 0) + return NULL; + + n = n->child[nib]; + if (is_leaf(n)) + return to_leaf(n)->value; + } +} + +/* + * internal: find_le -- recursively search <= in a subtree + */ +static void * +find_le(struct critnib_node *__restrict n, uint64_t key) +{ + if (!n) + return NULL; + + if (is_leaf(n)) { + struct critnib_leaf *k = to_leaf(n); + + return (k->key <= key) ? k->value : NULL; + } + + /* + * is our key outside the subtree we're in? + * + * If we're inside, all bits above the nib will be identical; note + * that shift points at the nib's lower rather than upper edge, so it + * needs to be masked away as well. + */ + if ((key ^ n->path) >> (n->shift) & ~NIB) { + /* + * subtree is too far to the left? + * -> its rightmost value is good + */ + if (n->path < key) + return find_successor(n); + + /* + * subtree is too far to the right? + * -> it has nothing of interest to us + */ + return NULL; + } + + unsigned nib = slice_index(key, n->shift); + + /* recursive call: follow the path */ + { + struct critnib_node *m; + + load(&n->child[nib], &m); + + void *value = find_le(m, key); + + if (value) + return value; + } + + /* + * nothing in that subtree? We strayed from the path at this point, + * thus need to search every subtree to our left in this node. No + * need to dive into any but the first non-null, though. + */ + for (; nib > 0; nib--) { + struct critnib_node *m; + + load(&n->child[nib - 1], &m); + if (m) { + n = m; + if (is_leaf(n)) + return to_leaf(n)->value; + + return find_successor(n); + } + } + + return NULL; +} + +/* + * critnib_find_le -- query for a key ("<=" match), returns value or NULL + * + * Same guarantees as critnib_get(). + */ +void * +critnib_find_le(struct critnib *c, uint64_t key) +{ + uint64_t wrs1, wrs2; + void *res; + + do { + load(&c->remove_count, &wrs1); + + struct critnib_node *n; /* avoid a subtle TOCTOU */ + + load(&c->root, &n); + res = n ? find_le(n, key) : NULL; + load(&c->remove_count, &wrs2); + } while (wrs1 + DELETED_LIFE <= wrs2); + + return res; +} diff --git a/src/common/dav_v2/critnib.h b/src/common/dav_v2/critnib.h new file mode 100644 index 00000000000..8e6d07f1c5d --- /dev/null +++ b/src/common/dav_v2/critnib.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2023, Intel Corporation */ + +/* + * critnib.h -- internal definitions for critnib tree + */ + +#ifndef __DAOS_COMMON_CRITNIB_H +#define __DAOS_COMMON_CRITNIB_H 1 + +#include + +struct critnib; + +struct critnib *critnib_new(void); +void critnib_delete(struct critnib *c); + +int critnib_insert(struct critnib *c, uint64_t key, void *value); +void *critnib_remove(struct critnib *c, uint64_t key); +void *critnib_get(struct critnib *c, uint64_t key); +void *critnib_find_le(struct critnib *c, uint64_t key); + +#endif /* __DAOS_COMMON_CRITNIB_H */ diff --git a/src/common/dav_v2/dav_clogs.c b/src/common/dav_v2/dav_clogs.c new file mode 100644 index 00000000000..1603e14dd88 --- /dev/null +++ b/src/common/dav_v2/dav_clogs.c @@ -0,0 +1,104 @@ +/** + * (C) Copyright 2015-2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include +#include +#include +#include +#include +#include + +#include "dav_internal.h" +#include "memops.h" +#include "tx.h" + +static void +clogs_extend_free(struct ulog *redo) +{ + D_FREE(redo); +} + +static int +clogs_extend_redo(struct ulog **redo, uint64_t gen_num) +{ + size_t size = SIZEOF_ALIGNED_ULOG(LANE_REDO_EXTERNAL_SIZE); + + D_ALIGNED_ALLOC_NZ(*redo, CACHELINE_SIZE, size); + if (*redo == NULL) + return -1; + + size_t capacity = ALIGN_DOWN(size - sizeof(struct ulog), CACHELINE_SIZE); + + ulog_construct_new(*redo, capacity, gen_num, 0); + return 0; +} + +static int +clogs_extend_undo(struct ulog **undo, uint64_t gen_num) +{ + size_t size = TX_DEFAULT_RANGE_CACHE_SIZE; + + D_ALIGNED_ALLOC_NZ(*undo, CACHELINE_SIZE, size); + if (*undo == NULL) + return -1; + + size_t capacity = ALIGN_DOWN(size - sizeof(struct ulog), CACHELINE_SIZE); + + ulog_construct_new(*undo, capacity, gen_num, 0); + return 0; +} + +int +dav_create_clogs(dav_obj_t *hdl) +{ + + ulog_construct_new((struct ulog *)&hdl->clogs.external, + LANE_REDO_EXTERNAL_SIZE, 0, 0); + ulog_construct_new((struct ulog *)&hdl->clogs.undo, + LANE_UNDO_SIZE, 0, 0); + + hdl->external = operation_new((struct ulog *)&hdl->clogs.external, + LANE_REDO_EXTERNAL_SIZE, clogs_extend_redo, clogs_extend_free, + &hdl->p_ops, LOG_TYPE_REDO); + if (hdl->external == NULL) + return -1; + hdl->undo = operation_new((struct ulog *)&hdl->clogs.undo, + LANE_UNDO_SIZE, clogs_extend_undo, clogs_extend_free, + &hdl->p_ops, LOG_TYPE_UNDO); + if (hdl->undo == NULL) { + operation_delete(hdl->external); + return -1; + } + return 0; +} + +void +dav_destroy_clogs(dav_obj_t *hdl) +{ + operation_free_logs(hdl->external); + operation_delete(hdl->external); + operation_free_logs(hdl->undo); + operation_delete(hdl->undo); +} + +int +dav_hold_clogs(dav_obj_t *hdl) +{ + if (hdl->nested_tx++ == 0) { + operation_init(hdl->external); + operation_init(hdl->undo); + } + return 0; +} + +int +dav_release_clogs(dav_obj_t *hdl) +{ + if (hdl->nested_tx == 0) + FATAL("release clogs"); + --hdl->nested_tx; + return 0; +} diff --git a/src/common/dav_v2/dav_clogs.h b/src/common/dav_v2/dav_clogs.h new file mode 100644 index 00000000000..b2565a949ac --- /dev/null +++ b/src/common/dav_v2/dav_clogs.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * dav_iface.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) + */ + +#ifndef __DAOS_COMMON_DAV_CLOGS_H +#define __DAOS_COMMON_CLOGS_H 1 + +#include +#include +#include "ulog.h" + +#define LANE_TOTAL_SIZE (3072) /* 3 * 1024 (sum of 3 old lane sections) */ +/* + * We have 3 kilobytes to distribute be split between transactional redo + * and undo logs. + * Since by far the most space consuming operations are transactional + * snapshots, most of the space, 2304 bytes, is assigned to the undo log. + * After that, the remainder, 640 bytes, or 40 ulog entries, is left for the + * transactional redo logs. + * Thanks to this distribution, all small and medium transactions should be + * entirely performed without allocating any additional metadata. + * + * These values must be cacheline size aligned to be used for ulogs. Therefore + * they are parametrized for the size of the struct ulog changes between + * platforms. + */ +#define LANE_UNDO_SIZE (LANE_TOTAL_SIZE \ + - LANE_REDO_EXTERNAL_SIZE \ + - 2 * sizeof(struct ulog)) /* 2304 for 64B ulog */ +#define LANE_REDO_EXTERNAL_SIZE ALIGN_UP(704 - sizeof(struct ulog), \ + CACHELINE_SIZE) /* 640 for 64B ulog */ + +struct dav_clogs { + /* + * Redo log for large operations/transactions. + * Can be extended by the use of internal ulog. + */ + struct ULOG(LANE_REDO_EXTERNAL_SIZE) external; + /* + * Undo log for snapshots done in a transaction. + * Can be extended/shrunk by the use of internal ulog. + */ + struct ULOG(LANE_UNDO_SIZE) undo; +}; + +typedef struct dav_obj dav_obj_t; + +int dav_create_clogs(dav_obj_t *hdl); +void dav_destroy_clogs(dav_obj_t *hdl); +int dav_hold_clogs(dav_obj_t *hdl); +int dav_release_clogs(dav_obj_t *hdl); + +#endif /* __DAOS_COMMON_DAV_CLOGS_H */ diff --git a/src/common/dav_v2/dav_iface.c b/src/common/dav_v2/dav_iface.c new file mode 100644 index 00000000000..ede29fafc56 --- /dev/null +++ b/src/common/dav_v2/dav_iface.c @@ -0,0 +1,480 @@ +/** + * (C) Copyright 2015-2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "dav_internal.h" +#include "heap.h" +#include "palloc.h" +#include "mo_wal.h" +#include "obj.h" +#include "tx.h" + +#define DAV_HEAP_INIT 0x1 +#define MEGABYTE ((uintptr_t)1 << 20) + +static bool +is_zone_evictable(void *arg, uint32_t zid) +{ + struct dav_obj *hdl = (struct dav_obj *)arg; + + return heap_mbrt_ismb_evictable(hdl->do_heap, zid); +} + +static int +dav_uc_callback(int evt_type, void *arg, uint32_t zid) +{ + struct dav_obj *hdl = (struct dav_obj *)arg; + struct zone *z = ZID_TO_ZONE(&hdl->do_heap->layout_info, zid); + + switch (evt_type) { + case UMEM_CACHE_EVENT_PGLOAD: + if (hdl->do_booted) { + VALGRIND_DO_CREATE_MEMPOOL(z, 0, 0); +#if VG_MEMCHECK_ENABLED + if (On_memcheck) + palloc_heap_vg_zone_open(hdl->do_heap, zid, 1); +#endif + D_ASSERT(z->header.flags & ZONE_EVICTABLE_MB); + heap_mbrt_setmb_usage(hdl->do_heap, zid, z->header.sp_usage); + } + break; + case UMEM_CACHE_EVENT_PGEVICT: + if (hdl->do_booted) { + VALGRIND_DO_DESTROY_MEMPOOL(z); + } + break; + default: + D_ERROR("Unknown umem cache event type in callback"); + } + return 0; +} + +static dav_obj_t * +dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct umem_store *store) +{ + dav_obj_t *hdl = NULL; + void *mmap_base; + int err = 0; + int rc; + struct heap_zone_limits hzl; + struct zone *z0; + + hzl = heap_get_zone_limits(store->stor_size, scm_sz, 100); + + if (hzl.nzones_heap == 0) { + ERR("Insufficient heap size."); + errno = EINVAL; + return NULL; + } + + if ((hzl.nzones_cache < 2) && (hzl.nzones_heap > hzl.nzones_cache)) { + ERR("Insufficient scm size."); + errno = EINVAL; + return NULL; + } + + if (hzl.nzones_cache * ZONE_MAX_SIZE != scm_sz) + D_WARN("scm size %lu is not aligned to zone size %lu, some scm will be unused", + scm_sz, ZONE_MAX_SIZE); + + if (hzl.nzones_heap < hzl.nzones_cache) + D_WARN("scm size %lu exceeds metablob size %lu, some scm will be unused", scm_sz, + store->stor_size); + + mmap_base = mmap(NULL, scm_sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mmap_base == MAP_FAILED) + return NULL; + + D_ALIGNED_ALLOC(hdl, CACHELINE_SIZE, sizeof(dav_obj_t)); + if (hdl == NULL) { + err = ENOMEM; + goto out0; + } + + hdl->do_fd = fd; + hdl->do_base = mmap_base; + hdl->do_size_mem = scm_sz; + hdl->do_size_mem_usable = hzl.nzones_cache * ZONE_MAX_SIZE; + hdl->do_size_meta = store->stor_size; + hdl->p_ops.base = hdl; + hdl->do_store = store; + hdl->p_ops.umem_store = store; + + if (hdl->do_store->stor_priv == NULL) { + D_ERROR("Missing backing store for the heap"); + err = EINVAL; + goto out1; + } + + if (flags & DAV_HEAP_INIT) { + rc = heap_init(mmap_base, scm_sz, store); + if (rc) { + err = errno; + goto out1; + } + } + + D_STRNDUP(hdl->do_path, path, strlen(path)); + D_ALLOC_PTR(hdl->do_heap); + if (hdl->do_heap == NULL) { + err = ENOMEM; + goto out2; + } + + hdl->do_stats = stats_new(hdl); + if (hdl->do_stats == NULL) + goto out2; + + rc = heap_boot(hdl->do_heap, hdl->do_base, hdl->do_store->stor_size, scm_sz, &hdl->p_ops, + hdl->do_stats); + if (rc) { + err = rc; + goto out2; + } + + heap_set_root_ptrs(hdl->do_heap, &hdl->do_root_offsetp, &hdl->do_root_sizep); + heap_set_stats_ptr(hdl->do_heap, &hdl->do_stats->persistent); + + rc = umem_cache_alloc(store, ZONE_MAX_SIZE, hzl.nzones_heap, hzl.nzones_cache, + heap_get_max_nemb(hdl->do_heap), 4096, mmap_base, is_zone_evictable, + dav_uc_callback, hdl); + if (rc != 0) { + D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc)); + err = daos_der2errno(rc); + goto out3; + } + + if (!(flags & DAV_HEAP_INIT)) { + rc = heap_zone_load(hdl->do_heap, 0); + if (rc) { + err = rc; + goto out4; + } + D_ASSERT(store != NULL); + rc = hdl->do_store->stor_ops->so_wal_replay(hdl->do_store, dav_wal_replay_cb, hdl); + if (rc) { + err = daos_der2errno(rc); + goto out4; + } + } + + rc = dav_create_clogs(hdl); + if (rc) { + err = rc; + goto out4; + } + + rc = lw_tx_begin(hdl); + if (rc) { + D_ERROR("lw_tx_begin failed with err %d\n", rc); + err = ENOMEM; + goto out5; + } + rc = heap_ensure_zone0_initialized(hdl->do_heap); + if (rc) { + lw_tx_end(hdl, NULL); + D_ERROR("Failed to initialize zone0, rc = %d", daos_errno2der(rc)); + goto out5; + } + lw_tx_end(hdl, NULL); + + z0 = ZID_TO_ZONE(&hdl->do_heap->layout_info, 0); + if (z0->header.zone0_zinfo_off) { + D_ASSERT(z0->header.zone0_zinfo_size); + D_ASSERT(OFFSET_TO_ZID(z0->header.zone0_zinfo_off) == 0); + + rc = heap_update_mbrt_zinfo(hdl->do_heap, false); + if (rc) { + D_ERROR("Failed to update mbrt with zinfo errno = %d", rc); + err = rc; + goto out5; + } + + rc = heap_load_nonevictable_zones(hdl->do_heap); + if (rc) { + D_ERROR("Failed to load required zones during boot, errno= %d", rc); + err = rc; + goto out5; + } + } else { + D_ASSERT(z0->header.zone0_zinfo_size == 0); + rc = lw_tx_begin(hdl); + if (rc) { + D_ERROR("lw_tx_begin failed with err %d\n", rc); + err = ENOMEM; + goto out5; + } + rc = obj_realloc(hdl, &z0->header.zone0_zinfo_off, &z0->header.zone0_zinfo_size, + heap_zinfo_get_size(hzl.nzones_heap)); + if (rc != 0) { + lw_tx_end(hdl, NULL); + D_ERROR("Failed to setup zinfo"); + goto out5; + } + rc = heap_update_mbrt_zinfo(hdl->do_heap, true); + if (rc) { + D_ERROR("Failed to update mbrt with zinfo errno = %d", rc); + err = rc; + goto out5; + } + lw_tx_end(hdl, NULL); + } + umem_cache_post_replay(hdl->do_store); + +#if VG_MEMCHECK_ENABLED + if (On_memcheck) + palloc_heap_vg_open(hdl->do_heap, 1); +#endif + + hdl->do_booted = 1; + + return hdl; +out5: + dav_destroy_clogs(hdl); +out4: + umem_cache_free(hdl->do_store); +out3: + heap_cleanup(hdl->do_heap); +out2: + if (hdl->do_stats) + stats_delete(hdl, hdl->do_stats); + if (hdl->do_heap) + D_FREE(hdl->do_heap); + if (hdl->do_utx) { + dav_umem_wtx_cleanup(hdl->do_utx); + D_FREE(hdl->do_utx); + } + D_FREE(hdl->do_path); +out1: + D_FREE(hdl); +out0: + munmap(mmap_base, scm_sz); + errno = err; + return NULL; + +} + +DAV_FUNC_EXPORT dav_obj_t * +dav_obj_create_v2(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store) +{ + int fd; + dav_obj_t *hdl; + struct stat statbuf; + int create = 0; + + SUPPRESS_UNUSED(flags); + + if (sz == 0) { + /* Open the file and obtain the size */ + fd = open(path, O_RDWR|O_CLOEXEC); + if (fd == -1) { + DS_ERROR(errno, "obj_create_v2 open %s to fetch size", path); + return NULL; + } + + if (fstat(fd, &statbuf) != 0) + goto out; + sz = statbuf.st_size; + } else { + fd = open(path, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, mode); + if (fd == -1) { + DS_ERROR(errno, "obj_create_v2 open %s to alloc", path); + return NULL; + } + + if (fallocate(fd, 0, 0, (off_t)sz) == -1) { + errno = ENOSPC; + goto out; + } + create = 1; + } + + hdl = dav_obj_open_internal(fd, DAV_HEAP_INIT, sz, path, store); + if (hdl == NULL) + goto out; + + DAV_DBG("pool %s created, size="DF_U64"", hdl->do_path, sz); + return hdl; + +out: + close(fd); + if (create) + unlink(path); + return NULL; +} + +DAV_FUNC_EXPORT dav_obj_t * +dav_obj_open_v2(const char *path, int flags, struct umem_store *store) +{ + size_t size; + int fd; + dav_obj_t *hdl; + struct stat statbuf; + + SUPPRESS_UNUSED(flags); + + fd = open(path, O_RDWR|O_CLOEXEC); + if (fd == -1) { + DS_ERROR(errno, "obj_create_v2 open %s", path); + return NULL; + } + + if (fstat(fd, &statbuf) != 0) { + close(fd); + return NULL; + } + size = (size_t)statbuf.st_size; + + hdl = dav_obj_open_internal(fd, 0, size, path, store); + if (hdl == NULL) { + close(fd); + return NULL; + } + DAV_DBG("pool %s is open, size="DF_U64"", hdl->do_path, size); + return hdl; +} + +DAV_FUNC_EXPORT void +dav_obj_close_v2(dav_obj_t *hdl) +{ + + if (hdl == NULL) { + ERR("NULL handle"); + return; + } + dav_destroy_clogs(hdl); + heap_cleanup(hdl->do_heap); + D_FREE(hdl->do_heap); + + stats_delete(hdl, hdl->do_stats); + + munmap(hdl->do_base, hdl->do_size_mem); + close(hdl->do_fd); + if (hdl->do_utx) { + dav_umem_wtx_cleanup(hdl->do_utx); + D_FREE(hdl->do_utx); + } + umem_cache_free(hdl->do_store); + DAV_DBG("pool %s is closed", hdl->do_path); + D_FREE(hdl->do_path); + D_FREE(hdl); +} + +DAV_FUNC_EXPORT void * +dav_get_base_ptr_v2(dav_obj_t *hdl) +{ + return hdl->do_heap->layout_info.zone0; +} + +DAV_FUNC_EXPORT int +dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p) +{ + uint8_t id = (uint8_t)p->class_id; + struct alloc_class_collection *ac = heap_alloc_classes(pop->do_heap); + enum header_type lib_htype = MAX_HEADER_TYPES; + size_t runsize_bytes; + uint32_t size_idx; + struct alloc_class *c; + + if (p->unit_size <= 0 || p->unit_size > DAV_MAX_ALLOC_SIZE || + p->units_per_block <= 0) { + errno = EINVAL; + return -1; + } + + if (p->alignment != 0 && p->unit_size % p->alignment != 0) { + ERR("unit size must be evenly divisible by alignment"); + errno = EINVAL; + return -1; + } + + if (p->alignment > (MEGABYTE * 2)) { + ERR("alignment cannot be larger than 2 megabytes"); + errno = EINVAL; + return -1; + } + + if (p->class_id >= MAX_ALLOCATION_CLASSES) { + ERR("class id outside of the allowed range"); + errno = ERANGE; + return -1; + } + + switch (p->header_type) { + case DAV_HEADER_LEGACY: + lib_htype = HEADER_LEGACY; + break; + case DAV_HEADER_COMPACT: + lib_htype = HEADER_COMPACT; + break; + case DAV_HEADER_NONE: + lib_htype = HEADER_NONE; + break; + case MAX_DAV_HEADER_TYPES: + default: + ERR("invalid header type"); + errno = EINVAL; + return -1; + } + + if (id == 0) { + if (alloc_class_find_first_free_slot(ac, &id) != 0) { + ERR("no available free allocation class identifier"); + errno = EINVAL; + return -1; + } + } else { + if (alloc_class_reserve(ac, id) != 0) { + ERR("attempted to overwrite an allocation class"); + errno = EEXIST; + return -1; + } + } + + runsize_bytes = CHUNKSIZE; + while (((p->units_per_block * p->unit_size) + RUN_BASE_METADATA_SIZE) > runsize_bytes) + runsize_bytes += CHUNKSIZE; + + /* aligning the buffer might require up-to to 'alignment' bytes */ + if (p->alignment != 0) + runsize_bytes += p->alignment; + + size_idx = (uint32_t)(runsize_bytes / CHUNKSIZE); + + if (size_idx > MAX_CHUNK) + size_idx = MAX_CHUNK; + + c = alloc_class_new(id, heap_alloc_classes(pop->do_heap), CLASS_RUN, lib_htype, + p->unit_size, p->alignment, size_idx); + if (c == NULL) { + errno = EINVAL; + return -1; + } + + if (heap_create_alloc_class_buckets(pop->do_heap, c) != 0) { + alloc_class_delete(ac, c); + return -1; + } + + p->class_id = c->id; + p->units_per_block = c->rdsc.nallocs; + + return 0; +} + +DAV_FUNC_EXPORT size_t +dav_obj_pgsz_v2() +{ + return ZONE_MAX_SIZE; +} diff --git a/src/common/dav_v2/dav_internal.h b/src/common/dav_v2/dav_internal.h new file mode 100644 index 00000000000..bc13e2eabc3 --- /dev/null +++ b/src/common/dav_v2/dav_internal.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) + */ + +#ifndef __DAOS_COMMON_DAV_INTERNAL_H +#define __DAOS_COMMON_DAV_INTERNAL_H 1 + +#include "dav_v2.h" +#include "dav_clogs.h" +#include "heap.h" +#include "mo_wal.h" +#include "wal_tx.h" + +#define DAV_FUNC_EXPORT __attribute__ ((visibility ("default"))) + +#define DAV_MAX_ALLOC_SIZE ((size_t)0x3FFDFFFC0) + +enum dav_tx_failure_behavior { + DAV_TX_FAILURE_ABORT, + DAV_TX_FAILURE_RETURN, +}; + +enum dav_stats_enabled { + DAV_STATS_ENABLED_TRANSIENT, + DAV_STATS_ENABLED_BOTH, + DAV_STATS_ENABLED_PERSISTENT, + DAV_STATS_DISABLED, +}; + +#define DAV_PHDR_SIZE 4096 + +/* DAV object handle */ +typedef struct dav_obj { + char *do_path; + uint64_t do_size_meta; + uint64_t do_size_mem; + uint64_t do_size_mem_usable; + void *do_base; + uint64_t *do_root_offsetp; + uint64_t *do_root_sizep; + struct palloc_heap *do_heap; + struct operation_context *external; + struct operation_context *undo; + struct mo_ops p_ops; /* REVISIT */ + struct stats *do_stats; + int do_fd; + int nested_tx; + struct umem_wal_tx *do_utx; + struct umem_store *do_store; + int do_booted; + + struct dav_clogs clogs __attribute__ ((__aligned__(CACHELINE_SIZE))); +} dav_obj_t; + +static inline +struct dav_tx *utx2wtx(struct umem_wal_tx *utx) +{ + return (struct dav_tx *)&utx->utx_private; +} + +static inline +struct umem_wal_tx *wtx2utx(struct dav_tx *wtx) +{ + return (struct umem_wal_tx *)((void *)wtx + - (ptrdiff_t)offsetof(struct umem_wal_tx, utx_private)); +} + +int lw_tx_begin(dav_obj_t *pop); +int lw_tx_end(dav_obj_t *pop, void *data); + +#endif /* __DAOS_COMMON_DAV_INTERNAL_H */ diff --git a/src/common/dav_v2/dav_v2.h b/src/common/dav_v2/dav_v2.h new file mode 100644 index 00000000000..6147d33ba4e --- /dev/null +++ b/src/common/dav_v2/dav_v2.h @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) + */ + +#ifndef __DAOS_COMMON_DAV_V2_H +#define __DAOS_COMMON_DAV_V2_H 1 + +#include +#include +#include +#include +#include "../dav/dav.h" + +typedef struct dav_obj dav_obj_t; +struct umem_store; + +/** + * Create and initialize a DAV object and return its handle. + * + * \param[in] path Path of the vos file. + * + * \param[in] flags additional flags (Future). + * + * \param[in] sz size of the file/heap. + * + * \param[in] mode permission to use while creating the file. + * + * \param[in] store backing umem store. + * + * \return Returns the pointer to the object handle. Upon failure, + * it returns NULL with errno set appropriately. + */ +dav_obj_t * +dav_obj_create_v2(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store); + +/** + * Open and initialize a DAV object and return its handle. + * + * \param[in] path Path of the vos file. + * + * \param[in] flags additional flags (Future). + * + * \param[in] store backing umem store. + * + * \return Returns the pointer to the object handle. Upon failure, + * it returns NULL with errno set appropriately. + */ +dav_obj_t * +dav_obj_open_v2(const char *path, int flags, struct umem_store *store); + +/** + * Close the DAV object + * + * \param[in] hdl DAV handle + */ +void +dav_obj_close_v2(dav_obj_t *hdl); + +/** + * Return the pointer to the base of the heap. + * + * \param[in] hdl DAV handle + * + * \return Returns the pointer to the base of the heap pointed to + * by hdl. + */ +void * +dav_get_base_ptr_v2(dav_obj_t *hdl); + +typedef int (*dav_constr)(dav_obj_t *pop, void *ptr, void *arg); + +/* + * Allocates a new object from the pool and calls a constructor function before + * returning. It is guaranteed that allocated object is either properly + * initialized, or if it's interrupted before the constructor completes, the + * memory reserved for the object is automatically reclaimed. + */ +int +dav_alloc_v2(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags, + dav_constr constructor, void *arg); + +/** + * Frees the memory at specified offset within the DAV object pointed to by hdl. + * + * \param[in] hdl DAV handle. + * + * \param[in] off offset to the memory location. off should correspond + * to the offset returned by previous call to dav_malloc(). + */ +void +dav_free_v2(dav_obj_t *pop, uint64_t off); + +/* + * DAV version of memcpy. Data copied is made persistent in blob. + */ +void * +dav_memcpy_persist_v2(dav_obj_t *pop, void *dest, const void *src, size_t len); + +/* + * If called for the first time on a newly created dav heap, the root object + * of given size is allocated. Otherwise, it returns the existing root object. + * In such case, the size must be not less than the actual root object size + * stored in the pool. If it's larger, the root object is automatically + * resized. + * + * This function is currently *not* thread-safe. + */ +uint64_t +dav_root_v2(dav_obj_t *pop, size_t size); + +/* + * Starts a new transaction in the current thread. + * If called within an open transaction, starts a nested transaction. + * + * If successful, transaction stage changes to TX_STAGE_WORK and function + * returns zero. Otherwise, stage changes to TX_STAGE_ONABORT and an error + * number is returned. + */ +int +dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...); + +/* + * Aborts current transaction + * + * Causes transition to TX_STAGE_ONABORT. + * + * This function must be called during TX_STAGE_WORK. + */ +void +dav_tx_abort_v2(int errnum); + +/* + * Commits current transaction + * + * This function must be called during TX_STAGE_WORK. + */ +void +dav_tx_commit_v2(void); + +/* + * Cleanups current transaction. Must always be called after dav_tx_begin, + * even if starting the transaction failed. + * + * If called during TX_STAGE_NONE, has no effect. + * + * Always causes transition to TX_STAGE_NONE. + * + * If transaction was successful, returns 0. Otherwise returns error code set + * by dav_tx_abort. + * + * This function must *not* be called during TX_STAGE_WORK. + */ +int +dav_tx_end_v2(void *data); + +/* + * Returns the current stage of the transaction. + */ +enum dav_tx_stage +dav_tx_stage_v2(void); + +/* + * Returns last transaction error code. + */ +int +dav_tx_errno_v2(void); + +/* + * Transactionally allocates a new object. + * + * If successful, returns offset of the object in the heap. + * Otherwise, stage changes to TX_STAGE_ONABORT and an zero is returned. + * 'Flags' is a bitmask of the following values: + * - POBJ_XALLOC_ZERO - zero the allocated object + * - POBJ_XALLOC_NO_FLUSH - skip flush on commit + * - POBJ_XALLOC_NO_ABORT - if the function does not end successfully, + * - DAV_CLASS_ID(id) - id of allocation class to use. + * - DAV_EZONE_ID(id) - id of zone to use. + * do not abort the transaction and return the error number. + * + * This function must be called during TX_STAGE_WORK. + */ +uint64_t +dav_tx_alloc_v2(size_t size, uint64_t type_num, uint64_t flags); + +/* + * Transactionally frees an existing object. + * + * If successful, returns zero. + * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned. + * + * This function must be called during TX_STAGE_WORK. + */ +int +dav_tx_free_v2(uint64_t off); + +/* + * Takes a "snapshot" of the memory block of given size and located at given + * offset 'off' in the object 'oid' and saves it in the undo log. + * The application is then free to directly modify the object in that memory + * range. In case of failure or abort, all the changes within this range will + * be rolled-back automatically. + * + * If successful, returns zero. + * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned. + * + * This function must be called during TX_STAGE_WORK. + */ +int +dav_tx_add_range_v2(uint64_t off, size_t size); + +/* + * Takes a "snapshot" of the given memory region and saves it in the undo log. + * The application is then free to directly modify the object in that memory + * range. In case of failure or abort, all the changes within this range will + * be rolled-back automatically. The supplied block of memory has to be within + * the given pool. + * + * If successful, returns zero. + * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned. + * + * This function must be called during TX_STAGE_WORK. + */ +int +dav_tx_add_range_direct_v2(const void *ptr, size_t size); + +/* + * Behaves exactly the same as dav_tx_add_range when 'flags' equals 0. + * 'Flags' is a bitmask of the following values: + * - POBJ_XADD_NO_FLUSH - skips flush on commit + * - POBJ_XADD_NO_SNAPSHOT - added range will not be snapshotted + * - POBJ_XADD_ASSUME_INITIALIZED - added range is assumed to be initialized + * - POBJ_XADD_NO_ABORT - if the function does not end successfully, + * do not abort the transaction and return the error number. + */ +int +dav_tx_xadd_range_v2(uint64_t off, size_t size, uint64_t flags); + +/* + * Behaves exactly the same as dav_tx_add_range_direct when 'flags' equals + * 0. 'Flags' is a bitmask of the following values: + * - POBJ_XADD_NO_FLUSH - skips flush on commit + * - POBJ_XADD_NO_SNAPSHOT - added range will not be snapshotted + * - POBJ_XADD_ASSUME_INITIALIZED - added range is assumed to be initialized + * - POBJ_XADD_NO_ABORT - if the function does not end successfully, + * do not abort the transaction and return the error number. + */ +int +dav_tx_xadd_range_direct_v2(const void *ptr, size_t size, uint64_t flags); + +#define DAV_ACTION_XRESERVE_VALID_FLAGS \ + (DAV_XALLOC_CLASS_MASK | DAV_XALLOC_EZONE_MASK | DAV_XALLOC_ZERO) + +struct dav_action; +uint64_t +dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num, + uint64_t flags); +void +dav_defer_free_v2(dav_obj_t *pop, uint64_t off, struct dav_action *act); +void +dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt); +int +dav_tx_publish_v2(struct dav_action *actv, size_t actvcnt); + +struct dav_alloc_class_desc; +/* + * Registers an allocation class handle with the DAV object. + */ +int +dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p); + +struct dav_heap_stats; +/* + * Returns the heap allocation statistics associated with the + * DAV object. + */ +int +dav_get_heap_stats_v2(dav_obj_t *pop, struct dav_heap_stats *st); + +struct dav_heap_mb_stats { + uint64_t dhms_allocated; + uint64_t dhms_maxsz; +}; + +/** + * Returns the usage statistics of a memory bucket. Note that usage + * stats for evictable MBs will be approximate values if they are not + * yet loaded on to the umem cache. + * + * \param[in] pop pool handle + * \param[in] mb_id memory bucket id + * \param[out] st mb stats + * + * \return 0, success + * < 0, error and errno is set to appropriate value. + */ +int +dav_get_heap_mb_stats_v2(dav_obj_t *pop, uint32_t mb_id, struct dav_heap_mb_stats *st); + +/** + * Allot an evictable memory bucket for tasks like new object creation + * + * \param[in] pop pool handle + * \param[in] flags zone selection criteria. + * + * \return id > 0, mbid of evictable memory bucket. + * id = 0, no evictable memory bucket is available + * use non-evictable memory bucket. + */ +uint32_t +dav_allot_mb_evictable_v2(dav_obj_t *pop, int flags); + +/* + * Return the page size for dav_v2. + */ +size_t +dav_obj_pgsz_v2(); + +#endif /* __DAOS_COMMON_DAV_V2_H */ diff --git a/src/common/dav_v2/heap.c b/src/common/dav_v2/heap.c new file mode 100644 index 00000000000..d730fed7bc4 --- /dev/null +++ b/src/common/dav_v2/heap.c @@ -0,0 +1,2195 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * heap.c -- heap implementation + */ + +#include +#include +#include +#include +#include + +#include "bucket.h" +#include "dav_internal.h" +#include "memblock.h" +#include "queue.h" +#include "heap.h" +#include "out.h" +#include "util.h" +#include "sys_util.h" +#include "valgrind_internal.h" +#include "recycler.h" +#include "container.h" +#include "alloc_class.h" +#include "meta_io.h" + +#define HEAP_NEMB_PCT_DEFAULT 80 + +static void +heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, uint32_t zone_id); + +#define MAX_RUN_LOCKS MAX_CHUNK +#define MAX_RUN_LOCKS_VG MAX_CHUNK /* avoid perf issues /w drd */ + +#define ZINFO_VERSION 0x1 + +struct zinfo_element { + unsigned char z_allotted : 1; + unsigned char z_evictable : 1; + unsigned char z_usage_hint : 3; +}; + +struct zinfo_vec { + uint32_t version; + uint32_t num_elems; + struct zinfo_element z[]; +}; + +TAILQ_HEAD(mbrt_q, mbrt); + +/* + * Memory Bucket Runtime. + */ +struct mbrt { + TAILQ_ENTRY(mbrt) mb_link; + struct mbrt_q *qptr; + uint32_t mb_id; + uint32_t garbage_reclaimed; + uint64_t space_usage; + uint64_t prev_usage; + struct palloc *heap; + struct bucket_locked *default_bucket; /* bucket for free chunks */ + struct bucket_locked *buckets[MAX_ALLOCATION_CLASSES]; + struct recycler *recyclers[MAX_ALLOCATION_CLASSES]; + bool laf[MAX_ALLOCATION_CLASSES]; /* last allocation failed? */ + bool laf_updated; +}; + +enum mb_usage_hint { + MB_U0_HINT = 0, + MB_U30_HINT = 1, + MB_U75_HINT = 2, + MB_U90_HINT = 3, + MB_UMAX_HINT = 4, +}; + +#define MB_U90 (ZONE_MAX_SIZE * 9 / 10) +#define MB_U75 (ZONE_MAX_SIZE * 75 / 100) +#define MB_U30 (ZONE_MAX_SIZE * 3 / 10) +#define MB_USAGE_DELTA (ZONE_MAX_SIZE / 20) + +size_t mb_usage_byhint[MB_UMAX_HINT] = {0, MB_U30 + 1, MB_U75 + 1, MB_U90 + 1}; + +struct heap_rt { + struct alloc_class_collection *alloc_classes; + pthread_mutex_t run_locks[MAX_RUN_LOCKS]; + unsigned nlocks; + unsigned nzones; + unsigned nzones_e; + unsigned nzones_ne; + unsigned zones_exhausted; + unsigned zones_exhausted_e; + unsigned zones_exhausted_ne; + unsigned zones_ne_gc; + unsigned zones_lastne_gc; + unsigned zones_unused_first; + unsigned zinfo_vec_size; + unsigned mb_create_waiters; + unsigned mb_pressure; + unsigned nemb_pct; + void *mb_create_wq; + struct zinfo_vec *zinfo_vec; + struct mbrt *default_mb; + struct mbrt **evictable_mbs; + struct mbrt *active_evictable_mb; + struct mbrt_q mb_u90; + struct mbrt_q mb_u75; + struct mbrt_q mb_u30; + struct mbrt_q mb_u0; +}; + +#define MBRT_NON_EVICTABLE ((struct mbrt *)(-1UL)) + +static inline void +heap_zinfo_set(struct palloc_heap *heap, uint32_t zid, bool allotted, bool evictable) +{ + struct zinfo_element *ze = heap->rt->zinfo_vec->z; + + ze[zid].z_allotted = allotted; + ze[zid].z_evictable = evictable; + mo_wal_persist(&heap->p_ops, &ze[zid], sizeof(ze[zid])); +} + +static inline void +heap_zinfo_get(struct palloc_heap *heap, uint32_t zid, bool *allotted, bool *evictable) +{ + struct zinfo_element *ze = heap->rt->zinfo_vec->z; + + *allotted = ze[zid].z_allotted; + *evictable = ze[zid].z_evictable; +} + +static inline void +heap_zinfo_set_usage(struct palloc_heap *heap, uint32_t zid, enum mb_usage_hint val) +{ + struct zinfo_element *ze = heap->rt->zinfo_vec->z; + + D_ASSERT(ze[zid].z_allotted && ze[zid].z_evictable && val < MB_UMAX_HINT); + ze[zid].z_usage_hint = val; + mo_wal_persist(&heap->p_ops, &ze[zid], sizeof(ze[zid])); +} + +static inline void +heap_zinfo_get_usage(struct palloc_heap *heap, uint32_t zid, enum mb_usage_hint *val) +{ + struct zinfo_element *ze = heap->rt->zinfo_vec->z; + + D_ASSERT(ze[zid].z_allotted && ze[zid].z_evictable && ze[zid].z_usage_hint < MB_UMAX_HINT); + *val = ze[zid].z_usage_hint; +} + +size_t +heap_zinfo_get_size(uint32_t nzones) +{ + return (sizeof(struct zinfo_vec) + sizeof(struct zinfo_element) * nzones); +} + +static inline void +heap_zinfo_init(struct palloc_heap *heap) +{ + struct zinfo_vec *z = heap->rt->zinfo_vec; + + D_ASSERT(heap->layout_info.zone0->header.zone0_zinfo_size >= + heap_zinfo_get_size(heap->rt->nzones)); + + z->version = ZINFO_VERSION; + z->num_elems = heap->rt->nzones; + mo_wal_persist(&heap->p_ops, z, sizeof(*z)); + heap_zinfo_set(heap, 0, 1, false); +} + +static void +mbrt_set_laf(struct mbrt *mb, int c_id) +{ + if (mb->mb_id == 0) + return; + D_ASSERT(c_id < MAX_ALLOCATION_CLASSES); + + mb->laf[c_id] = true; + mb->laf_updated = true; +} + +static void +mbrt_clear_laf(struct mbrt *mb) +{ + if (mb->mb_id == 0) + return; + if (mb->laf_updated) { + memset(mb->laf, 0, MAX_ALLOCATION_CLASSES); + mb->laf_updated = false; + } +} + +static bool +mbrt_is_laf(struct mbrt *mb, int c_id) +{ + D_ASSERT(c_id < MAX_ALLOCATION_CLASSES); + return mb->laf[c_id]; +} + +void +heap_mbrt_setmb_nonevictable(struct palloc_heap *heap, uint32_t zid) +{ + D_ASSERT(zid < heap->rt->nzones); + heap->rt->evictable_mbs[zid] = MBRT_NON_EVICTABLE; +} + +void +heap_mbrt_setmb_evictable(struct palloc_heap *heap, struct mbrt *mb) +{ + D_ASSERT((mb->mb_id != 0) && (mb->mb_id < heap->rt->nzones)); + heap->rt->evictable_mbs[mb->mb_id] = mb; +} + +bool +heap_mbrt_ismb_evictable(struct palloc_heap *heap, uint32_t zid) +{ + D_ASSERT(zid < heap->rt->nzones); + return (heap->rt->evictable_mbs[zid] != MBRT_NON_EVICTABLE); +} + +bool +heap_mbrt_ismb_initialized(struct palloc_heap *heap, uint32_t zid) +{ + D_ASSERT(zid < heap->rt->nzones); + return (heap->rt->evictable_mbs[zid] != 0); +} + +/* + * mbrt_bucket_acquire -- fetches by mbrt or by id a bucket exclusive + * for the thread until mbrt_bucket_release is called + */ +struct bucket * +mbrt_bucket_acquire(struct mbrt *mb, uint8_t class_id) +{ + struct bucket_locked *b; + + D_ASSERT(mb != NULL); + + if (class_id == DEFAULT_ALLOC_CLASS_ID) + b = mb->default_bucket; + else + b = mb->buckets[class_id]; + + return bucket_acquire(b); +} + +/* + * mbrt_bucket_release -- puts the bucket back into the heap + */ +void +mbrt_bucket_release(struct bucket *b) +{ + bucket_release(b); +} + +/* + * heap_mbrt_setup_mb -- (internal) create and initializes a Memory Bucket runtime. + */ +struct mbrt * +heap_mbrt_setup_mb(struct palloc_heap *heap, uint32_t zid) +{ + struct heap_rt *rt = heap->rt; + struct mbrt *mb; + struct alloc_class *c; + uint8_t i; + + D_ALLOC_PTR(mb); + if (mb == NULL) { + errno = ENOMEM; + return NULL; + } + + mb->mb_id = zid; + + for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + c = alloc_class_by_id(rt->alloc_classes, i); + + if (c == NULL) + continue; + + mb->buckets[c->id] = bucket_locked_new(container_new_seglists(heap), c, mb); + if (mb->buckets[c->id] == NULL) + goto error_bucket_create; + } + + mb->default_bucket = + bucket_locked_new(container_new_ravl(heap), + alloc_class_by_id(rt->alloc_classes, DEFAULT_ALLOC_CLASS_ID), mb); + + if (mb->default_bucket == NULL) + goto error_bucket_create; + + return mb; + +error_bucket_create: + for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + c = alloc_class_by_id(rt->alloc_classes, i); + if (c != NULL) { + if (mb->buckets[c->id] != NULL) + bucket_locked_delete(mb->buckets[c->id]); + } + } + D_FREE(mb); + errno = ENOMEM; + return NULL; +} + +static void +heap_mbrt_cleanup_mb(struct mbrt *mb) +{ + uint8_t i; + + if (mb == NULL) + return; + + for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + if (mb->buckets[i] == NULL) + continue; + bucket_locked_delete(mb->buckets[i]); + } + bucket_locked_delete(mb->default_bucket); + + for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + if (mb->recyclers[i] == NULL) + continue; + recycler_delete(mb->recyclers[i]); + } + D_DEBUG(DB_TRACE, "MB %u utilization = %lu\n", mb->mb_id, mb->space_usage); + D_FREE(mb); +} + +int +heap_mbrt_update_alloc_class_buckets(struct palloc_heap *heap, struct mbrt *mb, + struct alloc_class *c) +{ + uint8_t c_id = c->id; + + if ((heap->rt->default_mb == mb) || (mb->buckets[c_id] != NULL)) + return 0; + + /* Allocation class created post creation/loading of the memory bucket runtime */ + if (heap->rt->default_mb->buckets[c_id]) { + mb->buckets[c_id] = bucket_locked_new(container_new_seglists(heap), c, mb); + if (!mb->buckets[c_id]) + return ENOMEM; + } + return 0; +} + +static inline int +heap_mbrt_init(struct palloc_heap *heap) +{ + struct heap_rt *rt = heap->rt; + int ret = 0; + struct umem_store *store = heap->layout_info.store; + + rt->default_mb = NULL; + rt->active_evictable_mb = NULL; + rt->mb_create_waiters = 0; + rt->mb_create_wq = NULL; + rt->mb_pressure = 0; + ret = store->stor_ops->so_waitqueue_create(&rt->mb_create_wq); + if (ret) { + ret = daos_der2errno(ret); + goto error; + } + + D_ALLOC_ARRAY(rt->evictable_mbs, rt->nzones); + if (rt->evictable_mbs == NULL) { + ret = ENOMEM; + goto error; + } + + TAILQ_INIT(&rt->mb_u90); + TAILQ_INIT(&rt->mb_u75); + TAILQ_INIT(&rt->mb_u30); + TAILQ_INIT(&rt->mb_u0); + + rt->default_mb = heap_mbrt_setup_mb(heap, 0); + if (rt->default_mb == NULL) { + ret = ENOMEM; + goto error_default_mb_setup; + } + heap_mbrt_setmb_nonevictable(heap, 0); + return 0; + +error_default_mb_setup: + D_FREE(rt->evictable_mbs); +error: + return ret; +} + +static inline void +heap_mbrt_fini(struct palloc_heap *heap) +{ + struct heap_rt *rt = heap->rt; + int i; + struct umem_store *store = heap->layout_info.store; + + for (i = 0; i < rt->zones_exhausted; i++) { + if (heap_mbrt_ismb_evictable(heap, i)) + heap_mbrt_cleanup_mb(rt->evictable_mbs[i]); + } + heap_mbrt_cleanup_mb(rt->default_mb); + + D_FREE(rt->evictable_mbs); + rt->default_mb = NULL; + rt->active_evictable_mb = NULL; + rt->evictable_mbs = NULL; + D_ASSERT(rt->mb_create_waiters == 0); + if (rt->mb_create_wq != NULL) + store->stor_ops->so_waitqueue_destroy(rt->mb_create_wq); + rt->mb_create_wq = NULL; +} + +/* + * heap_mbrt_get_mb - returns the reference to the mb runtime given + * zone_id or mb_id. + */ +struct mbrt * +heap_mbrt_get_mb(struct palloc_heap *heap, uint32_t zone_id) +{ + if (!heap_mbrt_ismb_evictable(heap, zone_id)) + return heap->rt->default_mb; + + D_ASSERT(heap->rt->evictable_mbs[zone_id] != NULL); + return heap->rt->evictable_mbs[zone_id]; +} + +void +heap_mbrt_log_alloc_failure(struct palloc_heap *heap, uint32_t zone_id) +{ + struct mbrt *mb = heap->rt->active_evictable_mb; + + if (mb && (mb->mb_id == zone_id)) { + TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link); + mb->qptr = &heap->rt->mb_u90; + mb->prev_usage = mb->space_usage; + heap->rt->active_evictable_mb = NULL; + heap_zinfo_set_usage(heap, zone_id, MB_U90_HINT); + } +} + +void +heap_mbrt_setmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t usage) +{ + struct mbrt *mb = heap->rt->evictable_mbs[zone_id]; + + D_ASSERT(zone_id < heap->rt->nzones); + if (zone_id == 0) { + heap->rt->default_mb->space_usage = usage; + return; + } + if (mb == (struct mbrt *)(-1UL)) + return; + + mb->space_usage = usage; + + if ((heap->rt->active_evictable_mb == mb) || (mb->qptr)) + return; + + if (mb->space_usage > MB_U90) { + TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link); + mb->qptr = &heap->rt->mb_u90; + } else if (mb->space_usage > MB_U75) { + TAILQ_INSERT_TAIL(&heap->rt->mb_u75, mb, mb_link); + mb->qptr = &heap->rt->mb_u75; + } else if (mb->space_usage > MB_U30) { + TAILQ_INSERT_TAIL(&heap->rt->mb_u30, mb, mb_link); + mb->qptr = &heap->rt->mb_u30; + heap->rt->mb_pressure = 0; + } else { + TAILQ_INSERT_TAIL(&heap->rt->mb_u0, mb, mb_link); + mb->qptr = &heap->rt->mb_u0; + heap->rt->mb_pressure = 0; + } + mb->prev_usage = mb->space_usage; +} + +int +heap_mbrt_getmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t *allotted, + uint64_t *maxsz) +{ + struct mbrt *mb; + + if (zone_id == 0) { + *maxsz = heap->rt->nzones_ne * ZONE_MAX_SIZE; + *allotted = heap->rt->default_mb->space_usage; + } else { + if (zone_id >= heap->rt->nzones) { + errno = EINVAL; + return -1; + } + mb = heap->rt->evictable_mbs[zone_id]; + if (!mb || (mb == (struct mbrt *)(-1UL))) { + errno = EINVAL; + return -1; + } + *maxsz = ZONE_MAX_SIZE; + *allotted = mb->space_usage; + } + return 0; +} + +void +heap_mbrt_incrmb_usage(struct palloc_heap *heap, uint32_t zone_id, int size) +{ + struct mbrt *mb = heap->rt->evictable_mbs[zone_id]; + + if (mb == (struct mbrt *)(-1UL)) { + heap->rt->default_mb->space_usage += size; + return; + } + + mb->space_usage += size; + if ((heap->rt->active_evictable_mb == mb) || + (labs((int64_t)(mb->space_usage - mb->prev_usage)) < MB_USAGE_DELTA)) + return; + + if (mb->space_usage > MB_U90) { + if (mb->qptr != &heap->rt->mb_u90) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link); + mb->qptr = &heap->rt->mb_u90; + heap_zinfo_set_usage(heap, zone_id, MB_U90_HINT); + } + } else if (mb->space_usage > MB_U75) { + if (mb->qptr != &heap->rt->mb_u75) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&heap->rt->mb_u75, mb, mb_link); + mb->qptr = &heap->rt->mb_u75; + heap_zinfo_set_usage(heap, zone_id, MB_U75_HINT); + } + } else if (mb->space_usage > MB_U30) { + if (mb->qptr != &heap->rt->mb_u30) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&heap->rt->mb_u30, mb, mb_link); + mb->qptr = &heap->rt->mb_u30; + heap_zinfo_set_usage(heap, zone_id, MB_U30_HINT); + heap->rt->mb_pressure = 0; + } + } else if (mb->qptr != &heap->rt->mb_u0) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&heap->rt->mb_u0, mb, mb_link); + mb->qptr = &heap->rt->mb_u0; + heap_zinfo_set_usage(heap, zone_id, MB_U0_HINT); + heap->rt->mb_pressure = 0; + } + mb->prev_usage = mb->space_usage; +} + +int +heap_mbrt_mb_reclaim_garbage(struct palloc_heap *heap, uint32_t zid) +{ + struct mbrt *mb; + struct bucket *b; + + mb = heap_mbrt_get_mb(heap, zid); + + if ((mb->mb_id != 0) && (mb->garbage_reclaimed)) + return 0; + + b = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); + heap_reclaim_zone_garbage(heap, b, zid); + mbrt_bucket_release(b); + + if (mb->mb_id != 0) + mb->garbage_reclaimed = 1; + + return 0; +} + +void +heap_set_root_ptrs(struct palloc_heap *heap, uint64_t **offp, uint64_t **sizep) +{ + *offp = &heap->layout_info.zone0->header.reserved[0]; + *sizep = &heap->layout_info.zone0->header.reserved[1]; +} + +void +heap_set_stats_ptr(struct palloc_heap *heap, struct stats_persistent **sp) +{ + D_CASSERT(sizeof(struct stats_persistent) == sizeof(uint64_t)); + *sp = (struct stats_persistent *)&heap->layout_info.zone0->header.sp_usage_glob; + VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(*sp, sizeof(*sp)); +} + +/* + * heap_get_recycler - (internal) retrieves the recycler instance from the mbrt with + * the corresponding class id. Initializes the recycler if needed. + */ +static struct recycler * +heap_get_recycler(struct palloc_heap *heap, struct mbrt *mb, size_t id, size_t nallocs) +{ + struct recycler *r; + + D_ASSERT(mb != NULL); + util_atomic_load_explicit64(&mb->recyclers[id], &r, memory_order_acquire); + if (r != NULL) + return r; + + r = recycler_new(heap, nallocs, mb); + if (r && !util_bool_compare_and_swap64(&mb->recyclers[id], NULL, r)) { + /* + * If a different thread succeeded in assigning the recycler + * first, the recycler this thread created needs to be deleted. + */ + recycler_delete(r); + + return heap_get_recycler(heap, mb, id, nallocs); + } + + return r; +} + +/* + * heap_alloc_classes -- returns the allocation classes collection + */ +struct alloc_class_collection * +heap_alloc_classes(struct palloc_heap *heap) +{ + return heap->rt ? heap->rt->alloc_classes : NULL; +} + +/* + * heap_get_best_class -- returns the alloc class that best fits the + * requested size + */ +struct alloc_class * +heap_get_best_class(struct palloc_heap *heap, size_t size) +{ + return alloc_class_by_alloc_size(heap->rt->alloc_classes, size); +} + +/* + * heap_get_run_lock -- returns the lock associated with memory block + */ +pthread_mutex_t * +heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id) +{ + return &heap->rt->run_locks[chunk_id % heap->rt->nlocks]; +} + +/* + * heap_max_zone -- (internal) calculates how many zones can the heap fit + */ +static unsigned +heap_max_zone(size_t size) +{ + unsigned max_zone = 0; + + size -= sizeof(struct heap_header); + + while (size >= ZONE_MIN_SIZE) { + max_zone++; + size -= size <= ZONE_MAX_SIZE ? size : ZONE_MAX_SIZE; + } + + return max_zone; +} + +/* + * zone_calc_size_idx -- (internal) calculates zone size index + */ +static uint32_t +zone_calc_size_idx(uint32_t zone_id, unsigned max_zone, size_t heap_size) +{ + ASSERT(max_zone > 0); + if (zone_id < max_zone - 1) + return MAX_CHUNK; + + ASSERT(heap_size >= zone_id * ZONE_MAX_SIZE); + size_t zone_raw_size = heap_size - zone_id * ZONE_MAX_SIZE; + + ASSERT(zone_raw_size >= (sizeof(struct zone_header) + + sizeof(struct chunk_header) * MAX_CHUNK) + + sizeof(struct heap_header)); + zone_raw_size -= sizeof(struct zone_header) + + sizeof(struct chunk_header) * MAX_CHUNK + + sizeof(struct heap_header); + + size_t zone_size_idx = zone_raw_size / CHUNKSIZE; + + ASSERT(zone_size_idx <= MAX_CHUNK); + + return (uint32_t)zone_size_idx; +} + +/* + * heap_zone_init -- (internal) writes zone's first chunk and header + */ +static void +heap_zone_init(struct palloc_heap *heap, uint32_t zone_id, uint32_t first_chunk_id, + bool is_evictable) +{ + struct zone *z = ZID_TO_ZONE(&heap->layout_info, zone_id); + uint32_t size_idx = zone_calc_size_idx(zone_id, heap->rt->nzones, heap->size); + + ASSERT(size_idx > first_chunk_id); + + struct zone_header nhdr = { + .size_idx = size_idx, + .magic = ZONE_HEADER_MAGIC, + }; + + z->header = nhdr; /* write the entire header at once */ + if (is_evictable) + z->header.flags |= ZONE_EVICTABLE_MB; + mo_wal_persist(&heap->p_ops, &z->header, sizeof(z->header)); + + memblock_huge_init(heap, first_chunk_id, zone_id, size_idx - first_chunk_id); +} + +/* + * heap_get_adjacent_free_block -- locates adjacent free memory block in heap + */ +static int +heap_get_adjacent_free_block(struct palloc_heap *heap, + const struct memory_block *in, struct memory_block *out, int prev) +{ + struct zone *z = ZID_TO_ZONE(&heap->layout_info, in->zone_id); + struct chunk_header *hdr = &z->chunk_headers[in->chunk_id]; + + out->zone_id = in->zone_id; + + if (prev) { + if (in->chunk_id == 0) + return ENOENT; + + struct chunk_header *prev_hdr = + &z->chunk_headers[in->chunk_id - 1]; + out->chunk_id = in->chunk_id - prev_hdr->size_idx; + + if (z->chunk_headers[out->chunk_id].type != CHUNK_TYPE_FREE) + return ENOENT; + + out->size_idx = z->chunk_headers[out->chunk_id].size_idx; + } else { /* next */ + if (in->chunk_id + hdr->size_idx == z->header.size_idx) + return ENOENT; + + out->chunk_id = in->chunk_id + hdr->size_idx; + + if (z->chunk_headers[out->chunk_id].type != CHUNK_TYPE_FREE) + return ENOENT; + + out->size_idx = z->chunk_headers[out->chunk_id].size_idx; + } + memblock_rebuild_state(heap, out); + + return 0; +} + +/* + * heap_coalesce -- (internal) merges adjacent memory blocks + */ +static struct memory_block +heap_coalesce(struct palloc_heap *heap, + const struct memory_block *blocks[], int n) +{ + struct memory_block ret = MEMORY_BLOCK_NONE; + + const struct memory_block *b = NULL; + + ret.size_idx = 0; + for (int i = 0; i < n; ++i) { + if (blocks[i] == NULL) + continue; + b = b ? b : blocks[i]; + ret.size_idx += blocks[i]->size_idx; + } + + ASSERTne(b, NULL); + + ret.chunk_id = b->chunk_id; + ret.zone_id = b->zone_id; + ret.block_off = b->block_off; + memblock_rebuild_state(heap, &ret); + + return ret; +} + +/* + * heap_coalesce_huge -- finds neighbors of a huge block, removes them from the + * volatile state and returns the resulting block + */ +static struct memory_block +heap_coalesce_huge(struct palloc_heap *heap, struct bucket *b, + const struct memory_block *m) +{ + const struct memory_block *blocks[3] = {NULL, m, NULL}; + + struct memory_block prev = MEMORY_BLOCK_NONE; + + if (heap_get_adjacent_free_block(heap, m, &prev, 1) == 0 && + bucket_remove_block(b, &prev) == 0) { + blocks[0] = &prev; + } + + struct memory_block next = MEMORY_BLOCK_NONE; + + if (heap_get_adjacent_free_block(heap, m, &next, 0) == 0 && + bucket_remove_block(b, &next) == 0) { + blocks[2] = &next; + } + + return heap_coalesce(heap, blocks, 3); +} + +/* + * heap_free_chunk_reuse -- reuses existing free chunk + */ +int +heap_free_chunk_reuse(struct palloc_heap *heap, + struct bucket *bucket, + struct memory_block *m) +{ + /* + * Perform coalescing just in case there + * are any neighboring free chunks. + */ + struct memory_block nm = heap_coalesce_huge(heap, bucket, m); + + if (nm.size_idx != m->size_idx) + m->m_ops->prep_hdr(&nm, MEMBLOCK_FREE, NULL); + + *m = nm; + + return bucket_insert_block(bucket, m); +} + +/* + * heap_run_into_free_chunk -- (internal) creates a new free chunk in place of + * a run. + */ +static void +heap_run_into_free_chunk(struct palloc_heap *heap, + struct bucket *bucket, + struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); + + m->block_off = 0; + m->size_idx = hdr->size_idx; + + STATS_SUB(heap->stats, transient, heap_run_active, + m->size_idx * CHUNKSIZE); + + /* + * The only thing this could race with is heap_memblock_on_free() + * because that function is called after processing the operation, + * which means that a different thread might immediately call this + * function if the free() made the run empty. + * We could forgo this lock if it weren't for helgrind which needs it + * to establish happens-before relation for the chunk metadata. + */ + pthread_mutex_t *lock = m->m_ops->get_lock(m); + + util_mutex_lock(lock); + + *m = memblock_huge_init(heap, m->chunk_id, m->zone_id, m->size_idx); + + heap_free_chunk_reuse(heap, bucket, m); + + util_mutex_unlock(lock); +} + +/* + * heap_reclaim_run -- checks the run for available memory if unclaimed. + * + * Returns 1 if reclaimed chunk, 0 otherwise. + */ +static int +heap_reclaim_run(struct palloc_heap *heap, struct memory_block *m, int startup) +{ + struct chunk_run *run = heap_get_chunk_run(heap, m); + struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); + struct mbrt *mb = heap_mbrt_get_mb(heap, m->zone_id); + + struct alloc_class *c = alloc_class_by_run( + heap->rt->alloc_classes, + run->hdr.block_size, hdr->flags, m->size_idx); + + struct recycler_element e = recycler_element_new(heap, m); + + if (c == NULL) { + uint32_t size_idx = m->size_idx; + struct run_bitmap b; + + m->m_ops->get_bitmap(m, &b); + + ASSERTeq(size_idx, m->size_idx); + + return e.free_space == b.nbits; + } + + if (e.free_space == c->rdsc.nallocs) + return 1; + + if (startup) { + STATS_INC(heap->stats, transient, heap_run_active, + m->size_idx * CHUNKSIZE); + STATS_INC(heap->stats, transient, heap_run_allocated, + (c->rdsc.nallocs - e.free_space) * run->hdr.block_size); + } + struct recycler *recycler = heap_get_recycler(heap, mb, c->id, c->rdsc.nallocs); + + if (recycler == NULL || recycler_put(recycler, e) < 0) + ERR("lost runtime tracking info of %u run due to OOM", c->id); + + return 0; +} + +/* + * heap_reclaim_zone_garbage -- (internal) creates volatile state of unused runs + */ +static void +heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, + uint32_t zone_id) +{ + struct zone *z = ZID_TO_ZONE(&heap->layout_info, zone_id); + + for (uint32_t i = 0; i < z->header.size_idx; ) { + struct chunk_header *hdr = &z->chunk_headers[i]; + + ASSERT(hdr->size_idx != 0); + + struct memory_block m = MEMORY_BLOCK_NONE; + + m.zone_id = zone_id; + m.chunk_id = i; + m.size_idx = hdr->size_idx; + + memblock_rebuild_state(heap, &m); + m.m_ops->reinit_chunk(&m); + + switch (hdr->type) { + case CHUNK_TYPE_RUN: + if (heap_reclaim_run(heap, &m, 1) != 0) + heap_run_into_free_chunk(heap, bucket, &m); + break; + case CHUNK_TYPE_FREE: + heap_free_chunk_reuse(heap, bucket, &m); + break; + case CHUNK_TYPE_USED: + break; + default: + ASSERT(0); + } + + i = m.chunk_id + m.size_idx; /* hdr might have changed */ + } +} + +static int +heap_getnext_ne_zone(struct palloc_heap *heap, uint32_t *zone_id) +{ + bool allotted, evictable; + int i; + struct heap_rt *h = heap->rt; + + if (h->zones_ne_gc == h->zones_exhausted_ne) + return -1; + + i = h->zones_ne_gc ? h->zones_lastne_gc + 1 : 0; + + for (; i < h->zones_exhausted; i++) { + heap_zinfo_get(heap, i, &allotted, &evictable); + if (!allotted) + break; + if (!evictable) { + *zone_id = i; + return 0; + } + } + return -1; +} + +/* + * heap_populate_bucket -- (internal) creates volatile state of memory blocks + */ +static int +heap_populate_bucket(struct palloc_heap *heap, struct bucket *bucket) +{ + struct heap_rt *h = heap->rt; + struct mbrt *mb = bucket_get_mbrt(bucket); + struct umem_cache_range rg = {0}; + int rc; + uint32_t zone_id; + + if (mb->mb_id != 0) { + if (!mb->garbage_reclaimed) { + heap_reclaim_zone_garbage(heap, bucket, mb->mb_id); + mb->garbage_reclaimed = 1; + return 0; + } + return ENOMEM; + } + + rc = heap_getnext_ne_zone(heap, &zone_id); + if (!rc) + goto reclaim_garbage; + + /* at this point we are sure that there's no more memory in the heap */ + if (h->zones_exhausted_ne == h->nzones_ne) + return ENOMEM; + + zone_id = h->zones_exhausted++; + /* Create a umem cache map for the new zone */ + rg.cr_off = GET_ZONE_OFFSET(zone_id); + rg.cr_size = + ((heap->size - rg.cr_off) > ZONE_MAX_SIZE) ? ZONE_MAX_SIZE : heap->size - rg.cr_off; + heap_mbrt_setmb_nonevictable(heap, zone_id); + rc = umem_cache_map(heap->layout_info.store, &rg, 1); + if (rc != 0) { + rc = daos_der2errno(rc); + ERR("Failed to map zone %d to umem cache rc=%d\n", zone_id, rc); + h->zones_exhausted--; + return rc; + } + h->zones_exhausted_ne++; + + struct zone *z = ZID_TO_ZONE(&heap->layout_info, zone_id); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(ZID_TO_ZONE(&heap->layout_info, zone_id), rg.cr_size); + if (rg.cr_size != ZONE_MAX_SIZE) + VALGRIND_DO_MAKE_MEM_NOACCESS(ZID_TO_ZONE(&heap->layout_info, zone_id) + rg.cr_size, + (ZONE_MAX_SIZE - rg.cr_size)); + + /* + * umem_cache_map() does not return a zeroed page. + * Explicitly memset the page. + */ + memset(z, 0, rg.cr_size); + + /* ignore zone and chunk headers */ + VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(z, sizeof(z->header) + + sizeof(z->chunk_headers)); + + heap_zone_init(heap, zone_id, 0, false); + if (zone_id) + heap_zinfo_set(heap, zone_id, true, false); + +reclaim_garbage: + heap_reclaim_zone_garbage(heap, bucket, zone_id); + h->zones_lastne_gc = zone_id; + h->zones_ne_gc++; + + /* + * It doesn't matter that this function might not have found any + * free blocks because there is still potential that subsequent calls + * will find something in later zones. + */ + return 0; +} + +/* + * heap_recycle_unused -- recalculate scores in the recycler and turn any + * empty runs into free chunks + * + * If force is not set, this function might effectively be a noop if not enough + * of space was freed. + */ +static int +heap_recycle_unused(struct palloc_heap *heap, struct recycler *recycler, + struct bucket *defb, int force) +{ + struct mbrt *mb; + struct memory_block *nm; + struct empty_runs r = recycler_recalc(recycler, force); + struct bucket *nb; + + if (VEC_SIZE(&r) == 0) + return ENOMEM; + + mb = recycler_get_mbrt(recycler); + D_ASSERT(mb != NULL); + + nb = defb == NULL ? mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID) : NULL; + + ASSERT(defb != NULL || nb != NULL); + + VEC_FOREACH_BY_PTR(nm, &r) { + heap_run_into_free_chunk(heap, defb ? defb : nb, nm); + } + + if (nb != NULL) + mbrt_bucket_release(nb); + + VEC_DELETE(&r); + + return 0; +} + +/* + * heap_reclaim_garbage -- (internal) creates volatile state of unused runs + */ +static int +heap_reclaim_garbage(struct palloc_heap *heap, struct bucket *bucket) +{ + int ret = ENOMEM; + struct recycler *r; + struct mbrt *mb = bucket_get_mbrt(bucket); + + for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + r = mb->recyclers[i]; + if (r == NULL) + continue; + + if (heap_recycle_unused(heap, r, bucket, 1) == 0) + ret = 0; + } + + return ret; +} + +/* + * heap_ensure_huge_bucket_filled -- + * (internal) refills the default bucket if needed + */ +static int +heap_ensure_huge_bucket_filled(struct palloc_heap *heap, + struct bucket *bucket) +{ + if (heap_reclaim_garbage(heap, bucket) == 0) + return 0; + + if (heap_populate_bucket(heap, bucket) == 0) + return 0; + + return ENOMEM; +} + +/* + * heap_discard_run -- puts the memory block back into the global heap. + */ +void +heap_discard_run(struct palloc_heap *heap, struct memory_block *m) +{ + struct mbrt *mb = heap_mbrt_get_mb(heap, m->zone_id); + + D_ASSERT(mb != NULL); + if (heap_reclaim_run(heap, m, 0)) { + struct bucket *b = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); + + heap_run_into_free_chunk(heap, b, m); + + mbrt_bucket_release(b); + } +} + +/* + * heap_detach_and_try_discard_run -- detaches the active from a bucket and + * tries to discard the run if it is completely empty (has no allocations) + */ +static int +heap_detach_and_try_discard_run(struct palloc_heap *heap, struct bucket *b) +{ + int empty = 0; + struct memory_block m; + + if (bucket_detach_run(b, &m, &empty) != 0) + return -1; + + if (empty) + heap_discard_run(heap, &m); + + return 0; +} + +/* + * heap_reuse_from_recycler -- (internal) try reusing runs that are currently + * in the recycler + */ +static int +heap_reuse_from_recycler(struct palloc_heap *heap, + struct bucket *b, uint32_t units, int force) +{ + struct mbrt *mb = bucket_get_mbrt(b); + struct memory_block m = MEMORY_BLOCK_NONE; + + m.size_idx = units; + + struct alloc_class *aclass = bucket_alloc_class(b); + + struct recycler *recycler = heap_get_recycler(heap, mb, aclass->id, aclass->rdsc.nallocs); + + if (recycler == NULL) { + ERR("lost runtime tracking info of %u run due to OOM", + aclass->id); + return 0; + } + + if (!force && recycler_get(recycler, &m) == 0) + return bucket_attach_run(b, &m); + + heap_recycle_unused(heap, recycler, NULL, force); + + if (recycler_get(recycler, &m) == 0) + return bucket_attach_run(b, &m); + + return ENOMEM; +} + +/* + * heap_run_create -- (internal) initializes a new run on an existing free chunk + */ +static int +heap_run_create(struct palloc_heap *heap, struct bucket *b, + struct memory_block *m) +{ + struct alloc_class *aclass = bucket_alloc_class(b); + *m = memblock_run_init(heap, m->chunk_id, m->zone_id, &aclass->rdsc); + + bucket_attach_run(b, m); + + STATS_INC(heap->stats, transient, heap_run_active, + m->size_idx * CHUNKSIZE); + + return 0; +} + +/* + * heap_ensure_run_bucket_filled -- (internal) refills the bucket if needed + */ +static int +heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b, + uint32_t units) +{ + int ret = 0; + struct alloc_class *aclass = bucket_alloc_class(b); + struct mbrt *mb = bucket_get_mbrt(b); + struct memory_block m; + struct bucket *defb; + + D_ASSERT(mb != NULL); + ASSERTeq(aclass->type, CLASS_RUN); + + if (mbrt_is_laf(mb, aclass->id)) + return ENOMEM; + + if (heap_detach_and_try_discard_run(heap, b) != 0) + return ENOMEM; + + if (heap_reuse_from_recycler(heap, b, units, 0) == 0) + goto out; + + m = MEMORY_BLOCK_NONE; + + m.size_idx = aclass->rdsc.size_idx; + + defb = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); + + /* cannot reuse an existing run, create a new one */ + if (heap_get_bestfit_block(heap, defb, &m) == 0) { + ASSERTeq(m.block_off, 0); + if (heap_run_create(heap, b, &m) != 0) { + mbrt_bucket_release(defb); + return ENOMEM; + } + mbrt_bucket_release(defb); + goto out; + } + mbrt_bucket_release(defb); + + if (heap_reuse_from_recycler(heap, b, units, 1) == 0) + goto out; + + mbrt_set_laf(mb, aclass->id); + ret = ENOMEM; +out: + return ret; +} + +/* + * heap_memblock_on_free -- bookkeeping actions executed at every free of a + * block + */ +void +heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m) +{ + struct mbrt *mb = heap_mbrt_get_mb(heap, m->zone_id); + + if (m->type != MEMORY_BLOCK_RUN) + return; + + struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); + struct chunk_run *run = heap_get_chunk_run(heap, m); + + ASSERTeq(hdr->type, CHUNK_TYPE_RUN); + + struct alloc_class *c = alloc_class_by_run( + heap->rt->alloc_classes, + run->hdr.block_size, hdr->flags, hdr->size_idx); + + if (c == NULL) + return; + + struct recycler *recycler = heap_get_recycler(heap, mb, c->id, c->rdsc.nallocs); + + if (recycler == NULL) { + ERR("lost runtime tracking info of %u run due to OOM", + c->id); + } else { + recycler_inc_unaccounted(recycler, m); + mbrt_clear_laf(mb); + } +} + +/* + * heap_split_block -- (internal) splits unused part of the memory block + */ +static void +heap_split_block(struct palloc_heap *heap, struct bucket *b, + struct memory_block *m, uint32_t units) +{ + struct alloc_class *aclass = bucket_alloc_class(b); + + ASSERT(units <= MAX_CHUNK); + ASSERT(units > 0); + + if (aclass->type == CLASS_RUN) { + ASSERT((uint64_t)m->block_off + (uint64_t)units <= UINT32_MAX); + struct memory_block r = {m->chunk_id, m->zone_id, + m->size_idx - units, (uint32_t)(m->block_off + units), + NULL, NULL, 0, 0, NULL}; + memblock_rebuild_state(heap, &r); + if (bucket_insert_block(b, &r) != 0) + D_CRIT("failed to allocate memory block runtime tracking info\n"); + } else { + uint32_t new_chunk_id = m->chunk_id + units; + uint32_t new_size_idx = m->size_idx - units; + + struct memory_block n = memblock_huge_init(heap, + new_chunk_id, m->zone_id, new_size_idx); + + *m = memblock_huge_init(heap, m->chunk_id, m->zone_id, units); + + if (bucket_insert_block(b, &n) != 0) + D_CRIT("failed to allocate memory block runtime tracking info\n"); + } + + m->size_idx = units; +} + +/* + * heap_get_bestfit_block -- + * extracts a memory block of equal size index + */ +int +heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, + struct memory_block *m) +{ + struct alloc_class *aclass = bucket_alloc_class(b); + uint32_t units = m->size_idx; + + while (bucket_alloc_block(b, m) != 0) { + if (aclass->type == CLASS_HUGE) { + if (heap_ensure_huge_bucket_filled(heap, b) != 0) + return ENOMEM; + } else { + if (heap_ensure_run_bucket_filled(heap, b, units) != 0) + return ENOMEM; + } + } + + ASSERT(m->size_idx >= units); + + if (units != m->size_idx) + heap_split_block(heap, b, m, units); + + m->m_ops->ensure_header_type(m, aclass->header_type); + m->header_type = aclass->header_type; + + return 0; +} + +/* + * heap_create_alloc_class_buckets -- allocates all cache bucket + * instances of the specified type + */ +int +heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c) +{ + struct mbrt *default_mb = heap->rt->default_mb; + + if (default_mb->buckets[c->id] == NULL) { + default_mb->buckets[c->id] = + bucket_locked_new(container_new_seglists(heap), c, default_mb); + if (default_mb->buckets[c->id] == NULL) + return -1; + } + + return 0; +} + +/* + * heap_write_header -- (internal) creates a clean header + */ +static int +heap_write_header(struct umem_store *store, size_t heap_size, size_t umem_cache_size, + uint32_t nemb_pct) +{ + struct heap_header *newhdr; + int rc; + + D_ALLOC_PTR(newhdr); + if (!newhdr) + return -1; + + strncpy(newhdr->signature, HEAP_SIGNATURE, HEAP_SIGNATURE_LEN); + newhdr->major = HEAP_MAJOR; + newhdr->minor = HEAP_MINOR; + newhdr->heap_size = heap_size; + newhdr->cache_size = umem_cache_size; + newhdr->heap_hdr_size = sizeof(struct heap_header); + newhdr->chunksize = CHUNKSIZE; + newhdr->chunks_per_zone = MAX_CHUNK; + newhdr->nemb_pct = (uint8_t)nemb_pct; + newhdr->checksum = 0; + + util_checksum(newhdr, sizeof(*newhdr), &newhdr->checksum, 1, 0); + rc = meta_update(store, newhdr, 0, sizeof(*newhdr)); + D_FREE(newhdr); + + return rc; +} + +/* + * heap_cleanup -- cleanups the volatile heap state + */ +void +heap_cleanup(struct palloc_heap *heap) +{ + struct heap_rt *rt = heap->rt; + unsigned i; + + alloc_class_collection_delete(rt->alloc_classes); + + for (i = 0; i < rt->nlocks; ++i) + util_mutex_destroy(&rt->run_locks[i]); + +#if VG_MEMCHECK_ENABLED + VALGRIND_DO_DESTROY_MEMPOOL(heap->layout_info.zone0); + if (On_memcheck) { + for (i = 0; i < heap->rt->zones_exhausted; i++) { + if (!heap_mbrt_ismb_initialized(heap, i) || + !heap_mbrt_ismb_evictable(heap, i)) + continue; + if (umem_cache_offisloaded(heap->layout_info.store, GET_ZONE_OFFSET(i))) + VALGRIND_DO_DESTROY_MEMPOOL(ZID_TO_ZONE(&heap->layout_info, i)); + } + } +#endif + heap_mbrt_fini(heap); + + D_FREE(rt); + heap->rt = NULL; +} + +/* + * heap_verify_header -- (internal) verifies if the heap header is consistent + */ +static int +heap_verify_header(struct heap_header *hdr, size_t heap_size, size_t cache_size) +{ + if (util_checksum(hdr, sizeof(*hdr), &hdr->checksum, 0, 0) != 1) { + D_CRIT("heap: invalid header's checksum\n"); + return -1; + } + + if ((hdr->major != HEAP_MAJOR) || (hdr->minor > HEAP_MINOR)) { + D_ERROR("Version mismatch of heap layout\n"); + return -1; + } + + if (hdr->heap_size != heap_size) { + D_ERROR("Metadata store size mismatch, created with %lu , opened with %lu\n", + hdr->heap_size, heap_size); + return -1; + } + + if (hdr->cache_size != cache_size) { + D_ERROR("umem cache size mismatch, created with %lu , opened with %lu\n", + hdr->cache_size, cache_size); + return -1; + } + + if (hdr->nemb_pct > 100) { + D_ERROR("nemb pct value (%d) in heap header is incorrect\n", hdr->nemb_pct); + return -1; + } + + if ((hdr->heap_hdr_size != sizeof(struct heap_header)) || (hdr->chunksize != CHUNKSIZE) || + (hdr->chunks_per_zone != MAX_CHUNK)) { + D_ERROR("incompatible heap layout: hdr_sz=%lu, chunk_sz=%lu, max_chunks=%lu\n", + hdr->heap_hdr_size, hdr->chunksize, hdr->chunks_per_zone); + return -1; + } + + return 0; +} + +int +heap_zone_load(struct palloc_heap *heap, uint32_t zid) +{ + struct umem_cache_range rg = {0}; + struct umem_store *store = heap->layout_info.store; + int rc; + + D_ASSERT(heap->rt->nzones > zid); + + rg.cr_off = GET_ZONE_OFFSET(zid); + rg.cr_size = ((store->stor_size - rg.cr_off) > ZONE_MAX_SIZE) + ? ZONE_MAX_SIZE + : (store->stor_size - rg.cr_off); + rc = umem_cache_load(store, &rg, 1, 0); + if (rc) { + D_ERROR("Failed to load pages to umem cache"); + return daos_der2errno(rc); + } + return 0; +} + +int +heap_ensure_zone0_initialized(struct palloc_heap *heap) +{ + struct mbrt *mb; + struct bucket *b; + int rc = 0; + + heap_mbrt_setmb_nonevictable(heap, 0); + if (heap->layout_info.zone0->header.magic != ZONE_HEADER_MAGIC) { + /* If not magic the content should be zero, indicating new file */ + D_ASSERT(heap->layout_info.zone0->header.magic == 0); + mb = heap_mbrt_get_mb(heap, 0); + b = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); + rc = heap_populate_bucket(heap, b); + mbrt_bucket_release(b); + } +#if VG_MEMCHECK_ENABLED + else { + if (On_memcheck) + palloc_heap_vg_zone_open(heap, 0, 1); + } +#endif + heap_mbrt_setmb_usage(heap, 0, heap->layout_info.zone0->header.sp_usage); + return rc; +} + +D_CASSERT(sizeof(struct zone) == 4096); +D_CASSERT(sizeof(struct heap_header) == 4096); + +#define MAX_HEADER_FETCH 4 + +/* + * heap_boot -- opens the heap region of the dav_obj pool + * + * If successful function returns zero. Otherwise an error number is returned. + */ +int +heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_t cache_size, + struct mo_ops *p_ops, struct stats *stats) +{ + struct heap_rt *h; + struct heap_header *newhdr; + int err; + struct heap_zone_limits hzl; + uint32_t nemb_pct = HEAP_NEMB_PCT_DEFAULT; + + D_ALLOC_PTR(newhdr); + if (!newhdr) + return ENOMEM; + + err = meta_fetch(p_ops->umem_store, newhdr, 0, sizeof(*newhdr)); + if (err) { + ERR("failed to read the heap header"); + D_FREE(newhdr); + return err; + } + err = heap_verify_header(newhdr, heap_size, cache_size); + if (err) { + ERR("incompatible heap detected"); + D_FREE(newhdr); + return EINVAL; + } + if (newhdr->nemb_pct) + nemb_pct = newhdr->nemb_pct; + D_FREE(newhdr); + + D_ALLOC_PTR_NZ(h); + if (h == NULL) { + err = ENOMEM; + goto error_heap_malloc; + } + + h->alloc_classes = alloc_class_collection_new(); + if (h->alloc_classes == NULL) { + err = ENOMEM; + goto error_alloc_classes_new; + } + + hzl = heap_get_zone_limits(heap_size, cache_size, nemb_pct); + + h->nzones = hzl.nzones_heap; + h->nzones_ne = hzl.nzones_ne_max; + h->nzones_e = hzl.nzones_e_max; + h->zones_exhausted = 0; + h->zones_exhausted_e = 0; + h->zones_exhausted_ne = 0; + h->zones_ne_gc = 0; + h->zones_lastne_gc = 0; + h->zones_unused_first = 0; + + h->nlocks = On_valgrind ? MAX_RUN_LOCKS_VG : MAX_RUN_LOCKS; + for (unsigned i = 0; i < h->nlocks; ++i) + util_mutex_init(&h->run_locks[i]); + heap->rt = h; + + heap->p_ops = *p_ops; + heap->layout_info.store = p_ops->umem_store; + heap->layout_info.zone0 = mmap_base; + heap->size = heap_size; + heap->base = mmap_base; + heap->stats = stats; + heap->alloc_pattern = PALLOC_CTL_DEBUG_NO_PATTERN; + VALGRIND_DO_CREATE_MEMPOOL(heap->layout_info.zone0, 0, 0); + + err = heap_mbrt_init(heap); + if (err) + goto error_mbrt_init; + + return 0; + +error_mbrt_init: + alloc_class_collection_delete(h->alloc_classes); +error_alloc_classes_new: + D_FREE(h); + heap->rt = NULL; +error_heap_malloc: + return err; +} + +static unsigned int +heap_get_nemb_pct() +{ + unsigned int nemb_pct; + + nemb_pct = HEAP_NEMB_PCT_DEFAULT; + d_getenv_uint("DAOS_MD_ON_SSD_NEMB_PCT", &nemb_pct); + if ((nemb_pct > 100) || (nemb_pct == 0)) { + D_ERROR("Invalid value %d for tunable DAOS_MD_ON_SSD_NEMB_PCT", nemb_pct); + nemb_pct = HEAP_NEMB_PCT_DEFAULT; + } + D_INFO("DAOS_MD_ON_SSD_NEMB_PCT set to %d", nemb_pct); + + return nemb_pct; +} + +int +heap_get_max_nemb(struct palloc_heap *heap) +{ + return heap->rt->nzones_ne; +} + +/* + * heap_init -- initializes the heap + * + * If successful function returns zero. Otherwise an error number is returned. + */ +int +heap_init(void *heap_start, uint64_t umem_cache_size, struct umem_store *store) +{ + int nzones; + uint32_t nemb_pct = heap_get_nemb_pct(); + uint64_t heap_size = store->stor_size; + + if (heap_size < HEAP_MIN_SIZE) + return EINVAL; + + D_ASSERT(store->stor_priv != NULL); + + nzones = heap_max_zone(heap_size); + meta_clear_pages(store, sizeof(struct heap_header), 4096, ZONE_MAX_SIZE, nzones); + + if (heap_write_header(store, heap_size, umem_cache_size, nemb_pct)) + return ENOMEM; + + return 0; +} + +static inline int +heap_create_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) +{ + uint32_t zone_id; + struct mbrt *mb; + struct umem_cache_range rg = {0}; + int rc; + struct zone *z; + struct umem_pin_handle *pin_handle = NULL; + struct umem_store *store = heap->layout_info.store; + + D_ASSERT(heap->rt->active_evictable_mb == NULL); + + if (heap->rt->zones_exhausted_e >= heap->rt->nzones_e) + return -1; + + heap->rt->mb_create_waiters++; + if (heap->rt->mb_create_waiters > 1) { + D_ASSERT(store->stor_ops->so_waitqueue_wait != NULL); + store->stor_ops->so_waitqueue_wait(heap->rt->mb_create_wq, false); + D_ASSERT((int)heap->rt->mb_create_waiters >= 0); + rc = 1; + errno = EBUSY; + goto out; + } + + for (zone_id = heap->rt->zones_unused_first; zone_id < heap->rt->nzones; zone_id++) { + if (!heap_mbrt_ismb_initialized(heap, zone_id)) + break; + } + + D_ASSERT(zone_id < heap->rt->nzones); + mb = heap_mbrt_setup_mb(heap, zone_id); + if (mb == NULL) { + ERR("Failed to setup mbrt for zone %u\n", zone_id); + rc = -1; + goto out; + } + + heap->rt->zones_unused_first = zone_id + 1; + if (heap->rt->zones_exhausted < heap->rt->zones_unused_first) + heap->rt->zones_exhausted = heap->rt->zones_unused_first; + heap->rt->zones_exhausted_e++; + heap_mbrt_setmb_evictable(heap, mb); + + /* Create a umem cache map for the new zone */ + rg.cr_off = GET_ZONE_OFFSET(zone_id); + rg.cr_size = + ((heap->size - rg.cr_off) > ZONE_MAX_SIZE) ? ZONE_MAX_SIZE : heap->size - rg.cr_off; + + rc = umem_cache_map(heap->layout_info.store, &rg, 1); + if (rc != 0) { + ERR("Failed to map zone %u to umem cache\n", zone_id); + errno = daos_der2errno(rc); + goto error; + } + + D_DEBUG(DB_TRACE, "Creating evictable zone %d\n", zone_id); + + z = ZID_TO_ZONE(&heap->layout_info, zone_id); + VALGRIND_DO_CREATE_MEMPOOL(z, 0, 0); + VALGRIND_DO_MAKE_MEM_UNDEFINED(z, rg.cr_size); + if (rg.cr_size != ZONE_MAX_SIZE) + VALGRIND_DO_MAKE_MEM_NOACCESS(z + rg.cr_size, (ZONE_MAX_SIZE - rg.cr_size)); + + memset(z, 0, rg.cr_size); + + rc = umem_cache_pin(heap->layout_info.store, &rg, 1, false, &pin_handle); + if (rc) { + errno = daos_der2errno(rc); + goto error; + } + + /* ignore zone and chunk headers */ + VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(z, sizeof(z->header) + sizeof(z->chunk_headers)); + + rc = lw_tx_begin(heap->p_ops.base); + if (rc) + goto error; + + heap_zone_init(heap, zone_id, 0, true); + rc = heap_mbrt_mb_reclaim_garbage(heap, zone_id); + if (rc) { + ERR("Failed to initialize evictable zone %u", zone_id); + lw_tx_end(heap->p_ops.base, NULL); + goto error; + } + heap_zinfo_set(heap, zone_id, true, true); + lw_tx_end(heap->p_ops.base, NULL); + umem_cache_unpin(heap->layout_info.store, pin_handle); + + *mb_id = zone_id; + rc = 0; + goto out; + +error: + if (pin_handle) + umem_cache_unpin(heap->layout_info.store, pin_handle); + heap_mbrt_cleanup_mb(mb); + heap->rt->evictable_mbs[zone_id] = NULL; + heap->rt->zones_exhausted_e--; + if (heap->rt->zones_unused_first > zone_id) + heap->rt->zones_unused_first = zone_id; + rc = -1; + +out: + heap->rt->mb_create_waiters--; + D_ASSERT((int)heap->rt->mb_create_waiters >= 0); + if (heap->rt->mb_create_waiters) { + D_ASSERT(store->stor_ops->so_waitqueue_wakeup != NULL); + store->stor_ops->so_waitqueue_wakeup(heap->rt->mb_create_wq, false); + } + return rc; +} + +int +heap_get_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) +{ + struct mbrt *mb; + int ret; + +retry: + if (heap->rt->active_evictable_mb != NULL) { + if ((heap->rt->mb_pressure) || + (heap->rt->active_evictable_mb->space_usage <= MB_U75)) { + *mb_id = heap->rt->active_evictable_mb->mb_id; + return 0; + } + mb = heap->rt->active_evictable_mb; + heap->rt->active_evictable_mb = NULL; + heap_mbrt_setmb_usage(heap, mb->mb_id, mb->space_usage); + } + heap->rt->mb_pressure = 0; + + if ((mb = TAILQ_FIRST(&heap->rt->mb_u30)) != NULL) + TAILQ_REMOVE(&heap->rt->mb_u30, mb, mb_link); + else if ((mb = TAILQ_FIRST(&heap->rt->mb_u0)) != NULL) + TAILQ_REMOVE(&heap->rt->mb_u0, mb, mb_link); + else if ((ret = heap_create_evictable_mb(heap, mb_id)) >= 0) { + if (ret) + goto retry; + mb = heap_mbrt_get_mb(heap, *mb_id); + D_ASSERT(mb != NULL); + if (heap->rt->active_evictable_mb) { + TAILQ_INSERT_HEAD(&heap->rt->mb_u0, mb, mb_link); + mb->qptr = &heap->rt->mb_u0; + *mb_id = heap->rt->active_evictable_mb->mb_id; + return 0; + } + } else if ((mb = TAILQ_FIRST(&heap->rt->mb_u75)) != NULL) { + TAILQ_REMOVE(&heap->rt->mb_u75, mb, mb_link); + heap->rt->mb_pressure = 1; + } else if ((mb = TAILQ_FIRST(&heap->rt->mb_u90)) != NULL) { + TAILQ_REMOVE(&heap->rt->mb_u90, mb, mb_link); + heap->rt->mb_pressure = 1; + } else { + D_ERROR("Failed to get an evictable MB"); + *mb_id = 0; + return 0; + } + heap->rt->active_evictable_mb = mb; + mb->qptr = NULL; + *mb_id = mb->mb_id; + return 0; +} + +uint32_t +heap_off2mbid(struct palloc_heap *heap, uint64_t offset) +{ + struct memory_block m = memblock_from_offset_opt(heap, offset, 0); + + if (heap_mbrt_ismb_evictable(heap, m.zone_id)) + return m.zone_id; + else + return 0; +} + +int +heap_update_mbrt_zinfo(struct palloc_heap *heap, bool init) +{ + bool allotted, evictable; + struct zone *z0 = heap->layout_info.zone0; + int nemb_cnt = 1, emb_cnt = 0, i; + struct mbrt *mb; + struct zone *z; + enum mb_usage_hint usage_hint; + int last_allocated = 0; + + heap->rt->zinfo_vec = HEAP_OFF_TO_PTR(heap, z0->header.zone0_zinfo_off); + heap->rt->zinfo_vec_size = z0->header.zone0_zinfo_size; + + if (init) + heap_zinfo_init(heap); + else { + D_ASSERT(heap->rt->zinfo_vec->num_elems == heap->rt->nzones); + heap_zinfo_get(heap, 0, &allotted, &evictable); + D_ASSERT((evictable == false) && (allotted == true)); + } + + for (i = 1; i < heap->rt->nzones; i++) { + heap_zinfo_get(heap, i, &allotted, &evictable); + if (!allotted) { + if (!heap->rt->zones_unused_first) + heap->rt->zones_unused_first = i; + continue; + } + if (!evictable) { + heap_mbrt_setmb_nonevictable(heap, i); + nemb_cnt++; + } else { + mb = heap_mbrt_setup_mb(heap, i); + if (mb == NULL) + return ENOMEM; + heap_mbrt_setmb_evictable(heap, mb); + if (umem_cache_offisloaded(heap->layout_info.store, GET_ZONE_OFFSET(i))) { + z = ZID_TO_ZONE(&heap->layout_info, i); + D_ASSERT(z->header.flags & ZONE_EVICTABLE_MB); + heap_mbrt_setmb_usage(heap, i, z->header.sp_usage); + } else { + heap_zinfo_get_usage(heap, i, &usage_hint); + heap_mbrt_setmb_usage(heap, i, mb_usage_byhint[(int)usage_hint]); + } + emb_cnt++; + } + last_allocated = i; + } + heap->rt->zones_exhausted = last_allocated + 1; + heap->rt->zones_exhausted_ne = nemb_cnt; + heap->rt->zones_exhausted_e = emb_cnt; + + D_ASSERT(heap->rt->nzones_e >= heap->rt->zones_exhausted_e); + D_ASSERT(heap->rt->nzones_ne >= heap->rt->zones_exhausted_ne); + return 0; +} + +/* + * heap_load_nonevictable_zones() -> Populate the heap with non-evictable MBs. + */ +int +heap_load_nonevictable_zones(struct palloc_heap *heap) +{ + int i, rc; + bool allotted, evictable; + + for (i = 1; i < heap->rt->zones_exhausted; i++) { + heap_zinfo_get(heap, i, &allotted, &evictable); + D_ASSERT(allotted); + if (!evictable) { + rc = heap_zone_load(heap, i); + if (rc) + return rc; + } + } + return 0; +} + +#if 0 +/* + * heap_verify_zone_header -- + * (internal) verifies if the zone header is consistent + */ +static int +heap_verify_zone_header(struct zone_header *hdr) +{ + if (hdr->magic != ZONE_HEADER_MAGIC) /* not initialized */ + return 0; + + if (hdr->size_idx == 0) { + D_CRIT("heap: invalid zone size\n"); + return -1; + } + + return 0; +} + +/* + * heap_verify_chunk_header -- + * (internal) verifies if the chunk header is consistent + */ +static int +heap_verify_chunk_header(struct chunk_header *hdr) +{ + if (hdr->type == CHUNK_TYPE_UNKNOWN) { + D_CRIT("heap: invalid chunk type\n"); + return -1; + } + + if (hdr->type >= MAX_CHUNK_TYPE) { + D_CRIT("heap: unknown chunk type\n"); + return -1; + } + + if (hdr->flags & ~CHUNK_FLAGS_ALL_VALID) { + D_CRIT("heap: invalid chunk flags\n"); + return -1; + } + + return 0; +} + +/* + * heap_verify_zone -- (internal) verifies if the zone is consistent + */ +static int +heap_verify_zone(struct zone *zone) +{ + if (zone->header.magic == 0) + return 0; /* not initialized, and that is OK */ + + if (zone->header.magic != ZONE_HEADER_MAGIC) { + D_CRIT("heap: invalid zone magic\n"); + return -1; + } + + if (heap_verify_zone_header(&zone->header)) + return -1; + + uint32_t i; + + for (i = 0; i < zone->header.size_idx; ) { + if (heap_verify_chunk_header(&zone->chunk_headers[i])) + return -1; + + i += zone->chunk_headers[i].size_idx; + } + + if (i != zone->header.size_idx) { + D_CRIT("heap: chunk sizes mismatch\n"); + return -1; + } + + return 0; +} + +/* + * heap_check -- verifies if the heap is consistent and can be opened properly + * + * If successful function returns zero. Otherwise an error number is returned. + */ +int +heap_check(void *heap_start, uint64_t heap_size) +{ + if (heap_size < HEAP_MIN_SIZE) { + D_CRIT("heap: invalid heap size\n"); + return -1; + } + + struct heap_layout *layout = heap_start; + + if (heap_verify_header(&layout->header, heap_size)) + return -1; + + for (unsigned i = 0; i < heap_max_zone(heap_size); ++i) { + if (heap_verify_zone(ZID_TO_ZONE(layout, i))) + return -1; + } + + return 0; +} +#endif + +/* + * heap_zone_foreach_object -- (internal) iterates through objects in a zone + */ +static int +heap_zone_foreach_object(struct palloc_heap *heap, object_callback cb, + void *arg, struct memory_block *m) +{ + struct zone *zone = ZID_TO_ZONE(&heap->layout_info, m->zone_id); + + if (zone->header.magic == 0) + return 0; + + for (; m->chunk_id < zone->header.size_idx; ) { + struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); + + memblock_rebuild_state(heap, m); + m->size_idx = hdr->size_idx; + + if (m->m_ops->iterate_used(m, cb, arg) != 0) + return 1; + + m->chunk_id += m->size_idx; + m->block_off = 0; + } + + return 0; +} + +/* + * heap_foreach_object -- (internal) iterates through objects in the heap + */ +void +heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg, + struct memory_block m) +{ + for (; m.zone_id < heap->rt->nzones; ++m.zone_id) { + if (heap_zone_foreach_object(heap, cb, arg, &m) != 0) + break; + + m.chunk_id = 0; + } +} + +struct heap_zone_limits +heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size, uint32_t nemb_pct) +{ + struct heap_zone_limits zd = {0}; + + D_ASSERT(nemb_pct <= 100); + + if (heap_size < sizeof(struct heap_header)) + zd.nzones_heap = 0; + else + zd.nzones_heap = heap_max_zone(heap_size); + + zd.nzones_cache = cache_size / ZONE_MAX_SIZE; + if (zd.nzones_cache <= UMEM_CACHE_MIN_EVICTABLE_PAGES) + return zd; + + if (zd.nzones_heap > zd.nzones_cache) { + if (zd.nzones_heap < (zd.nzones_cache + UMEM_CACHE_MIN_EVICTABLE_PAGES)) + zd.nzones_ne_max = zd.nzones_cache - UMEM_CACHE_MIN_EVICTABLE_PAGES; + else + zd.nzones_ne_max = ((unsigned long)zd.nzones_cache * nemb_pct) / 100; + if (zd.nzones_cache < (zd.nzones_ne_max + UMEM_CACHE_MIN_EVICTABLE_PAGES)) + zd.nzones_ne_max = zd.nzones_cache - UMEM_CACHE_MIN_EVICTABLE_PAGES; + } else + zd.nzones_ne_max = zd.nzones_heap; + + zd.nzones_e_max = zd.nzones_heap - zd.nzones_ne_max; + + return zd; +} + +#if VG_MEMCHECK_ENABLED +void +heap_vg_zone_open(struct palloc_heap *heap, uint32_t zone_id, object_callback cb, void *args, + int objects) +{ + struct memory_block m = MEMORY_BLOCK_NONE; + uint32_t chunks; + struct chunk_header *hdr; + struct zone *z = ZID_TO_ZONE(&heap->layout_info, zone_id); + uint32_t c; + + m.zone_id = zone_id; + m.chunk_id = 0; + + VALGRIND_DO_MAKE_MEM_UNDEFINED(z, ZONE_MAX_SIZE); + + VALGRIND_DO_MAKE_MEM_DEFINED(&z->header, sizeof(z->header)); + + D_ASSERT(z->header.magic == ZONE_HEADER_MAGIC); + + chunks = z->header.size_idx; + + for (c = 0; c < chunks;) { + hdr = &z->chunk_headers[c]; + + /* define the header before rebuilding state */ + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); + + m.chunk_id = c; + m.size_idx = hdr->size_idx; + + memblock_rebuild_state(heap, &m); + + m.m_ops->vg_init(&m, objects, cb, args); + m.block_off = 0; + + ASSERT(hdr->size_idx > 0); + + c += hdr->size_idx; + } + + /* mark all unused chunk headers after last as not accessible */ + VALGRIND_DO_MAKE_MEM_NOACCESS(&z->chunk_headers[chunks], + (MAX_CHUNK - chunks) * sizeof(struct chunk_header)); +} + +/* + * heap_vg_open -- notifies Valgrind about heap layout + */ +void +heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int objects) +{ + unsigned zones = heap_max_zone(heap->size); + + ASSERTne(cb, NULL); + + for (unsigned i = 1; i < zones; ++i) { + if (!umem_cache_offisloaded(heap->layout_info.store, GET_ZONE_OFFSET(i))) + continue; + + if (heap_mbrt_ismb_evictable(heap, i)) + VALGRIND_DO_CREATE_MEMPOOL(ZID_TO_ZONE(&heap->layout_info, i), 0, 0); + + heap_vg_zone_open(heap, i, cb, arg, objects); + } +} +#endif diff --git a/src/common/dav_v2/heap.h b/src/common/dav_v2/heap.h new file mode 100644 index 00000000000..8ceeff9a5cd --- /dev/null +++ b/src/common/dav_v2/heap.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * heap.h -- internal definitions for heap + */ + +#ifndef __DAOS_COMMON_HEAP_H +#define __DAOS_COMMON_HEAP_H 1 + +#include +#include + +#include "memblock.h" +#include "bucket.h" +#include "memops.h" +#include "palloc.h" +#include "dav_internal.h" +#include + +#define HEAP_OFF_TO_PTR(heap, off) umem_cache_off2ptr(heap->layout_info.store, off) +#define HEAP_PTR_TO_OFF(heap, ptr) umem_cache_ptr2off(heap->layout_info.store, ptr) + +#define BIT_IS_CLR(a, i) (!((a) & (1ULL << (i)))) +#define HEAP_ARENA_PER_THREAD (0) + +struct mbrt; + +int +heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_t cache_size, + struct mo_ops *p_ops, struct stats *stats); +int +heap_init(void *heap_start, uint64_t umem_cache_size, struct umem_store *store); +void +heap_cleanup(struct palloc_heap *heap); +int +heap_check(void *heap_start, uint64_t heap_size); +int +heap_get_max_nemb(struct palloc_heap *heap); +int +heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c); +int +heap_mbrt_update_alloc_class_buckets(struct palloc_heap *heap, struct mbrt *mb, + struct alloc_class *c); +int +heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size); +void +heap_mbrt_setmb_evictable(struct palloc_heap *heap, struct mbrt *mb); +bool +heap_mbrt_ismb_initialized(struct palloc_heap *heap, uint32_t zone_id); +bool +heap_mbrt_ismb_evictable(struct palloc_heap *heap, uint32_t zone_id); +void +heap_mbrt_setmb_nonevictable(struct palloc_heap *heap, uint32_t zone_id); +void +heap_mbrt_setmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t usage); +int +heap_mbrt_getmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t *allotted, + uint64_t *maxsz); +void +heap_mbrt_incrmb_usage(struct palloc_heap *heap, uint32_t zone_id, int size); +struct mbrt * +heap_mbrt_setup_mb(struct palloc_heap *heap, uint32_t zone_id); +int +heap_mbrt_mb_reclaim_garbage(struct palloc_heap *heap, uint32_t zid); +int +heap_ensure_zone0_initialized(struct palloc_heap *heap); +int +heap_zone_load(struct palloc_heap *heap, uint32_t zid); +int +heap_load_nonevictable_zones(struct palloc_heap *heap); +int +heap_update_mbrt_zinfo(struct palloc_heap *heap, bool init); +size_t +heap_zinfo_get_size(uint32_t nzones); + +struct alloc_class * +heap_get_best_class(struct palloc_heap *heap, size_t size); +struct bucket * +mbrt_bucket_acquire(struct mbrt *mb, uint8_t class_id); +void +mbrt_bucket_release(struct bucket *b); +void +heap_set_root_ptrs(struct palloc_heap *heap, uint64_t **offp, uint64_t **sizep); +void +heap_set_stats_ptr(struct palloc_heap *heap, struct stats_persistent **sp); + +int +heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, struct memory_block *m); +pthread_mutex_t * +heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id); + +void +heap_discard_run(struct palloc_heap *heap, struct memory_block *m); + +void +heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m); + +int +heap_free_chunk_reuse(struct palloc_heap *heap, struct bucket *bucket, struct memory_block *m); + +void +heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg, + struct memory_block start); + +struct alloc_class_collection * +heap_alloc_classes(struct palloc_heap *heap); + +void +heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int objects); +void +heap_vg_zone_open(struct palloc_heap *heap, uint32_t zone_id, object_callback cb, void *arg, + int objects); + +static inline struct chunk_header * +heap_get_chunk_hdr(struct palloc_heap *heap, const struct memory_block *m) +{ + return GET_CHUNK_HDR(&heap->layout_info, m->zone_id, m->chunk_id); +} + +static inline struct chunk * +heap_get_chunk(struct palloc_heap *heap, const struct memory_block *m) +{ + return GET_CHUNK(&heap->layout_info, m->zone_id, m->chunk_id); +} + +static inline struct chunk_run * +heap_get_chunk_run(struct palloc_heap *heap, const struct memory_block *m) +{ + return GET_CHUNK_RUN(&heap->layout_info, m->zone_id, m->chunk_id); +} + +struct mbrt * +heap_mbrt_get_mb(struct palloc_heap *heap, uint32_t zone_id); + +void +heap_mbrt_log_alloc_failure(struct palloc_heap *heap, uint32_t zone_id); + +int +heap_get_evictable_mb(struct palloc_heap *heap, uint32_t *zone_id); + +struct heap_zone_limits { + unsigned nzones_heap; + unsigned nzones_cache; + unsigned nzones_ne_max; + unsigned nzones_e_max; +}; + +uint32_t +heap_off2mbid(struct palloc_heap *heap, uint64_t offset); + +struct heap_zone_limits +heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size, uint32_t nemb_pct); +#endif /* __DAOS_COMMON_HEAP_H */ diff --git a/src/common/dav_v2/heap_layout.h b/src/common/dav_v2/heap_layout.h new file mode 100644 index 00000000000..fa65846921e --- /dev/null +++ b/src/common/dav_v2/heap_layout.h @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * heap_layout.h -- internal definitions for heap layout + */ + +#ifndef __DAOS_COMMON_HEAP_LAYOUT_H +#define __DAOS_COMMON_HEAP_LAYOUT_H 1 + +#include +#include +#include + +#define HEAP_MAJOR 1 +#define HEAP_MINOR 0 + +#define MAX_CHUNK 63 +#define CHUNK_BASE_ALIGNMENT 1024 +#define CHUNKSIZE ((size_t)1024 * 260) /* 260 kilobytes */ +#define MAX_MEMORY_BLOCK_SIZE (MAX_CHUNK * CHUNKSIZE) +#define HEAP_SIGNATURE_LEN 16 +#define HEAP_SIGNATURE "MEMORY_HEAP_HDR\0" +#define ZONE_HEADER_MAGIC 0xC3F0A2D2 +#define ZONE_MIN_SIZE (sizeof(struct zone) + sizeof(struct chunk)) +#define ZONE_MAX_SIZE (sizeof(struct zone) + sizeof(struct chunk) * MAX_CHUNK) +#define HEAP_MIN_SIZE (sizeof(struct heap_header) + ZONE_MIN_SIZE) + +/* Base bitmap values, relevant for both normal and flexible bitmaps */ +#define RUN_BITS_PER_VALUE 64U +#define RUN_BASE_METADATA_VALUES\ + ((unsigned)(sizeof(struct chunk_run_header) / sizeof(uint64_t))) +#define RUN_BASE_METADATA_SIZE (sizeof(struct chunk_run_header)) + +#define RUN_CONTENT_SIZE (CHUNKSIZE - RUN_BASE_METADATA_SIZE) + +/* + * Calculates the size in bytes of a single run instance, including bitmap + */ +#define RUN_CONTENT_SIZE_BYTES(size_idx)\ +(RUN_CONTENT_SIZE + (((size_idx) - 1) * CHUNKSIZE)) + +/* Default bitmap values, specific for old, non-flexible, bitmaps */ +#define RUN_DEFAULT_METADATA_VALUES 40 /* in 8 byte words, 320 bytes total */ +#define RUN_DEFAULT_BITMAP_VALUES \ + (RUN_DEFAULT_METADATA_VALUES - RUN_BASE_METADATA_VALUES) +#define RUN_DEFAULT_BITMAP_SIZE (sizeof(uint64_t) * RUN_DEFAULT_BITMAP_VALUES) +#define RUN_DEFAULT_BITMAP_NBITS\ + (RUN_BITS_PER_VALUE * RUN_DEFAULT_BITMAP_VALUES) +#define RUN_DEFAULT_SIZE \ + (CHUNKSIZE - RUN_BASE_METADATA_SIZE - RUN_DEFAULT_BITMAP_SIZE) + +/* + * Calculates the size in bytes of a single run instance, without bitmap, + * but only for the default fixed-bitmap algorithm + */ +#define RUN_DEFAULT_SIZE_BYTES(size_idx)\ +(RUN_DEFAULT_SIZE + (((size_idx) - 1) * CHUNKSIZE)) + +enum chunk_flags { + CHUNK_FLAG_COMPACT_HEADER = 0x0001, + CHUNK_FLAG_HEADER_NONE = 0x0002, + CHUNK_FLAG_ALIGNED = 0x0004, + CHUNK_FLAG_FLEX_BITMAP = 0x0008, +}; + +#define CHUNK_FLAGS_ALL_VALID (\ + CHUNK_FLAG_COMPACT_HEADER |\ + CHUNK_FLAG_HEADER_NONE |\ + CHUNK_FLAG_ALIGNED |\ + CHUNK_FLAG_FLEX_BITMAP\ +) + +enum chunk_type { + CHUNK_TYPE_UNKNOWN, + CHUNK_TYPE_FOOTER, /* not actual chunk type */ + CHUNK_TYPE_FREE, + CHUNK_TYPE_USED, + CHUNK_TYPE_RUN, + CHUNK_TYPE_RUN_DATA, + + MAX_CHUNK_TYPE +}; + +/* zone header flags */ +#define ZONE_EVICTABLE_MB 0x0001 + +struct chunk { + uint8_t data[CHUNKSIZE]; +}; + +struct chunk_run_header { + uint64_t block_size; + uint64_t alignment; /* valid only /w CHUNK_FLAG_ALIGNED */ +}; + +struct chunk_run { + struct chunk_run_header hdr; + uint8_t content[RUN_CONTENT_SIZE]; /* bitmap + data */ +}; + +struct chunk_header { + uint16_t type; + uint16_t flags; + uint32_t size_idx; +}; + +struct zone_header { + uint32_t magic; + uint32_t size_idx; + uint32_t flags; + uint32_t spare1; + uint64_t zone0_zinfo_size; + uint64_t zone0_zinfo_off; + uint64_t reserved[2]; + uint64_t sp_usage; + uint64_t sp_usage_glob; + uint8_t spare[3528]; +}; + +struct zone { + struct zone_header header; + struct chunk_header chunk_headers[MAX_CHUNK]; + struct chunk chunks[]; +}; + +struct heap_header { + char signature[HEAP_SIGNATURE_LEN]; + uint64_t major; + uint64_t minor; + uint64_t heap_size; + uint64_t cache_size; + uint64_t heap_hdr_size; + uint64_t chunksize; + uint64_t chunks_per_zone; + uint8_t nemb_pct; + uint8_t reserved[4015]; + uint64_t checksum; +}; + +struct heap_layout_info { + struct heap_header header; + struct zone *zone0; /* Address of the zone0 in umem_cache */ + struct umem_store *store; +}; + +#define ALLOC_HDR_SIZE_SHIFT (48ULL) +#define ALLOC_HDR_FLAGS_MASK (((1ULL) << ALLOC_HDR_SIZE_SHIFT) - 1) + +struct allocation_header_legacy { + uint8_t unused[8]; + uint64_t size; + uint8_t unused2[32]; + uint64_t root_size; + uint64_t type_num; +}; + +#define ALLOC_HDR_COMPACT_SIZE sizeof(struct allocation_header_compact) + +struct allocation_header_compact { + uint64_t size; + uint64_t extra; +}; + +enum header_type { + HEADER_LEGACY, + HEADER_COMPACT, + HEADER_NONE, + + MAX_HEADER_TYPES +}; + +static const size_t header_type_to_size[MAX_HEADER_TYPES] = { + sizeof(struct allocation_header_legacy), + sizeof(struct allocation_header_compact), + 0 +}; + +static const enum chunk_flags header_type_to_flag[MAX_HEADER_TYPES] = { + (enum chunk_flags)0, + CHUNK_FLAG_COMPACT_HEADER, + CHUNK_FLAG_HEADER_NONE +}; + +static inline struct zone * +ZID_TO_ZONE(struct heap_layout_info *layout_info, size_t zone_id) +{ + uint64_t zoff = sizeof(struct heap_header) + ZONE_MAX_SIZE * zone_id; + + return umem_cache_off2ptr(layout_info->store, zoff); +} + +static inline struct chunk_header * +GET_CHUNK_HDR(struct heap_layout_info *layout_info, size_t zone_id, unsigned chunk_id) +{ + return &ZID_TO_ZONE(layout_info, zone_id)->chunk_headers[chunk_id]; +} + +static inline struct chunk * +GET_CHUNK(struct heap_layout_info *layout_info, size_t zone_id, unsigned chunk_id) +{ + return &ZID_TO_ZONE(layout_info, zone_id)->chunks[chunk_id]; +} + +static inline struct chunk_run * +GET_CHUNK_RUN(struct heap_layout_info *layout_info, size_t zone_id, unsigned chunk_id) +{ + return (struct chunk_run *)GET_CHUNK(layout_info, zone_id, chunk_id); +} + +static inline uint64_t +GET_ZONE_OFFSET(uint32_t zid) +{ + return sizeof(struct heap_header) + ZONE_MAX_SIZE * zid; +} + +static inline bool +IS_ZONE_HDR_OFFSET(uint64_t off) +{ + return (((off - sizeof(struct heap_header)) % ZONE_MAX_SIZE) == 0); +} + +static inline uint32_t +OFFSET_TO_ZID(uint64_t off) +{ + return (off - sizeof(struct heap_header)) / ZONE_MAX_SIZE; +} + +#endif /* __DAOS_COMMON_HEAP_LAYOUT_H */ diff --git a/src/common/dav_v2/memblock.c b/src/common/dav_v2/memblock.c new file mode 100644 index 00000000000..d66682d5f5a --- /dev/null +++ b/src/common/dav_v2/memblock.c @@ -0,0 +1,1615 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2024, Intel Corporation */ + +/* + * memblock.c -- implementation of memory block + * + * Memory block is a representation of persistent object that resides in the + * heap. A valid memory block must be either a huge (free or used) chunk or a + * block inside a run. + * + * Huge blocks are 1:1 correlated with the chunk headers in the zone whereas + * run blocks are represented by bits in corresponding chunk bitmap. + * + * This file contains implementations of abstract operations on memory blocks. + * Instead of storing the mbops structure inside each memory block the correct + * method implementation is chosen at runtime. + */ + +#include + +#include "obj.h" +#include "heap.h" +#include "memblock.h" +#include "out.h" +#include "valgrind_internal.h" +#include "alloc_class.h" + +/* calculates the size of the entire run, including any additional chunks */ +#define SIZEOF_RUN(runp, size_idx)\ + (sizeof(*(runp)) + (((size_idx) - 1) * CHUNKSIZE)) + +/* + * memblock_header_type -- determines the memory block's header type + */ +static enum header_type +memblock_header_type(const struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + if (hdr->flags & CHUNK_FLAG_COMPACT_HEADER) + return HEADER_COMPACT; + + if (hdr->flags & CHUNK_FLAG_HEADER_NONE) + return HEADER_NONE; + + return HEADER_LEGACY; +} + +/* + * memblock_header_legacy_get_size -- + * (internal) returns the size stored in a legacy header + */ +static size_t +memblock_header_legacy_get_size(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + return hdr->size; +} + +/* + * memblock_header_compact_get_size -- + * (internal) returns the size stored in a compact header + */ +static size_t +memblock_header_compact_get_size(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + return hdr->size & ALLOC_HDR_FLAGS_MASK; +} + +/* + * memblock_header_none_get_size -- + * (internal) determines the sizes of an object without a header + */ +static size_t +memblock_header_none_get_size(const struct memory_block *m) +{ + return m->m_ops->block_size(m); +} + +/* + * memblock_header_legacy_get_extra -- + * (internal) returns the extra field stored in a legacy header + */ +static uint64_t +memblock_header_legacy_get_extra(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + return hdr->type_num; +} + +/* + * memblock_header_compact_get_extra -- + * (internal) returns the extra field stored in a compact header + */ +static uint64_t +memblock_header_compact_get_extra(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + return hdr->extra; +} + +/* + * memblock_header_none_get_extra -- + * (internal) objects without a header don't have an extra field + */ +static uint64_t +memblock_header_none_get_extra(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return 0; +} + +/* + * memblock_header_legacy_get_flags -- + * (internal) returns the flags stored in a legacy header + */ +static uint16_t +memblock_header_legacy_get_flags(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + return (uint16_t)(hdr->root_size >> ALLOC_HDR_SIZE_SHIFT); +} + +/* + * memblock_header_compact_get_flags -- + * (internal) returns the flags stored in a compact header + */ +static uint16_t +memblock_header_compact_get_flags(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + return (uint16_t)(hdr->size >> ALLOC_HDR_SIZE_SHIFT); +} + +/* + * memblock_header_none_get_flags -- + * (internal) objects without a header do not support flags + */ +static uint16_t +memblock_header_none_get_flags(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return 0; +} + +/* + * memblock_header_legacy_write -- + * (internal) writes a legacy header of an object + */ +static void +memblock_header_legacy_write(const struct memory_block *m, + size_t size, uint64_t extra, uint16_t flags) +{ + struct allocation_header_legacy hdr; + + hdr.size = size; + hdr.type_num = extra; + hdr.root_size = ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT); + + struct allocation_header_legacy *hdrp = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp)); + + VALGRIND_ADD_TO_TX(hdrp, sizeof(*hdrp)); + memcpy(hdrp, &hdr, sizeof(hdr)); /* legacy header is 64 bytes in size */ + VALGRIND_REMOVE_FROM_TX(hdrp, sizeof(*hdrp)); + + /* unused fields of the legacy headers are used as a red zone */ + VALGRIND_DO_MAKE_MEM_NOACCESS(hdrp->unused, sizeof(hdrp->unused)); +} + +/* + * memblock_header_compact_write -- + * (internal) writes a compact header of an object + */ +static void +memblock_header_compact_write(const struct memory_block *m, + size_t size, uint64_t extra, uint16_t flags) +{ + COMPILE_ERROR_ON(ALLOC_HDR_COMPACT_SIZE > CACHELINE_SIZE); + + struct { + struct allocation_header_compact hdr; + uint8_t padding[CACHELINE_SIZE - ALLOC_HDR_COMPACT_SIZE]; + } padded; + + /* + * REVISIT: + * Below memset is added to prevent valgrind propagating the + * cleared V-Bits of the padding field all the way till DMA buffer + * as part of logging by WAL. + * This code needs to be revisited when valgrind macros are + * enabled within DAV. + */ + padded.hdr.size = size | ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT); + padded.hdr.extra = extra; + + struct allocation_header_compact *hdrp = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp)); + + /* + * If possible write the entire header with a single memcpy, this allows + * the copy implementation to avoid a cache miss on a partial cache line + * write. + */ + size_t hdr_size = ALLOC_HDR_COMPACT_SIZE; + + if ((uintptr_t)hdrp % CACHELINE_SIZE == 0 && size >= sizeof(padded)) + hdr_size = sizeof(padded); + + VALGRIND_ADD_TO_TX(hdrp, hdr_size); + + memcpy(hdrp, &padded, hdr_size); + VALGRIND_DO_MAKE_MEM_UNDEFINED((char *)hdrp + ALLOC_HDR_COMPACT_SIZE, + hdr_size - ALLOC_HDR_COMPACT_SIZE); + + VALGRIND_REMOVE_FROM_TX(hdrp, hdr_size); +} + +/* + * memblock_header_none_write -- + * (internal) nothing to write + */ +static void +memblock_header_none_write(const struct memory_block *m, + size_t size, uint64_t extra, uint16_t flags) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m, size, extra, flags); + + /* NOP */ +} + +/* + * memblock_header_legacy_invalidate -- + * (internal) invalidates a legacy header + */ +static void +memblock_header_legacy_invalidate(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + VALGRIND_SET_CLEAN(hdr, sizeof(*hdr)); +} + +/* + * memblock_header_compact_invalidate -- + * (internal) invalidates a compact header + */ +static void +memblock_header_compact_invalidate(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + VALGRIND_SET_CLEAN(hdr, sizeof(*hdr)); +} + +/* + * memblock_no_header_invalidate -- + * (internal) nothing to invalidate + */ +static void +memblock_header_none_invalidate(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + /* NOP */ +} + +/* + * memblock_header_legacy_reinit -- + * (internal) reinitializes a legacy header after a heap restart + */ +static void +memblock_header_legacy_reinit(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); + + /* unused fields of the legacy headers are used as a red zone */ + VALGRIND_DO_MAKE_MEM_NOACCESS(hdr->unused, sizeof(hdr->unused)); +} + +/* + * memblock_header_compact_reinit -- + * (internal) reinitializes a compact header after a heap restart + */ +static void +memblock_header_compact_reinit(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); +} + +/* + * memblock_header_none_reinit -- + * (internal) nothing to reinitialize + */ +static void +memblock_header_none_reinit(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + /* NOP */ +} + +static const struct { + /* determines the sizes of an object */ + size_t (*get_size)(const struct memory_block *m); + + /* returns the extra field (if available, 0 if not) */ + uint64_t (*get_extra)(const struct memory_block *m); + + /* returns the flags stored in a header (if available, 0 if not) */ + uint16_t (*get_flags)(const struct memory_block *m); + + /* + * Stores size, extra info and flags in header of an object + * (if available, does nothing otherwise). + */ + void (*write)(const struct memory_block *m, + size_t size, uint64_t extra, uint16_t flags); + void (*invalidate)(const struct memory_block *m); + + /* + * Reinitializes a header after a heap restart (if available, does + * nothing otherwise) (VG). + */ + void (*reinit)(const struct memory_block *m); +} memblock_header_ops[MAX_HEADER_TYPES] = { + [HEADER_LEGACY] = { + memblock_header_legacy_get_size, + memblock_header_legacy_get_extra, + memblock_header_legacy_get_flags, + memblock_header_legacy_write, + memblock_header_legacy_invalidate, + memblock_header_legacy_reinit, + }, + [HEADER_COMPACT] = { + memblock_header_compact_get_size, + memblock_header_compact_get_extra, + memblock_header_compact_get_flags, + memblock_header_compact_write, + memblock_header_compact_invalidate, + memblock_header_compact_reinit, + }, + [HEADER_NONE] = { + memblock_header_none_get_size, + memblock_header_none_get_extra, + memblock_header_none_get_flags, + memblock_header_none_write, + memblock_header_none_invalidate, + memblock_header_none_reinit, + } +}; + +/* + * memblock_run_default_nallocs -- returns the number of memory blocks + * available in the in a run with given parameters using the default + * fixed-bitmap algorithm + */ +static unsigned +memblock_run_default_nallocs(uint32_t *size_idx, uint16_t flags, + uint64_t unit_size, uint64_t alignment) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(flags); + + unsigned nallocs = (unsigned) + (RUN_DEFAULT_SIZE_BYTES(*size_idx) / unit_size); + + while (nallocs > RUN_DEFAULT_BITMAP_NBITS) { + /* trying to create a run with number of units exceeding the bitmap size */ + DAV_DBG("run:%lu number of units %u exceeds bitmap size (%u)", + unit_size, nallocs, RUN_DEFAULT_BITMAP_NBITS); + if (*size_idx > 1) { + *size_idx -= 1; + /* recalculate the number of allocations */ + nallocs = (uint32_t) + (RUN_DEFAULT_SIZE_BYTES(*size_idx) / unit_size); + /* run was constructed with fewer chunks (minus one) */ + D_INFO("run:%lu constructed with fewer chunks:%u\n", + unit_size, *size_idx); + } else { + /* + * run was constructed with fewer units than optimal, + * this might lead to inefficient memory utilization! + */ + D_INFO("run:%lu constructed with fewer units:%u than optimal:%u\n", + unit_size, RUN_DEFAULT_BITMAP_NBITS, nallocs); + + nallocs = RUN_DEFAULT_BITMAP_NBITS; + } + } + + return nallocs - (alignment ? 1 : 0); +} + +/* + * memblock_run_bitmap -- calculate bitmap parameters for given arguments + */ +void +memblock_run_bitmap(uint32_t *size_idx, uint16_t flags, + uint64_t unit_size, uint64_t alignment, void *content, + struct run_bitmap *b) +{ + ASSERTne(*size_idx, 0); + + /* + * Flexible bitmaps have a variably sized values array. The size varies + * depending on: + * alignment - initial run alignment might require up-to a unit + * size idx - the larger the run, the more units it carries + * unit_size - the smaller the unit size, the more units per run + * + * The size of the bitmap also has to be calculated in such a way that + * the beginning of allocations data is cacheline aligned. This is + * required to perform many optimizations throughout the codebase. + * This alignment requirement means that some of the bitmap values might + * remain unused and will serve only as a padding for data. + */ + if (flags & CHUNK_FLAG_FLEX_BITMAP) { + /* + * First calculate the number of values without accounting for + * the bitmap size. + */ + size_t content_size = RUN_CONTENT_SIZE_BYTES(*size_idx); + + b->nbits = (unsigned)(content_size / unit_size); + b->nvalues = util_div_ceil(b->nbits, RUN_BITS_PER_VALUE); + + /* + * Then, align the number of values up, so that the cacheline + * alignment is preserved. + */ + b->nvalues = ALIGN_UP(b->nvalues + RUN_BASE_METADATA_VALUES, + (unsigned)(CACHELINE_SIZE / sizeof(*b->values))) + - RUN_BASE_METADATA_VALUES; + + /* + * This is the total number of bytes needed for the bitmap AND + * padding. + */ + b->size = b->nvalues * sizeof(*b->values); + + /* + * Calculate the number of allocations again, but this time + * accounting for the bitmap/padding. + */ + b->nbits = (unsigned)((content_size - b->size) / unit_size) + - (alignment ? 1U : 0U); + + /* + * The last step is to calculate how much of the padding + * is left at the end of the bitmap. + */ + unsigned unused_bits = (b->nvalues * RUN_BITS_PER_VALUE) + - b->nbits; + unsigned unused_values = unused_bits / RUN_BITS_PER_VALUE; + + b->nvalues -= unused_values; + b->values = (uint64_t *)content; + + return; + } + + b->size = RUN_DEFAULT_BITMAP_SIZE; + b->nbits = memblock_run_default_nallocs(size_idx, flags, + unit_size, alignment); + + unsigned unused_bits = RUN_DEFAULT_BITMAP_NBITS - b->nbits; + unsigned unused_values = unused_bits / RUN_BITS_PER_VALUE; + + b->nvalues = RUN_DEFAULT_BITMAP_VALUES - unused_values; + b->values = (uint64_t *)content; +} + +/* + * run_get_bitmap -- initializes run bitmap information + */ +static void +run_get_bitmap(const struct memory_block *m, struct run_bitmap *b) +{ + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + if (m->cached_bitmap != NULL) { + *b = *m->cached_bitmap; + b->values = (uint64_t *)run->content; + } else { + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + uint32_t size_idx = hdr->size_idx; + + memblock_run_bitmap(&size_idx, hdr->flags, run->hdr.block_size, + run->hdr.alignment, run->content, b); + ASSERTeq(size_idx, hdr->size_idx); + } +} + +/* + * huge_block_size -- returns the compile-time constant which defines the + * huge memory block size. + */ +static size_t +huge_block_size(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return CHUNKSIZE; +} + +/* + * run_block_size -- looks for the right chunk and returns the block size + * information that is attached to the run block metadata. + */ +static size_t +run_block_size(const struct memory_block *m) +{ + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + return run->hdr.block_size; +} + +/* + * huge_get_real_data -- returns pointer to the beginning data of a huge block + */ +static void * +huge_get_real_data(const struct memory_block *m) +{ + return heap_get_chunk(m->heap, m)->data; +} + +/* + * run_get_data_start -- (internal) returns the pointer to the beginning of + * allocations in a run + */ +static char * +run_get_data_start(const struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + struct run_bitmap b; + + run_get_bitmap(m, &b); + + if (hdr->flags & CHUNK_FLAG_ALIGNED) { + /* + * Alignment is property of user data in allocations. And + * since objects have headers, we need to take them into + * account when calculating the address. + */ + uintptr_t hsize = header_type_to_size[m->header_type]; + uintptr_t base = (uintptr_t)run->content + + b.size + hsize; + return (char *)(ALIGN_UP(base, run->hdr.alignment) - hsize); + } else { + return (char *)&run->content + b.size; + } +} + +/* + * run_get_data_offset -- (internal) returns the number of bytes between + * run base metadata and data + */ +static size_t +run_get_data_offset(const struct memory_block *m) +{ + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + return (size_t)run_get_data_start(m) - (size_t)&run->content; +} + +/* + * run_get_real_data -- returns pointer to the beginning data of a run block + */ +static void * +run_get_real_data(const struct memory_block *m) +{ + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + ASSERT(run->hdr.block_size != 0); + + return run_get_data_start(m) + (run->hdr.block_size * m->block_off); +} + +/* + * block_get_user_data -- returns pointer to the data of a block + */ +static void * +block_get_user_data(const struct memory_block *m) +{ + return (char *)m->m_ops->get_real_data(m) + + header_type_to_size[m->header_type]; +} + +/* + * chunk_get_chunk_hdr_value -- (internal) get value of a header for redo log + */ +static uint64_t +chunk_get_chunk_hdr_value(uint16_t type, uint16_t flags, uint32_t size_idx) +{ + uint64_t val; + struct chunk_header hdr; + + COMPILE_ERROR_ON(sizeof(struct chunk_header) != sizeof(uint64_t)); + + hdr.type = type; + hdr.flags = flags; + hdr.size_idx = size_idx; + memcpy(&val, &hdr, sizeof(val)); + + return val; +} + +/* + * huge_prep_operation_hdr -- prepares the new value of a chunk header that will + * be set after the operation concludes. + */ +static void +huge_prep_operation_hdr(const struct memory_block *m, enum memblock_state op, + struct operation_context *ctx) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + /* + * Depending on the operation that needs to be performed a new chunk + * header needs to be prepared with the new chunk state. + */ + uint64_t val = chunk_get_chunk_hdr_value( + op == MEMBLOCK_ALLOCATED ? CHUNK_TYPE_USED : CHUNK_TYPE_FREE, + hdr->flags, + m->size_idx); + + if (ctx == NULL) { + util_atomic_store_explicit64((uint64_t *)hdr, val, + memory_order_relaxed); + mo_wal_persist(&m->heap->p_ops, hdr, sizeof(*hdr)); + } else { + operation_add_entry(ctx, hdr, val, ULOG_OPERATION_SET); + } + + VALGRIND_DO_MAKE_MEM_NOACCESS(hdr + 1, + (hdr->size_idx - 1) * sizeof(struct chunk_header)); + + /* + * In the case of chunks larger than one unit the footer must be + * created immediately AFTER the persistent state is safely updated. + */ + if (m->size_idx == 1) + return; + + struct chunk_header *footer = hdr + m->size_idx - 1; + + VALGRIND_DO_MAKE_MEM_UNDEFINED(footer, sizeof(*footer)); + + val = chunk_get_chunk_hdr_value(CHUNK_TYPE_FOOTER, 0, m->size_idx); + + /* + * It's only safe to write the footer AFTER the persistent part of + * the operation have been successfully processed because the footer + * pointer might point to a currently valid persistent state + * of a different chunk. + * The footer entry change is updated as transient because it will + * be recreated at heap boot regardless - it's just needed for runtime + * operations. + */ + if (ctx == NULL) { + util_atomic_store_explicit64((uint64_t *)footer, val, + memory_order_relaxed); + VALGRIND_SET_CLEAN(footer, sizeof(*footer)); + } else { + operation_add_typed_entry(ctx, + footer, val, ULOG_OPERATION_SET, LOG_TRANSIENT); + } +} + +/* + * run_prep_operation_hdr -- prepares the new value for a select few bytes of + * a run bitmap that will be set after the operation concludes. + * + * It's VERY important to keep in mind that the particular value of the + * bitmap this method is modifying must not be changed after this function + * is called and before the operation is processed. + */ +static void +run_prep_operation_hdr(const struct memory_block *m, enum memblock_state op, + struct operation_context *ctx) +{ + ASSERT(m->size_idx <= RUN_BITS_PER_VALUE); + ASSERT(m->size_idx > 0); + + /* + * Free blocks are represented by clear bits and used blocks by set + * bits - which is the reverse of the commonly used scheme. + * + * Here a bit mask is prepared that flips the bits that represent the + * memory block provided by the caller - because both the size index and + * the block offset are tied 1:1 to the bitmap this operation is + * relatively simple. + */ + uint64_t bmask; + +#ifdef WAL_SUPPORTS_AND_OR_OPS + if (m->size_idx == RUN_BITS_PER_VALUE) { + ASSERTeq(m->block_off % RUN_BITS_PER_VALUE, 0); + bmask = UINT64_MAX; + } else { + bmask = ((1ULL << m->size_idx) - 1ULL) << + (m->block_off % RUN_BITS_PER_VALUE); + } +#else + uint16_t num = m->size_idx; + uint32_t pos = m->block_off % RUN_BITS_PER_VALUE; + + ASSERT_rt(num > 0 && num <= RUN_BITS_PER_VALUE); + bmask = ULOG_ENTRY_TO_VAL(pos, num); +#endif + + /* + * The run bitmap is composed of several 8 byte values, so a proper + * element of the bitmap array must be selected. + */ + unsigned bpos = m->block_off / RUN_BITS_PER_VALUE; + struct run_bitmap b; + + run_get_bitmap(m, &b); + + /* the bit mask is applied immediately by the add entry operations */ + if (op == MEMBLOCK_ALLOCATED) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + operation_add_entry(ctx, &b.values[bpos], + bmask, ULOG_OPERATION_OR); +#else + operation_add_entry(ctx, &b.values[bpos], + bmask, ULOG_OPERATION_SET_BITS); +#endif + } else if (op == MEMBLOCK_FREE) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + operation_add_entry(ctx, &b.values[bpos], + ~bmask, ULOG_OPERATION_AND); +#else + operation_add_entry(ctx, &b.values[bpos], + bmask, ULOG_OPERATION_CLR_BITS); +#endif + } else { + ASSERT(0); + } +} + +/* + * huge_get_lock -- because huge memory blocks are always allocated from a + * single bucket there's no reason to lock them - the bucket itself is + * protected. + */ +static pthread_mutex_t * +huge_get_lock(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return NULL; +} + +/* + * run_get_lock -- gets the runtime mutex from the heap. + */ +static pthread_mutex_t * +run_get_lock(const struct memory_block *m) +{ + return heap_get_run_lock(m->heap, m->chunk_id); +} + +/* + * huge_get_state -- returns whether a huge block is allocated or not + */ +static enum memblock_state +huge_get_state(const struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + if (hdr->type == CHUNK_TYPE_USED) + return MEMBLOCK_ALLOCATED; + + if (hdr->type == CHUNK_TYPE_FREE) + return MEMBLOCK_FREE; + + return MEMBLOCK_STATE_UNKNOWN; +} + +/* + * huge_get_state -- returns whether a block from a run is allocated or not + */ +static enum memblock_state +run_get_state(const struct memory_block *m) +{ + struct run_bitmap b; + + run_get_bitmap(m, &b); + + unsigned v = m->block_off / RUN_BITS_PER_VALUE; + uint64_t bitmap = b.values[v]; + unsigned bit = m->block_off % RUN_BITS_PER_VALUE; + + unsigned bit_last = bit + m->size_idx; + + ASSERT(bit_last <= RUN_BITS_PER_VALUE); + + for (unsigned i = bit; i < bit_last; ++i) { + if (!BIT_IS_CLR(bitmap, i)) + return MEMBLOCK_ALLOCATED; + } + + return MEMBLOCK_FREE; +} + +/* + * huge_ensure_header_type -- checks the header type of a chunk and modifies + * it if necessary. This is fail-safe atomic. + */ +static void +huge_ensure_header_type(const struct memory_block *m, + enum header_type t) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + ASSERTeq(hdr->type, CHUNK_TYPE_FREE); + + if ((hdr->flags & header_type_to_flag[t]) == 0) { + VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr)); + uint16_t f = ((uint16_t)header_type_to_flag[t]); + uint64_t nhdr = chunk_get_chunk_hdr_value(hdr->type, + hdr->flags | f, hdr->size_idx); + util_atomic_store_explicit64((uint64_t *)hdr, + nhdr, memory_order_relaxed); + mo_wal_persist(&m->heap->p_ops, hdr, sizeof(*hdr)); + VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr)); + } +} + +/* + * run_ensure_header_type -- runs must be created with appropriate header type. + */ +static void +run_ensure_header_type(const struct memory_block *m, + enum header_type t) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m, t); + +#ifdef DAV_EXTRA_DEBUG + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + ASSERTeq(hdr->type, CHUNK_TYPE_RUN); + ASSERT((hdr->flags & header_type_to_flag[t]) == header_type_to_flag[t]); +#endif +} + +/* + * block_get_real_size -- returns the size of a memory block that includes all + * of the overhead (headers) + */ +static size_t +block_get_real_size(const struct memory_block *m) +{ + /* + * There are two valid ways to get a size. If the memory block + * initialized properly and the size index is set, the chunk unit size + * can be simply multiplied by that index, otherwise we need to look at + * the allocation header. + */ + if (m->size_idx != 0) + return m->m_ops->block_size(m) * m->size_idx; + else + return memblock_header_ops[m->header_type].get_size(m); +} + +/* + * block_get_user_size -- returns the size of a memory block without overheads, + * this is the size of a data block that can be used. + */ +static size_t +block_get_user_size(const struct memory_block *m) +{ + return block_get_real_size(m) - header_type_to_size[m->header_type]; +} + +/* + * block_write_header -- writes a header of an allocation + */ +static void +block_write_header(const struct memory_block *m, + uint64_t extra_field, uint16_t flags) +{ + memblock_header_ops[m->header_type].write(m, + block_get_real_size(m), extra_field, flags); +} + +/* + * block_invalidate -- invalidates allocation data and header + */ +static void +block_invalidate(const struct memory_block *m) +{ + void *data = m->m_ops->get_user_data(m); + size_t size = m->m_ops->get_user_size(m); + + VALGRIND_SET_CLEAN(data, size); + + memblock_header_ops[m->header_type].invalidate(m); +} + +/* + * block_reinit_header -- reinitializes a block after a heap restart + */ +static void +block_reinit_header(const struct memory_block *m) +{ + memblock_header_ops[m->header_type].reinit(m); +} + +/* + * block_get_extra -- returns the extra field of an allocation + */ +static uint64_t +block_get_extra(const struct memory_block *m) +{ + return memblock_header_ops[m->header_type].get_extra(m); +} + +/* + * block_get_flags -- returns the flags of an allocation + */ +static uint16_t +block_get_flags(const struct memory_block *m) +{ + return memblock_header_ops[m->header_type].get_flags(m); +} + +/* + * heap_run_process_bitmap_value -- (internal) looks for unset bits in the + * value, creates a valid memory block out of them and inserts that + * block into the given bucket. + */ +static int +run_process_bitmap_value(const struct memory_block *m, + uint64_t value, uint32_t base_offset, object_callback cb, void *arg) +{ + int ret = 0; + uint64_t shift = 0; /* already processed bits */ + struct memory_block s = *m; + + do { + /* + * Shift the value so that the next memory block starts on the + * least significant position: + * ..............0 (free block) + * or ..............1 (used block) + */ + uint64_t shifted = value >> shift; + + /* all clear or set bits indicate the end of traversal */ + if (shifted == 0) { + /* + * Insert the remaining blocks as free. Remember that + * unsigned values are always zero-filled, so we must + * take the current shift into account. + */ + s.block_off = (uint32_t)(base_offset + shift); + s.size_idx = (uint32_t)(RUN_BITS_PER_VALUE - shift); + + ret = cb(&s, arg); + if (ret != 0) + return ret; + + break; + } else if (shifted == UINT64_MAX) { + break; + } + + /* + * Offset and size of the next free block, either of these + * can be zero depending on where the free block is located + * in the value. + */ + unsigned off = (unsigned)util_lssb_index64(~shifted); + unsigned size = (unsigned)util_lssb_index64(shifted); + + shift += off + size; + + if (size != 0) { /* zero size means skip to the next value */ + s.block_off = (uint32_t)(base_offset + (shift - size)); + s.size_idx = (uint32_t)(size); + + memblock_rebuild_state(m->heap, &s); + ret = cb(&s, arg); + if (ret != 0) + return ret; + } + } while (shift != RUN_BITS_PER_VALUE); + + return 0; +} + +/* + * run_iterate_free -- iterates over free blocks in a run + */ +static int +run_iterate_free(const struct memory_block *m, object_callback cb, void *arg) +{ + int ret = 0; + uint32_t block_off = 0; + struct run_bitmap b; + + run_get_bitmap(m, &b); + + struct memory_block nm = *m; + + for (unsigned i = 0; i < b.nvalues; ++i) { + uint64_t v = b.values[i]; + + ASSERT((uint64_t)RUN_BITS_PER_VALUE * (uint64_t)i + <= UINT32_MAX); + block_off = RUN_BITS_PER_VALUE * i; + ret = run_process_bitmap_value(&nm, v, block_off, cb, arg); + if (ret != 0) + return ret; + } + + return 0; +} + +/* + * run_iterate_used -- iterates over used blocks in a run + */ +static int +run_iterate_used(const struct memory_block *m, object_callback cb, void *arg) +{ + uint32_t i = m->block_off / RUN_BITS_PER_VALUE; + uint32_t block_start = m->block_off % RUN_BITS_PER_VALUE; + uint32_t block_off; + + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + struct memory_block iter = *m; + struct run_bitmap b; + + run_get_bitmap(m, &b); + + for (; i < b.nvalues; ++i) { + uint64_t v = b.values[i]; + + block_off = (uint32_t)(RUN_BITS_PER_VALUE * i); + + for (uint32_t j = block_start; j < RUN_BITS_PER_VALUE; ) { + if (block_off + j >= (uint32_t)b.nbits) + break; + + if (!BIT_IS_CLR(v, j)) { + iter.block_off = (uint32_t)(block_off + j); + + /* + * The size index of this memory block cannot be + * retrieved at this time because the header + * might not be initialized in valgrind yet. + */ + iter.size_idx = 0; + + if (cb(&iter, arg) != 0) + return 1; + + iter.size_idx = CALC_SIZE_IDX( + run->hdr.block_size, + iter.m_ops->get_real_size(&iter)); + j = (uint32_t)(j + iter.size_idx); + } else { + ++j; + } + } + block_start = 0; + } + + return 0; +} + +/* + * huge_iterate_free -- calls cb on memory block if it's free + */ +static int +huge_iterate_free(const struct memory_block *m, object_callback cb, void *arg) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + return hdr->type == CHUNK_TYPE_FREE ? cb(m, arg) : 0; +} + +/* + * huge_iterate_free -- calls cb on memory block if it's used + */ +static int +huge_iterate_used(const struct memory_block *m, object_callback cb, void *arg) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + return hdr->type == CHUNK_TYPE_USED ? cb(m, arg) : 0; +} + +/* + * huge_vg_init -- initializes chunk metadata in memcheck state + */ +static void +huge_vg_init(const struct memory_block *m, int objects, + object_callback cb, void *arg) +{ + struct zone *z = ZID_TO_ZONE(&m->heap->layout_info, m->zone_id); + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + struct chunk *chunk = heap_get_chunk(m->heap, m); + + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); + + /* + * Mark unused chunk headers as not accessible. + */ + VALGRIND_DO_MAKE_MEM_NOACCESS( + &z->chunk_headers[m->chunk_id + 1], + (m->size_idx - 1) * + sizeof(struct chunk_header)); + + size_t size = block_get_real_size(m); + + VALGRIND_DO_MAKE_MEM_NOACCESS(chunk, size); + + if (objects && huge_get_state(m) == MEMBLOCK_ALLOCATED) { + if (cb(m, arg) != 0) + FATAL("failed to initialize valgrind state"); + } +} + +/* + * run_vg_init -- initializes run metadata in memcheck state + */ +static void +run_vg_init(const struct memory_block *m, int objects, + object_callback cb, void *arg) +{ + struct zone *z = ZID_TO_ZONE(&m->heap->layout_info, m->zone_id); + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); + + /* set the run metadata as defined */ + VALGRIND_DO_MAKE_MEM_DEFINED(run, RUN_BASE_METADATA_SIZE); + + struct run_bitmap b; + + run_get_bitmap(m, &b); + + /* + * Mark run data headers as defined. + */ + for (unsigned j = 1; j < m->size_idx; ++j) { + struct chunk_header *data_hdr = + &z->chunk_headers[m->chunk_id + j]; + VALGRIND_DO_MAKE_MEM_DEFINED(data_hdr, + sizeof(struct chunk_header)); + ASSERTeq(data_hdr->type, CHUNK_TYPE_RUN_DATA); + } + + VALGRIND_DO_MAKE_MEM_NOACCESS(run, SIZEOF_RUN(run, m->size_idx)); + + /* set the run bitmap as defined */ + VALGRIND_DO_MAKE_MEM_DEFINED(run, b.size + RUN_BASE_METADATA_SIZE); + + if (objects) { + if (run_iterate_used(m, cb, arg) != 0) + FATAL("failed to initialize valgrind state"); + } +} + +/* + * run_reinit_chunk -- run reinitialization on first zone traversal + */ +static void +run_reinit_chunk(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + /* noop */ +} + +/* + * huge_write_footer -- (internal) writes a chunk footer + */ +static void +huge_write_footer(struct chunk_header *hdr, uint32_t size_idx) +{ + if (size_idx == 1) /* that would overwrite the header */ + return; + + VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr + size_idx - 1, sizeof(*hdr)); + + struct chunk_header f = *hdr; + + f.type = CHUNK_TYPE_FOOTER; + f.size_idx = size_idx; + *(hdr + size_idx - 1) = f; + /* no need to persist, footers are recreated in heap_populate_buckets */ + VALGRIND_SET_CLEAN(hdr + size_idx - 1, sizeof(f)); +} + +/* + * huge_reinit_chunk -- chunk reinitialization on first zone traversal + */ +static void +huge_reinit_chunk(const struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + if (hdr->type == CHUNK_TYPE_USED) + huge_write_footer(hdr, hdr->size_idx); +} + +/* + * run_calc_free -- calculates the number of free units in a run + */ +static void +run_calc_free(const struct memory_block *m, + uint32_t *free_space, uint32_t *max_free_block) +{ + struct run_bitmap b; + + run_get_bitmap(m, &b); + for (unsigned i = 0; i < b.nvalues; ++i) { + uint64_t value = ~b.values[i]; + + if (value == 0) + continue; + + uint32_t free_in_value = util_popcount64(value); + + *free_space = *free_space + free_in_value; + + /* + * If this value has less free blocks than already found max, + * there's no point in calculating. + */ + if (free_in_value < *max_free_block) + continue; + + /* if the entire value is empty, no point in calculating */ + if (free_in_value == RUN_BITS_PER_VALUE) { + *max_free_block = RUN_BITS_PER_VALUE; + continue; + } + + /* if already at max, no point in calculating */ + if (*max_free_block == RUN_BITS_PER_VALUE) + continue; + + /* + * Calculate the biggest free block in the bitmap. + * This algorithm is not the most clever imaginable, but it's + * easy to implement and fast enough. + */ + uint16_t n = 0; + + while (value != 0) { + value &= (value << 1ULL); + n++; + } + + if (n > *max_free_block) + *max_free_block = n; + } +} + +/* + * huge_fill_pct -- huge blocks by definition use the entirety of a chunk + */ +static unsigned +huge_fill_pct(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return 100; +} + +/* + * run_fill_pct -- calculates the percentage of allocated units inside of a run + */ +static unsigned +run_fill_pct(const struct memory_block *m) +{ + struct run_bitmap b; + unsigned clearbits = 0; + + run_get_bitmap(m, &b); + for (unsigned i = 0; i < b.nvalues; ++i) { + uint64_t value = ~b.values[i]; + + if (value == 0) + continue; + + clearbits += util_popcount64(value); + } + ASSERT(b.nbits >= clearbits); + unsigned setbits = b.nbits - clearbits; + + return (100 * setbits) / b.nbits; +} + +static const struct memory_block_ops mb_ops[MAX_MEMORY_BLOCK] = { + [MEMORY_BLOCK_HUGE] = { + .block_size = huge_block_size, + .prep_hdr = huge_prep_operation_hdr, + .get_lock = huge_get_lock, + .get_state = huge_get_state, + .get_user_data = block_get_user_data, + .get_real_data = huge_get_real_data, + .get_user_size = block_get_user_size, + .get_real_size = block_get_real_size, + .write_header = block_write_header, + .invalidate = block_invalidate, + .ensure_header_type = huge_ensure_header_type, + .reinit_header = block_reinit_header, + .vg_init = huge_vg_init, + .get_extra = block_get_extra, + .get_flags = block_get_flags, + .iterate_free = huge_iterate_free, + .iterate_used = huge_iterate_used, + .reinit_chunk = huge_reinit_chunk, + .calc_free = NULL, + .get_bitmap = NULL, + .fill_pct = huge_fill_pct, + }, + [MEMORY_BLOCK_RUN] = { + .block_size = run_block_size, + .prep_hdr = run_prep_operation_hdr, + .get_lock = run_get_lock, + .get_state = run_get_state, + .get_user_data = block_get_user_data, + .get_real_data = run_get_real_data, + .get_user_size = block_get_user_size, + .get_real_size = block_get_real_size, + .write_header = block_write_header, + .invalidate = block_invalidate, + .ensure_header_type = run_ensure_header_type, + .reinit_header = block_reinit_header, + .vg_init = run_vg_init, + .get_extra = block_get_extra, + .get_flags = block_get_flags, + .iterate_free = run_iterate_free, + .iterate_used = run_iterate_used, + .reinit_chunk = run_reinit_chunk, + .calc_free = run_calc_free, + .get_bitmap = run_get_bitmap, + .fill_pct = run_fill_pct, + } +}; + +/* + * memblock_huge_init -- initializes a new huge memory block + */ +struct memory_block +memblock_huge_init(struct palloc_heap *heap, + uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx) +{ + struct memory_block m = MEMORY_BLOCK_NONE; + + m.chunk_id = chunk_id; + m.zone_id = zone_id; + m.size_idx = size_idx; + m.heap = heap; + + struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr, sizeof(*hdr)); + VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr)); + + uint64_t nhdr = chunk_get_chunk_hdr_value(CHUNK_TYPE_FREE, + 0, size_idx); + util_atomic_store_explicit64((uint64_t *)hdr, + nhdr, memory_order_relaxed); + + mo_wal_persist(&heap->p_ops, hdr, sizeof(*hdr)); + + huge_write_footer(hdr, size_idx); + + memblock_rebuild_state(heap, &m); + + return m; +} + +/* + * memblock_run_init -- initializes a new run memory block + */ +struct memory_block +memblock_run_init(struct palloc_heap *heap, + uint32_t chunk_id, uint32_t zone_id, struct run_descriptor *rdsc) +{ + uint32_t size_idx = rdsc->size_idx; + + ASSERTne(size_idx, 0); + + struct memory_block m = MEMORY_BLOCK_NONE; + + m.chunk_id = chunk_id; + m.zone_id = zone_id; + m.size_idx = size_idx; + m.heap = heap; + + struct zone *z = ZID_TO_ZONE(&heap->layout_info, zone_id); + struct chunk_run *run = heap_get_chunk_run(heap, &m); + size_t runsize = SIZEOF_RUN(run, size_idx); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(run, runsize); + + /* add/remove chunk_run and chunk_header to valgrind transaction */ + VALGRIND_ADD_TO_TX(run, runsize); + run->hdr.block_size = rdsc->unit_size; + run->hdr.alignment = rdsc->alignment; + + struct run_bitmap b = rdsc->bitmap; + + b.values = (uint64_t *)run->content; + + size_t bitmap_size = b.size; + + /* set all the bits */ + memset(b.values, 0xFF, bitmap_size); + + /* clear only the bits available for allocations from this bucket */ + memset(b.values, 0, sizeof(*b.values) * (b.nvalues - 1)); + + unsigned trailing_bits = b.nbits % RUN_BITS_PER_VALUE; + uint64_t last_value = UINT64_MAX << trailing_bits; + + b.values[b.nvalues - 1] = last_value; + + VALGRIND_REMOVE_FROM_TX(run, runsize); + + mo_wal_flush(&heap->p_ops, run, + sizeof(struct chunk_run_header) + + bitmap_size, 0); + + struct chunk_header run_data_hdr; + + run_data_hdr.type = CHUNK_TYPE_RUN_DATA; + run_data_hdr.flags = 0; + + VALGRIND_ADD_TO_TX(&z->chunk_headers[chunk_id], + sizeof(struct chunk_header) * size_idx); + + struct chunk_header *data_hdr; + + for (unsigned i = 1; i < size_idx; ++i) { + data_hdr = &z->chunk_headers[chunk_id + i]; + VALGRIND_DO_MAKE_MEM_UNDEFINED(data_hdr, sizeof(*data_hdr)); + VALGRIND_ANNOTATE_NEW_MEMORY(data_hdr, sizeof(*data_hdr)); + run_data_hdr.size_idx = i; + *data_hdr = run_data_hdr; + } + mo_wal_persist(&heap->p_ops, + &z->chunk_headers[chunk_id + 1], + sizeof(struct chunk_header) * (size_idx - 1)); + + struct chunk_header *hdr = &z->chunk_headers[chunk_id]; + + ASSERT(hdr->type == CHUNK_TYPE_FREE); + + VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr)); + + uint64_t run_hdr = chunk_get_chunk_hdr_value(CHUNK_TYPE_RUN, + rdsc->flags, hdr->size_idx); + util_atomic_store_explicit64((uint64_t *)hdr, + run_hdr, memory_order_relaxed); + mo_wal_persist(&heap->p_ops, hdr, sizeof(*hdr)); + + VALGRIND_REMOVE_FROM_TX(&z->chunk_headers[chunk_id], + sizeof(struct chunk_header) * size_idx); + + memblock_rebuild_state(heap, &m); + m.cached_bitmap = &rdsc->bitmap; + + return m; +} + +/* + * memblock_detect_type -- looks for the corresponding chunk header and + * depending on the chunks type returns the right memory block type + */ +static enum memory_block_type +memblock_detect_type(struct palloc_heap *heap, const struct memory_block *m) +{ + enum memory_block_type ret = MEMORY_BLOCK_HUGE; + + switch (heap_get_chunk_hdr(heap, m)->type) { + case CHUNK_TYPE_RUN: + case CHUNK_TYPE_RUN_DATA: + ret = MEMORY_BLOCK_RUN; + break; + case CHUNK_TYPE_FREE: + case CHUNK_TYPE_USED: + case CHUNK_TYPE_FOOTER: + ret = MEMORY_BLOCK_HUGE; + break; + default: + /* unreachable */ + FATAL("possible zone chunks metadata corruption"); + } + return ret; +} + +/* + * memblock_from_offset -- resolves a memory block data from an offset that + * originates from the heap + */ +struct memory_block +memblock_from_offset_opt(struct palloc_heap *heap, uint64_t off, int size) +{ + struct memory_block m = MEMORY_BLOCK_NONE; + + m.heap = heap; + + off -= HEAP_PTR_TO_OFF(heap, heap->layout_info.zone0); + m.zone_id = (uint32_t)(off / ZONE_MAX_SIZE); + + off -= (ZONE_MAX_SIZE * m.zone_id) + sizeof(struct zone); + m.chunk_id = (uint32_t)(off / CHUNKSIZE); + + struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m); + + if (hdr->type == CHUNK_TYPE_RUN_DATA) + m.chunk_id -= hdr->size_idx; + + off -= CHUNKSIZE * m.chunk_id; + + m.header_type = memblock_header_type(&m); + + off -= header_type_to_size[m.header_type]; + + m.type = off != 0 ? MEMORY_BLOCK_RUN : MEMORY_BLOCK_HUGE; + ASSERTeq(memblock_detect_type(heap, &m), m.type); + + m.m_ops = &mb_ops[m.type]; + + uint64_t unit_size = m.m_ops->block_size(&m); + + if (off != 0) { /* run */ + off -= run_get_data_offset(&m); + off -= RUN_BASE_METADATA_SIZE; + m.block_off = (uint16_t)(off / unit_size); + off -= m.block_off * unit_size; + } + + struct alloc_class_collection *acc = heap_alloc_classes(heap); + + if (acc != NULL) { + struct alloc_class *ac = alloc_class_by_run(acc, + unit_size, hdr->flags, hdr->size_idx); + if (ac != NULL) + m.cached_bitmap = &ac->rdsc.bitmap; + } + + m.size_idx = !size ? 0 : CALC_SIZE_IDX(unit_size, + memblock_header_ops[m.header_type].get_size(&m)); + + ASSERTeq(off, 0); + + return m; +} + +/* + * memblock_from_offset -- returns memory block with size + */ +struct memory_block +memblock_from_offset(struct palloc_heap *heap, uint64_t off) +{ + return memblock_from_offset_opt(heap, off, 1); +} + +/* + * memblock_rebuild_state -- fills in the runtime-state related fields of a + * memory block structure + * + * This function must be called on all memory blocks that were created by hand + * (as opposed to retrieved from memblock_from_offset function). + */ +void +memblock_rebuild_state(struct palloc_heap *heap, struct memory_block *m) +{ + m->heap = heap; + m->header_type = memblock_header_type(m); + m->type = memblock_detect_type(heap, m); + m->m_ops = &mb_ops[m->type]; + m->cached_bitmap = NULL; +} diff --git a/src/common/dav_v2/memblock.h b/src/common/dav_v2/memblock.h new file mode 100644 index 00000000000..0dd133647c3 --- /dev/null +++ b/src/common/dav_v2/memblock.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2023, Intel Corporation */ + +/* + * memblock.h -- internal definitions for memory block + */ + +#ifndef __DAOS_COMMON_MEMBLOCK_H +#define __DAOS_COMMON_MEMBLOCK_H 1 + +#include +#include + +#include "heap_layout.h" +#include "memops.h" +#include "palloc.h" + +#define MEMORY_BLOCK_NONE \ +(struct memory_block)\ +{0, 0, 0, 0, NULL, NULL, MAX_HEADER_TYPES, MAX_MEMORY_BLOCK, NULL} + +#define MEMORY_BLOCK_IS_NONE(_m)\ +((_m).heap == NULL) + +#define MEMORY_BLOCK_EQUALS(lhs, rhs)\ +((lhs).zone_id == (rhs).zone_id && (lhs).chunk_id == (rhs).chunk_id &&\ +(lhs).block_off == (rhs).block_off && (lhs).heap == (rhs).heap) + +enum memory_block_type { + /* + * Huge memory blocks are directly backed by memory chunks. A single + * huge block can consist of several chunks. + * The persistent representation of huge memory blocks can be thought + * of as a doubly linked list with variable length elements. + * That list is stored in the chunk headers array where one element + * directly corresponds to one chunk. + * + * U - used, F - free, R - footer, . - empty + * |U| represents a used chunk with a size index of 1, with type + * information (CHUNK_TYPE_USED) stored in the corresponding header + * array element - chunk_headers[chunk_id]. + * + * |F...R| represents a free chunk with size index of 5. The empty + * chunk headers have undefined values and shouldn't be used. All + * chunks with size larger than 1 must have a footer in the last + * corresponding header array - chunk_headers[chunk_id - size_idx - 1]. + * + * The above representation of chunks will be used to describe the + * way fail-safety is achieved during heap operations. + * + * Allocation of huge memory block with size index 5: + * Initial heap state: |U| <> |F..R| <> |U| <> |F......R| + * + * The only block that matches that size is at very end of the chunks + * list: |F......R| + * + * As the request was for memory block of size 5, and this ones size is + * 7 there's a need to first split the chunk in two. + * 1) The last chunk header of the new allocation is marked as footer + * and the block after that one is marked as free: |F...RF.R| + * This is allowed and has no impact on the heap because this + * modification is into chunk header that is otherwise unused, in + * other words the linked list didn't change. + * + * 2) The size index of the first header is changed from previous value + * of 7 to 5: |F...R||F.R| + * This is a single fail-safe atomic operation and this is the + * first change that is noticeable by the heap operations. + * A single linked list element is split into two new ones. + * + * 3) The allocation process either uses redo log or changes directly + * the chunk header type from free to used: |U...R| <> |F.R| + * + * In a similar fashion the reverse operation, free, is performed: + * Initial heap state: |U| <> |F..R| <> |F| <> |U...R| <> |F.R| + * + * This is the heap after the previous example with the single chunk + * in between changed from used to free. + * + * 1) Determine the neighbors of the memory block which is being + * freed. + * + * 2) Update the footer (if needed) information of the last chunk which + * is the memory block being freed or it's neighbor to the right. + * |F| <> |U...R| <> |F.R << this one| + * + * 3) Update the size index and type of the left-most chunk header. + * And so this: |F << this one| <> |U...R| <> |F.R| + * becomes this: |F.......R| + * The entire chunk header can be updated in a single fail-safe + * atomic operation because it's size is only 64 bytes. + */ + MEMORY_BLOCK_HUGE, + /* + * Run memory blocks are chunks with CHUNK_TYPE_RUN and size index of 1. + * The entire chunk is subdivided into smaller blocks and has an + * additional metadata attached in the form of a bitmap - each bit + * corresponds to a single block. + * In this case there's no need to perform any coalescing or splitting + * on the persistent metadata. + * The bitmap is stored on a variable number of 64 bit values and + * because of the requirement of allocation fail-safe atomicity the + * maximum size index of a memory block from a run is 64 - since that's + * the limit of atomic write guarantee. + * + * The allocation/deallocation process is a single 8 byte write that + * sets/clears the corresponding bits. Depending on the user choice + * it can either be made atomically or using redo-log when grouped with + * other operations. + * It's also important to note that in a case of realloc it might so + * happen that a single 8 byte bitmap value has its bits both set and + * cleared - that's why the run memory block metadata changes operate + * on AND'ing or OR'ing a bitmask instead of directly setting the value. + */ + MEMORY_BLOCK_RUN, + + MAX_MEMORY_BLOCK +}; + +enum memblock_state { + MEMBLOCK_STATE_UNKNOWN, + MEMBLOCK_ALLOCATED, + MEMBLOCK_FREE, + + MAX_MEMBLOCK_STATE, +}; + +/* runtime bitmap information for a run */ +struct run_bitmap { + unsigned nvalues; /* number of 8 byte values - size of values array */ + unsigned nbits; /* number of valid bits */ + + size_t size; /* total size of the bitmap in bytes */ + + uint64_t *values; /* pointer to the bitmap's values array */ +}; + +/* runtime information necessary to create a run */ +struct run_descriptor { + uint16_t flags; /* chunk flags for the run */ + size_t unit_size; /* the size of a single unit in a run */ + uint32_t size_idx; /* size index of a single run instance */ + size_t alignment; /* required alignment of objects */ + unsigned nallocs; /* number of allocs per run */ + struct run_bitmap bitmap; +}; + +struct memory_block_ops { + /* returns memory block size */ + size_t (*block_size)(const struct memory_block *m); + + /* prepares header modification operation */ + void (*prep_hdr)(const struct memory_block *m, + enum memblock_state dest_state, struct operation_context *ctx); + + /* returns lock associated with memory block */ + pthread_mutex_t *(*get_lock)(const struct memory_block *m); + + /* returns whether a block is allocated or not */ + enum memblock_state (*get_state)(const struct memory_block *m); + + /* returns pointer to the data of a block */ + void *(*get_user_data)(const struct memory_block *m); + + /* + * Returns the size of a memory block without overhead. + * This is the size of a data block that can be used. + */ + size_t (*get_user_size)(const struct memory_block *m); + + /* returns pointer to the beginning of data of a run block */ + void *(*get_real_data)(const struct memory_block *m); + + /* returns the size of a memory block, including headers */ + size_t (*get_real_size)(const struct memory_block *m); + + /* writes a header of an allocation */ + void (*write_header)(const struct memory_block *m, + uint64_t extra_field, uint16_t flags); + void (*invalidate)(const struct memory_block *m); + + /* + * Checks the header type of a chunk matches the expected type and + * modifies it if necessary. This is fail-safe atomic. + */ + void (*ensure_header_type)(const struct memory_block *m, + enum header_type t); + + /* + * Reinitializes a block after a heap restart. + * This is called for EVERY allocation, but *only* under Valgrind. + */ + void (*reinit_header)(const struct memory_block *m); + + /* returns the extra field of an allocation */ + uint64_t (*get_extra)(const struct memory_block *m); + + /* returns the flags of an allocation */ + uint16_t (*get_flags)(const struct memory_block *m); + + /* initializes memblock in valgrind */ + void (*vg_init)(const struct memory_block *m, int objects, + object_callback cb, void *arg); + + /* iterates over every free block */ + int (*iterate_free)(const struct memory_block *m, + object_callback cb, void *arg); + + /* iterates over every used block */ + int (*iterate_used)(const struct memory_block *m, + object_callback cb, void *arg); + + /* calculates number of free units, valid only for runs */ + void (*calc_free)(const struct memory_block *m, + uint32_t *free_space, uint32_t *max_free_block); + + /* this is called exactly once for every existing chunk */ + void (*reinit_chunk)(const struct memory_block *m); + + /* + * Initializes bitmap data for a run. + * Do *not* use this function unless absolutely necessary, it breaks + * the abstraction layer by exposing implementation details. + */ + void (*get_bitmap)(const struct memory_block *m, struct run_bitmap *b); + + /* calculates the ratio between occupied and unoccupied space */ + unsigned (*fill_pct)(const struct memory_block *m); +}; + +struct memory_block { + uint32_t chunk_id; /* index of the memory block in its zone */ + uint32_t zone_id; /* index of this block zone in the heap */ + + /* + * Size index of the memory block represented in either multiple of + * CHUNKSIZE in the case of a huge chunk or in multiple of a run + * block size. + */ + uint32_t size_idx; + + /* + * Used only for run chunks, must be zeroed for huge. + * Number of preceding blocks in the chunk. In other words, the + * position of this memory block in run bitmap. + */ + uint32_t block_off; + + /* + * The variables below are associated with the memory block and are + * stored here for convenience. Those fields are filled by either the + * memblock_from_offset or memblock_rebuild_state, and they should not + * be modified manually. + */ + const struct memory_block_ops *m_ops; + struct palloc_heap *heap; + enum header_type header_type; + enum memory_block_type type; + struct run_bitmap *cached_bitmap; +}; + +/* + * This is a representation of a run memory block that is active in a bucket or + * is on a pending list in the recycler. + * This structure should never be passed around by value because the address of + * the nresv variable can be in reservations made through palloc_reserve(). Only + * if the number of reservations equals 0 the structure can be moved/freed. + */ +struct memory_block_reserved { + struct memory_block m; + + struct bucket_locked *bucket; + /* + * Number of reservations made from this run, the pointer to this value + * is stored in a user facing pobj_action structure. Decremented once + * the reservation is published or canceled. + */ + int nresv; +}; + +struct memory_block memblock_from_offset(struct palloc_heap *heap, + uint64_t off); +struct memory_block memblock_from_offset_opt(struct palloc_heap *heap, + uint64_t off, int size); +void memblock_rebuild_state(struct palloc_heap *heap, struct memory_block *m); + +struct memory_block memblock_huge_init(struct palloc_heap *heap, + uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx); + +struct memory_block memblock_run_init(struct palloc_heap *heap, + uint32_t chunk_id, uint32_t zone_id, struct run_descriptor *rdsc); + +void memblock_run_bitmap(uint32_t *size_idx, uint16_t flags, + uint64_t unit_size, uint64_t alignment, void *content, + struct run_bitmap *b); + +#endif /* __DAOS_COMMON_MEMBLOCK_H */ diff --git a/src/common/dav_v2/memops.c b/src/common/dav_v2/memops.c new file mode 100644 index 00000000000..c550ce34e39 --- /dev/null +++ b/src/common/dav_v2/memops.c @@ -0,0 +1,678 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2024, Intel Corporation */ + +/* + * memops.c -- aggregated memory operations helper implementation + * + * The operation collects all of the required memory modifications that + * need to happen in an atomic way (all of them or none), and abstracts + * away the storage type (transient/persistent) and the underlying + * implementation of how it's actually performed - in some cases using + * the redo log is unnecessary and the allocation process can be sped up + * a bit by completely omitting that whole machinery. + * + * The modifications are not visible until the context is processed. + */ + +#include "memops.h" +#include "obj.h" +#include "out.h" +#include "ravl.h" +#include "valgrind_internal.h" +#include "vecq.h" +#include "sys_util.h" +#include "dav_internal.h" +#include "tx.h" + +static inline int +OBJ_OFF_IS_VALID_FROM_CTX(void *ctx, uint64_t offset) +{ + dav_obj_t *dav_hdl = (dav_obj_t *)ctx; + + return OBJ_OFF_IS_VALID(dav_hdl, offset); +} + +#define ULOG_BASE_SIZE 1024 +#define OP_MERGE_SEARCH 64 + +enum operation_state { + OPERATION_IDLE, + OPERATION_IN_PROGRESS, + OPERATION_CLEANUP, +}; + +struct operation_log { + size_t capacity; /* capacity of the ulog log */ + size_t offset; /* data offset inside of the log */ + struct ulog *ulog; /* DRAM allocated log of modifications */ +}; + +/* + * operation_context -- context of an ongoing palloc operation + */ +struct operation_context { + enum log_type type; + + ulog_extend_fn extend; /* function to allocate next ulog */ + ulog_free_fn ulog_free; /* function to free next ulogs */ + + const struct mo_ops *p_ops; + struct mo_ops t_ops; /* used for transient data processing */ + struct mo_ops s_ops; /* used for shadow copy data processing */ + + size_t ulog_curr_offset; /* offset in the log for buffer stores */ + size_t ulog_curr_capacity; /* capacity of the current log */ + size_t ulog_curr_gen_num; /* transaction counter in the current log */ + struct ulog *ulog_curr; /* current persistent log */ + size_t total_logged; /* total amount of buffer stores in the logs */ + + struct ulog *ulog; /* pointer to the ulog used by context for undo ops */ + size_t ulog_base_nbytes; /* available bytes in initial ulog log */ + size_t ulog_capacity; /* sum of capacity, incl all next ulog logs */ + int ulog_auto_reserve; /* allow or do not to auto ulog reservation */ + + struct ulog_next next; /* vector of 'next' fields of persistent ulog */ + + enum operation_state state; /* operation sanity check */ + + struct operation_log pshadow_ops; /* used by context for redo ops */ + struct operation_log transient_ops; /* log of transient changes */ + + /* collection used to look for potential merge candidates */ + VECQ(, struct ulog_entry_val *) merge_entries; +}; + +/* + * operation_log_transient_init -- (internal) initialize operation log + * containing transient memory resident changes + */ +static int +operation_log_transient_init(struct operation_log *log) +{ + struct ulog *src; + + log->capacity = ULOG_BASE_SIZE; + log->offset = 0; + + D_ALLOC(src, (sizeof(struct ulog) + ULOG_BASE_SIZE)); + if (src == NULL) { + D_CRIT("Zalloc!\n"); + return -1; + } + + /* initialize underlying redo log structure */ + src->capacity = ULOG_BASE_SIZE; + + log->ulog = src; + + return 0; +} + +/* + * operation_log_persistent_init -- (internal) initialize operation log + * containing persistent memory resident changes + */ +static int +operation_log_persistent_init(struct operation_log *log, + size_t ulog_base_nbytes) +{ + struct ulog *src; + + log->capacity = ULOG_BASE_SIZE; + log->offset = 0; + + D_ALLOC(src, (sizeof(struct ulog) + ULOG_BASE_SIZE)); + if (src == NULL) { + D_CRIT("Zalloc!\n"); + return -1; + } + + /* initialize underlying redo log structure */ + src->capacity = ULOG_BASE_SIZE; + memset(src->unused, 0, sizeof(src->unused)); + + log->ulog = src; + + return 0; +} + +/* + * operation_transient_clean -- cleans pmemcheck address state + */ +static int +operation_transient_clean(void *base, const void *addr, size_t len, + unsigned flags) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(base, flags); + + VALGRIND_SET_CLEAN(addr, len); + + return 0; +} + +/* + * operation_transient_drain -- noop + */ +static void +operation_transient_drain(void *base) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(base); +} + +/* + * operation_transient_memcpy -- transient memcpy wrapper + */ +static void * +operation_transient_memcpy(void *base, void *dest, const void *src, size_t len, + unsigned flags) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(base, flags); + + return memcpy(dest, src, len); +} + +/* + * operation_new -- creates new operation context + */ +struct operation_context * +operation_new(struct ulog *ulog, size_t ulog_base_nbytes, + ulog_extend_fn extend, ulog_free_fn ulog_free, + const struct mo_ops *p_ops, enum log_type type) +{ + + SUPPRESS_UNUSED(p_ops); + + struct operation_context *ctx; + + D_ALLOC_PTR(ctx); + if (ctx == NULL) { + D_CRIT("Zalloc!\n"); + goto error_ctx_alloc; + } + + ctx->ulog = ulog; + ctx->ulog_base_nbytes = ulog_base_nbytes; + ctx->ulog_capacity = ulog_capacity(ulog, + ulog_base_nbytes); + ctx->extend = extend; + ctx->ulog_free = ulog_free; + ctx->state = OPERATION_IDLE; + VEC_INIT(&ctx->next); + ulog_rebuild_next_vec(ulog, &ctx->next); + ctx->p_ops = p_ops; + ctx->type = type; + + ctx->ulog_curr_offset = 0; + ctx->ulog_curr_capacity = 0; + ctx->ulog_curr = NULL; + + ctx->t_ops.base = NULL; + ctx->t_ops.flush = operation_transient_clean; + ctx->t_ops.memcpy = operation_transient_memcpy; + ctx->t_ops.drain = operation_transient_drain; + + ctx->s_ops.base = p_ops->base; + ctx->s_ops.flush = operation_transient_clean; + ctx->s_ops.memcpy = operation_transient_memcpy; + ctx->s_ops.drain = operation_transient_drain; + ctx->s_ops.umem_store = p_ops->umem_store; + + VECQ_INIT(&ctx->merge_entries); + + if (operation_log_transient_init(&ctx->transient_ops) != 0) + goto error_ulog_alloc; + + if (operation_log_persistent_init(&ctx->pshadow_ops, + ulog_base_nbytes) != 0) + goto error_ulog_alloc; + + return ctx; + +error_ulog_alloc: + operation_delete(ctx); +error_ctx_alloc: + return NULL; +} + +/* + * operation_delete -- deletes operation context + */ +void +operation_delete(struct operation_context *ctx) +{ + VECQ_DELETE(&ctx->merge_entries); + VEC_DELETE(&ctx->next); + D_FREE(ctx->pshadow_ops.ulog); + D_FREE(ctx->transient_ops.ulog); + D_FREE(ctx); +} + +/* + * operation_free_logs -- free all logs except first + */ +void +operation_free_logs(struct operation_context *ctx) +{ + int freed = ulog_free_next(ctx->ulog, ctx->ulog_free); + + if (freed) { + ctx->ulog_capacity = ulog_capacity(ctx->ulog, + ctx->ulog_base_nbytes); + VEC_CLEAR(&ctx->next); + ulog_rebuild_next_vec(ctx->ulog, &ctx->next); + } + + ASSERTeq(VEC_SIZE(&ctx->next), 0); +} + +/* + * operation_merge -- (internal) performs operation on a field + */ +static inline int +operation_merge(struct ulog_entry_base *entry, uint64_t value, + ulog_operation_type type) +{ + struct ulog_entry_val *e = (struct ulog_entry_val *)entry; + uint16_t num, num1, num2; + uint32_t pos, pos1, pos2; + + switch (type) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + e->value &= value; + break; + case ULOG_OPERATION_OR: + e->value |= value; + break; +#else + case ULOG_OPERATION_SET_BITS: + case ULOG_OPERATION_CLR_BITS: + num1 = ULOG_ENTRY_VAL_TO_BITS(e->value); + pos1 = ULOG_ENTRY_VAL_TO_POS(e->value); + num2 = ULOG_ENTRY_VAL_TO_BITS(value); + pos2 = ULOG_ENTRY_VAL_TO_POS(value); + + if ((pos2 > pos1 + num1) || (pos1 > pos2 + num2)) + return 0; /* there is a gap, no merge */ + + pos = MIN(pos1, pos2); + num = MAX(pos1 + num1, pos2 + num2) - pos; + + e->value = ULOG_ENTRY_TO_VAL(pos, num); + break; +#endif + case ULOG_OPERATION_SET: + e->value = value; + default: + ASSERT(0); /* unreachable */ + } + return 1; +} + +/* + * operation_try_merge_entry -- tries to merge the incoming log entry with + * existing entries + * + * Because this requires a reverse foreach, it cannot be implemented using + * the on-media ulog log structure since there's no way to find what's + * the previous entry in the log. Instead, the last N entries are stored + * in a collection and traversed backwards. + */ +static int +operation_try_merge_entry(struct operation_context *ctx, + void *ptr, uint64_t value, ulog_operation_type type) +{ + int ret = 0; + uint64_t offset = OBJ_PTR_TO_OFF(ctx->p_ops->base, ptr); + + struct ulog_entry_val *e; + + VECQ_FOREACH_REVERSE(e, &ctx->merge_entries) { + if (ulog_entry_offset(&e->base) == offset) { + if (ulog_entry_type(&e->base) == type) { + if (operation_merge(&e->base, value, type)) + return 1; + } + break; + } + } + + return ret; +} + +/* + * operation_merge_entry_add -- adds a new entry to the merge collection, + * keeps capacity at OP_MERGE_SEARCH. Removes old entries in FIFO fashion. + */ +static void +operation_merge_entry_add(struct operation_context *ctx, + struct ulog_entry_val *entry) +{ + if (VECQ_SIZE(&ctx->merge_entries) == OP_MERGE_SEARCH) + (void) VECQ_DEQUEUE(&ctx->merge_entries); + + if (VECQ_ENQUEUE(&ctx->merge_entries, entry) != 0) { + /* this is fine, only runtime perf will get slower */ + D_CRIT("out of memory - unable to track entries\n"); + } +} + +/* + * operation_add_typed_value -- adds new entry to the current operation, if the + * same ptr address already exists and the operation type is set, + * the new value is not added and the function has no effect. + */ +int +operation_add_typed_entry(struct operation_context *ctx, + void *ptr, uint64_t value, + ulog_operation_type type, enum operation_log_type log_type) +{ + struct operation_log *oplog = log_type == LOG_PERSISTENT ? + &ctx->pshadow_ops : &ctx->transient_ops; + + /* + * Always make sure to have one extra spare cacheline so that the + * ulog log entry creation has enough room for zeroing. + */ + if (oplog->offset + CACHELINE_SIZE == oplog->capacity) { + size_t ncapacity = oplog->capacity + ULOG_BASE_SIZE; + struct ulog *ulog; + + D_REALLOC_NZ(ulog, oplog->ulog, SIZEOF_ULOG(ncapacity)); + if (ulog == NULL) + return -1; + oplog->capacity += ULOG_BASE_SIZE; + oplog->ulog = ulog; + oplog->ulog->capacity = oplog->capacity; + + /* + * Realloc invalidated the ulog entries that are inside of this + * vector, need to clear it to avoid use after free. + */ + VECQ_CLEAR(&ctx->merge_entries); + } + + if (log_type == LOG_PERSISTENT && + operation_try_merge_entry(ctx, ptr, value, type) != 0) + return 0; + + struct ulog_entry_val *entry = ulog_entry_val_create( + oplog->ulog, oplog->offset, ptr, value, type, + log_type == LOG_TRANSIENT ? &ctx->t_ops : &ctx->s_ops); + + if (log_type == LOG_PERSISTENT) + operation_merge_entry_add(ctx, entry); + + oplog->offset += ulog_entry_size(&entry->base); + + return 0; +} + + +/* + * operation_add_value -- adds new entry to the current operation with + * entry type autodetected based on the memory location + */ +int +operation_add_entry(struct operation_context *ctx, void *ptr, uint64_t value, + ulog_operation_type type) +{ + const struct mo_ops *p_ops = ctx->p_ops; + dav_obj_t *pop = (dav_obj_t *)p_ops->base; + + int from_pool = OBJ_PTR_IS_VALID(pop, ptr); + + return operation_add_typed_entry(ctx, ptr, value, type, + from_pool ? LOG_PERSISTENT : LOG_TRANSIENT); +} + +/* + * operation_add_buffer -- adds a buffer operation to the log + */ +int +operation_add_buffer(struct operation_context *ctx, + void *dest, void *src, size_t size, ulog_operation_type type) +{ + size_t real_size = size + sizeof(struct ulog_entry_buf); + + /* if there's no space left in the log, reserve some more */ + if (ctx->ulog_curr_capacity == 0) { + ctx->ulog_curr_gen_num = ctx->ulog->gen_num; + if (operation_reserve(ctx, ctx->total_logged + real_size) != 0) + return -1; + + ctx->ulog_curr = ctx->ulog_curr == NULL ? ctx->ulog : + ulog_next(ctx->ulog_curr); + ASSERTne(ctx->ulog_curr, NULL); + ctx->ulog_curr_offset = 0; + ctx->ulog_curr_capacity = ctx->ulog_curr->capacity; + } + + size_t curr_size = MIN(real_size, ctx->ulog_curr_capacity); + size_t data_size = curr_size - sizeof(struct ulog_entry_buf); + size_t entry_size = ALIGN_UP(curr_size, CACHELINE_SIZE); + + /* + * To make sure that the log is consistent and contiguous, we need + * make sure that the header of the entry that would be located + * immediately after this one is zeroed. + */ + struct ulog_entry_base *next_entry = NULL; + + if (entry_size == ctx->ulog_curr_capacity) { + struct ulog *u = ulog_next(ctx->ulog_curr); + + if (u != NULL) + next_entry = (struct ulog_entry_base *)u->data; + } else { + size_t next_entry_offset = ctx->ulog_curr_offset + entry_size; + + next_entry = (struct ulog_entry_base *)(ctx->ulog_curr->data + + next_entry_offset); + } + if (next_entry != NULL) + ulog_clobber_entry(next_entry); + + /* create a persistent log entry */ + struct ulog_entry_buf *e = ulog_entry_buf_create(ctx->ulog_curr, + ctx->ulog_curr_offset, + ctx->ulog_curr_gen_num, + dest, src, data_size, + type, ctx->p_ops); + ASSERT(entry_size == ulog_entry_size(&e->base)); + ASSERT(entry_size <= ctx->ulog_curr_capacity); + + ctx->total_logged += entry_size; + ctx->ulog_curr_offset += entry_size; + ctx->ulog_curr_capacity -= entry_size; + + /* + * Recursively add the data to the log until the entire buffer is + * processed. + */ + return size - data_size == 0 ? 0 : operation_add_buffer(ctx, + (char *)dest + data_size, + (char *)src + data_size, + size - data_size, type); +} + +/* + * operation_set_auto_reserve -- set auto reserve value for context + */ +void +operation_set_auto_reserve(struct operation_context *ctx, int auto_reserve) +{ + ctx->ulog_auto_reserve = auto_reserve; +} + +/* + * operation_process_persistent_redo -- (internal) process using ulog + */ +static void +operation_process_persistent_redo(struct operation_context *ctx) +{ + ASSERTeq(ctx->pshadow_ops.capacity % CACHELINE_SIZE, 0); + + /* Copy the redo log to wal redo */ + ulog_foreach_entry(ctx->pshadow_ops.ulog, tx_create_wal_entry, + NULL, ctx->p_ops); + + ulog_process(ctx->pshadow_ops.ulog, OBJ_OFF_IS_VALID_FROM_CTX, + ctx->p_ops); + + ulog_clobber(ctx->ulog, &ctx->next); +} + +/* + * operation_reserve -- (internal) reserves new capacity in persistent ulog log + */ +int +operation_reserve(struct operation_context *ctx, size_t new_capacity) +{ + if ((ctx->type == LOG_TYPE_UNDO) && (new_capacity > ctx->ulog_capacity)) { + if (ctx->extend == NULL) { + ERR("no extend function present"); + return -1; + } + + if (ulog_reserve(ctx->ulog, + ctx->ulog_base_nbytes, + ctx->ulog_curr_gen_num, + ctx->ulog_auto_reserve, + &new_capacity, ctx->extend, + &ctx->next) != 0) + return -1; + ctx->ulog_capacity = new_capacity; + } + + return 0; +} + +/* + * operation_init -- initializes runtime state of an operation + */ +void +operation_init(struct operation_context *ctx) +{ + struct operation_log *plog = &ctx->pshadow_ops; + struct operation_log *tlog = &ctx->transient_ops; + + VALGRIND_ANNOTATE_NEW_MEMORY(ctx, sizeof(*ctx)); + VALGRIND_ANNOTATE_NEW_MEMORY(tlog->ulog, sizeof(struct ulog) + + tlog->capacity); + VALGRIND_ANNOTATE_NEW_MEMORY(plog->ulog, sizeof(struct ulog) + + plog->capacity); + tlog->offset = 0; + plog->offset = 0; + VECQ_REINIT(&ctx->merge_entries); + + ctx->ulog_curr_offset = 0; + ctx->ulog_curr_capacity = 0; + ctx->ulog_curr_gen_num = 0; + ctx->ulog_curr = NULL; + ctx->total_logged = 0; + ctx->ulog_auto_reserve = 1; +} + +/* + * operation_start -- initializes and starts a new operation + */ +void +operation_start(struct operation_context *ctx) +{ + operation_init(ctx); + ASSERTeq(ctx->state, OPERATION_IDLE); + ctx->state = OPERATION_IN_PROGRESS; +} + +/* + * operation_cancel -- cancels a running operation + */ +void +operation_cancel(struct operation_context *ctx) +{ + ASSERTeq(ctx->state, OPERATION_IN_PROGRESS); + ctx->state = OPERATION_IDLE; +} + +/* + * operation_process -- processes registered operations + * + * The order of processing is important: persistent, transient. + * This is because the transient entries that reside on persistent memory might + * require write to a location that is currently occupied by a valid persistent + * state but becomes a transient state after operation is processed. + */ +void +operation_process(struct operation_context *ctx) +{ + /* + * If there's exactly one persistent entry there's no need to involve + * the redo log. We can simply assign the value, the operation will be + * atomic. + */ + int redo_process = ctx->type == LOG_TYPE_REDO && + ctx->pshadow_ops.offset != 0; + if (redo_process && + ctx->pshadow_ops.offset == sizeof(struct ulog_entry_val)) { + struct ulog_entry_base *e = (struct ulog_entry_base *) + ctx->pshadow_ops.ulog->data; + ulog_operation_type t = ulog_entry_type(e); + + if ((t == ULOG_OPERATION_SET) || ULOG_ENTRY_IS_BIT_OP(t)) { + tx_create_wal_entry(e, NULL, ctx->p_ops); + ulog_entry_apply(e, 1, ctx->p_ops); + redo_process = 0; + } + } + + if (redo_process) { + operation_process_persistent_redo(ctx); + ctx->state = OPERATION_CLEANUP; + } + D_ASSERT(ctx->type != LOG_TYPE_UNDO); + + /* process transient entries with transient memory ops */ + if (ctx->transient_ops.offset != 0) + ulog_process(ctx->transient_ops.ulog, NULL, &ctx->t_ops); +} + +/* + * operation_finish -- finalizes the operation + */ +void +operation_finish(struct operation_context *ctx, unsigned flags) +{ + ASSERTne(ctx->state, OPERATION_IDLE); + + if (ctx->type == LOG_TYPE_UNDO && ctx->total_logged != 0) + ctx->state = OPERATION_CLEANUP; + + if (ctx->state != OPERATION_CLEANUP) + goto out; + + if (ctx->type == LOG_TYPE_UNDO) { + int ret = ulog_clobber_data(ctx->ulog, + &ctx->next, ctx->ulog_free, flags); + + if (ret == 0) + goto out; + } else if (ctx->type == LOG_TYPE_REDO) { + int ret = ulog_free_next(ctx->ulog, ctx->ulog_free); + + if (ret == 0) + goto out; + } + + /* clobbering shrunk the ulog */ + ctx->ulog_capacity = ulog_capacity(ctx->ulog, + ctx->ulog_base_nbytes); + VEC_CLEAR(&ctx->next); + ulog_rebuild_next_vec(ctx->ulog, &ctx->next); + +out: + ctx->state = OPERATION_IDLE; +} diff --git a/src/common/dav_v2/memops.h b/src/common/dav_v2/memops.h new file mode 100644 index 00000000000..23e5d531cde --- /dev/null +++ b/src/common/dav_v2/memops.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2023, Intel Corporation */ + +/* + * memops.h -- aggregated memory operations helper definitions + */ + +#ifndef __DAOS_COMMON_MEMOPS_H +#define __DAOS_COMMON_MEMOPS_H 1 + +#include +#include + +#include "vec.h" +#include "mo_wal.h" +#include "ulog.h" + +enum operation_log_type { + LOG_PERSISTENT, /* log of persistent modifications */ + LOG_TRANSIENT, /* log of transient memory modifications */ + + MAX_OPERATION_LOG_TYPE +}; + +enum log_type { + LOG_TYPE_UNDO, + LOG_TYPE_REDO, + + MAX_LOG_TYPE, +}; + +struct user_buffer_def { + void *addr; + size_t size; +}; + +struct operation_context; + +struct operation_context * +operation_new(struct ulog *redo, size_t ulog_base_nbytes, + ulog_extend_fn extend, ulog_free_fn ulog_free, + const struct mo_ops *p_ops, enum log_type type); + +void operation_init(struct operation_context *ctx); +void operation_start(struct operation_context *ctx); + +void operation_delete(struct operation_context *ctx); +void operation_free_logs(struct operation_context *ctx); + +int operation_add_buffer(struct operation_context *ctx, + void *dest, void *src, size_t size, ulog_operation_type type); + +int operation_add_entry(struct operation_context *ctx, + void *ptr, uint64_t value, ulog_operation_type type); +int operation_add_typed_entry(struct operation_context *ctx, + void *ptr, uint64_t value, + ulog_operation_type type, enum operation_log_type log_type); +void operation_set_auto_reserve(struct operation_context *ctx, + int auto_reserve); + +int operation_reserve(struct operation_context *ctx, size_t new_capacity); +void operation_process(struct operation_context *ctx); +void operation_finish(struct operation_context *ctx, unsigned flags); +void operation_cancel(struct operation_context *ctx); + +#endif /* __DAOS_COMMON_MEMOPS_H */ diff --git a/src/common/dav_v2/meta_io.c b/src/common/dav_v2/meta_io.c new file mode 100644 index 00000000000..2e4b044aaa7 --- /dev/null +++ b/src/common/dav_v2/meta_io.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2017-2024, Intel Corporation */ + +/* + * meta_io.c -- IO to/from meta blob bypassing WAL. + */ + +#include +#include + +/** Maximum number of sets of pages in-flight at a time */ +#define MAX_INFLIGHT_SETS 4 + +int +meta_clear_pages(struct umem_store *store, daos_off_t start_off, daos_size_t size, + daos_size_t hop_dist, int cnt) +{ + struct umem_store_iod iod; + struct umem_store_region iod_region[MAX_INFLIGHT_SETS]; + d_sg_list_t sgl; + d_iov_t sg_iov[MAX_INFLIGHT_SETS]; + char *src; + int rc; + int i; + + D_ASSERT((size % 4096) == 0); + D_ASSERT(hop_dist != 0); + + D_ALLOC(src, size); + if (src == NULL) + return ENOMEM; + + sgl.sg_iovs = sg_iov; + for (i = 0; i < MAX_INFLIGHT_SETS; i++) + d_iov_set(&sg_iov[i], src, size); + do { + iod.io_nr = (cnt > MAX_INFLIGHT_SETS) ? MAX_INFLIGHT_SETS : cnt; + sgl.sg_nr = iod.io_nr; + sgl.sg_nr_out = iod.io_nr; + + for (i = 0; i < iod.io_nr; i++) { + iod_region[i].sr_addr = start_off; + iod_region[i].sr_size = size; + start_off += hop_dist; + } + iod.io_regions = iod_region; + + rc = store->stor_ops->so_write(store, &iod, &sgl); + D_ASSERT(rc == 0); + + cnt -= iod.io_nr; + } while (cnt > 0); + + D_FREE(src); + return 0; +} + +/* + * meta_update -- Write size bytes from addr src to meta blob at offset off. + */ +int +meta_update(struct umem_store *store, void *src, daos_off_t off, daos_size_t size) +{ + struct umem_store_iod iod; + d_sg_list_t sgl; + d_iov_t sg_iov; + int rc; + + iod.io_nr = 1; + iod.io_region.sr_addr = off; + iod.io_region.sr_size = size; + iod.io_regions = &iod.io_region; + sgl.sg_nr = 1; + sgl.sg_nr_out = 1; + sgl.sg_iovs = &sg_iov; + d_iov_set(&sg_iov, src, size); + + D_ASSERT(store != NULL); + if (store->stor_ops->so_write == NULL) + return 0; + + rc = store->stor_ops->so_write(store, &iod, &sgl); + if (rc != 0) { + D_ERROR("Failed to write to meta at offset %lu, size %lu, rc = %d\n", off, size, + rc); + return EFAULT; + } + return 0; +} + +/* + * meta_fetch -- Fetch size bytes from offset off in the meta blob to addr dest. + */ +int +meta_fetch(struct umem_store *store, void *dest, daos_off_t off, daos_size_t size) +{ + struct umem_store_iod iod; + d_sg_list_t sgl; + d_iov_t sg_iov; + int rc; + + iod.io_nr = 1; + iod.io_region.sr_addr = off; + iod.io_region.sr_size = size; + iod.io_regions = &iod.io_region; + sgl.sg_nr = 1; + sgl.sg_nr_out = 1; + sgl.sg_iovs = &sg_iov; + d_iov_set(&sg_iov, dest, size); + + D_ASSERT(store != NULL); + if (store->stor_ops->so_write == NULL) + return 0; + + rc = store->stor_ops->so_read(store, &iod, &sgl); + if (rc != 0) { + D_ERROR("Failed to read from meta at offset %lu, size %lu, rc = %d\n", off, size, + rc); + return EFAULT; + } + return 0; +} + +/* + * meta_fetch_batch -- Fetch nelems of elem_size bytes starting from metablob offset start_off and + * hop distance of hop_dist to the buffer dest. + */ +int +meta_fetch_batch(struct umem_store *store, void *dest, daos_off_t start_off, daos_size_t elem_size, + daos_size_t hop_dist, int nelems) +{ + struct umem_store_iod iod; + struct umem_store_region iod_region[MAX_INFLIGHT_SETS]; + d_sg_list_t sgl; + d_iov_t sg_iov[MAX_INFLIGHT_SETS]; + int rc; + int i; + + D_ASSERT((elem_size % 4096) == 0); + D_ASSERT(hop_dist != 0); + + sgl.sg_iovs = sg_iov; + while (nelems > 0) { + iod.io_nr = (nelems > MAX_INFLIGHT_SETS) ? MAX_INFLIGHT_SETS : nelems; + sgl.sg_nr = iod.io_nr; + sgl.sg_nr_out = iod.io_nr; + + for (i = 0; i < iod.io_nr; i++) { + d_iov_set(&sg_iov[i], dest, elem_size); + iod_region[i].sr_addr = start_off; + iod_region[i].sr_size = elem_size; + start_off += hop_dist; + dest += elem_size; + } + iod.io_regions = iod_region; + + rc = store->stor_ops->so_read(store, &iod, &sgl); + if (rc) + return -1; + + nelems -= iod.io_nr; + } + return 0; +} diff --git a/src/common/dav_v2/meta_io.h b/src/common/dav_v2/meta_io.h new file mode 100644 index 00000000000..3193df364fb --- /dev/null +++ b/src/common/dav_v2/meta_io.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2024, Intel Corporation */ + +/* + * meta_io.h -- definitions of statistics + */ + +#ifndef __DAOS_COMMON_META_IO_H +#define __DAOS_COMMON_META_IO_H 1 + +#include + +struct umem_store; +/* + * meta_clear_pages - fill zeros at various offsets in the meta blob. + */ +int +meta_clear_pages(struct umem_store *store, daos_off_t start_off, daos_size_t size, + daos_size_t hop_dist, int cnt); + +/* + * meta_update -- Write size bytes from addr src to meta blob at offset off. + */ +int +meta_update(struct umem_store *store, void *src, daos_off_t off, daos_size_t size); + +/* + * meta_fetch -- Fetch size bytes from offset off in the meta blob to addr dest. + */ +int +meta_fetch(struct umem_store *store, void *dest, daos_off_t off, daos_size_t size); + +/* + * meta_fetch_batch -- Fetch nelems of elem_size bytes starting from metablob offset + * start_off and hop distance of hop_dist to the buffer dest. + */ +int +meta_fetch_batch(struct umem_store *store, void *dest, daos_off_t start_off, daos_size_t elem_size, + daos_size_t hop_dist, int nelems); + +#endif /* __DAOS_COMMON_META_IO_H */ diff --git a/src/common/dav_v2/mo_wal.h b/src/common/dav_v2/mo_wal.h new file mode 100644 index 00000000000..5ff7b8a71ac --- /dev/null +++ b/src/common/dav_v2/mo_wal.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2024, Intel Corporation */ + +#ifndef __DAOS_COMMON_MO_WAL_H +#define __DAOS_COMMON_MO_WAL_H 1 + +#include +#include +#include +#include + +#include "out.h" +#include "wal_tx.h" + +typedef int (*persist_fn)(void *base, const void *, size_t, unsigned); +typedef int (*flush_fn)(void *base, const void *, size_t, unsigned); +typedef void (*drain_fn)(void *base); + +typedef void *(*memcpy_fn)(void *base, void *dest, const void *src, size_t len, + unsigned flags); +typedef void *(*memmove_fn)(void *base, void *dest, const void *src, size_t len, + unsigned flags); +typedef void *(*memset_fn)(void *base, void *dest, int c, size_t len, + unsigned flags); + +typedef int (*remote_read_fn)(void *ctx, uintptr_t base, void *dest, void *addr, + size_t length); + +struct umem_store; + +struct mo_ops { + /* for 'master' replica: with or without data replication */ + persist_fn persist; /* persist function */ + flush_fn flush; /* flush function */ + drain_fn drain; /* drain function */ + memcpy_fn memcpy; /* persistent memcpy function */ + memmove_fn memmove; /* persistent memmove function */ + memset_fn memset; /* persistent memset function */ + void *base; + struct umem_store *umem_store; +}; + +static force_inline void +mo_wal_persist(const struct mo_ops *p_ops, void *d, size_t s) +{ + dav_wal_tx_snap(p_ops->base, d, s, d, 0); +} + +static force_inline void +mo_wal_flush(const struct mo_ops *p_ops, void *d, size_t s, int flags) +{ + dav_wal_tx_snap(p_ops->base, d, s, d, flags); +} + +static force_inline void +mo_wal_drain(const struct mo_ops *p_ops) +{ + SUPPRESS_UNUSED(p_ops); +} + +static force_inline void * +mo_wal_memcpy(const struct mo_ops *p_ops, void *dest, + const void *src, size_t len, unsigned flags) +{ + SUPPRESS_UNUSED(p_ops); + memcpy(dest, src, len); + mo_wal_flush(p_ops, dest, len, 0); + return dest; +} + +static force_inline void * +mo_wal_memmove(const struct mo_ops *p_ops, void *dest, + const void *src, size_t len, unsigned flags) +{ + SUPPRESS_UNUSED(p_ops); + memmove(dest, src, len); + mo_wal_flush(p_ops, dest, len, 0); + return dest; +} + +static force_inline void * +mo_wal_memset(const struct mo_ops *p_ops, void *dest, int c, + size_t len, unsigned flags) +{ + SUPPRESS_UNUSED(p_ops); + memset(dest, c, len); + dav_wal_tx_set(p_ops->base, dest, c, len); + return dest; +} + +#endif /* __DAOS_COMMON_MO_WAL_H */ diff --git a/src/common/dav_v2/obj.h b/src/common/dav_v2/obj.h new file mode 100644 index 00000000000..3182077bbfd --- /dev/null +++ b/src/common/dav_v2/obj.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2024, Intel Corporation */ + +/* + * obj.h -- internal definitions for obj module + */ + +#ifndef __DAOS_COMMON_OBJ_H +#define __DAOS_COMMON_OBJ_H 1 + +#include +#include + +#include "dav_internal.h" +#include "stats.h" +#include "daos/mem.h" + +#define OBJ_OFF_TO_PTR(pop, off) umem_cache_off2ptr(((dav_obj_t *)pop)->do_store, off) +#define OBJ_PTR_TO_OFF(pop, ptr) umem_cache_ptr2off(((dav_obj_t *)pop)->do_store, ptr) +#define OBJ_OFF_FROM_HEAP(pop, off) \ + (((off) >= (ALIGN_UP(sizeof(struct heap_header), 4096))) && \ + ((off) < ((dav_obj_t *)(pop))->do_size_meta)) + +#define OBJ_OFF_IS_VALID(pop, off) OBJ_OFF_FROM_HEAP(pop, off) + +#define OBJ_PTR_FROM_POOL(pop, ptr) \ + ((uintptr_t)(ptr) >= (uintptr_t)(((dav_obj_t *)pop)->do_base) && \ + (uintptr_t)(ptr) < \ + (uintptr_t)(((dav_obj_t *)pop)->do_base) + (((dav_obj_t *)pop)->do_size_mem_usable)) + +#define OBJ_PTR_IS_VALID(pop, ptr) OBJ_PTR_FROM_POOL(pop, ptr) + +#define OBJ_OFFRANGE_FROM_HEAP(pop, start, end) \ + (((start) >= (ALIGN_UP(sizeof(struct heap_header), 4096))) && \ + ((end) <= (((dav_obj_t *)pop)->do_size_meta))) + +typedef uint64_t type_num_t; + +#define CLASS_ID_FROM_FLAG(flag) ((uint16_t)((flag) >> 48)) +#define EZONE_ID_FROM_FLAG(flag) ((uint32_t)((flag) >> 16)) + +#endif /* __DAOS_COMMON_OBJ_H */ diff --git a/src/common/dav_v2/out.h b/src/common/dav_v2/out.h new file mode 100644 index 00000000000..9c5cc8516b9 --- /dev/null +++ b/src/common/dav_v2/out.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2023, Intel Corporation */ + +/* + * out.h -- definitions for "out" module + */ + +#ifndef __DAOS_COMMON_OUT_H +#define __DAOS_COMMON_OUT_H 1 + +#include +#include "util.h" + +#define DAV_LOG_FAC DB_TRACE + +/* enable extra debug messages and extra checks */ +/*#define DAV_EXTRA_DEBUG*/ + +#ifndef EVALUATE_DBG_EXPRESSIONS +#if defined(DAV_EXTRA_DEBUG) || defined(__clang_analyzer__) || defined(__COVERITY__) ||\ + defined(__KLOCWORK__) +#define EVALUATE_DBG_EXPRESSIONS 1 +#else +#define EVALUATE_DBG_EXPRESSIONS 0 +#endif +#endif + +#define TEST_ALWAYS_TRUE_EXPR(cnd) do { \ + if (__builtin_constant_p(cnd)) \ + COMPILE_ERROR_ON(cnd); \ +} while (0) +#define TEST_ALWAYS_EQ_EXPR(lhs, rhs) do { \ + if (__builtin_constant_p(lhs) && __builtin_constant_p(rhs)) \ + COMPILE_ERROR_ON((lhs) == (rhs)); \ +} while (0) +#define TEST_ALWAYS_NE_EXPR(lhs, rhs) do { \ + if (__builtin_constant_p(lhs) && __builtin_constant_p(rhs)) \ + COMPILE_ERROR_ON((lhs) != (rhs)); \ +} while (0) + +/* produce debug/trace output */ +#if defined(DAV_EXTRA_DEBUG) +#define DAV_DBG(fmt, ...) D_DEBUG(DAV_LOG_FAC, fmt "\n", ##__VA_ARGS__) +#else +#define DAV_DBG(fmt, ...) SUPPRESS_UNUSED(__VA_ARGS__) +#endif + +/* produce output and exit */ +#define FATAL(fmt, ...) \ + D_ASSERTF(0, fmt "\n", ## __VA_ARGS__) + +/* assert a condition is true at runtime */ +#define ASSERT_rt(cnd) do { \ + if (!EVALUATE_DBG_EXPRESSIONS || (cnd)) \ + break; \ + D_ASSERT(cnd); \ +} while (0) + +/* assert two integer values are equal at runtime */ +#define ASSERTeq_rt(lhs, rhs) do { \ + if (!EVALUATE_DBG_EXPRESSIONS || ((lhs) == (rhs)))\ + break; \ + D_ASSERTF(((lhs) == (rhs)), \ + "assertion failure: %s (0x%llx) == %s (0x%llx)", #lhs,\ + (unsigned long long)(lhs), #rhs, (unsigned long long)(rhs)); \ +} while (0) + +/* assert two integer values are not equal at runtime */ +#define ASSERTne_rt(lhs, rhs) do { \ + if (!EVALUATE_DBG_EXPRESSIONS || ((lhs) != (rhs)))\ + break; \ + D_ASSERTF(((lhs) != (rhs)), \ + "assertion failure: %s (0x%llx) != %s (0x%llx)", #lhs,\ + (unsigned long long)(lhs), #rhs, (unsigned long long)(rhs)); \ +} while (0) + +/* + * Detect useless asserts on always true expression. Please use + * COMPILE_ERROR_ON(!cnd) or ASSERT_rt(cnd) in such cases. + */ +/* assert a condition is true */ +#define ASSERT(cnd) do {\ + TEST_ALWAYS_TRUE_EXPR(cnd);\ + ASSERT_rt(cnd);\ + } while (0) + +/* assert two integer values are equal */ +#define ASSERTeq(lhs, rhs) do {\ + /* See comment in ASSERT. */\ + TEST_ALWAYS_EQ_EXPR(lhs, rhs);\ + ASSERTeq_rt(lhs, rhs);\ + } while (0) + +/* assert two integer values are not equal */ +#define ASSERTne(lhs, rhs) do {\ + /* See comment in ASSERT. */\ + TEST_ALWAYS_NE_EXPR(lhs, rhs);\ + ASSERTne_rt(lhs, rhs);\ + } while (0) + +#define ERR(fmt, ...)\ + D_ERROR(fmt "\n", ## __VA_ARGS__) + +#endif /* __DAOS_COMMON_OUT_H */ diff --git a/src/common/dav_v2/palloc.c b/src/common/dav_v2/palloc.c new file mode 100644 index 00000000000..3b929583e9a --- /dev/null +++ b/src/common/dav_v2/palloc.c @@ -0,0 +1,982 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * palloc.c -- implementation of pmalloc POSIX-like API + * + * This is the front-end part of the persistent memory allocator. It uses both + * transient and persistent representation of the heap to provide memory blocks + * in a reasonable time and with an acceptable common-case fragmentation. + * + * Lock ordering in the entirety of the allocator is simple, but might be hard + * to follow at times because locks are, by necessity, externalized. + * There are two sets of locks that need to be taken into account: + * - runtime state locks, represented by buckets. + * - persistent state locks, represented by memory block mutexes. + * + * To properly use them, follow these rules: + * - When nesting, always lock runtime state first. + * Doing the reverse might cause deadlocks in other parts of the code. + * + * - When introducing functions that would require runtime state locks, + * always try to move the lock acquiring to the upper most layer. This + * usually means that the functions will simply take "struct bucket" as + * their argument. By doing so most of the locking can happen in + * the frontend part of the allocator and it's easier to follow the first + * rule because all functions in the backend can safely use the persistent + * state locks - the runtime lock, if it is needed, will be already taken + * by the upper layer. + * + * General lock ordering: + * 1. arenas.lock + * 2. buckets (sorted by ID) + * 3. memory blocks (sorted by lock address) + */ + +#include "bucket.h" +#include "valgrind_internal.h" +#include "heap_layout.h" +#include "heap.h" +#include "alloc_class.h" +#include "out.h" +#include "sys_util.h" +#include "palloc.h" +#include "ravl.h" +#include "vec.h" + +struct dav_action_internal { + /* type of operation (alloc/free vs set) */ + enum dav_action_type type; + + uint32_t padding; + + /* + * Action-specific lock that needs to be taken for the duration of + * an action. + */ + pthread_mutex_t *lock; + + /* action-specific data */ + union { + /* valid only when type == DAV_ACTION_TYPE_HEAP */ + struct { + uint64_t offset; + uint64_t usable_size; + enum memblock_state new_state; + struct memory_block m; + struct memory_block_reserved *mresv; + }; + + /* valid only when type == DAV_ACTION_TYPE_MEM */ + struct { + uint64_t *ptr; + uint64_t value; + }; + + /* padding, not used */ + uint64_t data2[14]; + }; +}; +D_CASSERT(offsetof(struct dav_action_internal, data2) == offsetof(struct dav_action, data2), + "struct dav_action misaligned!"); + +/* + * palloc_set_value -- creates a new set memory action + */ +void +palloc_set_value(struct palloc_heap *heap, struct dav_action *act, + uint64_t *ptr, uint64_t value) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(heap); + + act->type = DAV_ACTION_TYPE_MEM; + + struct dav_action_internal *actp = (struct dav_action_internal *)act; + + actp->ptr = ptr; + actp->value = value; + actp->lock = NULL; +} + +static void * +zone_get_base_address(struct palloc_heap *heap, void *ptr) +{ + uint64_t off = HEAP_PTR_TO_OFF(heap, ptr); + uint32_t zid = heap_off2mbid(heap, off); + + if (zid) + return ZID_TO_ZONE(&heap->layout_info, zid); + + return heap->layout_info.zone0; +} + +/* + * alloc_prep_block -- (internal) prepares a memory block for allocation + * + * Once the block is fully reserved and it's guaranteed that no one else will + * be able to write to this memory region it is safe to write the allocation + * header and call the object construction function. + * + * Because the memory block at this stage is only reserved in transient state + * there's no need to worry about fail-safety of this method because in case + * of a crash the memory will be back in the free blocks collection. + */ +static int +alloc_prep_block(struct palloc_heap *heap, const struct memory_block *m, + palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, + struct dav_action_internal *out) +{ + void *uptr = m->m_ops->get_user_data(m); + size_t usize = m->m_ops->get_user_size(m); + + VALGRIND_DO_MEMPOOL_ALLOC(zone_get_base_address(heap, uptr), uptr, usize); + VALGRIND_DO_MAKE_MEM_UNDEFINED(uptr, usize); + VALGRIND_ANNOTATE_NEW_MEMORY(uptr, usize); + + m->m_ops->write_header(m, extra_field, object_flags); + + /* + * Set allocated memory with pattern, if debug.heap.alloc_pattern CTL + * parameter had been set. + */ + if (unlikely(heap->alloc_pattern > PALLOC_CTL_DEBUG_NO_PATTERN)) { + mo_wal_memset(&heap->p_ops, uptr, heap->alloc_pattern, + usize, 0); + VALGRIND_DO_MAKE_MEM_UNDEFINED(uptr, usize); + } + + int ret; + + if (constructor != NULL) { + ret = constructor(heap->p_ops.base, uptr, usize, arg); + if (ret != 0) { + /* + * If canceled, revert the block back to the free + * state in vg machinery. + */ + VALGRIND_DO_MEMPOOL_FREE(zone_get_base_address(heap, uptr), uptr); + return ret; + } + } + + /* + * To avoid determining the user data pointer twice this method is also + * responsible for calculating the offset of the object in the pool that + * will be used to set the offset destination pointer provided by the + * caller. + */ + out->offset = HEAP_PTR_TO_OFF(heap, uptr); + out->usable_size = usize; + + return 0; +} + +/* + * palloc_reservation_create -- creates a volatile reservation of a + * memory block. + * + * The first step in the allocation of a new block is reserving it in + * the transient heap - which is represented by the bucket abstraction. + * + * To provide optimal scaling for multi-threaded applications and reduce + * fragmentation the appropriate bucket is chosen depending on the + * current thread context and to which allocation class the requested + * size falls into. + * + * Once the bucket is selected, just enough memory is reserved for the + * requested size. The underlying block allocation algorithm + * (best-fit, next-fit, ...) varies depending on the bucket container. + */ +static int +palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr constructor, + void *arg, uint64_t extra_field, uint16_t object_flags, uint16_t class_id, + uint32_t mb_id, struct dav_action_internal *out) +{ + int err = 0; + struct memory_block *new_block = &out->m; + struct mbrt *mb; + + out->type = DAV_ACTION_TYPE_HEAP; + + ASSERT(class_id < UINT8_MAX); + struct alloc_class *c = class_id == 0 ? + heap_get_best_class(heap, size) : + alloc_class_by_id(heap_alloc_classes(heap), + (uint8_t)class_id); + + if (c == NULL) { + ERR("no allocation class for size %lu bytes", size); + errno = EINVAL; + return -1; + } + +retry: + mb = heap_mbrt_get_mb(heap, mb_id); + if (mb == NULL) { + errno = EINVAL; + return -1; + } + + /* + * The caller provided size in bytes, but buckets operate in + * 'size indexes' which are multiples of the block size in the + * bucket. + * + * For example, to allocate 500 bytes from a bucket that + * provides 256 byte blocks two memory 'units' are required. + */ + ssize_t size_idx = alloc_class_calc_size_idx(c, size); + + if (size_idx < 0) { + ERR("allocation class not suitable for size %lu bytes", + size); + errno = EINVAL; + return -1; + } + ASSERT(size_idx <= UINT32_MAX); + *new_block = MEMORY_BLOCK_NONE; + new_block->size_idx = (uint32_t)size_idx; + + err = heap_mbrt_update_alloc_class_buckets(heap, mb, c); + if (err != 0) { + errno = err; + return -1; + } + + struct bucket *b = mbrt_bucket_acquire(mb, c->id); + + err = heap_get_bestfit_block(heap, b, new_block); + if (err != 0) + goto out; + + if (alloc_prep_block(heap, new_block, constructor, arg, + extra_field, object_flags, out) != 0) { + /* + * Constructor returned non-zero value which means + * the memory block reservation has to be rolled back. + */ + if (new_block->type == MEMORY_BLOCK_HUGE) + bucket_insert_block(b, new_block); + err = ECANCELED; + goto out; + } + + /* + * Each as of yet unfulfilled reservation needs to be tracked in the + * runtime state. + * The memory block cannot be put back into the global state unless + * there are no active reservations. + */ + out->mresv = bucket_active_block(b); + if (out->mresv != NULL) + util_fetch_and_add64(&out->mresv->nresv, 1); + + out->lock = new_block->m_ops->get_lock(new_block); + out->new_state = MEMBLOCK_ALLOCATED; + +out: + mbrt_bucket_release(b); + + if (err == 0) + return 0; + + /* + * If there is no memory in evictable zone then do the allocation + * from non-evictable zone. + */ + if ((mb_id != 0) && (err == ENOMEM)) { + heap_mbrt_log_alloc_failure(heap, mb_id); + mb_id = 0; + goto retry; + } + + errno = err; + return -1; +} + +/* + * palloc_heap_action_exec -- executes a single heap action (alloc, free) + */ +static void +palloc_heap_action_exec(struct palloc_heap *heap, + const struct dav_action_internal *act, + struct operation_context *ctx) +{ + struct zone *zone; +#ifdef DAV_EXTRA_DEBUG + if (act->m.m_ops->get_state(&act->m) == act->new_state) { + D_CRIT("invalid operation or heap corruption\n"); + ASSERT(0); + } +#endif + + /* + * The actual required metadata modifications are chunk-type + * dependent, but it always is a modification of a single 8 byte + * value - either modification of few bits in a bitmap or + * changing a chunk type from free to used or vice versa. + */ + act->m.m_ops->prep_hdr(&act->m, act->new_state, ctx); + + /* + * Update the memory bucket utilization info. + */ + if (heap_mbrt_ismb_evictable(heap, act->m.zone_id)) + zone = ZID_TO_ZONE(&heap->layout_info, act->m.zone_id); + else + zone = heap->layout_info.zone0; + + if (act->new_state == MEMBLOCK_FREE) + zone->header.sp_usage -= act->m.m_ops->get_real_size(&act->m); + else + zone->header.sp_usage += act->m.m_ops->get_real_size(&act->m); + operation_add_entry(ctx, &zone->header.sp_usage, zone->header.sp_usage, ULOG_OPERATION_SET); +} + +/* + * palloc_restore_free_chunk_state -- updates the runtime state of a free chunk. + * + * This function also takes care of coalescing of huge chunks. + */ +static void +palloc_restore_free_chunk_state(struct palloc_heap *heap, + struct memory_block *m) +{ + struct mbrt *mb = heap_mbrt_get_mb(heap, m->zone_id); + + if (m->type == MEMORY_BLOCK_HUGE) { + struct bucket *b = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); + + if (heap_free_chunk_reuse(heap, b, m) != 0) { + if (errno == EEXIST) + FATAL("duplicate runtime chunk state, possible double free"); + else + D_CRIT("unable to track runtime chunk state\n"); + } + mbrt_bucket_release(b); + } +} + +/* + * palloc_mem_action_noop -- empty handler for unused memory action funcs + */ +static void +palloc_mem_action_noop(struct palloc_heap *heap, + struct dav_action_internal *act) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(heap, act); +} + +/* + * palloc_reservation_clear -- clears the reservation state of the block, + * discards the associated memory block if possible + */ +static void +palloc_reservation_clear(struct palloc_heap *heap, + struct dav_action_internal *act, int publish) +{ + if (act->mresv == NULL) + return; + + struct memory_block_reserved *mresv = act->mresv; + struct bucket_locked *locked = mresv->bucket; + + if (!publish) { + /* + * If a memory block used for the action is the currently active + * memory block of the bucket it can be returned back to the + * bucket. This way it will be available for future allocation + * requests, improving performance. + */ + struct bucket *b = bucket_acquire(locked); + + bucket_try_insert_attached_block(b, &act->m); + bucket_release(b); + } + + if (util_fetch_and_sub64(&mresv->nresv, 1) == 1) { + VALGRIND_ANNOTATE_HAPPENS_AFTER(&mresv->nresv); + /* + * If the memory block used for the action is not currently used + * in any bucket nor action it can be discarded (given back to + * the heap). + */ + heap_discard_run(heap, &mresv->m); + D_FREE(mresv); + } else { + VALGRIND_ANNOTATE_HAPPENS_BEFORE(&mresv->nresv); + } +} + +/* + * palloc_heap_action_on_cancel -- restores the state of the heap + */ +static void +palloc_heap_action_on_cancel(struct palloc_heap *heap, + struct dav_action_internal *act) +{ + void *uptr; + + if (act->new_state == MEMBLOCK_FREE) + return; + + uptr = act->m.m_ops->get_user_data(&act->m); + VALGRIND_DO_MEMPOOL_FREE(zone_get_base_address(heap, uptr), uptr); + + act->m.m_ops->invalidate(&act->m); + palloc_restore_free_chunk_state(heap, &act->m); + + palloc_reservation_clear(heap, act, 0 /* publish */); +} + +/* + * palloc_heap_action_on_process -- performs finalization steps under a lock + * on the persistent state + */ +static void +palloc_heap_action_on_process(struct palloc_heap *heap, + struct dav_action_internal *act) +{ + if (act->new_state == MEMBLOCK_ALLOCATED) { + STATS_INC(heap->stats, persistent, heap_curr_allocated, + act->m.m_ops->get_real_size(&act->m)); + if (act->m.type == MEMORY_BLOCK_RUN) { + STATS_INC(heap->stats, transient, heap_run_allocated, + act->m.m_ops->get_real_size(&act->m)); + } + heap_mbrt_incrmb_usage(heap, act->m.zone_id, act->m.m_ops->get_real_size(&act->m)); + } else if (act->new_state == MEMBLOCK_FREE) { + if (On_memcheck) { + void *ptr = act->m.m_ops->get_user_data(&act->m); + + VALGRIND_DO_MEMPOOL_FREE(zone_get_base_address(heap, ptr), ptr); + } + + STATS_SUB(heap->stats, persistent, heap_curr_allocated, + act->m.m_ops->get_real_size(&act->m)); + if (act->m.type == MEMORY_BLOCK_RUN) { + STATS_SUB(heap->stats, transient, heap_run_allocated, + act->m.m_ops->get_real_size(&act->m)); + } + heap_memblock_on_free(heap, &act->m); + heap_mbrt_incrmb_usage(heap, act->m.zone_id, + -(act->m.m_ops->get_real_size(&act->m))); + } +} + +/* + * palloc_heap_action_on_unlock -- performs finalization steps that need to be + * performed without a lock on persistent state + */ +static void +palloc_heap_action_on_unlock(struct palloc_heap *heap, + struct dav_action_internal *act) +{ + if (act->new_state == MEMBLOCK_ALLOCATED) + palloc_reservation_clear(heap, act, 1 /* publish */); + else if (act->new_state == MEMBLOCK_FREE) + palloc_restore_free_chunk_state(heap, &act->m); +} + +/* + * palloc_mem_action_exec -- executes a single memory action (set, and, or) + */ +static void +palloc_mem_action_exec(struct palloc_heap *heap, + const struct dav_action_internal *act, + struct operation_context *ctx) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(heap); + + operation_add_entry(ctx, act->ptr, act->value, ULOG_OPERATION_SET); +} + +static const struct { + /* + * Translate action into some number of operation_entry'ies. + */ + void (*exec)(struct palloc_heap *heap, + const struct dav_action_internal *act, + struct operation_context *ctx); + + /* + * Cancel any runtime state changes. Can be called only when action has + * not been translated to persistent operation yet. + */ + void (*on_cancel)(struct palloc_heap *heap, + struct dav_action_internal *act); + + /* + * Final steps after persistent state has been modified. Performed + * under action-specific lock. + */ + void (*on_process)(struct palloc_heap *heap, + struct dav_action_internal *act); + + /* + * Final steps after persistent state has been modified. Performed + * after action-specific lock has been dropped. + */ + void (*on_unlock)(struct palloc_heap *heap, + struct dav_action_internal *act); +} action_funcs[DAV_MAX_ACTION_TYPE] = { + [DAV_ACTION_TYPE_HEAP] = { + .exec = palloc_heap_action_exec, + .on_cancel = palloc_heap_action_on_cancel, + .on_process = palloc_heap_action_on_process, + .on_unlock = palloc_heap_action_on_unlock, + }, + [DAV_ACTION_TYPE_MEM] = { + .exec = palloc_mem_action_exec, + .on_cancel = palloc_mem_action_noop, + .on_process = palloc_mem_action_noop, + .on_unlock = palloc_mem_action_noop, + } +}; + +/* + * palloc_action_compare -- compares two actions based on lock address + */ +static int +palloc_action_compare(const void *lhs, const void *rhs) +{ + const struct dav_action_internal *mlhs = lhs; + const struct dav_action_internal *mrhs = rhs; + uintptr_t vlhs = (uintptr_t)(mlhs->lock); + uintptr_t vrhs = (uintptr_t)(mrhs->lock); + + if (vlhs < vrhs) + return -1; + if (vlhs > vrhs) + return 1; + + return 0; +} + +/* + * palloc_exec_actions -- perform the provided free/alloc operations + */ +static void +palloc_exec_actions(struct palloc_heap *heap, + struct operation_context *ctx, + struct dav_action_internal *actv, + size_t actvcnt) +{ + /* + * The operations array is sorted so that proper lock ordering is + * ensured. + */ + if (actv) + qsort(actv, actvcnt, sizeof(struct dav_action_internal), + palloc_action_compare); + else + ASSERTeq(actvcnt, 0); + + struct dav_action_internal *act; + + for (size_t i = 0; i < actvcnt; ++i) { + act = &actv[i]; + + /* + * This lock must be held for the duration between the creation + * of the allocation metadata updates in the operation context + * and the operation processing. This is because a different + * thread might operate on the same 8-byte value of the run + * bitmap and override allocation performed by this thread. + */ + if (i == 0 || act->lock != actv[i - 1].lock) { + if (act->lock) + util_mutex_lock(act->lock); + } + + /* translate action to some number of operation_entry'ies */ + action_funcs[act->type].exec(heap, act, ctx); + } + + /* wait for all allocated object headers to be persistent */ + mo_wal_drain(&heap->p_ops); + + /* perform all persistent memory operations */ + operation_process(ctx); + + for (size_t i = 0; i < actvcnt; ++i) { + act = &actv[i]; + + action_funcs[act->type].on_process(heap, act); + + if (i == actvcnt - 1 || act->lock != actv[i + 1].lock) { + if (act->lock) + util_mutex_unlock(act->lock); + } + } + + for (size_t i = 0; i < actvcnt; ++i) { + act = &actv[i]; + + action_funcs[act->type].on_unlock(heap, act); + } + + operation_finish(ctx, 0); +} + +/* + * palloc_reserve -- creates a single reservation + */ +int +palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t mb_id, + struct dav_action *act) +{ + COMPILE_ERROR_ON(sizeof(struct dav_action) != + sizeof(struct dav_action_internal)); + + return palloc_reservation_create(heap, size, constructor, arg, extra_field, object_flags, + class_id, mb_id, (struct dav_action_internal *)act); +} + +/* + * palloc_action_isalloc - action is a heap reservation + * created by palloc_reserve(). + */ +int +palloc_action_isalloc(struct dav_action *act) +{ + struct dav_action_internal *actp = (struct dav_action_internal *)act; + + return ((actp->type == DAV_ACTION_TYPE_HEAP) && + (actp->new_state == MEMBLOCK_ALLOCATED)); +} + +uint64_t +palloc_get_realoffset(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + + return HEAP_PTR_TO_OFF(m.heap, m.m_ops->get_real_data(&m)); +} + +/* + * palloc_get_prange -- get the start offset and size of allocated memory that + * needs to be persisted. + * + * persist_udata - if true, persist the user data. + */ +void +palloc_get_prange(struct dav_action *act, uint64_t *const offp, uint64_t *const sizep, + int persist_udata) +{ + struct dav_action_internal *act_in = (struct dav_action_internal *)act; + + D_ASSERT(act_in->type == DAV_ACTION_TYPE_HEAP); + /* we need to persist the header if present */ + *offp = HEAP_PTR_TO_OFF(act_in->m.heap, act_in->m.m_ops->get_real_data(&act_in->m)); + *sizep = header_type_to_size[act_in->m.header_type]; + + D_ASSERT(act_in->offset == *offp + header_type_to_size[act_in->m.header_type]); + /* persist the user data */ + if (persist_udata) + *sizep += act_in->usable_size; +} + +/* + * palloc_defer_free -- creates an internal deferred free action + */ +static void +palloc_defer_free_create(struct palloc_heap *heap, uint64_t off, + struct dav_action_internal *out) +{ + COMPILE_ERROR_ON(sizeof(struct dav_action) != + sizeof(struct dav_action_internal)); + + out->type = DAV_ACTION_TYPE_HEAP; + out->offset = off; + out->m = memblock_from_offset(heap, off); + + /* + * For the duration of free we may need to protect surrounding + * metadata from being modified. + */ + out->lock = out->m.m_ops->get_lock(&out->m); + out->mresv = NULL; + out->new_state = MEMBLOCK_FREE; +} + +/* + * palloc_defer_free -- creates a deferred free action + */ +void +palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act) +{ + COMPILE_ERROR_ON(sizeof(struct dav_action) != + sizeof(struct dav_action_internal)); + + palloc_defer_free_create(heap, off, (struct dav_action_internal *)act); +} + +/* + * palloc_cancel -- cancels all reservations in the array + */ +void +palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt) +{ + struct dav_action_internal *act; + + for (size_t i = 0; i < actvcnt; ++i) { + act = (struct dav_action_internal *)&actv[i]; + action_funcs[act->type].on_cancel(heap, act); + } +} + +/* + * palloc_publish -- publishes all reservations in the array + */ +void +palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt, + struct operation_context *ctx) +{ + palloc_exec_actions(heap, ctx, + (struct dav_action_internal *)actv, actvcnt); +} + +/* + * palloc_operation -- persistent memory operation. Takes a NULL pointer + * or an existing memory block and modifies it to occupy, at least, 'size' + * number of bytes. + * + * The malloc, free and realloc routines are implemented in the context of this + * common operation which encompasses all of the functionality usually done + * separately in those methods. + * + * The first thing that needs to be done is determining which memory blocks + * will be affected by the operation - this varies depending on the whether the + * operation will need to modify or free an existing block and/or allocate + * a new one. + * + * Simplified allocation process flow is as follows: + * - reserve a new block in the transient heap + * - prepare the new block + * - create redo log of required modifications + * - chunk metadata + * - offset of the new object + * - commit and process the redo log + * + * And similarly, the deallocation process: + * - create redo log of required modifications + * - reverse the chunk metadata back to the 'free' state + * - set the destination of the object offset to zero + * - commit and process the redo log + * There's an important distinction in the deallocation process - it does not + * return the memory block to the transient container. That is done once no more + * memory is available. + * + * Reallocation is a combination of the above, with one additional step + * of copying the old content. + */ +int +palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, + palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags, + uint16_t class_id, uint32_t mb_id, struct operation_context *ctx) +{ + size_t user_size = 0; + + size_t nops = 0; + uint64_t aoff; + uint64_t asize; + struct dav_action_internal ops[2]; + struct dav_action_internal *alloc = NULL; + struct dav_action_internal *dealloc = NULL; + + /* + * The offset of an existing block can be nonzero which means this + * operation is either free or a realloc - either way the offset of the + * object needs to be translated into memory block, which is a structure + * that all of the heap methods expect. + */ + if (off != 0) { + dealloc = &ops[nops++]; + palloc_defer_free_create(heap, off, dealloc); + user_size = dealloc->m.m_ops->get_user_size(&dealloc->m); + if (user_size == size) { + operation_cancel(ctx); + return 0; + } + } + + /* alloc or realloc */ + if (size != 0) { + alloc = &ops[nops++]; + if (palloc_reservation_create(heap, size, constructor, arg, extra_field, + object_flags, class_id, mb_id, alloc) != 0) { + operation_cancel(ctx); + return -1; + } + + palloc_get_prange((struct dav_action *)alloc, &aoff, &asize, 0); + if (asize) /* != CHUNK_FLAG_HEADER_NONE */ + dav_wal_tx_snap(heap->p_ops.base, HEAP_OFF_TO_PTR(heap, aoff), + asize, HEAP_OFF_TO_PTR(heap, aoff), 0); + } + + /* realloc */ + if (alloc != NULL && dealloc != NULL) { + /* copy data to newly allocated memory */ + size_t old_size = user_size; + size_t to_cpy = old_size > size ? size : old_size; + + VALGRIND_ADD_TO_TX( + HEAP_OFF_TO_PTR(heap, alloc->offset), + to_cpy); + mo_wal_memcpy(&heap->p_ops, + HEAP_OFF_TO_PTR(heap, alloc->offset), + HEAP_OFF_TO_PTR(heap, off), + to_cpy, + 0); + VALGRIND_REMOVE_FROM_TX( + HEAP_OFF_TO_PTR(heap, alloc->offset), + to_cpy); + } + + /* + * If the caller provided a destination value to update, it needs to be + * modified atomically alongside the heap metadata, and so the operation + * context must be used. + */ + if (dest_off) { + operation_add_entry(ctx, dest_off, + alloc ? alloc->offset : 0, ULOG_OPERATION_SET); + } + + /* and now actually perform the requested operation! */ + palloc_exec_actions(heap, ctx, ops, nops); + + return 0; +} + +/* + * palloc_usable_size -- returns the number of bytes in the memory block + */ +size_t +palloc_usable_size(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + + return m.m_ops->get_user_size(&m); +} + +/* + * palloc_extra -- returns allocation extra field + */ +uint64_t +palloc_extra(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + + return m.m_ops->get_extra(&m); +} + +/* + * palloc_flags -- returns allocation flags + */ +uint16_t +palloc_flags(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + + return m.m_ops->get_flags(&m); +} + +/* + * pmalloc_search_cb -- (internal) foreach callback. + */ +static int +pmalloc_search_cb(const struct memory_block *m, void *arg) +{ + struct memory_block *out = arg; + + if (MEMORY_BLOCK_EQUALS(*m, *out)) + return 0; /* skip the same object */ + + *out = *m; + + return 1; +} + +/* + * palloc_first -- returns the first object from the heap. + */ +uint64_t +palloc_first(struct palloc_heap *heap) +{ + struct memory_block search = MEMORY_BLOCK_NONE; + + heap_foreach_object(heap, pmalloc_search_cb, + &search, MEMORY_BLOCK_NONE); + + if (MEMORY_BLOCK_IS_NONE(search)) + return 0; + + void *uptr = search.m_ops->get_user_data(&search); + + return HEAP_PTR_TO_OFF(heap, uptr); +} + +/* + * palloc_next -- returns the next object relative to 'off'. + */ +uint64_t +palloc_next(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + struct memory_block search = m; + + heap_foreach_object(heap, pmalloc_search_cb, &search, m); + + if (MEMORY_BLOCK_IS_NONE(search) || + MEMORY_BLOCK_EQUALS(search, m)) + return 0; + + void *uptr = search.m_ops->get_user_data(&search); + + return HEAP_PTR_TO_OFF(heap, uptr); +} + +#if VG_MEMCHECK_ENABLED +/* + * palloc_vg_register_alloc -- (internal) registers allocation header + * in Valgrind + */ +static int +palloc_vg_register_alloc(const struct memory_block *m, void *arg) +{ + struct palloc_heap *heap = arg; + + m->m_ops->reinit_header(m); + + void *uptr = m->m_ops->get_user_data(m); + size_t usize = m->m_ops->get_user_size(m); + + VALGRIND_DO_MEMPOOL_ALLOC(zone_get_base_address(heap, uptr), uptr, usize); + VALGRIND_DO_MAKE_MEM_DEFINED(uptr, usize); + + return 0; +} + +/* + * palloc_heap_vg_open -- notifies Valgrind about heap layout + */ +void +palloc_heap_vg_open(struct palloc_heap *heap, int objects) +{ + heap_vg_open(heap, palloc_vg_register_alloc, heap, objects); +} + +void +palloc_heap_vg_zone_open(struct palloc_heap *heap, uint32_t zid, int objects) +{ + heap_vg_zone_open(heap, zid, palloc_vg_register_alloc, heap, objects); +} +#endif diff --git a/src/common/dav_v2/palloc.h b/src/common/dav_v2/palloc.h new file mode 100644 index 00000000000..027fb94667b --- /dev/null +++ b/src/common/dav_v2/palloc.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * palloc.h -- internal definitions for persistent allocator + */ + +#ifndef __DAOS_COMMON_PALLOC_H +#define __DAOS_COMMON_PALLOC_H 1 + +#include +#include + +#include "memops.h" +#include "ulog.h" +#include "valgrind_internal.h" +#include "stats.h" +#include "dav_v2.h" + +#define PALLOC_CTL_DEBUG_NO_PATTERN (-1) + +struct palloc_heap { + struct mo_ops p_ops; + struct heap_layout_info layout_info; + struct heap_rt *rt; + uint64_t size; + struct stats *stats; + void *base; + int alloc_pattern; +}; + +struct memory_block; +struct mbrt; + +typedef int (*palloc_constr)(void *base, void *ptr, size_t usable_size, void *arg); + +int +palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, + palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags, + uint16_t class_id, uint32_t zset_id, struct operation_context *ctx); + +int +palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id, + struct dav_action *act); + +int +palloc_action_isalloc(struct dav_action *act); +void +palloc_get_prange(struct dav_action *act, uint64_t *const off, uint64_t *const size, + int persist_udata); +uint64_t +palloc_get_realoffset(struct palloc_heap *heap, uint64_t off); + +void +palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act); + +void +palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt); + +void +palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt, + struct operation_context *ctx); + +void +palloc_set_value(struct palloc_heap *heap, struct dav_action *act, uint64_t *ptr, uint64_t value); + +uint64_t +palloc_first(struct palloc_heap *heap); +uint64_t +palloc_next(struct palloc_heap *heap, uint64_t off); + +size_t +palloc_usable_size(struct palloc_heap *heap, uint64_t off); +uint64_t +palloc_extra(struct palloc_heap *heap, uint64_t off); +uint16_t +palloc_flags(struct palloc_heap *heap, uint64_t off); + +/* foreach callback, terminates iteration if return value is non-zero */ +typedef int (*object_callback)(const struct memory_block *m, void *arg); + +#if VG_MEMCHECK_ENABLED +void +palloc_heap_vg_open(struct palloc_heap *heap, int objects); +void +palloc_heap_vg_zone_open(struct palloc_heap *heap, uint32_t zid, int objects); +#endif + +#endif /* __DAOS_COMMON_PALLOC_H */ diff --git a/src/common/dav_v2/queue.h b/src/common/dav_v2/queue.h new file mode 100644 index 00000000000..654c60cec9b --- /dev/null +++ b/src/common/dav_v2/queue.h @@ -0,0 +1,112 @@ +/* + * Source: glibc 2.24 (git://sourceware.org/glibc.git /misc/sys/queue.h) + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 2016, Microsoft Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + */ + +#ifndef __DAOS_COMMON_QUEUE_H_ +#define __DAOS_COMMON_QUEUE_H_ + +/* + * This file defines five types of data structures: singly-linked lists, + * lists, simple queues, tail queues, and circular queues. + * + * A singly-linked list is headed by a single forward pointer. The + * elements are singly linked for minimum space and pointer manipulation + * overhead at the expense of O(n) removal for arbitrary elements. New + * elements can be added to the list after an existing element or at the + * head of the list. Elements being removed from the head of the list + * should use the explicit macro for this purpose for optimum + * efficiency. A singly-linked list may only be traversed in the forward + * direction. Singly-linked lists are ideal for applications with large + * datasets and few or no removals or for implementing a LIFO queue. + * + * For details on the use of these macros, see the queue(3) manual page. + */ + +/* + * Singly-linked List definitions. + */ +#define DAV_SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define DAV_SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define DAV_SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define DAV_SLIST_INIT(head) ((head)->slh_first = NULL) + +#define DAV_SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + (elm)->field.sle_next = (slistelm)->field.sle_next; \ + (slistelm)->field.sle_next = (elm); \ +} while (/*CONSTCOND*/0) + +#define DAV_SLIST_INSERT_HEAD(head, elm, field) do { \ + (elm)->field.sle_next = (head)->slh_first; \ + (head)->slh_first = (elm); \ +} while (/*CONSTCOND*/0) + +#define DAV_SLIST_REMOVE_HEAD(head, field) \ + ((head)->slh_first = (head)->slh_first->field.sle_next) + +#define DAV_SLIST_REMOVE(head, elm, type, field) do { \ + if ((head)->slh_first == (elm)) { \ + DAV_SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = (head)->slh_first; \ + while (curelm->field.sle_next != (elm)) \ + curelm = curelm->field.sle_next; \ + curelm->field.sle_next = \ + curelm->field.sle_next->field.sle_next; \ + } \ +} while (/*CONSTCOND*/0) + +#define DAV_SLIST_FOREACH(var, head, field) \ + for ((var) = (head)->slh_first; (var); (var) = (var)->field.sle_next) + +/* + * Singly-linked List access methods. + */ +#define DAV_SLIST_EMPTY(head) ((head)->slh_first == NULL) +#define DAV_SLIST_FIRST(head) ((head)->slh_first) +#define DAV_SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#endif /* __DAOS_COMMON_QUEUE_H_ */ diff --git a/src/common/dav_v2/ravl.c b/src/common/dav_v2/ravl.c new file mode 100644 index 00000000000..9a9639b367a --- /dev/null +++ b/src/common/dav_v2/ravl.c @@ -0,0 +1,613 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2023, Intel Corporation */ + +/* + * ravl.c -- implementation of a RAVL tree + * https://sidsen.azurewebsites.net//papers/ravl-trees-journal.pdf + */ + +#include +#include +#include + +#include "out.h" +#include "ravl.h" +#include "util.h" + +#define RAVL_DEFAULT_DATA_SIZE (sizeof(void *)) + +enum ravl_slot_type { + RAVL_LEFT, + RAVL_RIGHT, + + MAX_SLOTS, + + RAVL_ROOT +}; + +struct ravl_node { + struct ravl_node *parent; + struct ravl_node *slots[MAX_SLOTS]; + int32_t rank; /* cannot be greater than height of the subtree */ + int32_t pointer_based; + char data[]; +}; + +struct ravl { + struct ravl_node *root; + ravl_compare *compare; + size_t data_size; +}; + +/* + * ravl_new -- creates a new ravl tree instance + */ +struct ravl * +ravl_new_sized(ravl_compare *compare, size_t data_size) +{ + struct ravl *r; + + D_ALLOC_PTR_NZ(r); + if (r == NULL) { + D_CRIT("Malloc!\n"); + return r; + } + + r->compare = compare; + r->root = NULL; + r->data_size = data_size; + + return r; +} + +/* + * ravl_new -- creates a new tree that stores data pointers + */ +struct ravl * +ravl_new(ravl_compare *compare) +{ + return ravl_new_sized(compare, RAVL_DEFAULT_DATA_SIZE); +} + +/* + * ravl_clear_node -- (internal) recursively clears the given subtree, + * calls callback in an in-order fashion. Optionally frees the given node. + */ +static void +ravl_foreach_node(struct ravl_node *n, ravl_cb cb, void *arg, int free_node) +{ + if (n == NULL) + return; + + ravl_foreach_node(n->slots[RAVL_LEFT], cb, arg, free_node); + if (cb) + cb((void *)n->data, arg); + ravl_foreach_node(n->slots[RAVL_RIGHT], cb, arg, free_node); + + if (free_node) + D_FREE(n); +} + +/* + * ravl_clear -- clears the entire tree, starting from the root + */ +void +ravl_clear(struct ravl *ravl) +{ + ravl_foreach_node(ravl->root, NULL, NULL, 1); + ravl->root = NULL; +} + +/* + * ravl_delete_cb -- clears and deletes the given ravl instance, calls callback + */ +void +ravl_delete_cb(struct ravl *ravl, ravl_cb cb, void *arg) +{ + ravl_foreach_node(ravl->root, cb, arg, 1); + D_FREE(ravl); +} + +/* + * ravl_delete -- clears and deletes the given ravl instance + */ +void +ravl_delete(struct ravl *ravl) +{ + ravl_delete_cb(ravl, NULL, NULL); +} + +/* + * ravl_foreach -- traverses the entire tree, calling callback for every node + */ +void +ravl_foreach(struct ravl *ravl, ravl_cb cb, void *arg) +{ + ravl_foreach_node(ravl->root, cb, arg, 0); +} + +/* + * ravl_empty -- checks whether the given tree is empty + */ +int +ravl_empty(struct ravl *ravl) +{ + return ravl->root == NULL; +} + +/* + * ravl_node_insert_constructor -- node data constructor for ravl_insert + */ +static void +ravl_node_insert_constructor(void *data, size_t data_size, const void *arg) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(data_size); + + /* copy only the 'arg' pointer */ + memcpy(data, &arg, sizeof(arg)); +} + +/* + * ravl_node_copy_constructor -- node data constructor for ravl_emplace_copy + */ +static void +ravl_node_copy_constructor(void *data, size_t data_size, const void *arg) +{ + memcpy(data, arg, data_size); +} + +/* + * ravl_new_node -- (internal) allocates and initializes a new node + */ +static struct ravl_node * +ravl_new_node(struct ravl *ravl, ravl_constr constr, const void *arg) +{ + struct ravl_node *n; + + D_ALLOC_NZ(n, (sizeof(*n) + ravl->data_size)); + if (n == NULL) { + D_CRIT("Malloc!\n"); + return n; + } + + n->parent = NULL; + n->slots[RAVL_LEFT] = NULL; + n->slots[RAVL_RIGHT] = NULL; + n->rank = 0; + n->pointer_based = constr == ravl_node_insert_constructor; + constr(n->data, ravl->data_size, arg); + + return n; +} + +/* + * ravl_slot_opposite -- (internal) returns the opposite slot type, cannot be + * called for root type + */ +static enum ravl_slot_type +ravl_slot_opposite(enum ravl_slot_type t) +{ + ASSERTne(t, RAVL_ROOT); + + return t == RAVL_LEFT ? RAVL_RIGHT : RAVL_LEFT; +} + +/* + * ravl_node_slot_type -- (internal) returns the type of the given node: + * left child, right child or root + */ +static enum ravl_slot_type +ravl_node_slot_type(struct ravl_node *n) +{ + if (n->parent == NULL) + return RAVL_ROOT; + + return n->parent->slots[RAVL_LEFT] == n ? RAVL_LEFT : RAVL_RIGHT; +} + +/* + * ravl_node_sibling -- (internal) returns the sibling of the given node, + * NULL if the node is root (has no parent) + */ +static struct ravl_node * +ravl_node_sibling(struct ravl_node *n) +{ + enum ravl_slot_type t = ravl_node_slot_type(n); + + if (t == RAVL_ROOT) + return NULL; + + return n->parent->slots[t == RAVL_LEFT ? RAVL_RIGHT : RAVL_LEFT]; +} + +/* + * ravl_node_ref -- (internal) returns the pointer to the memory location in + * which the given node resides + */ +static struct ravl_node ** +ravl_node_ref(struct ravl *ravl, struct ravl_node *n) +{ + enum ravl_slot_type t = ravl_node_slot_type(n); + + return t == RAVL_ROOT ? &ravl->root : &n->parent->slots[t]; +} + +/* + * ravl_rotate -- (internal) performs a rotation around a given node + * + * The node n swaps place with its parent. If n is right child, parent becomes + * the left child of n, otherwise parent becomes right child of n. + */ +static void +ravl_rotate(struct ravl *ravl, struct ravl_node *n) +{ + ASSERTne(n->parent, NULL); + struct ravl_node *p = n->parent; + struct ravl_node **pref = ravl_node_ref(ravl, p); + + enum ravl_slot_type t = ravl_node_slot_type(n); + enum ravl_slot_type t_opposite = ravl_slot_opposite(t); + + n->parent = p->parent; + p->parent = n; + *pref = n; + + p->slots[t] = n->slots[t_opposite]; + if (p->slots[t] != NULL) + p->slots[t]->parent = p; + n->slots[t_opposite] = p; +} + +/* + * ravl_node_rank -- (internal) returns the rank of the node + * + * For the purpose of balancing, NULL nodes have rank -1. + */ +static int +ravl_node_rank(struct ravl_node *n) +{ + return n == NULL ? -1 : n->rank; +} + +/* + * ravl_node_rank_difference_parent -- (internal) returns the rank different + * between parent node p and its child n + * + * Every rank difference must be positive. + * + * Either of these can be NULL. + */ +static int +ravl_node_rank_difference_parent(struct ravl_node *p, struct ravl_node *n) +{ + return ravl_node_rank(p) - ravl_node_rank(n); +} + +/* + * ravl_node_rank_differenced - (internal) returns the rank difference between + * parent and its child + * + * Can be used to check if a given node is an i-child. + */ +static int +ravl_node_rank_difference(struct ravl_node *n) +{ + return ravl_node_rank_difference_parent(n->parent, n); +} + +/* + * ravl_node_is_i_j -- (internal) checks if a given node is strictly i,j-node + */ +static int +ravl_node_is_i_j(struct ravl_node *n, int i, int j) +{ + return (ravl_node_rank_difference_parent(n, n->slots[RAVL_LEFT]) == i && + ravl_node_rank_difference_parent(n, n->slots[RAVL_RIGHT]) == j); +} + +/* + * ravl_node_is -- (internal) checks if a given node is i,j-node or j,i-node + */ +static int +ravl_node_is(struct ravl_node *n, int i, int j) +{ + return ravl_node_is_i_j(n, i, j) || ravl_node_is_i_j(n, j, i); +} + +/* + * ravl_node_promote -- promotes a given node by increasing its rank + */ +static void +ravl_node_promote(struct ravl_node *n) +{ + n->rank += 1; +} + +/* + * ravl_node_promote -- demotes a given node by increasing its rank + */ +static void +ravl_node_demote(struct ravl_node *n) +{ + ASSERT(n->rank > 0); + n->rank -= 1; +} + +/* + * ravl_balance -- balances the tree after insert + * + * This function must restore the invariant that every rank + * difference is positive. + */ +static void +ravl_balance(struct ravl *ravl, struct ravl_node *n) +{ + /* walk up the tree, promoting nodes */ + while (n->parent && ravl_node_is(n->parent, 0, 1)) { + ravl_node_promote(n->parent); + n = n->parent; + } + + /* + * Either the rank rule holds or n is a 0-child whose sibling is an + * i-child with i > 1. + */ + struct ravl_node *s = ravl_node_sibling(n); + + if (!(ravl_node_rank_difference(n) == 0 && + ravl_node_rank_difference_parent(n->parent, s) > 1)) + return; + + struct ravl_node *y = n->parent; + /* if n is a left child, let z be n's right child and vice versa */ + enum ravl_slot_type t = ravl_slot_opposite(ravl_node_slot_type(n)); + struct ravl_node *z = n->slots[t]; + + if (z == NULL || ravl_node_rank_difference(z) == 2) { + ravl_rotate(ravl, n); + ravl_node_demote(y); + } else if (ravl_node_rank_difference(z) == 1) { + ravl_rotate(ravl, z); + ravl_rotate(ravl, z); + ravl_node_promote(z); + ravl_node_demote(n); + ravl_node_demote(y); + } +} + +/* + * ravl_insert -- insert data into the tree + */ +int +ravl_insert(struct ravl *ravl, const void *data) +{ + return ravl_emplace(ravl, ravl_node_insert_constructor, data); +} + +/* + * ravl_insert -- copy construct data inside of a new tree node + */ +int +ravl_emplace_copy(struct ravl *ravl, const void *data) +{ + return ravl_emplace(ravl, ravl_node_copy_constructor, data); +} + +/* + * ravl_emplace -- construct data inside of a new tree node + */ +int +ravl_emplace(struct ravl *ravl, ravl_constr constr, const void *arg) +{ + struct ravl_node *n = ravl_new_node(ravl, constr, arg); + + if (n == NULL) + return -1; + + /* walk down the tree and insert the new node into a missing slot */ + struct ravl_node **dstp = &ravl->root; + struct ravl_node *dst = NULL; + + while (*dstp != NULL) { + dst = (*dstp); + int cmp_result = ravl->compare(ravl_data(n), ravl_data(dst)); + + if (cmp_result == 0) + goto error_duplicate; + + dstp = &dst->slots[cmp_result > 0]; + } + n->parent = dst; + *dstp = n; + + ravl_balance(ravl, n); + + return 0; + +error_duplicate: + errno = EEXIST; + D_FREE(n); + return -1; +} + +/* + * ravl_node_type_most -- (internal) returns left-most or right-most node in + * the subtree + */ +static struct ravl_node * +ravl_node_type_most(struct ravl_node *n, enum ravl_slot_type t) +{ + while (n->slots[t] != NULL) + n = n->slots[t]; + + return n; +} + +/* + * ravl_node_cessor -- (internal) returns the successor or predecessor of the + * node + */ +static struct ravl_node * +ravl_node_cessor(struct ravl_node *n, enum ravl_slot_type t) +{ + /* + * If t child is present, we are looking for t-opposite-most node + * in t child subtree + */ + if (n->slots[t]) + return ravl_node_type_most(n->slots[t], ravl_slot_opposite(t)); + + /* otherwise get the first parent on the t path */ + while (n->parent != NULL && n == n->parent->slots[t]) + n = n->parent; + + return n->parent; +} + +/* + * ravl_node_successor -- (internal) returns node's successor + * + * It's the first node larger than n. + */ +static struct ravl_node * +ravl_node_successor(struct ravl_node *n) +{ + return ravl_node_cessor(n, RAVL_RIGHT); +} + +/* + * ravl_node_successor -- (internal) returns node's successor + * + * It's the first node smaller than n. + */ +static struct ravl_node * +ravl_node_predecessor(struct ravl_node *n) +{ + return ravl_node_cessor(n, RAVL_LEFT); +} + +/* + * ravl_predicate_holds -- (internal) verifies the given predicate for + * the current node in the search path + * + * If the predicate holds for the given node or a node that can be directly + * derived from it, returns 1. Otherwise returns 0. + */ +static int +ravl_predicate_holds(int result, struct ravl_node **ret, + struct ravl_node *n, enum ravl_predicate flags) +{ + if (flags & RAVL_PREDICATE_EQUAL) { + if (result == 0) { + *ret = n; + return 1; + } + } + if (flags & RAVL_PREDICATE_GREATER) { + if (result < 0) { /* data < n->data */ + *ret = n; + return 0; + } else if (result == 0) { + *ret = ravl_node_successor(n); + return 1; + } + } + if (flags & RAVL_PREDICATE_LESS) { + if (result > 0) { /* data > n->data */ + *ret = n; + return 0; + } else if (result == 0) { + *ret = ravl_node_predecessor(n); + return 1; + } + } + + return 0; +} + +/* + * ravl_find -- searches for the node in the tree + */ +struct ravl_node * +ravl_find(struct ravl *ravl, const void *data, enum ravl_predicate flags) +{ + struct ravl_node *r = NULL; + struct ravl_node *n = ravl->root; + + while (n) { + int result = ravl->compare(data, ravl_data(n)); + + if (ravl_predicate_holds(result, &r, n, flags)) + return r; + + n = n->slots[result > 0]; + } + + return r; +} + +/* + * ravl_remove -- removes the given node from the tree + */ +void +ravl_remove(struct ravl *ravl, struct ravl_node *n) +{ + if (n->slots[RAVL_LEFT] != NULL && n->slots[RAVL_RIGHT] != NULL) { + /* if both children are present, remove the successor instead */ + struct ravl_node *s = ravl_node_successor(n); + + memcpy(n->data, s->data, ravl->data_size); + ravl_remove(ravl, s); + } else { + /* swap n with the child that may exist */ + struct ravl_node *r = n->slots[RAVL_LEFT] ? + n->slots[RAVL_LEFT] : n->slots[RAVL_RIGHT]; + + if (r != NULL) + r->parent = n->parent; + + *ravl_node_ref(ravl, n) = r; + D_FREE(n); + } +} + +/* + * ravl_data -- returns the data contained within the node + */ +void * +ravl_data(struct ravl_node *node) +{ + if (node->pointer_based) { + void *data; + + memcpy(&data, node->data, sizeof(void *)); + return data; + } else { + return (void *)node->data; + } +} + +/* + * ravl_first -- returns first (left-most) node in the tree + */ +struct ravl_node * +ravl_first(struct ravl *ravl) +{ + if (ravl->root) + return ravl_node_type_most(ravl->root, RAVL_LEFT); + + return NULL; +} + +/* + * ravl_last -- returns last (right-most) node in the tree + */ +struct ravl_node * +ravl_last(struct ravl *ravl) +{ + if (ravl->root) + return ravl_node_type_most(ravl->root, RAVL_RIGHT); + + return NULL; +} diff --git a/src/common/dav_v2/ravl.h b/src/common/dav_v2/ravl.h new file mode 100644 index 00000000000..d1d69ec91b6 --- /dev/null +++ b/src/common/dav_v2/ravl.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2023, Intel Corporation */ + +/* + * ravl.h -- internal definitions for ravl tree + */ + +#ifndef __DAOS_COMMON_RAVL_H +#define __DAOS_COMMON_RAVL_H 1 + +#include + +struct ravl; +struct ravl_node; + +enum ravl_predicate { + RAVL_PREDICATE_EQUAL = 1 << 0, + RAVL_PREDICATE_GREATER = 1 << 1, + RAVL_PREDICATE_LESS = 1 << 2, + RAVL_PREDICATE_LESS_EQUAL = + RAVL_PREDICATE_EQUAL | RAVL_PREDICATE_LESS, + RAVL_PREDICATE_GREATER_EQUAL = + RAVL_PREDICATE_EQUAL | RAVL_PREDICATE_GREATER, +}; + +typedef int ravl_compare(const void *lhs, const void *rhs); +typedef void ravl_cb(void *data, void *arg); +typedef void ravl_constr(void *data, size_t data_size, const void *arg); + +struct ravl *ravl_new(ravl_compare *compare); +struct ravl *ravl_new_sized(ravl_compare *compare, size_t data_size); +void ravl_delete(struct ravl *ravl); +void ravl_delete_cb(struct ravl *ravl, ravl_cb cb, void *arg); +void ravl_foreach(struct ravl *ravl, ravl_cb cb, void *arg); +int ravl_empty(struct ravl *ravl); +void ravl_clear(struct ravl *ravl); +int ravl_insert(struct ravl *ravl, const void *data); +int ravl_emplace(struct ravl *ravl, ravl_constr constr, const void *arg); +int ravl_emplace_copy(struct ravl *ravl, const void *data); + +struct ravl_node *ravl_find(struct ravl *ravl, const void *data, + enum ravl_predicate predicate_flags); +struct ravl_node *ravl_first(struct ravl *ravl); +struct ravl_node *ravl_last(struct ravl *ravl); +void *ravl_data(struct ravl_node *node); +void ravl_remove(struct ravl *ravl, struct ravl_node *node); + +#endif /* __DAOS_COMMON_RAVL_H */ diff --git a/src/common/dav_v2/ravl_interval.c b/src/common/dav_v2/ravl_interval.c new file mode 100644 index 00000000000..e493b031bba --- /dev/null +++ b/src/common/dav_v2/ravl_interval.c @@ -0,0 +1,344 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2020-2023, Intel Corporation */ + +/* + * ravl_interval.c -- ravl_interval implementation + */ + +#include + +#include "ravl_interval.h" +#include "sys_util.h" +#include "ravl.h" + +/* + * ravl_interval - structure representing two points + * on the number line + */ +struct ravl_interval { + struct ravl *tree; + ravl_interval_min *get_min; + ravl_interval_max *get_max; +}; + +/* + * ravl_interval_node - structure holding min, max functions and address + */ +struct ravl_interval_node { + void *addr; + ravl_interval_min *get_min; + ravl_interval_max *get_max; + bool overlap; +}; + +/* + * ravl_interval_compare -- compare intervals by its boundaries + */ +static int +ravl_interval_compare(const void *lhs, const void *rhs) +{ + const struct ravl_interval_node *left = lhs; + const struct ravl_interval_node *right = rhs; + + /* + * when searching, comparing should return the + * earliest overlapped record + */ + if (left->overlap) { + if (left->get_min(left->addr) >= right->get_max(right->addr)) + return 1; + if (left->get_min(left->addr) == right->get_min(right->addr)) + return 0; + return -1; + } + + /* when inserting, comparing shouldn't allow overlapping intervals */ + if (left->get_min(left->addr) >= right->get_max(right->addr)) + return 1; + if (left->get_max(left->addr) <= right->get_min(right->addr)) + return -1; + return 0; +} + +/* + * ravl_interval_delete - finalize the ravl interval module + */ +void +ravl_interval_delete(struct ravl_interval *ri) +{ + ravl_delete(ri->tree); + ri->tree = NULL; + D_FREE(ri); +} + +/* + * ravl_interval_delete_cb - finalize the ravl interval module with entries + * and execute provided callback function for each entry. + */ +void +ravl_interval_delete_cb(struct ravl_interval *ri, ravl_cb cb, void *arg) +{ + ravl_delete_cb(ri->tree, cb, arg); + ri->tree = NULL; + D_FREE(ri); +} + +/* + * ravl_interval_new -- initialize the ravl interval module + */ +struct ravl_interval * +ravl_interval_new(ravl_interval_min *get_min, ravl_interval_max *get_max) +{ + struct ravl_interval *interval; + + D_ALLOC_PTR_NZ(interval); + if (!interval) + return NULL; + + interval->tree = ravl_new_sized(ravl_interval_compare, + sizeof(struct ravl_interval_node)); + if (!(interval->tree)) + goto free_alloc; + + interval->get_min = get_min; + interval->get_max = get_max; + + return interval; + +free_alloc: + D_FREE(interval); + return NULL; +} + +/* + * ravl_interval_insert -- insert interval entry into the tree + */ +int +ravl_interval_insert(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node rin; + + rin.addr = addr; + rin.get_min = ri->get_min; + rin.get_max = ri->get_max; + rin.overlap = false; + + int ret = ravl_emplace_copy(ri->tree, &rin); + + if (ret && errno) + return -errno; + + return ret; +} + +/* + * ravl_interval_remove -- remove interval entry from the tree + */ +int +ravl_interval_remove(struct ravl_interval *ri, struct ravl_interval_node *rin) +{ + struct ravl_node *node = ravl_find(ri->tree, rin, + RAVL_PREDICATE_EQUAL); + if (!node) + return -ENOENT; + + ravl_remove(ri->tree, node); + + return 0; +} + +/* + * ravl_interval_find_prior -- find overlapping interval starting prior to + * the current one + */ +static struct ravl_interval_node * +ravl_interval_find_prior(struct ravl *tree, struct ravl_interval_node *rin) +{ + struct ravl_node *node; + struct ravl_interval_node *cur; + + node = ravl_find(tree, rin, RAVL_PREDICATE_LESS); + if (!node) + return NULL; + + cur = ravl_data(node); + /* + * If the end of the found interval is below the searched boundary, then + * those intervals are not overlapping. + */ + if (cur->get_max(cur->addr) <= rin->get_min(rin->addr)) + return NULL; + + return cur; +} + +/* + * ravl_interval_find_eq -- find overlapping interval starting neither prior or + * lather than the current one + */ +static struct ravl_interval_node * +ravl_interval_find_eq(struct ravl *tree, struct ravl_interval_node *rin) +{ + struct ravl_node *node; + + node = ravl_find(tree, rin, RAVL_PREDICATE_EQUAL); + if (!node) + return NULL; + + return ravl_data(node); +} + +/* + * ravl_interval_find_later -- find overlapping interval starting later than + * the current one + */ +static struct ravl_interval_node * +ravl_interval_find_later(struct ravl *tree, struct ravl_interval_node *rin) +{ + struct ravl_node *node; + struct ravl_interval_node *cur; + + node = ravl_find(tree, rin, RAVL_PREDICATE_GREATER); + if (!node) + return NULL; + + cur = ravl_data(node); + + /* + * If the beginning of the found interval is above the end of + * the searched range, then those interval are not overlapping + */ + if (cur->get_min(cur->addr) >= rin->get_max(rin->addr)) + return NULL; + + return cur; +} + +/* + * ravl_interval_find_equal -- find the interval with exact (min, max) range + */ +struct ravl_interval_node * +ravl_interval_find_equal(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node range; + + range.addr = addr; + range.get_min = ri->get_min; + range.get_max = ri->get_max; + range.overlap = true; + + struct ravl_node *node; + + node = ravl_find(ri->tree, &range, RAVL_PREDICATE_EQUAL); + if (!node) + return NULL; + + return ravl_data(node); +} + +/* + * ravl_interval_find -- find the earliest interval within (min, max) range + */ +struct ravl_interval_node * +ravl_interval_find(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node range; + + range.addr = addr; + range.get_min = ri->get_min; + range.get_max = ri->get_max; + range.overlap = true; + + struct ravl_interval_node *cur; + + cur = ravl_interval_find_prior(ri->tree, &range); + if (!cur) + cur = ravl_interval_find_eq(ri->tree, &range); + if (!cur) + cur = ravl_interval_find_later(ri->tree, &range); + + return cur; +} + +/* + * ravl_interval_data -- returns the data contained within an interval node + */ +void * +ravl_interval_data(struct ravl_interval_node *rin) +{ + return (void *)rin->addr; +} + +/* + * ravl_interval_find_first -- returns first interval in the tree + */ +struct ravl_interval_node * +ravl_interval_find_first(struct ravl_interval *ri) +{ + struct ravl_node *first; + + first = ravl_first(ri->tree); + if (first) + return ravl_data(first); + + return NULL; +} + +/* + * ravl_interval_find_last -- returns last interval in the tree + */ +struct ravl_interval_node * +ravl_interval_find_last(struct ravl_interval *ri) +{ + struct ravl_node *last; + + last = ravl_last(ri->tree); + if (last) + return ravl_data(last); + + return NULL; +} + +/* + * ravl_interval_find_next -- returns interval succeeding the one provided + */ +struct ravl_interval_node * +ravl_interval_find_next(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node range; + + range.addr = addr; + range.get_min = ri->get_min; + range.get_max = ri->get_max; + range.overlap = true; + + struct ravl_node *next = NULL; + + next = ravl_find(ri->tree, &range, RAVL_PREDICATE_GREATER); + if (next) + return ravl_data(next); + + return NULL; +} + +/* + * ravl_interval_find_prev -- returns interval preceding the one provided + */ +struct ravl_interval_node * +ravl_interval_find_prev(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node range; + + range.addr = addr; + range.get_min = ri->get_min; + range.get_max = ri->get_max; + range.overlap = true; + + struct ravl_node *prev = NULL; + + prev = ravl_find(ri->tree, &range, RAVL_PREDICATE_LESS); + if (prev) + return ravl_data(prev); + + return NULL; +} diff --git a/src/common/dav_v2/ravl_interval.h b/src/common/dav_v2/ravl_interval.h new file mode 100644 index 00000000000..6b106fc4bfe --- /dev/null +++ b/src/common/dav_v2/ravl_interval.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2020-2023, Intel Corporation */ + +/* + * ravl_interval.h -- internal definitions for ravl_interval + */ + +#ifndef RAVL_INTERVAL_H +#define RAVL_INTERVAL_H + +#include "ravl.h" + +struct ravl_interval; +struct ravl_interval_node; + +typedef size_t ravl_interval_min(void *addr); +typedef size_t ravl_interval_max(void *addr); + +struct ravl_interval *ravl_interval_new(ravl_interval_min *min, + ravl_interval_min *max); +void ravl_interval_delete(struct ravl_interval *ri); +void ravl_interval_delete_cb(struct ravl_interval *ri, ravl_cb cb, void *arg); +int ravl_interval_insert(struct ravl_interval *ri, void *addr); +int ravl_interval_remove(struct ravl_interval *ri, + struct ravl_interval_node *rin); +struct ravl_interval_node *ravl_interval_find_equal(struct ravl_interval *ri, + void *addr); +struct ravl_interval_node *ravl_interval_find(struct ravl_interval *ri, + void *addr); +struct ravl_interval_node *ravl_interval_find_first(struct ravl_interval *ri); +struct ravl_interval_node *ravl_interval_find_last(struct ravl_interval *ri); +struct ravl_interval_node *ravl_interval_find_next(struct ravl_interval *ri, + void *addr); +struct ravl_interval_node *ravl_interval_find_prev(struct ravl_interval *ri, + void *addr); +void *ravl_interval_data(struct ravl_interval_node *rin); +#endif diff --git a/src/common/dav_v2/recycler.c b/src/common/dav_v2/recycler.c new file mode 100644 index 00000000000..5680735b341 --- /dev/null +++ b/src/common/dav_v2/recycler.c @@ -0,0 +1,323 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2024, Intel Corporation */ + +/* + * recycler.c -- implementation of run recycler + */ + +#include "heap.h" +#include "recycler.h" +#include "vec.h" +#include "out.h" +#include "util.h" +#include "sys_util.h" +#include "ravl.h" +#include "valgrind_internal.h" + +#define THRESHOLD_MUL 4 + +/* + * recycler_element_cmp -- compares two recycler elements + */ +static int +recycler_element_cmp(const void *lhs, const void *rhs) +{ + const struct recycler_element *l = lhs; + const struct recycler_element *r = rhs; + + int64_t diff = (int64_t)l->max_free_block - (int64_t)r->max_free_block; + + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->free_space - (int64_t)r->free_space; + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->zone_id - (int64_t)r->zone_id; + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->chunk_id - (int64_t)r->chunk_id; + if (diff != 0) + return diff > 0 ? 1 : -1; + + return 0; +} + +struct recycler { + struct ravl *runs; + struct palloc_heap *heap; + struct mbrt *mb; + + /* + * How many unaccounted units there *might* be inside of the memory + * blocks stored in the recycler. + * The value is not meant to be accurate, but rather a rough measure on + * how often should the memory block scores be recalculated. + * + * Per-chunk unaccounted units are shared for all zones, which might + * lead to some unnecessary recalculations. + */ + size_t unaccounted_units[MAX_CHUNK]; + size_t unaccounted_total; + size_t nallocs; + + VEC(, struct recycler_element) recalc; + + pthread_mutex_t lock; +}; + +/* + * recycler_new -- creates new recycler instance + */ +struct recycler * +recycler_new(struct palloc_heap *heap, size_t nallocs, struct mbrt *mb) +{ + struct recycler *r; + + D_ALLOC_PTR_NZ(r); + if (r == NULL) + goto error_alloc_recycler; + + r->runs = ravl_new_sized(recycler_element_cmp, + sizeof(struct recycler_element)); + if (r->runs == NULL) + goto error_alloc_tree; + + r->heap = heap; + r->nallocs = nallocs; + r->mb = mb; + r->unaccounted_total = 0; + memset(&r->unaccounted_units, 0, sizeof(r->unaccounted_units)); + + VEC_INIT(&r->recalc); + + util_mutex_init(&r->lock); + + return r; + +error_alloc_tree: + D_FREE(r); +error_alloc_recycler: + return NULL; +} + +/* + * recycler_delete -- deletes recycler instance + */ +void +recycler_delete(struct recycler *r) +{ + VEC_DELETE(&r->recalc); + + util_mutex_destroy(&r->lock); + ravl_delete(r->runs); + D_FREE(r); +} + +/* + * recycler_element_new -- calculates how many free bytes does a run have and + * what's the largest request that the run can handle, returns that as + * recycler element struct + */ +struct recycler_element +recycler_element_new(struct palloc_heap *heap, const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(heap); + + /* + * Counting of the clear bits can race with a concurrent deallocation + * that operates on the same run. This race is benign and has absolutely + * no effect on the correctness of this algorithm. Ideally, we would + * avoid grabbing the lock, but helgrind gets very confused if we + * try to disable reporting for this function. + */ + pthread_mutex_t *lock = m->m_ops->get_lock(m); + + util_mutex_lock(lock); + + struct recycler_element e = { + .free_space = 0, + .max_free_block = 0, + .chunk_id = m->chunk_id, + .zone_id = m->zone_id, + }; + m->m_ops->calc_free(m, &e.free_space, &e.max_free_block); + + util_mutex_unlock(lock); + + return e; +} + +/* + * recycler_put -- inserts new run into the recycler + */ +int +recycler_put(struct recycler *r, struct recycler_element element) +{ + int ret = 0; + + util_mutex_lock(&r->lock); + + ret = ravl_emplace_copy(r->runs, &element); + + util_mutex_unlock(&r->lock); + + return ret; +} + +/* + * recycler_get -- retrieves a chunk from the recycler + */ +int +recycler_get(struct recycler *r, struct memory_block *m) +{ + int ret = 0; + + util_mutex_lock(&r->lock); + + struct recycler_element e = { .max_free_block = m->size_idx, 0, 0, 0}; + struct ravl_node *n = ravl_find(r->runs, &e, + RAVL_PREDICATE_GREATER_EQUAL); + if (n == NULL) { + ret = ENOMEM; + goto out; + } + + struct recycler_element *ne = ravl_data(n); + + m->chunk_id = ne->chunk_id; + m->zone_id = ne->zone_id; + + ravl_remove(r->runs, n); + + struct chunk_header *hdr = heap_get_chunk_hdr(r->heap, m); + + m->size_idx = hdr->size_idx; + + memblock_rebuild_state(r->heap, m); + +out: + util_mutex_unlock(&r->lock); + + return ret; +} + +/* + * recycler_recalc -- recalculates the scores of runs in the recycler to match + * the updated persistent state + */ +struct empty_runs +recycler_recalc(struct recycler *r, int force) +{ + struct empty_runs runs; + + VEC_INIT(&runs); + + uint64_t units = r->unaccounted_total; + + uint64_t recalc_threshold = THRESHOLD_MUL * r->nallocs; + + if (!force && units < recalc_threshold) + return runs; + + if (util_mutex_trylock(&r->lock) != 0) + return runs; + + /* If the search is forced, recalculate everything */ + uint64_t search_limit = force ? UINT64_MAX : units; + + uint64_t found_units = 0; + struct memory_block nm = MEMORY_BLOCK_NONE; + struct ravl_node *n; + struct recycler_element next = {0, 0, 0, 0}; + enum ravl_predicate p = RAVL_PREDICATE_GREATER_EQUAL; + + do { + n = ravl_find(r->runs, &next, p); + if (n == NULL) + break; + + p = RAVL_PREDICATE_GREATER; + + struct recycler_element *ne = ravl_data(n); + + next = *ne; + + uint64_t chunk_units = r->unaccounted_units[ne->chunk_id]; + + if (!force && chunk_units == 0) + continue; + + uint32_t existing_free_space = ne->free_space; + + nm.chunk_id = ne->chunk_id; + nm.zone_id = ne->zone_id; + memblock_rebuild_state(r->heap, &nm); + + struct recycler_element e = recycler_element_new(r->heap, &nm); + + ASSERT(e.free_space >= existing_free_space); + uint64_t free_space_diff = e.free_space - existing_free_space; + + found_units += free_space_diff; + + if (free_space_diff == 0) + continue; + + /* + * Decrease the per chunk_id counter by the number of nallocs + * found, increased by the blocks potentially freed in the + * active memory block. Cap the sub value to prevent overflow. + */ + util_fetch_and_sub64(&r->unaccounted_units[nm.chunk_id], + MIN(chunk_units, free_space_diff + r->nallocs)); + + ravl_remove(r->runs, n); + + if (e.free_space == r->nallocs) { + memblock_rebuild_state(r->heap, &nm); + if (VEC_PUSH_BACK(&runs, nm) != 0) + ASSERT(0); /* XXX: fix after refactoring */ + } else { + VEC_PUSH_BACK(&r->recalc, e); + } + } while (found_units < search_limit); + + struct recycler_element *e; + + VEC_FOREACH_BY_PTR(e, &r->recalc) { + ravl_emplace_copy(r->runs, e); + } + + VEC_CLEAR(&r->recalc); + + util_mutex_unlock(&r->lock); + + util_fetch_and_sub64(&r->unaccounted_total, units); + + return runs; +} + +/* + * recycler_inc_unaccounted -- increases the number of unaccounted units in the + * recycler + */ +void +recycler_inc_unaccounted(struct recycler *r, const struct memory_block *m) +{ + util_fetch_and_add64(&r->unaccounted_total, m->size_idx); + util_fetch_and_add64(&r->unaccounted_units[m->chunk_id], + m->size_idx); +} + +/* + * Return the Memory Bucket runtime associated with the recycler. + */ +struct mbrt * +recycler_get_mbrt(struct recycler *r) +{ + return r->mb; +} diff --git a/src/common/dav_v2/recycler.h b/src/common/dav_v2/recycler.h new file mode 100644 index 00000000000..769ce4a4c4a --- /dev/null +++ b/src/common/dav_v2/recycler.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2024, Intel Corporation */ + +/* + * recycler.h -- internal definitions of run recycler + * + * This is a container that stores runs that are currently not used by any of + * the buckets. + */ + +#ifndef __DAOS_COMMON_RECYCLER_H +#define __DAOS_COMMON_RECYCLER_H 1 + +#include "memblock.h" +#include "vec.h" + +struct recycler; +VEC(empty_runs, struct memory_block); + +struct recycler_element { + uint32_t max_free_block; + uint32_t free_space; + + uint32_t chunk_id; + uint32_t zone_id; +}; + +struct recycler * +recycler_new(struct palloc_heap *layout, size_t nallocs, struct mbrt *mb); +void recycler_delete(struct recycler *r); +struct recycler_element recycler_element_new(struct palloc_heap *heap, + const struct memory_block *m); + +int recycler_put(struct recycler *r, struct recycler_element element); + +int recycler_get(struct recycler *r, struct memory_block *m); + +struct empty_runs recycler_recalc(struct recycler *r, int force); + +void recycler_inc_unaccounted(struct recycler *r, + const struct memory_block *m); + +struct mbrt * +recycler_get_mbrt(struct recycler *r); + +#endif /* __DAOS_COMMON_RECYCLER_H */ diff --git a/src/common/dav_v2/stats.c b/src/common/dav_v2/stats.c new file mode 100644 index 00000000000..173b8bb1bab --- /dev/null +++ b/src/common/dav_v2/stats.c @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2017-2024, Intel Corporation */ + +/* + * stats.c -- implementation of statistics + */ + +#include + +#include "dav_internal.h" +#include "obj.h" +#include "stats.h" +#include "heap.h" + +/* + * stats_new -- allocates and initializes statistics instance + */ +struct stats * +stats_new(dav_obj_t *pop) +{ + struct stats *s; + + D_ALLOC_PTR_NZ(s); + if (s == NULL) { + D_CRIT("Malloc\n"); + return NULL; + } + + D_ALLOC_PTR(s->transient); + if (s->transient == NULL) + goto error_transient_alloc; + + return s; + +error_transient_alloc: + D_FREE(s); + return NULL; +} + +/* + * stats_delete -- deletes statistics instance + */ +void +stats_delete(dav_obj_t *pop, struct stats *s) +{ + D_FREE(s->transient); + D_FREE(s); +} + +/* + * stats_persist -- save the persistent statistics to wal + */ +void +stats_persist(dav_obj_t *pop, struct stats *s) +{ + if (s->transient->heap_prev_pval != + s->persistent->heap_curr_allocated) { + mo_wal_persist(&pop->p_ops, s->persistent, + sizeof(struct stats_persistent)); + s->transient->heap_prev_pval = + s->persistent->heap_curr_allocated; + } +} + +DAV_FUNC_EXPORT int +dav_get_heap_stats_v2(dav_obj_t *pop, struct dav_heap_stats *st) +{ + if ((pop == NULL) || (st == NULL)) { + errno = EINVAL; + return -1; + } + + st->curr_allocated = pop->do_stats->persistent->heap_curr_allocated; + st->run_allocated = pop->do_stats->transient->heap_run_allocated; + st->run_active = pop->do_stats->transient->heap_run_active; + return 0; +} + +DAV_FUNC_EXPORT int +dav_get_heap_mb_stats_v2(dav_obj_t *pop, uint32_t mb_id, struct dav_heap_mb_stats *st) +{ + return heap_mbrt_getmb_usage(pop->do_heap, mb_id, &st->dhms_allocated, &st->dhms_maxsz); +} diff --git a/src/common/dav_v2/stats.h b/src/common/dav_v2/stats.h new file mode 100644 index 00000000000..a295563ec5f --- /dev/null +++ b/src/common/dav_v2/stats.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2017-2023, Intel Corporation */ + +/* + * stats.h -- definitions of statistics + */ + +#ifndef __DAOS_COMMON_STATS_H +#define __DAOS_COMMON_STATS_H 1 + +struct stats_transient { + uint64_t heap_run_allocated; + uint64_t heap_run_active; + uint64_t heap_prev_pval; /* previous persisted value of curr allocated */ +}; + +struct stats_persistent { + uint64_t heap_curr_allocated; +}; + +struct stats { + struct stats_transient *transient; + struct stats_persistent *persistent; +}; + +#define STATS_INC(stats, type, name, value) \ + STATS_INC_##type(stats, name, value) + +#define STATS_INC_transient(stats, name, value)\ + util_fetch_and_add64((&(stats)->transient->name), (value)) + +#define STATS_INC_persistent(stats, name, value)\ + util_fetch_and_add64((&(stats)->persistent->name), (value)) + +#define STATS_SUB(stats, type, name, value)\ + STATS_SUB_##type(stats, name, value) + +#define STATS_SUB_transient(stats, name, value)\ + util_fetch_and_sub64((&(stats)->transient->name), (value)) + +#define STATS_SUB_persistent(stats, name, value)\ + util_fetch_and_sub64((&(stats)->persistent->name), (value)) + +#define STATS_SET(stats, type, name, value)\ + STATS_SET_##type(stats, name, value) + +#define STATS_SET_transient(stats, name, value)\ + util_atomic_store_explicit64((&(stats)->transient->name),\ + (value), memory_order_release)\ + +#define STATS_SET_persistent(stats, name, value)\ + util_atomic_store_explicit64((&(stats)->persistent->name),\ + (value), memory_order_release)\ + +struct dav_obj; + +struct stats *stats_new(struct dav_obj *pop); +void stats_delete(struct dav_obj *pop, struct stats *stats); +void stats_persist(struct dav_obj *pop, struct stats *s); + +#endif /* __DAOS_COMMON_STATS_H */ diff --git a/src/common/dav_v2/sys_util.h b/src/common/dav_v2/sys_util.h new file mode 100644 index 00000000000..3730f60c0ce --- /dev/null +++ b/src/common/dav_v2/sys_util.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2023, Intel Corporation */ + +/* + * sys_util.h -- internal utility wrappers around system functions + */ + +#ifndef __DAOS_COMMON_SYS_UTIL_H +#define __DAOS_COMMON_SYS_UTIL_H 1 + +#include + +#include +#include "out.h" + +/* + * util_mutex_init -- os_mutex_init variant that never fails from + * caller perspective. If os_mutex_init failed, this function aborts + * the program. + */ +static inline void +util_mutex_init(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_INIT(m, NULL); + + D_ASSERTF(tmp == 0, "!os_mutex_init"); +} + +/* + * util_mutex_destroy -- os_mutex_destroy variant that never fails from + * caller perspective. If os_mutex_destroy failed, this function aborts + * the program. + */ +static inline void +util_mutex_destroy(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_DESTROY(m); + + D_ASSERTF(tmp == 0, "!os_mutex_destroy"); +} + +/* + * util_mutex_lock -- os_mutex_lock variant that never fails from + * caller perspective. If os_mutex_lock failed, this function aborts + * the program. + */ +static inline void +util_mutex_lock(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_LOCK(m); + + D_ASSERTF(tmp == 0, "!os_mutex_destroy"); +} + +/* + * util_mutex_trylock -- os_mutex_trylock variant that never fails from + * caller perspective (other than EBUSY). If util_mutex_trylock failed, this + * function aborts the program. + * Returns 0 if locked successfully, otherwise returns EBUSY. + */ +static inline int +util_mutex_trylock(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_TRYLOCK(m); + + D_ASSERTF((!tmp || (tmp == -DER_BUSY)), "!os_mutex_trylock"); + return tmp?EBUSY:0; +} + +/* + * util_mutex_unlock -- os_mutex_unlock variant that never fails from + * caller perspective. If os_mutex_unlock failed, this function aborts + * the program. + */ +static inline void +util_mutex_unlock(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_UNLOCK(m); + + D_ASSERTF(tmp == 0, "!os_mutex_unlock"); +} + +#endif /* __DAOS_COMMON_SYS_UTIL_H */ diff --git a/src/common/dav_v2/tx.c b/src/common/dav_v2/tx.c new file mode 100644 index 00000000000..d50c3f52299 --- /dev/null +++ b/src/common/dav_v2/tx.c @@ -0,0 +1,1895 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * tx.c -- transactions implementation + */ + +#include +#include +#include + +#include "queue.h" +#include "ravl.h" +#include "obj.h" +#include "out.h" +#include "tx.h" +#include "valgrind_internal.h" +#include "memops.h" +#include "dav_internal.h" + +struct tx_data { + DAV_SLIST_ENTRY(tx_data) tx_entry; + jmp_buf env; + enum dav_tx_failure_behavior failure_behavior; +}; + +struct tx { + dav_obj_t *pop; + enum dav_tx_stage stage; + int last_errnum; + + DAV_SLIST_HEAD(txd, tx_data) tx_entries; + + struct ravl *ranges; + + VEC(, struct dav_action) actions; + + dav_tx_callback stage_callback; + void *stage_callback_arg; + + int first_snapshot; +}; + +/* + * get_tx -- returns current transaction + * + * This function should be used only in high-level functions. + */ +static struct tx * +get_tx() +{ + static __thread struct tx tx; + + return &tx; +} + +struct tx_alloc_args { + uint64_t flags; + const void *copy_ptr; + size_t copy_size; +}; + +#define ALLOC_ARGS(flags)\ +(struct tx_alloc_args){flags, NULL, 0} + +struct tx_range_def { + uint64_t offset; + uint64_t size; + uint64_t flags; +}; + +/* + * tx_range_def_cmp -- compares two snapshot ranges + */ +static int +tx_range_def_cmp(const void *lhs, const void *rhs) +{ + const struct tx_range_def *l = lhs; + const struct tx_range_def *r = rhs; + + if (l->offset > r->offset) + return 1; + else if (l->offset < r->offset) + return -1; + + return 0; +} + +static void +obj_tx_abort(int errnum, int user); + +/* + * obj_tx_fail_err -- (internal) dav_tx_abort variant that returns + * error code + */ +static inline int +obj_tx_fail_err(int errnum, uint64_t flags) +{ + if ((flags & DAV_FLAG_TX_NO_ABORT) == 0) + obj_tx_abort(errnum, 0); + errno = errnum; + return errnum; +} + +/* + * obj_tx_fail_null -- (internal) dav_tx_abort variant that returns + * null PMEMoid + */ +static inline uint64_t +obj_tx_fail_null(int errnum, uint64_t flags) +{ + if ((flags & DAV_FLAG_TX_NO_ABORT) == 0) + obj_tx_abort(errnum, 0); + errno = errnum; + return 0; +} + +/* ASSERT_IN_TX -- checks whether there's open transaction */ +#define ASSERT_IN_TX(tx) do {\ + if ((tx)->stage == DAV_TX_STAGE_NONE)\ + FATAL("%s called outside of transaction", __func__);\ +} while (0) + +/* ASSERT_TX_STAGE_WORK -- checks whether current transaction stage is WORK */ +#define ASSERT_TX_STAGE_WORK(tx) do {\ + if ((tx)->stage != DAV_TX_STAGE_WORK)\ + FATAL("%s called in invalid stage %d", __func__, (tx)->stage);\ +} while (0) + +/* + * tx_action_reserve -- (internal) reserve space for the given number of actions + */ +static int +tx_action_reserve(struct tx *tx, size_t n) +{ + size_t entries_size = (VEC_SIZE(&tx->actions) + n) * + sizeof(struct ulog_entry_val); + + if (operation_reserve(tx->pop->external, entries_size) != 0) + return -1; + + return 0; +} + +/* + * tx_action_add -- (internal) reserve space and add a new tx action + */ +static struct dav_action * +tx_action_add(struct tx *tx) +{ + if (tx_action_reserve(tx, 1) != 0) + return NULL; + + VEC_INC_BACK(&tx->actions); + + return &VEC_BACK(&tx->actions); +} + +/* + * tx_action_remove -- (internal) remove last tx action + */ +static void +tx_action_remove(struct tx *tx) +{ + VEC_POP_BACK(&tx->actions); +} + +/* + * constructor_tx_alloc -- (internal) constructor for normal alloc + */ +static int +constructor_tx_alloc(void *ctx, void *ptr, size_t usable_size, void *arg) +{ + ASSERTne(ptr, NULL); + ASSERTne(arg, NULL); + + struct tx_alloc_args *args = arg; + + /* do not report changes to the new object */ + VALGRIND_ADD_TO_TX(ptr, usable_size); + + if (args->flags & DAV_FLAG_ZERO) + memset(ptr, 0, usable_size); + + if (args->copy_ptr && args->copy_size != 0) { + FATAL("dav xalloc does not support copy_ptr\n"); + memcpy(ptr, args->copy_ptr, args->copy_size); + } + + return 0; +} + +/* + * tx_restore_range -- (internal) restore a single range from undo log + */ +static void +tx_restore_range(dav_obj_t *pop, struct ulog_entry_buf *range) +{ + void *begin, *end; + size_t size = range->size; + uint64_t range_offset = ulog_entry_offset(&range->base); + + begin = OBJ_OFF_TO_PTR(pop, range_offset); + end = (char *)begin + size; + ASSERT((char *)end >= (char *)begin); + + memcpy(begin, range->data, size); +} + +/* + * tx_undo_entry_apply -- applies modifications of a single ulog entry + */ +static int +tx_undo_entry_apply(struct ulog_entry_base *e, void *arg, + const struct mo_ops *p_ops) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(arg); + + struct ulog_entry_buf *eb; + + switch (ulog_entry_type(e)) { + case ULOG_OPERATION_BUF_CPY: + eb = (struct ulog_entry_buf *)e; + + tx_restore_range(p_ops->base, eb); + break; +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + case ULOG_OPERATION_OR: +#else + case ULOG_OPERATION_CLR_BITS: + case ULOG_OPERATION_SET_BITS: +#endif + case ULOG_OPERATION_SET: + case ULOG_OPERATION_BUF_SET: + default: + ASSERT(0); + } + + return 0; +} + +/* + * tx_abort_set -- (internal) abort all set operations + */ +static void +tx_abort_set(dav_obj_t *pop) +{ + ulog_foreach_entry((struct ulog *)&pop->clogs.undo, + tx_undo_entry_apply, NULL, &pop->p_ops); + operation_finish(pop->undo, ULOG_INC_FIRST_GEN_NUM); +} + +/* + * tx_flush_range -- (internal) flush one range + */ +static void +tx_flush_range(void *data, void *ctx) +{ + dav_obj_t *pop = ctx; + struct tx_range_def *range = data; + + if (!(range->flags & DAV_FLAG_NO_FLUSH)) { + mo_wal_flush(&pop->p_ops, OBJ_OFF_TO_PTR(pop, range->offset), + range->size, range->flags & DAV_XADD_WAL_CPTR); + } + VALGRIND_REMOVE_FROM_TX(OBJ_OFF_TO_PTR(pop, range->offset), + range->size); +} + +/* + * tx_clean_range -- (internal) clean one range + */ +static void +tx_clean_range(void *data, void *ctx) +{ + dav_obj_t *pop = ctx; + struct tx_range_def *range = data; + + VALGRIND_REMOVE_FROM_TX(OBJ_OFF_TO_PTR(pop, range->offset), + range->size); + VALGRIND_SET_CLEAN(OBJ_OFF_TO_PTR(pop, range->offset), range->size); +} + +/* + * tx_pre_commit -- (internal) do pre-commit operations + */ +static void +tx_pre_commit(struct tx *tx) +{ + /* Flush all regions and destroy the whole tree. */ + ravl_delete_cb(tx->ranges, tx_flush_range, tx->pop); + tx->ranges = NULL; +} + +/* + * tx_abort -- (internal) abort all allocated objects + */ +static void +tx_abort(dav_obj_t *pop) +{ + struct tx *tx = get_tx(); + + tx_abort_set(pop); + + ravl_delete_cb(tx->ranges, tx_clean_range, pop); + palloc_cancel(pop->do_heap, + VEC_ARR(&tx->actions), VEC_SIZE(&tx->actions)); + tx->ranges = NULL; +} + +/* + * tx_ranges_insert_def -- (internal) allocates and inserts a new range + * definition into the ranges tree + */ +static int +tx_ranges_insert_def(dav_obj_t *pop, struct tx *tx, + const struct tx_range_def *rdef) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(pop); + + DAV_DBG("(%lu,%lu) size=%zu", + rdef->offset / 4096, rdef->offset % 4096, rdef->size); + + int ret = ravl_emplace_copy(tx->ranges, rdef); + + if (ret && errno == EEXIST) + FATAL("invalid state of ranges tree"); + return ret; +} + +/* + * tx_alloc_common -- (internal) common function for alloc and zalloc + */ +static uint64_t +tx_alloc_common(struct tx *tx, size_t size, type_num_t type_num, + palloc_constr constructor, struct tx_alloc_args args) +{ + const struct tx_range_def *r; + uint64_t off; + + if (size > DAV_MAX_ALLOC_SIZE) { + ERR("requested size too large"); + return obj_tx_fail_null(ENOMEM, args.flags); + } + + dav_obj_t *pop = tx->pop; + + struct dav_action *action = tx_action_add(tx); + + if (action == NULL) + return obj_tx_fail_null(ENOMEM, args.flags); + + if (palloc_reserve(pop->do_heap, size, constructor, &args, type_num, 0, + CLASS_ID_FROM_FLAG(args.flags), EZONE_ID_FROM_FLAG(args.flags), + action) != 0) + goto err_oom; + + palloc_get_prange(action, &off, &size, 1); + r = &(struct tx_range_def){off, size, args.flags}; + if (tx_ranges_insert_def(pop, tx, r) != 0) + goto err_oom; + + return action->heap.offset; + +err_oom: + tx_action_remove(tx); + D_CRIT("out of memory\n"); + return obj_tx_fail_null(ENOMEM, args.flags); +} + +/* + * tx_create_wal_entry -- convert to WAL a single ulog UNDO entry + */ +int +tx_create_wal_entry(struct ulog_entry_base *e, void *arg, + const struct mo_ops *p_ops) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(arg); + + int rc = 0; + uint64_t offset = ulog_entry_offset(e); + daos_size_t dst_size = sizeof(uint64_t); + struct ulog_entry_val *ev; + struct ulog_entry_buf *eb; + uint64_t v; + uint64_t *dst; + + D_ASSERT(p_ops->base != NULL); + dst = umem_cache_off2ptr(p_ops->umem_store, offset); + + switch (ulog_entry_type(e)) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + ev = (struct ulog_entry_val *)e; + v = ev->value; + + rc = dav_wal_tx_and(p_ops->base, dst, v); + break; + case ULOG_OPERATION_OR: + ev = (struct ulog_entry_val *)e; + v = ev->value; + + rc = dav_wal_tx_or(p_ops->base, dst, v); + break; +#else + case ULOG_OPERATION_CLR_BITS: + ev = (struct ulog_entry_val *)e; + v = ev->value; + + rc = dav_wal_tx_clr_bits(p_ops->base, dst, ULOG_ENTRY_VAL_TO_POS(v), + ULOG_ENTRY_VAL_TO_BITS(v)); + break; + case ULOG_OPERATION_SET_BITS: + ev = (struct ulog_entry_val *)e; + v = ev->value; + + rc = dav_wal_tx_set_bits(p_ops->base, dst, ULOG_ENTRY_VAL_TO_POS(v), + ULOG_ENTRY_VAL_TO_BITS(v)); + break; +#endif + case ULOG_OPERATION_SET: + ev = (struct ulog_entry_val *)e; + + rc = dav_wal_tx_snap(p_ops->base, dst, dst_size, (void *)&ev->value, 0); + break; + case ULOG_OPERATION_BUF_SET: + eb = (struct ulog_entry_buf *)e; + + dst_size = eb->size; + rc = dav_wal_tx_set(p_ops->base, dst, 0, dst_size); + break; + case ULOG_OPERATION_BUF_CPY: + eb = (struct ulog_entry_buf *)e; + + dst_size = eb->size; + /* The only undo entry from dav that needs to be + * transformed into redo + */ + rc = dav_wal_tx_snap(p_ops->base, dst, dst_size, dst, 0); + break; + default: + ASSERT(0); + } + + return rc; +} + +int +lw_tx_begin(dav_obj_t *pop) +{ + struct umem_wal_tx *utx = NULL; + int rc; + uint64_t wal_id; + + rc = umem_cache_reserve(pop->do_store); + if (rc) { + D_ERROR("umem_cache_reserve failed, " DF_RC "\n", DP_RC(rc)); + return rc; + } + rc = dav_wal_tx_reserve(pop, &wal_id); + if (rc) { + D_ERROR("so_wal_reserv failed, "DF_RC"\n", DP_RC(rc)); + return rc; + } + if (pop->do_utx == NULL) { + utx = dav_umem_wtx_new(pop); + if (utx == NULL) { + D_ERROR("dav_umem_wtx_new failed\n"); + return ENOMEM; + } + } + pop->do_utx->utx_id = wal_id; + return rc; +} + +int +lw_tx_end(dav_obj_t *pop, void *data) +{ + struct umem_wal_tx *utx; + int rc; + + /* Persist the frequently updated persistent globals */ + stats_persist(pop, pop->do_stats); + + utx = pop->do_utx; + D_ASSERT(utx != NULL); + pop->do_utx = NULL; + + rc = dav_wal_tx_commit(pop, utx, data); + D_FREE(utx); + return rc; +} + +/* + * dav_tx_begin -- initializes new transaction + */ +DAV_FUNC_EXPORT int +dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...) +{ + int err = 0; + struct tx *tx = get_tx(); + uint64_t wal_id; + + enum dav_tx_failure_behavior failure_behavior = DAV_TX_FAILURE_ABORT; + + if (tx->stage == DAV_TX_STAGE_WORK) { + if (tx->pop != pop) { + ERR("nested transaction for different pool"); + return obj_tx_fail_err(EINVAL, 0); + } + + /* inherits this value from the parent transaction */ + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + failure_behavior = txd->failure_behavior; + + VALGRIND_START_TX; + } else if (tx->stage == DAV_TX_STAGE_NONE) { + struct umem_wal_tx *utx = NULL; + + DAV_DBG(""); + err = umem_cache_reserve(pop->do_store); + if (err) { + D_ERROR("umem_cache_reserve failed, " DF_RC "\n", DP_RC(err)); + err = daos_der2errno(err); + goto err_abort; + } + + err = dav_wal_tx_reserve(pop, &wal_id); + if (err) { + D_ERROR("so_wal_reserv failed, "DF_RC"\n", DP_RC(err)); + goto err_abort; + } + + if (pop->do_utx == NULL) { + utx = dav_umem_wtx_new(pop); + if (utx == NULL) { + err = ENOMEM; + goto err_abort; + } + } + pop->do_utx->utx_id = wal_id; + + tx = get_tx(); + + VALGRIND_START_TX; + + dav_hold_clogs(pop); + operation_start(pop->undo); + + VEC_INIT(&tx->actions); + DAV_SLIST_INIT(&tx->tx_entries); + + tx->ranges = ravl_new_sized(tx_range_def_cmp, + sizeof(struct tx_range_def)); + tx->first_snapshot = 1; + tx->pop = pop; + } else { + FATAL("Invalid stage %d to begin new transaction", tx->stage); + } + + struct tx_data *txd; + + D_ALLOC_PTR_NZ(txd); + if (txd == NULL) { + err = errno; + D_CRIT("Malloc!\n"); + goto err_abort; + } + + tx->last_errnum = 0; + ASSERT(env == NULL); + if (env != NULL) + memcpy(txd->env, env, sizeof(jmp_buf)); + else + memset(txd->env, 0, sizeof(jmp_buf)); + + txd->failure_behavior = failure_behavior; + + DAV_SLIST_INSERT_HEAD(&tx->tx_entries, txd, tx_entry); + + tx->stage = DAV_TX_STAGE_WORK; + + /* handle locks */ + va_list argp; + + va_start(argp, env); + + enum dav_tx_param param_type; + + while ((param_type = va_arg(argp, enum dav_tx_param)) != + DAV_TX_PARAM_NONE) { + if (param_type == DAV_TX_PARAM_CB) { + dav_tx_callback cb = + va_arg(argp, dav_tx_callback); + void *arg = va_arg(argp, void *); + + if (tx->stage_callback && + (tx->stage_callback != cb || + tx->stage_callback_arg != arg)) { + FATAL( + "transaction callback is already set, old %p new %p old_arg %p new_arg %p", + tx->stage_callback, cb, + tx->stage_callback_arg, arg); + } + + tx->stage_callback = cb; + tx->stage_callback_arg = arg; + } else { + ASSERT(param_type == DAV_TX_PARAM_CB); + } + } + va_end(argp); + + ASSERT(err == 0); + return 0; + +err_abort: + if (tx->stage == DAV_TX_STAGE_WORK) + obj_tx_abort(err, 0); + else + tx->stage = DAV_TX_STAGE_ONABORT; + return err; +} + +/* + * tx_abort_on_failure_flag -- (internal) return 0 or DAV_FLAG_TX_NO_ABORT + * based on transaction setting + */ +static uint64_t +tx_abort_on_failure_flag(struct tx *tx) +{ + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + if (txd->failure_behavior == DAV_TX_FAILURE_RETURN) + return DAV_FLAG_TX_NO_ABORT; + return 0; +} + +/* + * obj_tx_callback -- (internal) executes callback associated with current stage + */ +static void +obj_tx_callback(struct tx *tx) +{ + if (!tx->stage_callback) + return; + + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + /* is this the outermost transaction? */ + if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) + tx->stage_callback(tx->pop, tx->stage, tx->stage_callback_arg); +} + +/* + * dav_tx_stage -- returns current transaction stage + */ +DAV_FUNC_EXPORT enum dav_tx_stage +dav_tx_stage_v2(void) +{ + return get_tx()->stage; +} + +/* + * obj_tx_abort -- aborts current transaction + */ +static void +obj_tx_abort(int errnum, int user) +{ + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + ASSERT(tx->pop != NULL); + + if (errnum == 0) + errnum = ECANCELED; + + tx->stage = DAV_TX_STAGE_ONABORT; + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) { + /* this is the outermost transaction */ + + /* process the undo log */ + tx_abort(tx->pop); + + dav_release_clogs(tx->pop); + } + + tx->last_errnum = errnum; + errno = errnum; + if (user) { + DAV_DBG("!explicit transaction abort"); + } + + /* ONABORT */ + obj_tx_callback(tx); + + if (!util_is_zeroed(txd->env, sizeof(jmp_buf))) + longjmp(txd->env, errnum); +} + +/* + * dav_tx_abort -- aborts current transaction + * + * Note: this function should not be called from inside of dav. + */ +DAV_FUNC_EXPORT void +dav_tx_abort_v2(int errnum) +{ + DAV_API_START(); + DAV_DBG(""); + obj_tx_abort(errnum, 1); + DAV_API_END(); +} + +/* + * dav_tx_errno -- returns last transaction error code + */ +DAV_FUNC_EXPORT int +dav_tx_errno_v2(void) +{ + DAV_DBG("err:%d", get_tx()->last_errnum); + + return get_tx()->last_errnum; +} + +static void +tx_post_commit(struct tx *tx) +{ + operation_finish(tx->pop->undo, 0); +} + +/* + * dav_tx_commit -- commits current transaction + */ +DAV_FUNC_EXPORT void +dav_tx_commit_v2(void) +{ + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + ASSERT(tx->pop); + DAV_DBG(""); + + /* WORK */ + obj_tx_callback(tx); + dav_obj_t *pop = tx->pop; + + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) { + /* this is the outermost transaction */ + + /* pre-commit phase */ + tx_pre_commit(tx); + + mo_wal_drain(&pop->p_ops); + + operation_start(pop->external); + + palloc_publish(pop->do_heap, VEC_ARR(&tx->actions), + VEC_SIZE(&tx->actions), pop->external); + + tx_post_commit(tx); + + dav_release_clogs(pop); + } + + tx->stage = DAV_TX_STAGE_ONCOMMIT; + + /* ONCOMMIT */ + obj_tx_callback(tx); + DAV_API_END(); +} + +/* + * dav_tx_end -- ends current transaction + */ +DAV_FUNC_EXPORT int +dav_tx_end_v2(void *data) +{ + struct tx *tx = get_tx(); + + if (tx->stage == DAV_TX_STAGE_WORK) + FATAL("dav_tx_end called without dav_tx_commit"); + + if (tx->pop == NULL) + FATAL("dav_tx_end called without dav_tx_begin"); + + if (tx->stage_callback && + (tx->stage == DAV_TX_STAGE_ONCOMMIT || + tx->stage == DAV_TX_STAGE_ONABORT)) { + tx->stage = DAV_TX_STAGE_FINALLY; + obj_tx_callback(tx); + } + + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + DAV_SLIST_REMOVE_HEAD(&tx->tx_entries, tx_entry); + + D_FREE(txd); + + VALGRIND_END_TX; + int ret = tx->last_errnum; + + if (DAV_SLIST_EMPTY(&tx->tx_entries)) { + dav_obj_t *pop = tx->pop; + dav_tx_callback cb = tx->stage_callback; + void *arg = tx->stage_callback_arg; + int rc; + + DAV_DBG(""); + ASSERT(pop); + tx->pop = NULL; + tx->stage = DAV_TX_STAGE_NONE; + tx->stage_callback = NULL; + tx->stage_callback_arg = NULL; + + VEC_DELETE(&tx->actions); + /* tx should not be accessed after this */ + + /* commit to WAL */ + rc = lw_tx_end(pop, data); + /* TODO: Handle WAL commit errors */ + D_ASSERT(rc == 0); + + if (cb) + cb(pop, DAV_TX_STAGE_NONE, arg); + } else { + /* resume the next transaction */ + tx->stage = DAV_TX_STAGE_WORK; + + /* abort called within inner transaction, waterfall the error */ + if (tx->last_errnum) + obj_tx_abort(tx->last_errnum, 0); + } + + return ret; +} + +/* + * vg_verify_initialized -- when executed under Valgrind verifies that + * the buffer has been initialized; explicit check at snapshotting time, + * because Valgrind may find it much later when it's impossible to tell + * for which snapshot it triggered + */ +static void +vg_verify_initialized(dav_obj_t *pop, const struct tx_range_def *def) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(pop, def); +#if VG_MEMCHECK_ENABLED + if (!On_memcheck) + return; + + VALGRIND_DO_DISABLE_ERROR_REPORTING; + char *start = OBJ_OFF_TO_PTR(pop, def->offset); + char *uninit = (char *)VALGRIND_CHECK_MEM_IS_DEFINED(start, def->size); + + if (uninit) { + VALGRIND_PRINTF( + "Snapshotting uninitialized data in range <%p,%p> ()\n", + start, start + def->size, def->offset, def->size); + + if (uninit != start) + VALGRIND_PRINTF("Uninitialized data starts at: %p\n", + uninit); + + VALGRIND_DO_ENABLE_ERROR_REPORTING; + VALGRIND_CHECK_MEM_IS_DEFINED(start, def->size); + } else { + VALGRIND_DO_ENABLE_ERROR_REPORTING; + } +#endif +} + +/* + * dav_tx_add_snapshot -- (internal) creates a variably sized snapshot + */ +static int +dav_tx_add_snapshot(struct tx *tx, struct tx_range_def *snapshot) +{ + /* + * Depending on the size of the block, either allocate an + * entire new object or use cache. + */ + void *ptr = OBJ_OFF_TO_PTR(tx->pop, snapshot->offset); + + VALGRIND_ADD_TO_TX(ptr, snapshot->size); + + /* do nothing */ + if (snapshot->flags & DAV_XADD_NO_SNAPSHOT) + return 0; + + if (!(snapshot->flags & DAV_XADD_ASSUME_INITIALIZED)) + vg_verify_initialized(tx->pop, snapshot); + + /* + * If we are creating the first snapshot, setup a redo log action to + * increment counter in the undo log, so that the log becomes + * invalid once the redo log is processed. + */ + if (tx->first_snapshot) { + struct dav_action *action = tx_action_add(tx); + + if (action == NULL) + return -1; + + uint64_t *n = &tx->pop->clogs.undo.gen_num; + + palloc_set_value(tx->pop->do_heap, action, + n, *n + 1); + + tx->first_snapshot = 0; + } + + return operation_add_buffer(tx->pop->undo, ptr, ptr, snapshot->size, + ULOG_OPERATION_BUF_CPY); +} + +/* + * dav_tx_merge_flags -- (internal) common code for merging flags between + * two ranges to ensure resultant behavior is correct + */ +static void +dav_tx_merge_flags(struct tx_range_def *dest, struct tx_range_def *merged) +{ + /* + * DAV_XADD_NO_FLUSH should only be set in merged range if set in + * both ranges + */ + if ((dest->flags & DAV_XADD_NO_FLUSH) && + !(merged->flags & DAV_XADD_NO_FLUSH)) { + dest->flags = dest->flags & (~DAV_XADD_NO_FLUSH); + } + + /* + * Extend DAV_XADD_WAL_CPTR when merged. + * REVISIT: Ideally merge should happen only if address ranges + * overlap. Current code merges adjacent ranges even if only one + * of them has this flag set. Fix this before closing DAOS-11049. + */ + if (merged->flags & DAV_XADD_WAL_CPTR) + dest->flags = dest->flags | DAV_XADD_WAL_CPTR; +} + +/* + * dav_tx_add_common -- (internal) common code for adding persistent memory + * into the transaction + */ +static int +dav_tx_add_common(struct tx *tx, struct tx_range_def *args) +{ + if (args->size > DAV_MAX_ALLOC_SIZE) { + ERR("snapshot size too large"); + return obj_tx_fail_err(EINVAL, args->flags); + } + + if (!OBJ_OFFRANGE_FROM_HEAP(tx->pop, args->offset, (args->offset + args->size))) { + ERR("object outside of heap"); + return obj_tx_fail_err(EINVAL, args->flags); + } + + int ret = 0; + + /* + * Search existing ranges backwards starting from the end of the + * snapshot. + */ + struct tx_range_def r = *args; + + DAV_DBG("(%lu,%lu) size=%zu", r.offset / 4096, r.offset % 4096, r.size); + struct tx_range_def search = {0, 0, 0}; + /* + * If the range is directly adjacent to an existing one, + * they can be merged, so search for less or equal elements. + */ + enum ravl_predicate p = RAVL_PREDICATE_LESS_EQUAL; + struct ravl_node *nprev = NULL; + + while (r.size != 0) { + search.offset = r.offset + r.size; + struct ravl_node *n = ravl_find(tx->ranges, &search, p); + /* + * We have to skip searching for LESS_EQUAL because + * the snapshot we would find is the one that was just + * created. + */ + p = RAVL_PREDICATE_LESS; + + struct tx_range_def *f = n ? ravl_data(n) : NULL; + + size_t fend = f == NULL ? 0 : f->offset + f->size; + size_t rend = r.offset + r.size; + + if (fend == 0 || fend < r.offset) { + /* + * If found no range or the found range is not + * overlapping or adjacent on the left side, we can just + * create the entire r.offset + r.size snapshot. + * + * Snapshot: + * --+- + * Existing ranges: + * ---- (no ranges) + * or +--- (no overlap) + * or ---+ (adjacent on on right side) + */ + if (nprev != NULL) { + /* + * But, if we have an existing adjacent snapshot + * on the right side, we can just extend it to + * include the desired range. + */ + struct tx_range_def *fprev = ravl_data(nprev); + + ASSERTeq(rend, fprev->offset); + fprev->offset -= r.size; + fprev->size += r.size; + } else { + /* + * If we don't have anything adjacent, create + * a new range in the tree. + */ + ret = tx_ranges_insert_def(tx->pop, + tx, &r); + if (ret != 0) + break; + } + ret = dav_tx_add_snapshot(tx, &r); + break; + } else if (fend <= rend) { + /* + * If found range has its end inside of the desired + * snapshot range, we can extend the found range by the + * size leftover on the left side. + * + * Snapshot: + * --+++-- + * Existing ranges: + * +++---- (overlap on left) + * or ---+--- (found snapshot is inside) + * or ---+-++ (inside, and adjacent on the right) + * or +++++-- (desired snapshot is inside) + * + */ + struct tx_range_def snapshot = *args; + + snapshot.offset = fend; + /* the side not yet covered by an existing snapshot */ + snapshot.size = rend - fend; + + /* the number of bytes intersecting in both ranges */ + size_t intersection = fend - MAX(f->offset, r.offset); + + r.size -= intersection + snapshot.size; + f->size += snapshot.size; + dav_tx_merge_flags(f, args); + + if (snapshot.size != 0) { + ret = dav_tx_add_snapshot(tx, &snapshot); + if (ret != 0) + break; + } + + /* + * If there's a snapshot adjacent on right side, merge + * the two ranges together. + */ + if (nprev != NULL) { + struct tx_range_def *fprev = ravl_data(nprev); + + ASSERTeq(rend, fprev->offset); + f->size += fprev->size; + dav_tx_merge_flags(f, fprev); + ravl_remove(tx->ranges, nprev); + } + } else if (fend >= r.offset) { + /* + * If found range has its end extending beyond the + * desired snapshot. + * + * Snapshot: + * --+++-- + * Existing ranges: + * -----++ (adjacent on the right) + * or ----++- (overlapping on the right) + * or ----+++ (overlapping and adjacent on the right) + * or --+++++ (desired snapshot is inside) + * + * Notice that we cannot create a snapshot based solely + * on this information without risking overwriting an + * existing one. We have to continue iterating, but we + * keep the information about adjacent snapshots in the + * nprev variable. + */ + size_t overlap = rend - MAX(f->offset, r.offset); + + r.size -= overlap; + dav_tx_merge_flags(f, args); + } else { + ASSERT(0); + } + + nprev = n; + } + + if (ret != 0) { + DAV_DBG("out of memory\n"); + return obj_tx_fail_err(ENOMEM, args->flags); + } + + return 0; +} + +/* + * dav_tx_add_range_direct -- adds persistent memory range into the + * transaction + */ +DAV_FUNC_EXPORT int +dav_tx_add_range_direct_v2(const void *ptr, size_t size) +{ + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + ASSERT(tx->pop != NULL); + + int ret; + + uint64_t flags = tx_abort_on_failure_flag(tx); + + if (!OBJ_PTR_FROM_POOL(tx->pop, ptr)) { + ERR("object outside of pool"); + ret = obj_tx_fail_err(EINVAL, flags); + DAV_API_END(); + return ret; + } + + struct tx_range_def args = { + .offset = OBJ_PTR_TO_OFF(tx->pop, ptr), + .size = size, + .flags = flags, + }; + + ret = dav_tx_add_common(tx, &args); + + DAV_API_END(); + return ret; +} + +/* + * dav_tx_xadd_range_direct -- adds persistent memory range into the + * transaction + */ +DAV_FUNC_EXPORT int +dav_tx_xadd_range_direct_v2(const void *ptr, size_t size, uint64_t flags) +{ + + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + int ret; + uint64_t off; + + flags |= tx_abort_on_failure_flag(tx); + + if (flags & ~DAV_XADD_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags + & ~DAV_XADD_VALID_FLAGS); + ret = obj_tx_fail_err(EINVAL, flags); + DAV_API_END(); + return ret; + } + + if (!OBJ_PTR_FROM_POOL(tx->pop, ptr)) { + ERR("object outside of pool"); + ret = obj_tx_fail_err(EINVAL, flags); + DAV_API_END(); + return ret; + } + + off = OBJ_PTR_TO_OFF(tx->pop, ptr); + struct tx_range_def args = { + .offset = off, + .size = size, + .flags = flags, + }; + + ret = dav_tx_add_common(tx, &args); + + DAV_API_END(); + return ret; +} + +/* + * dav_tx_add_range -- adds persistent memory range into the transaction + */ +DAV_FUNC_EXPORT int +dav_tx_add_range_v2(uint64_t hoff, size_t size) +{ + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + int ret; + + uint64_t flags = tx_abort_on_failure_flag(tx); + + ASSERT(OBJ_OFF_IS_VALID(tx->pop, hoff)); + + struct tx_range_def args = { + .offset = hoff, + .size = size, + .flags = flags, + }; + + ret = dav_tx_add_common(tx, &args); + + DAV_API_END(); + return ret; +} + +/* + * dav_tx_xadd_range -- adds persistent memory range into the transaction + */ +DAV_FUNC_EXPORT int +dav_tx_xadd_range_v2(uint64_t hoff, size_t size, uint64_t flags) +{ + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + int ret; + + flags |= tx_abort_on_failure_flag(tx); + + if (flags & ~DAV_XADD_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags + & ~DAV_XADD_VALID_FLAGS); + ret = obj_tx_fail_err(EINVAL, flags); + DAV_API_END(); + return ret; + } + + ASSERT(OBJ_OFF_IS_VALID(tx->pop, hoff)); + + struct tx_range_def args = { + .offset = hoff, + .size = size, + .flags = flags, + }; + + ret = dav_tx_add_common(tx, &args); + + DAV_API_END(); + return ret; +} + +/* + * dav_tx_alloc -- allocates a new object + */ +DAV_FUNC_EXPORT uint64_t +dav_tx_alloc_v2(size_t size, uint64_t type_num, uint64_t flags) +{ + uint64_t off; + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + flags |= tx_abort_on_failure_flag(tx); + + DAV_API_START(); + + if (size == 0) { + ERR("allocation with size 0"); + off = obj_tx_fail_null(EINVAL, flags); + DAV_API_END(); + return off; + } + + if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags + & ~(DAV_TX_XALLOC_VALID_FLAGS)); + off = obj_tx_fail_null(EINVAL, flags); + DAV_API_END(); + return off; + } + + off = tx_alloc_common(tx, size, (type_num_t)type_num, + constructor_tx_alloc, ALLOC_ARGS(flags)); + + DAV_API_END(); + return off; +} + +/* + * dav_tx_xfree -- frees an existing object, with no_abort option + */ +static int +dav_tx_xfree(uint64_t off, uint64_t flags) +{ + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + flags |= tx_abort_on_failure_flag(tx); + + if (flags & ~DAV_XFREE_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, + flags & ~DAV_XFREE_VALID_FLAGS); + return obj_tx_fail_err(EINVAL, flags); + } + + if (off == 0) + return 0; + + dav_obj_t *pop = tx->pop; + + ASSERT(pop != NULL); + ASSERT(OBJ_OFF_IS_VALID(pop, off)); + + DAV_API_START(); + + struct dav_action *action; + uint64_t roff = palloc_get_realoffset(pop->do_heap, off); + + struct tx_range_def range = {roff, 0, 0}; + struct ravl_node *n = ravl_find(tx->ranges, &range, + RAVL_PREDICATE_LESS_EQUAL); + + /* + * If attempting to free an object allocated within the same + * transaction, simply cancel the alloc and remove it from the actions. + */ + if (n != NULL) { + struct tx_range_def *r = ravl_data(n); + + if ((r->offset + r->size) < roff) + goto out; + + VEC_FOREACH_BY_PTR(action, &tx->actions) { + if (action->type == DAV_ACTION_TYPE_HEAP && + action->heap.offset == off) { + void *ptr = OBJ_OFF_TO_PTR(pop, roff); + uint64_t toff, usize; + + palloc_get_prange(action, &toff, &usize, 1); + D_ASSERT(usize <= r->size); + if ((r->offset == roff) && (r->size == usize)) { + /* Exact match. */ + ravl_remove(tx->ranges, n); + } else if (r->offset == roff) { + /* Retain the right portion. */ + r->offset += usize; + r->size -= usize; + } else { + /* Retain the left portion. */ + uint64_t osize = r->size; + + r->size = roff - r->offset; + + /* Still data after range remove. */ + osize -= (r->size + usize); + if (osize) { + struct tx_range_def *r1 = + &(struct tx_range_def) + {roff + usize, osize, r->flags}; + + tx_ranges_insert_def(pop, tx, r1); + } + } + + VALGRIND_SET_CLEAN(ptr, usize); + VALGRIND_REMOVE_FROM_TX(ptr, usize); + palloc_cancel(pop->do_heap, action, 1); + VEC_ERASE_BY_PTR(&tx->actions, action); + DAV_API_END(); + return 0; + } + } + } + +out: + action = tx_action_add(tx); + if (action == NULL) { + int ret = obj_tx_fail_err(errno, flags); + + DAV_API_END(); + return ret; + } + + palloc_defer_free(pop->do_heap, off, action); + + DAV_API_END(); + return 0; +} + +/* + * dav_tx_free -- frees an existing object + */ +DAV_FUNC_EXPORT int +dav_tx_free_v2(uint64_t off) +{ + return dav_tx_xfree(off, 0); +} + +/* arguments for constructor_alloc */ +struct constr_args { + int zero_init; + dav_constr constructor; + void *arg; +}; + +/* arguments for constructor_alloc_root */ +struct carg_root { + size_t size; + dav_constr constructor; + void *arg; +}; + +/* arguments for constructor_realloc and constructor_zrealloc */ +struct carg_realloc { + void *ptr; + size_t old_size; + size_t new_size; + int zero_init; + type_num_t user_type; + dav_constr constructor; + void *arg; +}; + +/* + * constructor_zrealloc_root -- (internal) constructor for dav_root + */ +static int +constructor_zrealloc_root(void *ctx, void *ptr, size_t usable_size, void *arg) +{ + dav_obj_t *pop = ctx; + + DAV_DBG("pop %p ptr %p arg %p", pop, ptr, arg); + + ASSERTne(ptr, NULL); + ASSERTne(arg, NULL); + + VALGRIND_ADD_TO_TX(ptr, usable_size); + + struct carg_realloc *carg = arg; + + if (usable_size > carg->old_size) { + size_t grow_len = usable_size - carg->old_size; + void *new_data_ptr = (void *)((uintptr_t)ptr + carg->old_size); + + mo_wal_memset(&pop->p_ops, new_data_ptr, 0, grow_len, 0); + } + int ret = 0; + + if (carg->constructor) + ret = carg->constructor(pop, ptr, carg->arg); + + VALGRIND_REMOVE_FROM_TX(ptr, usable_size); + + return ret; +} + +/* + * obj_realloc_root -- (internal) reallocate root object + */ +static int +obj_alloc_root(dav_obj_t *pop, size_t size) +{ + struct operation_context *ctx; + struct carg_realloc carg; + int ret; + + DAV_DBG("pop %p size %zu", pop, size); + + carg.ptr = (*pop->do_root_offsetp == 0) ? 0 : OBJ_OFF_TO_PTR(pop, *pop->do_root_offsetp); + carg.old_size = *pop->do_root_sizep; + carg.new_size = size; + carg.user_type = 0; + carg.constructor = NULL; + carg.zero_init = 1; + carg.arg = NULL; + + ret = lw_tx_begin(pop); + if (ret) + return ret; + + ctx = pop->external; + operation_start(ctx); + + operation_add_entry(ctx, pop->do_root_sizep, size, ULOG_OPERATION_SET); + + ret = palloc_operation(pop->do_heap, *pop->do_root_offsetp, pop->do_root_offsetp, size, + constructor_zrealloc_root, &carg, 0, 0, 0, 0, + ctx); /* REVISIT: object_flags and type num ignored*/ + + lw_tx_end(pop, NULL); + return ret; +} + +/* + * dav_root_construct -- returns root object + */ +DAV_FUNC_EXPORT uint64_t +dav_root_v2(dav_obj_t *pop, size_t size) +{ + DAV_DBG("pop %p size %zu", pop, size); + + DAV_API_START(); + if (size > DAV_MAX_ALLOC_SIZE) { + ERR("requested size too large"); + errno = ENOMEM; + DAV_API_END(); + return 0; + } + + if (size == 0 && *pop->do_root_offsetp == 0) { + ERR("requested size cannot equals zero"); + errno = EINVAL; + DAV_API_END(); + return 0; + } + + if (size > *pop->do_root_sizep && obj_alloc_root(pop, size)) { + ERR("dav_root failed"); + errno = ENOMEM; + DAV_API_END(); + return 0; + } + + DAV_API_END(); + return *pop->do_root_offsetp; +} + +/* + * constructor_alloc -- (internal) constructor for obj_alloc_construct + */ +static int +constructor_alloc(void *ctx, void *ptr, size_t usable_size, void *arg) +{ + dav_obj_t *pop = ctx; + + struct mo_ops *p_ops = &pop->p_ops; + + DAV_DBG("pop %p ptr %p arg %p", pop, ptr, arg); + + ASSERTne(ptr, NULL); + ASSERTne(arg, NULL); + + struct constr_args *carg = arg; + + if (carg->zero_init) + mo_wal_memset(p_ops, ptr, 0, usable_size, 0); + + int ret = 0; + + if (carg->constructor) + ret = carg->constructor(pop, ptr, carg->arg); + + return ret; +} + +/* + * obj_alloc_construct -- (internal) allocates a new object with constructor + */ +static int +obj_alloc_construct(dav_obj_t *pop, uint64_t *offp, size_t size, + type_num_t type_num, uint64_t flags, + dav_constr constructor, void *arg) +{ + struct operation_context *ctx; + struct constr_args carg; + int ret; + + if (size > DAV_MAX_ALLOC_SIZE) { + ERR("requested size too large"); + errno = ENOMEM; + return -1; + } + + carg.zero_init = flags & DAV_FLAG_ZERO; + carg.constructor = constructor; + carg.arg = arg; + + ret = lw_tx_begin(pop); + if (ret) + return ret; + ctx = pop->external; + operation_start(ctx); + + ret = palloc_operation(pop->do_heap, 0, offp, size, constructor_alloc, &carg, type_num, 0, + CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), ctx); + + lw_tx_end(pop, NULL); + return ret; +} + +/* + * dav_alloc -- allocates a new object + */ +DAV_FUNC_EXPORT int +dav_alloc_v2(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags, + dav_constr constructor, void *arg) +{ + DAV_DBG(3, "pop %p offp %p size %zu type_num %llx flags %llx constructor %p arg %p", pop, + offp, size, (unsigned long long)type_num, (unsigned long long)flags, constructor, + arg); + + if (size == 0) { + ERR("allocation with size 0"); + errno = EINVAL; + return -1; + } + + if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags & ~DAV_TX_XALLOC_VALID_FLAGS); + errno = EINVAL; + return -1; + } + + DAV_API_START(); + int ret = obj_alloc_construct(pop, offp, size, type_num, flags, constructor, arg); + if (ret) { + errno = ret; + ret = -1; + } + + DAV_API_END(); + return ret; +} + +/* + * dav_free -- frees an existing object + */ +DAV_FUNC_EXPORT void +dav_free_v2(dav_obj_t *pop, uint64_t off) +{ + struct operation_context *ctx; + int rc; + + DAV_DBG("oid.off 0x%016" PRIx64, off); + + if (off == 0) + return; + + DAV_API_START(); + + ASSERTne(pop, NULL); + ASSERT(OBJ_OFF_IS_VALID(pop, off)); + rc = lw_tx_begin(pop); + D_ASSERT(rc == 0); + ctx = pop->external; + operation_start(ctx); + + palloc_operation(pop->do_heap, off, NULL, 0, NULL, NULL, + 0, 0, 0, 0, ctx); + + lw_tx_end(pop, NULL); + DAV_API_END(); +} + +/* + * dav_memcpy_persist -- dav version of memcpy + */ +DAV_FUNC_EXPORT void * +dav_memcpy_persist_v2(dav_obj_t *pop, void *dest, const void *src, + size_t len) +{ + int rc; + + DAV_DBG("pop %p dest %p src %p len %zu", pop, dest, src, len); + D_ASSERT((dav_tx_stage_v2() == DAV_TX_STAGE_NONE)); + + DAV_API_START(); + rc = lw_tx_begin(pop); + D_ASSERT(rc == 0); + + void *ptr = mo_wal_memcpy(&pop->p_ops, dest, src, len, 0); + + lw_tx_end(pop, NULL); + DAV_API_END(); + return ptr; +} + +/* + * dav_reserve -- reserves a single object + */ +DAV_FUNC_EXPORT uint64_t +dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num, + uint64_t flags) +{ + struct constr_args carg; + int tx_inprogress = 0; + int rc; + + DAV_DBG(3, "pop %p act %p size %zu type_num %llx flags %llx", pop, act, size, + (unsigned long long)type_num, (unsigned long long)flags); + + if (flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS); + errno = EINVAL; + return 0; + } + + if (get_tx()->stage != DAV_TX_STAGE_NONE) + tx_inprogress = 1; + + DAV_API_START(); + if (!tx_inprogress) { + rc = lw_tx_begin(pop); + if (rc) + return 0; + } + + carg.zero_init = flags & DAV_FLAG_ZERO; + carg.constructor = NULL; + carg.arg = NULL; + + if (palloc_reserve(pop->do_heap, size, constructor_alloc, &carg, type_num, 0, + CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), act) != 0) { + DAV_API_END(); + return 0; + } + + if (!tx_inprogress) + lw_tx_end(pop, NULL); + DAV_API_END(); + return act->heap.offset; +} + +/* + * dav_defer_free -- creates a deferred free action + */ +DAV_FUNC_EXPORT void +dav_defer_free_v2(dav_obj_t *pop, uint64_t off, struct dav_action *act) +{ + ASSERT(off != 0); + ASSERT(OBJ_OFF_IS_VALID(pop, off)); + palloc_defer_free(pop->do_heap, off, act); +} + +#if 0 +/* + * dav_publish -- publishes a collection of actions + */ +int +dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) +{ + DAV_API_START(); + struct operation_context *ctx = pmalloc_operation_hold(pop); + + size_t entries_size = actvcnt * sizeof(struct ulog_entry_val); + + if (operation_reserve(ctx, entries_size) != 0) { + DAV_API_END(); + return -1; + } + + palloc_publish(&pop->do_heap, actv, actvcnt, ctx); + + pmalloc_operation_release(pop); + + DAV_API_END(); + return 0; +} +#endif + +/* + * dav_cancel -- cancels collection of actions + */ +DAV_FUNC_EXPORT void +dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) +{ + DAV_DBG("actvcnt=%zu", actvcnt); + DAV_API_START(); + palloc_cancel(pop->do_heap, actv, actvcnt); + DAV_API_END(); +} + +/* + * dav_tx_publish -- publishes actions inside of a transaction, + * with no_abort option + */ +DAV_FUNC_EXPORT int +dav_tx_publish_v2(struct dav_action *actv, size_t actvcnt) +{ + struct tx *tx = get_tx(); + uint64_t flags = 0; + uint64_t off, size; + int ret; + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + flags |= tx_abort_on_failure_flag(tx); + + DAV_API_START(); + + if (tx_action_reserve(tx, actvcnt) != 0) { + ret = obj_tx_fail_err(ENOMEM, flags); + + DAV_API_END(); + return ret; + } + + for (size_t i = 0; i < actvcnt; ++i) { + VEC_PUSH_BACK(&tx->actions, actv[i]); + if (palloc_action_isalloc(&actv[i])) { + palloc_get_prange(&actv[i], &off, &size, 1); + struct tx_range_def r = {off, size, + DAV_XADD_NO_SNAPSHOT | DAV_XADD_WAL_CPTR}; + + ret = dav_tx_add_common(tx, &r); + D_ASSERT(ret == 0); + } + } + + DAV_API_END(); + return 0; +} + +/* + * dav_allot_zone_evictable -- Returns an evictable memory bucket id that can be used + * for allocations. If there are no evictable zone with sufficient free space then + * zero is returned which maps to non-evictable memory bucket. + */ +DAV_FUNC_EXPORT uint32_t +dav_allot_mb_evictable_v2(dav_obj_t *pop, int flags) +{ + uint32_t mb_id; + int err; + + D_ASSERT(flags == 0); + D_ASSERT((dav_tx_stage_v2() == DAV_TX_STAGE_NONE)); + + err = heap_get_evictable_mb(pop->do_heap, &mb_id); + if (err) { + D_ERROR("failed to get evictable mb, error = %d", err); + return 0; + } + + return mb_id; +} + +/* + * obj_realloc -- (internal) reallocate zinfo object + */ +int +obj_realloc(dav_obj_t *pop, uint64_t *offp, size_t *sizep, size_t size) +{ + struct operation_context *ctx; + struct carg_realloc carg; + int ret; + + DAV_DBG("pop %p size %zu", pop, size); + + carg.ptr = (*offp == 0) ? 0 : OBJ_OFF_TO_PTR(pop, *offp); + carg.old_size = *sizep; + carg.new_size = size; + carg.user_type = 0; + carg.constructor = NULL; + carg.zero_init = 1; + carg.arg = NULL; + + ctx = pop->external; + operation_start(ctx); + + operation_add_entry(ctx, sizep, size, ULOG_OPERATION_SET); + + ret = palloc_operation(pop->do_heap, *offp, offp, size, constructor_zrealloc_root, &carg, 0, + 0, 0, 0, ctx); + + return ret; +} diff --git a/src/common/dav_v2/tx.h b/src/common/dav_v2/tx.h new file mode 100644 index 00000000000..f3906f65465 --- /dev/null +++ b/src/common/dav_v2/tx.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2023, Intel Corporation */ + +/* + * tx.h -- internal definitions for transactions + */ + +#ifndef __DAOS_COMMON_INTERNAL_TX_H +#define __DAOS_COMMON_INTERNAL_TX_H 1 + +#include + +#define TX_DEFAULT_RANGE_CACHE_SIZE (1 << 15) + +struct ulog_entry_base; +struct mo_ops; +/* + * tx_create_wal_entry -- convert to WAL a single ulog UNDO entry + */ +int tx_create_wal_entry(struct ulog_entry_base *e, void *arg, const struct mo_ops *p_ops); + +int +obj_realloc(dav_obj_t *pop, uint64_t *offp, size_t *sizep, size_t size); + +#endif diff --git a/src/common/dav_v2/ulog.c b/src/common/dav_v2/ulog.c new file mode 100644 index 00000000000..282ab6ae9fd --- /dev/null +++ b/src/common/dav_v2/ulog.c @@ -0,0 +1,691 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2024, Intel Corporation */ + +/* + * ulog.c -- unified log implementation + */ + +#include +#include + +#include "dav_internal.h" +#include "mo_wal.h" +#include "ulog.h" +#include "obj.h" +#include "out.h" +#include "valgrind_internal.h" + +/* + * Operation flag at the three most significant bits + */ +#define ULOG_OPERATION(op) ((uint64_t)(op)) +#define ULOG_OPERATION_MASK ((uint64_t)(0b111ULL << 61ULL)) +#define ULOG_OPERATION_FROM_OFFSET(off) \ + ((ulog_operation_type) ((off) & ULOG_OPERATION_MASK)) +#define ULOG_OFFSET_MASK (~(ULOG_OPERATION_MASK)) + +#define CACHELINE_ALIGN(size) ALIGN_UP(size, CACHELINE_SIZE) +#define IS_CACHELINE_ALIGNED(ptr)\ + (((uintptr_t)(ptr) & (CACHELINE_SIZE - 1)) == 0) + +/* + * ulog_next -- retrieves the pointer to the next ulog + */ +struct ulog * +ulog_next(struct ulog *ulog) +{ + return ulog->next; +} + +/* + * ulog_operation -- returns the type of entry operation + */ +ulog_operation_type +ulog_entry_type(const struct ulog_entry_base *entry) +{ + return ULOG_OPERATION_FROM_OFFSET(entry->offset); +} + +/* + * ulog_offset -- returns offset + */ +uint64_t +ulog_entry_offset(const struct ulog_entry_base *entry) +{ + return entry->offset & ULOG_OFFSET_MASK; +} + +/* + * ulog_entry_size -- returns the size of a ulog entry + */ +size_t +ulog_entry_size(const struct ulog_entry_base *entry) +{ + struct ulog_entry_buf *eb; + + switch (ulog_entry_type(entry)) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + case ULOG_OPERATION_OR: +#else + case ULOG_OPERATION_CLR_BITS: + case ULOG_OPERATION_SET_BITS: +#endif + case ULOG_OPERATION_SET: + return sizeof(struct ulog_entry_val); + case ULOG_OPERATION_BUF_SET: + case ULOG_OPERATION_BUF_CPY: + eb = (struct ulog_entry_buf *)entry; + return CACHELINE_ALIGN( + sizeof(struct ulog_entry_buf) + eb->size); + default: + ASSERT(0); + } + + return 0; +} + +/* + * ulog_entry_valid -- (internal) checks if a ulog entry is valid + * Returns 1 if the range is valid, otherwise 0 is returned. + */ +static int +ulog_entry_valid(struct ulog *ulog, const struct ulog_entry_base *entry) +{ + if (entry->offset == 0) + return 0; + + size_t size; + struct ulog_entry_buf *b; + + switch (ulog_entry_type(entry)) { + case ULOG_OPERATION_BUF_CPY: + case ULOG_OPERATION_BUF_SET: + size = ulog_entry_size(entry); + b = (struct ulog_entry_buf *)entry; + + uint64_t csum = util_checksum_compute(b, size, + &b->checksum, 0); + csum = util_checksum_seq(&ulog->gen_num, + sizeof(ulog->gen_num), csum); + + if (b->checksum != csum) + return 0; + break; + default: + break; + } + + return 1; +} + +/* + * ulog_construct -- initializes the ulog structure + */ +void +ulog_construct_new(struct ulog *ulog, size_t capacity, uint64_t gen_num, uint64_t flags) +{ + ASSERTne(ulog, NULL); + + ulog->capacity = capacity; + ulog->checksum = 0; + ulog->next = 0; + ulog->gen_num = gen_num; + ulog->flags = flags; + memset(ulog->unused, 0, sizeof(ulog->unused)); + + /* we only need to zero out the header of ulog's first entry */ + size_t zeroed_data = CACHELINE_ALIGN(sizeof(struct ulog_entry_base)); + /* + * We want to avoid replicating zeroes for every ulog of every + * lane, to do that, we need to use plain old memset. + */ + memset(ulog->data, 0, zeroed_data); +} + +/* + * ulog_foreach_entry -- iterates over every existing entry in the ulog + */ +int +ulog_foreach_entry(struct ulog *ulog, ulog_entry_cb cb, void *arg, const struct mo_ops *ops) +{ + struct ulog_entry_base *e; + int ret = 0; + + for (struct ulog *r = ulog; r != NULL; r = ulog_next(r)) { + for (size_t offset = 0; offset < r->capacity; ) { + e = (struct ulog_entry_base *)(r->data + offset); + if (!ulog_entry_valid(ulog, e)) + return ret; + + ret = cb(e, arg, ops); + if (ret != 0) + return ret; + + offset += ulog_entry_size(e); + } + } + + return ret; +} + +/* + * ulog_capacity -- (internal) returns the total capacity of the ulog + */ +size_t +ulog_capacity(struct ulog *ulog, size_t ulog_base_bytes) +{ + size_t capacity = ulog_base_bytes; + + ulog = ulog_next(ulog); + /* skip the first one, we count it in 'ulog_base_bytes' */ + while (ulog != NULL) { + capacity += ulog->capacity; + ulog = ulog_next(ulog); + } + + return capacity; +} + +/* + * ulog_rebuild_next_vec -- rebuilds the vector of next entries + */ +void +ulog_rebuild_next_vec(struct ulog *ulog, struct ulog_next *next) +{ + do { + if (ulog->next != 0) + VEC_PUSH_BACK(next, ulog->next); + } while ((ulog = ulog_next(ulog)) != NULL); +} + +/* + * ulog_reserve -- reserves new capacity in the ulog + */ +int +ulog_reserve(struct ulog *ulog, + size_t ulog_base_nbytes, size_t gen_num, + int auto_reserve, size_t *new_capacity, + ulog_extend_fn extend, struct ulog_next *next) +{ + if (!auto_reserve) { + D_CRIT("cannot auto reserve next ulog\n"); + return -1; + } + + size_t capacity = ulog_base_nbytes; + + VEC_FOREACH(ulog, next) { + ASSERTne(ulog, NULL); + capacity += ulog->capacity; + } + + while (capacity < *new_capacity) { + if (extend(&ulog->next, gen_num) != 0) + return -1; + VEC_PUSH_BACK(next, ulog->next); + ulog = ulog_next(ulog); + ASSERTne(ulog, NULL); + + capacity += ulog->capacity; + } + *new_capacity = capacity; + + return 0; +} + +/* + * ulog_checksum -- (internal) calculates ulog checksum + */ +static int +ulog_checksum(struct ulog *ulog, size_t ulog_base_bytes, int insert) +{ + return util_checksum(ulog, SIZEOF_ULOG(ulog_base_bytes), + &ulog->checksum, insert, 0); +} + +/* + * ulog_entry_val_create -- creates a new log value entry in the ulog + * + * This function requires at least a cacheline of space to be available in the + * ulog. + */ +struct ulog_entry_val * +ulog_entry_val_create(struct ulog *ulog, size_t offset, uint64_t *dest, + uint64_t value, ulog_operation_type type, const struct mo_ops *p_ops) +{ + struct ulog_entry_val *e = + (struct ulog_entry_val *)(ulog->data + offset); + + struct { + struct ulog_entry_val v; + struct ulog_entry_base zeroes; + } data; + COMPILE_ERROR_ON(sizeof(data) != sizeof(data.v) + sizeof(data.zeroes)); + + /* + * Write a little bit more to the buffer so that the next entry that + * resides in the log is erased. This will prevent leftovers from + * a previous, clobbered, log from being incorrectly applied. + */ + data.zeroes.offset = 0; + data.v.base.offset = + p_ops->base ? umem_cache_ptr2off(p_ops->umem_store, dest) : (uint64_t)dest; + data.v.base.offset |= ULOG_OPERATION(type); + data.v.value = value; + + memcpy(e, &data, sizeof(data)); + + return e; +} + +/* + * ulog_clobber_entry -- zeroes out a single log entry header + */ +void +ulog_clobber_entry(const struct ulog_entry_base *e) +{ + static const size_t aligned_entry_size = + CACHELINE_ALIGN(sizeof(struct ulog_entry_base)); + + memset((char *)e, 0, aligned_entry_size); +} + +/* + * ulog_entry_buf_create -- atomically creates a buffer entry in the log + */ +struct ulog_entry_buf * +ulog_entry_buf_create(struct ulog *ulog, size_t offset, uint64_t gen_num, + uint64_t *dest, const void *src, uint64_t size, + ulog_operation_type type, const struct mo_ops *p_ops) +{ + struct ulog_entry_buf *e = + (struct ulog_entry_buf *)(ulog->data + offset); + + /* + * Depending on the size of the source buffer, we might need to perform + * up to three separate copies: + * 1. The first cacheline, 24b of metadata and 40b of data + * If there's still data to be logged: + * 2. The entire remainder of data data aligned down to cacheline, + * for example, if there's 150b left, this step will copy only + * 128b. + * Now, we are left with between 0 to 63 bytes. If nonzero: + * 3. Create a stack allocated cacheline-sized buffer, fill in the + * remainder of the data, and copy the entire cacheline. + * + * This is done so that we avoid a cache-miss on misaligned writes. + */ + + struct ulog_entry_buf *b = alloca(CACHELINE_SIZE); + + ASSERT(p_ops->base != NULL); + b->base.offset = umem_cache_ptr2off(p_ops->umem_store, dest); + b->base.offset |= ULOG_OPERATION(type); + b->size = size; + b->checksum = 0; + + size_t bdatasize = CACHELINE_SIZE - sizeof(struct ulog_entry_buf); + size_t ncopy = MIN(size, bdatasize); + + memcpy(b->data, src, ncopy); + memset(b->data + ncopy, 0, bdatasize - ncopy); + + size_t remaining_size = ncopy > size ? 0 : size - ncopy; + + char *srcof = (char *)src + ncopy; + size_t rcopy = ALIGN_DOWN(remaining_size, CACHELINE_SIZE); + size_t lcopy = remaining_size - rcopy; + + uint8_t last_cacheline[CACHELINE_SIZE]; + + if (lcopy != 0) { + memcpy(last_cacheline, srcof + rcopy, lcopy); + memset(last_cacheline + lcopy, 0, CACHELINE_SIZE - lcopy); + } + + if (rcopy != 0) { + void *rdest = e->data + ncopy; + + ASSERT(IS_CACHELINE_ALIGNED(rdest)); + memcpy(rdest, srcof, rcopy); + } + + if (lcopy != 0) { + void *ldest = e->data + ncopy + rcopy; + + ASSERT(IS_CACHELINE_ALIGNED(ldest)); + + memcpy(ldest, last_cacheline, CACHELINE_SIZE); + } + + b->checksum = util_checksum_seq(b, CACHELINE_SIZE, 0); + if (rcopy != 0) + b->checksum = util_checksum_seq(srcof, rcopy, b->checksum); + if (lcopy != 0) + b->checksum = util_checksum_seq(last_cacheline, + CACHELINE_SIZE, b->checksum); + + b->checksum = util_checksum_seq(&gen_num, sizeof(gen_num), + b->checksum); + + ASSERT(IS_CACHELINE_ALIGNED(e)); + + memcpy(e, b, CACHELINE_SIZE); + + /* + * Allow having uninitialized data in the buffer - this requires marking + * data as defined so that comparing checksums is not reported as an + * error by memcheck. + */ + VALGRIND_DO_MAKE_MEM_DEFINED(e->data, ncopy + rcopy + lcopy); + VALGRIND_DO_MAKE_MEM_DEFINED(&e->checksum, sizeof(e->checksum)); + + ASSERT(ulog_entry_valid(ulog, &e->base)); + + return e; +} + +/* + * ulog_entry_apply -- applies modifications of a single ulog entry + */ +void +ulog_entry_apply(const struct ulog_entry_base *e, int persist, + const struct mo_ops *p_ops) +{ + ulog_operation_type t = ulog_entry_type(e); + uint64_t offset = ulog_entry_offset(e); + size_t dst_size = sizeof(uint64_t); + struct ulog_entry_val *ev; + struct ulog_entry_buf *eb; + uint16_t nbits; + uint32_t pos; + uint64_t bmask; + uint64_t *dst; + + dst = p_ops->base ? umem_cache_off2ptr(p_ops->umem_store, offset) : (uint64_t *)offset; + + SUPPRESS_UNUSED(persist); + + switch (t) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + ev = (struct ulog_entry_val *)e; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst &= ev->value; + break; + case ULOG_OPERATION_OR: + ev = (struct ulog_entry_val *)e; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst |= ev->value; + break; +#else + case ULOG_OPERATION_CLR_BITS: + ev = (struct ulog_entry_val *)e; + pos = ULOG_ENTRY_VAL_TO_POS(ev->value); + nbits = ULOG_ENTRY_VAL_TO_BITS(ev->value); + if (nbits == RUN_BITS_PER_VALUE) + bmask = UINT64_MAX; + else + bmask = ((1ULL << nbits) - 1ULL) << pos; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst &= ~bmask; + break; + case ULOG_OPERATION_SET_BITS: + ev = (struct ulog_entry_val *)e; + pos = ULOG_ENTRY_VAL_TO_POS(ev->value); + nbits = ULOG_ENTRY_VAL_TO_BITS(ev->value); + if (nbits == RUN_BITS_PER_VALUE) + bmask = UINT64_MAX; + else + bmask = ((1ULL << nbits) - 1ULL) << pos; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst |= bmask; + break; +#endif + case ULOG_OPERATION_SET: + ev = (struct ulog_entry_val *)e; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst = ev->value; + break; + case ULOG_OPERATION_BUF_CPY: + eb = (struct ulog_entry_buf *)e; + + dst_size = eb->size; + VALGRIND_ADD_TO_TX(dst, dst_size); + mo_wal_memcpy(p_ops, dst, eb->data, eb->size, 0); + break; + case ULOG_OPERATION_BUF_SET: + default: + ASSERT(0); + } + VALGRIND_REMOVE_FROM_TX(dst, dst_size); +} + +/* + * ulog_process_entry -- (internal) processes a single ulog entry + */ +static int +ulog_process_entry(struct ulog_entry_base *e, void *arg, + const struct mo_ops *p_ops) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(arg); + + ulog_entry_apply(e, 0, p_ops); + + return 0; +} +/* + * ulog_inc_gen_num -- (internal) increments gen num in the ulog + */ +static void +ulog_inc_gen_num(struct ulog *ulog) +{ + ulog->gen_num++; +} + +/* + * ulog_free_next -- free all ulogs starting from the indicated one. + * Function returns 1 if any ulog have been freed or unpinned, 0 otherwise. + */ +int +ulog_free_next(struct ulog *u, ulog_free_fn ulog_free) +{ + int ret = 0; + + if (u == NULL) + return ret; + + VEC(, struct ulog **) ulogs_internal_except_first; + VEC_INIT(&ulogs_internal_except_first); + + while (u->next != 0) { + if (VEC_PUSH_BACK(&ulogs_internal_except_first, + &u->next) != 0) { + /* this is fine, it will just use more memory */ + DAV_DBG("unable to free transaction logs memory"); + goto out; + } + u = u->next; + } + + /* free non-user defined logs */ + struct ulog **ulog_ptr; + + VEC_FOREACH_REVERSE(ulog_ptr, &ulogs_internal_except_first) { + ulog_free(*ulog_ptr); + *ulog_ptr = NULL; + ret = 1; + } + +out: + VEC_DELETE(&ulogs_internal_except_first); + return ret; +} + +/* + * ulog_clobber -- zeroes the metadata of the ulog + */ +void +ulog_clobber(struct ulog *dest, struct ulog_next *next) +{ + struct ulog empty; + + memset(&empty, 0, sizeof(empty)); + + if (next != NULL) + empty.next = VEC_SIZE(next) == 0 ? 0 : VEC_FRONT(next); + else + empty.next = dest->next; + + memcpy(dest, &empty, sizeof(empty)); +} + +/* + * ulog_clobber_data -- zeroes out 'nbytes' of data in the logs + */ +int +ulog_clobber_data(struct ulog *ulog_first, + struct ulog_next *next, ulog_free_fn ulog_free, + unsigned flags) +{ + ASSERTne(ulog_first, NULL); + + /* In case of abort we need to increment counter in the first ulog. */ + if (flags & ULOG_INC_FIRST_GEN_NUM) + ulog_inc_gen_num(ulog_first); + + /* + * In the case of abort or commit, we are not going to free all ulogs, + * but rather increment the generation number to be consistent in the + * first two ulogs. + */ + struct ulog *ulog_second = VEC_SIZE(next) == 0 ? 0 : *VEC_GET(next, 0); + + if (ulog_second && !(flags & ULOG_FREE_AFTER_FIRST)) + /* + * We want to keep gen_nums consistent between ulogs. + * If the transaction will commit successfully we'll reuse the + * second buffer (third and next ones will be freed anyway). + * If the application will crash we'll free 2nd ulog on + * recovery, which means we'll never read gen_num of the + * second ulog in case of an ungraceful shutdown. + */ + ulog_inc_gen_num(ulog_second); + + struct ulog *u; + + /* + * To make sure that transaction logs do not occupy too + * much of space, all of them, expect for the first one, + * are freed at the end of the operation. The reasoning for + * this is that pmalloc() is a relatively cheap operation for + * transactions where many hundreds of kilobytes are being + * snapshot, and so, allocating and freeing the buffer for + * each transaction is an acceptable overhead for the average + * case. + */ + if (flags & ULOG_FREE_AFTER_FIRST) + u = ulog_first; + else + u = ulog_second; + + if (u == NULL) + return 0; + + return ulog_free_next(u, ulog_free); +} + +/* + * ulog_process -- process ulog entries + */ +void +ulog_process(struct ulog *ulog, ulog_check_offset_fn check, + const struct mo_ops *p_ops) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(check); + +#ifdef DAV_EXTRA_DEBUG + if (check) + ulog_check(ulog, check, p_ops); +#endif + + ulog_foreach_entry(ulog, ulog_process_entry, NULL, p_ops); + mo_wal_drain(p_ops); +} + +/* + * ulog_base_nbytes -- (internal) counts the actual of number of bytes + * occupied by the ulog + */ +size_t +ulog_base_nbytes(struct ulog *ulog) +{ + size_t offset = 0; + struct ulog_entry_base *e; + + for (offset = 0; offset < ulog->capacity; ) { + e = (struct ulog_entry_base *)(ulog->data + offset); + if (!ulog_entry_valid(ulog, e)) + break; + + offset += ulog_entry_size(e); + } + + return offset; +} + +/* + * ulog_recovery_needed -- checks if the logs needs recovery + */ +int +ulog_recovery_needed(struct ulog *ulog, int verify_checksum) +{ + size_t nbytes = MIN(ulog_base_nbytes(ulog), ulog->capacity); + + if (nbytes == 0) + return 0; + + if (verify_checksum && !ulog_checksum(ulog, nbytes, 0)) + return 0; + + return 1; +} + +/* + * ulog_check_entry -- + * (internal) checks consistency of a single ulog entry + */ +static int +ulog_check_entry(struct ulog_entry_base *e, void *arg, const struct mo_ops *p_ops) +{ + uint64_t offset = ulog_entry_offset(e); + ulog_check_offset_fn check = arg; + + if (!check(p_ops->base, offset)) { + DAV_DBG("ulog %p invalid offset %" PRIu64, + e, e->offset); + return -1; + } + + return offset == 0 ? -1 : 0; +} + +/* + * ulog_check -- (internal) check consistency of ulog entries + */ +int +ulog_check(struct ulog *ulog, ulog_check_offset_fn check, const struct mo_ops *p_ops) +{ + DAV_DBG("ulog %p", ulog); + + return ulog_foreach_entry(ulog, + ulog_check_entry, check, p_ops); +} diff --git a/src/common/dav_v2/ulog.h b/src/common/dav_v2/ulog.h new file mode 100644 index 00000000000..6be0cd9b3ed --- /dev/null +++ b/src/common/dav_v2/ulog.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * ulog.h -- unified log public interface + */ + +#ifndef __DAOS_COMMON_ULOG_H +#define __DAOS_COMMON_ULOG_H 1 + +#include +#include + +#include "util.h" +#include "vec.h" +#include "mo_wal.h" + +struct ulog_entry_base { + uint64_t offset; /* offset with operation type flag */ +}; + +/* + * ulog_entry_val -- log entry + */ +struct ulog_entry_val { + struct ulog_entry_base base; + uint64_t value; /* value to be applied */ +}; + +/* + * ulog_entry_buf - ulog buffer entry + */ +struct ulog_entry_buf { + struct ulog_entry_base base; /* offset with operation type flag */ + uint64_t checksum; /* checksum of the entire log entry */ + uint64_t size; /* size of the buffer to be modified */ + uint8_t data[]; /* content to fill in */ +}; + +#define ULOG_UNUSED ((CACHELINE_SIZE - 40) / 8) +/* + * This structure *must* be located at a cacheline boundary. To achieve this, + * the next field is always allocated with extra padding, and then the offset + * is additionally aligned. + */ +#define ULOG(capacity_bytes) {\ + /* 64 bytes of metadata */\ + uint64_t checksum; /* checksum of ulog header and its entries */\ + struct ulog *next; /* offset of ulog extension */\ + uint64_t capacity; /* capacity of this ulog in bytes */\ + uint64_t gen_num; /* generation counter */\ + uint64_t flags; /* ulog flags */\ + uint64_t unused[ULOG_UNUSED]; /* must be 0 */\ + uint8_t data[capacity_bytes]; /* N bytes of data */\ +} + +#define SIZEOF_ULOG(base_capacity)\ +(sizeof(struct ulog) + base_capacity) + +/* + * Ulog buffer allocated by the user must be marked by this flag. + * It is important to not free it at the end: + * what user has allocated - user should free himself. + */ +#define ULOG_USER_OWNED (1U << 0) + +/* use this for allocations of aligned ulog extensions */ +#define SIZEOF_ALIGNED_ULOG(base_capacity)\ +ALIGN_UP(SIZEOF_ULOG(base_capacity + (2 * CACHELINE_SIZE)), CACHELINE_SIZE) + +struct ulog ULOG(0); + +VEC(ulog_next, struct ulog *); + +typedef uint64_t ulog_operation_type; + +#define ULOG_OPERATION_SET (0b000ULL << 61ULL) +#ifdef WAL_SUPPORTS_AND_OR_OPS +#define ULOG_OPERATION_AND (0b001ULL << 61ULL) +#define ULOG_OPERATION_OR (0b010ULL << 61ULL) +#else +#define ULOG_OPERATION_CLR_BITS (0b001ULL << 61ULL) +#define ULOG_OPERATION_SET_BITS (0b010ULL << 61ULL) +#endif +#define ULOG_OPERATION_BUF_SET (0b101ULL << 61ULL) +#define ULOG_OPERATION_BUF_CPY (0b110ULL << 61ULL) + +#ifndef WAL_SUPPORTS_AND_OR_OPS +#endif + +#ifdef WAL_SUPPORTS_AND_OR_OPS +#define ULOG_ENTRY_IS_BIT_OP(opc) ((opc == ULOG_OPERATION_AND) || \ + (opc == ULOG_OPERATION_OR)) +#else +#define ULOG_ENTRY_IS_BIT_OP(opc) ((opc == ULOG_OPERATION_CLR_BITS) || \ + (opc == ULOG_OPERATION_SET_BITS)) +#define ULOG_ENTRY_OPS_POS 16 /* bits' pos at value:16 */ +#define ULOG_ENTRY_OPS_BITS_MASK ((1ULL << ULOG_ENTRY_OPS_POS) - 1) +#define ULOG_ENTRY_VAL_TO_BITS(val) ((val) & ULOG_ENTRY_OPS_BITS_MASK) +#define ULOG_ENTRY_VAL_TO_POS(val) ((val) >> ULOG_ENTRY_OPS_POS) +#define ULOG_ENTRY_OPS_POS_MASK (RUN_BITS_PER_VALUE - 1ULL) +#define ULOG_ENTRY_TO_VAL(pos, nbits) (((uint64_t)(nbits) & ULOG_ENTRY_OPS_BITS_MASK) | \ + ((pos) & ULOG_ENTRY_OPS_POS_MASK) << ULOG_ENTRY_OPS_POS) +#endif + +/* immediately frees all associated ulog structures */ +#define ULOG_FREE_AFTER_FIRST (1U << 0) +/* increments gen_num of the first, preallocated, ulog */ +#define ULOG_INC_FIRST_GEN_NUM (1U << 1) + +typedef int (*ulog_check_offset_fn)(void *ctx, uint64_t offset); +typedef int (*ulog_extend_fn)(struct ulog **, uint64_t); +typedef int (*ulog_entry_cb)(struct ulog_entry_base *e, void *arg, + const struct mo_ops *p_ops); +typedef void (*ulog_free_fn)(struct ulog *ptr); + +struct ulog *ulog_next(struct ulog *ulog); + +void ulog_construct(uint64_t offset, size_t capacity, uint64_t gen_num, + int flush, uint64_t flags, const struct mo_ops *p_ops); +void ulog_construct_new(struct ulog *ulog, size_t capacity, uint64_t gen_num, + uint64_t flags); + +size_t ulog_capacity(struct ulog *ulog, size_t ulog_base_bytes); +void ulog_rebuild_next_vec(struct ulog *ulog, struct ulog_next *next); + +int ulog_foreach_entry(struct ulog *ulog, + ulog_entry_cb cb, void *arg, const struct mo_ops *ops); + +int ulog_reserve(struct ulog *ulog, + size_t ulog_base_nbytes, size_t gen_num, + int auto_reserve, size_t *new_capacity_bytes, + ulog_extend_fn extend, struct ulog_next *next); + +int ulog_free_next(struct ulog *u, ulog_free_fn ulog_free); +void ulog_clobber(struct ulog *dest, struct ulog_next *next); +int ulog_clobber_data(struct ulog *dest, + struct ulog_next *next, ulog_free_fn ulog_free, unsigned flags); +void ulog_clobber_entry(const struct ulog_entry_base *e); + +void ulog_process(struct ulog *ulog, ulog_check_offset_fn check, + const struct mo_ops *p_ops); + +size_t ulog_base_nbytes(struct ulog *ulog); +int ulog_recovery_needed(struct ulog *ulog, int verify_checksum); + +uint64_t ulog_entry_offset(const struct ulog_entry_base *entry); +ulog_operation_type ulog_entry_type(const struct ulog_entry_base *entry); + +struct ulog_entry_val * +ulog_entry_val_create(struct ulog *ulog, size_t offset, uint64_t *dest, uint64_t value, + ulog_operation_type type, const struct mo_ops *p_ops); + +struct ulog_entry_buf * +ulog_entry_buf_create(struct ulog *ulog, size_t offset, uint64_t gen_num, + uint64_t *dest, const void *src, uint64_t size, + ulog_operation_type type, const struct mo_ops *p_ops); + +void ulog_entry_apply(const struct ulog_entry_base *e, int persist, + const struct mo_ops *p_ops); + +size_t ulog_entry_size(const struct ulog_entry_base *entry); + +int ulog_check(struct ulog *ulog, ulog_check_offset_fn check, + const struct mo_ops *p_ops); + +#endif /* __DAOS_COMMON_ULOG_H */ diff --git a/src/common/dav_v2/util.c b/src/common/dav_v2/util.c new file mode 100644 index 00000000000..f3f6850997a --- /dev/null +++ b/src/common/dav_v2/util.c @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2023, Intel Corporation */ + +/* + * util.c -- very basic utilities + */ + +#include +#include +#include + +#include "util.h" +#include "valgrind_internal.h" + + +#if ANY_VG_TOOL_ENABLED +/* Initialized to true if the process is running inside Valgrind. */ +unsigned _On_valgrind; +#endif + +#if VG_HELGRIND_ENABLED +/* Initialized to true if the process is running inside Valgrind helgrind. */ +unsigned _On_helgrind; +#endif + +#if VG_DRD_ENABLED +/* Initialized to true if the process is running inside Valgrind drd. */ +unsigned _On_drd; +#endif + +#if VG_HELGRIND_ENABLED || VG_DRD_ENABLED +/* Initialized to true if the process is running inside Valgrind drd or hg. */ +unsigned _On_drd_or_hg; +#endif + +#if VG_MEMCHECK_ENABLED +/* Initialized to true if the process is running inside Valgrind memcheck. */ +unsigned _On_memcheck; +#endif + +#if VG_TXINFO_ENABLED +/* true if DAV API and TX-related messages has to be enabled in Valgrind log. */ +int _Vg_txinfo_emit; +#endif /* VG_TXINFO_ENABLED */ + +/* + * util_is_zeroed -- check if given memory range is all zero + */ +int +util_is_zeroed(const void *addr, size_t len) +{ + const char *a = addr; + + if (len == 0) + return 1; + + if (a[0] == 0 && memcmp(a, a + 1, len - 1) == 0) + return 1; + + return 0; +} + +/* + * util_checksum_compute -- compute Fletcher64-like checksum + * + * csump points to where the checksum lives, so that location + * is treated as zeros while calculating the checksum. The + * checksummed data is assumed to be in little endian order. + */ +uint64_t +util_checksum_compute(void *addr, size_t len, uint64_t *csump, size_t skip_off) +{ + if (len % 4 != 0) + abort(); + + uint32_t *p32 = addr; + uint32_t *p32end = (uint32_t *)((char *)addr + len); + uint32_t *skip; + uint32_t lo32 = 0; + uint32_t hi32 = 0; + + if (skip_off) + skip = (uint32_t *)((char *)addr + skip_off); + else + skip = (uint32_t *)((char *)addr + len); + + while (p32 < p32end) + if (p32 == (uint32_t *)csump || p32 >= skip) { + /* lo32 += 0; treat first 32-bits as zero */ + p32++; + hi32 += lo32; + /* lo32 += 0; treat second 32-bits as zero */ + p32++; + hi32 += lo32; + } else { + lo32 += le32toh(*p32); + ++p32; + hi32 += lo32; + } + + return (uint64_t)hi32 << 32 | lo32; +} + +/* + * util_checksum -- compute Fletcher64-like checksum + * + * csump points to where the checksum lives, so that location + * is treated as zeros while calculating the checksum. + * If insert is true, the calculated checksum is inserted into + * the range at *csump. Otherwise the calculated checksum is + * checked against *csump and the result returned (true means + * the range checksummed correctly). + */ +int +util_checksum(void *addr, size_t len, uint64_t *csump, + int insert, size_t skip_off) +{ + uint64_t csum = util_checksum_compute(addr, len, csump, skip_off); + + if (insert) { + *csump = htole64(csum); + return 1; + } + + return *csump == htole64(csum); +} + +/* + * util_checksum_seq -- compute sequential Fletcher64-like checksum + * + * Merges checksum from the old buffer with checksum for current buffer. + */ +uint64_t +util_checksum_seq(const void *addr, size_t len, uint64_t csum) +{ + if (len % 4 != 0) + abort(); + const uint32_t *p32 = addr; + const uint32_t *p32end = (const uint32_t *)((const char *)addr + len); + uint32_t lo32 = (uint32_t)csum; + uint32_t hi32 = (uint32_t)(csum >> 32); + + while (p32 < p32end) { + lo32 += le32toh(*p32); + ++p32; + hi32 += lo32; + } + return (uint64_t)hi32 << 32 | lo32; +} + +/* + * util_init -- initialize the utils + * + * This is called from the library initialization code. + */ +#if ANY_VG_TOOL_ENABLED +__attribute__((constructor)) +static void +_util_init(void) +{ + util_init(); +} +#endif + +void +util_init(void) +{ +#if ANY_VG_TOOL_ENABLED + _On_valgrind = RUNNING_ON_VALGRIND; +#endif + +#if VG_MEMCHECK_ENABLED + if (_On_valgrind) { + unsigned tmp; + unsigned result; + unsigned res = VALGRIND_GET_VBITS(&tmp, &result, sizeof(tmp)); + + _On_memcheck = res ? 1 : 0; + } else { + _On_memcheck = 0; + } +#endif + +#if VG_DRD_ENABLED + if (_On_valgrind) + _On_drd = DRD_GET_DRD_THREADID ? 1 : 0; + else + _On_drd = 0; +#endif + +#if VG_HELGRIND_ENABLED + if (_On_valgrind) { + unsigned tmp; + unsigned result; + /* + * As of now (pmem-3.15) VALGRIND_HG_GET_ABITS is broken on + * the upstream version of Helgrind headers. It generates + * a sign-conversion error and actually returns UINT32_MAX-1 + * when not running under Helgrind. + */ + long res = VALGRIND_HG_GET_ABITS(&tmp, &result, sizeof(tmp)); + + _On_helgrind = res != -2 ? 1 : 0; + } else { + _On_helgrind = 0; + } +#endif + +#if VG_DRD_ENABLED || VG_HELGRIND_ENABLED + _On_drd_or_hg = (unsigned)(On_helgrind + On_drd); +#endif + +#if VG_TXINFO_ENABLED + if (_On_valgrind) { + char *txinfo_env = secure_getenv("D_DAV_VG_TXINFO"); + + if (txinfo_env) + _Vg_txinfo_emit = atoi(txinfo_env); + } else { + _Vg_txinfo_emit = 0; + } +#endif +} diff --git a/src/common/dav_v2/util.h b/src/common/dav_v2/util.h new file mode 100644 index 00000000000..537898edd64 --- /dev/null +++ b/src/common/dav_v2/util.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2023, Intel Corporation */ +/* + * Copyright (c) 2016-2020, Microsoft Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * util.h -- internal definitions for util module + */ + +#ifndef __DAOS_COMMON_UTIL_H +#define __DAOS_COMMON_UTIL_H 1 + +#include +#include +#include +#include +#include +#include +#include + +#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || \ + defined(__riscv) +#define PAGESIZE 4096 +#elif defined(__PPC64__) +#define PAGESIZE 65536 +#else +#error unable to recognize ISA at compile time +#endif + +#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || \ + defined(__riscv) +#define CACHELINE_SIZE 64ULL +#elif defined(__PPC64__) +#define CACHELINE_SIZE 128ULL +#else +#error unable to recognize architecture at compile time +#endif + +#define ALIGN_UP(size, align) (((size) + (align) - 1) & ~((align) - 1)) +#define ALIGN_DOWN(size, align) ((size) & ~((align) - 1)) + +void util_init(void); +int util_is_zeroed(const void *addr, size_t len); +uint64_t util_checksum_compute(void *addr, size_t len, uint64_t *csump, + size_t skip_off); +int util_checksum(void *addr, size_t len, uint64_t *csump, + int insert, size_t skip_off); +uint64_t util_checksum_seq(const void *addr, size_t len, uint64_t csum); + +#define force_inline __attribute__((always_inline)) inline + +typedef uint64_t ua_uint64_t __attribute__((aligned(1))); +typedef uint32_t ua_uint32_t __attribute__((aligned(1))); +typedef uint16_t ua_uint16_t __attribute__((aligned(1))); + +/* + * util_div_ceil -- divides a by b and rounds up the result + */ +static force_inline unsigned +util_div_ceil(unsigned a, unsigned b) +{ + return (unsigned)(((unsigned long)a + b - 1) / b); +} + +/* + * util_bool_compare_and_swap -- perform an atomic compare and swap + * util_fetch_and_* -- perform an operation atomically, return old value + * util_popcount -- count number of set bits + * util_lssb_index -- return index of least significant set bit, + * undefined on zero + * util_mssb_index -- return index of most significant set bit + * undefined on zero + * + * XXX assertions needed on (value != 0) in both versions of bitscans + * + */ + +/* + * ISO C11 -- 7.17.7.2 The atomic_load generic functions + * Integer width specific versions as supplement for: + * + * + * #include + * C atomic_load(volatile A *object); + * C atomic_load_explicit(volatile A *object, memory_order order); + * + * The atomic_load interface doesn't return the loaded value, but instead + * copies it to a specified address. + * + * void util_atomic_load64(volatile A *object, A *destination); + * void util_atomic_load_explicit32(volatile A *object, A *destination, + * memory_order order); + * void util_atomic_load_explicit64(volatile A *object, A *destination, + * memory_order order); + * Also, instead of generic functions, two versions are available: + * for 32 bit fundamental integers, and for 64 bit ones. + */ + +#define util_atomic_load_explicit32 __atomic_load +#define util_atomic_load_explicit64 __atomic_load + +/* ISO C11 -- 7.17.7.1 The atomic_store generic functions */ +/* + * ISO C11 -- 7.17.7.1 The atomic_store generic functions + * Integer width specific versions as supplement for: + * + * #include + * void atomic_store(volatile A *object, C desired); + * void atomic_store_explicit(volatile A *object, C desired, + * memory_order order); + */ +#define util_atomic_store_explicit32 __atomic_store_n +#define util_atomic_store_explicit64 __atomic_store_n + +/* + * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html + * https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html + * https://clang.llvm.org/docs/LanguageExtensions.html#builtin-functions + */ +#define util_bool_compare_and_swap64 __sync_bool_compare_and_swap +#define util_fetch_and_add64 __sync_fetch_and_add +#define util_fetch_and_sub64 __sync_fetch_and_sub +#define util_popcount64(value) ((unsigned char)__builtin_popcountll(value)) + +#define util_lssb_index64(value) ((unsigned char)__builtin_ctzll(value)) +#define util_mssb_index64(value) ((unsigned char)(63 - __builtin_clzll(value))) + +/* ISO C11 -- 7.17.7 Operations on atomic types */ +#define util_atomic_load64(object, dest)\ + util_atomic_load_explicit64(object, dest, memory_order_seq_cst) + +#define COMPILE_ERROR_ON(cond) ((void)sizeof(char[(cond) ? -1 : 1])) + +/* macro for counting the number of varargs (up to 9) */ +#define COUNT(...)\ + COUNT_11TH(_, ##__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#define COUNT_11TH(_11, _10, _9, _8, _7, _6, _5, _4, _3, _2, X, ...) X + +/* concatenation macro */ +#define GLUE(A, B) GLUE_I(A, B) +#define GLUE_I(A, B) A##B + +/* macro for suppressing errors from unused variables (zero to 9) */ +#define SUPPRESS_UNUSED(...)\ + GLUE(SUPPRESS_ARG_, COUNT(__VA_ARGS__))(__VA_ARGS__) +#define SUPPRESS_ARG_0(X) +#define SUPPRESS_ARG_1(X) ((void)(X)) +#define SUPPRESS_ARG_2(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_1(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_3(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_2(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_4(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_3(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_5(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_4(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_6(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_5(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_7(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_6(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_8(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_7(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_9(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_8(__VA_ARGS__);\ +} while (0) + +#endif /* __DAOS_COMMON_UTIL_H */ diff --git a/src/common/dav_v2/valgrind_internal.h b/src/common/dav_v2/valgrind_internal.h new file mode 100644 index 00000000000..86fe9d47a19 --- /dev/null +++ b/src/common/dav_v2/valgrind_internal.h @@ -0,0 +1,293 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * valgrind_internal.h -- internal definitions for valgrind macros + */ + +#ifndef __DAOS_COMMON_VALGRIND_INTERNAL_H +#define __DAOS_COMMON_VALGRIND_INTERNAL_H 1 + +#ifdef D_HAS_VALGRIND +#if !defined(_WIN32) && !defined(__FreeBSD__) && !defined(__riscv) +#define VG_TXINFO_ENABLED 1 +#define VG_HELGRIND_ENABLED 1 +#define VG_MEMCHECK_ENABLED 1 +#define VG_DRD_ENABLED 1 +#endif +#endif + +#if VG_TXINFO_ENABLED || VG_HELGRIND_ENABLED || VG_MEMCHECK_ENABLED || \ + VG_DRD_ENABLED +#define ANY_VG_TOOL_ENABLED 1 +#else +#define ANY_VG_TOOL_ENABLED 0 +#endif + +#if ANY_VG_TOOL_ENABLED +extern unsigned _On_valgrind; +#define On_valgrind __builtin_expect(_On_valgrind, 0) +#include "valgrind/valgrind.h" +#else +#define On_valgrind (0) +#endif + +#if VG_HELGRIND_ENABLED +extern unsigned _On_helgrind; +#define On_helgrind __builtin_expect(_On_helgrind, 0) +#include "valgrind/helgrind.h" +#else +#define On_helgrind (0) +#endif + +#if VG_DRD_ENABLED +extern unsigned _On_drd; +#define On_drd __builtin_expect(_On_drd, 0) +#include "valgrind/drd.h" +#else +#define On_drd (0) +#endif + +#if VG_HELGRIND_ENABLED || VG_DRD_ENABLED + +extern unsigned _On_drd_or_hg; +#define On_drd_or_hg __builtin_expect(_On_drd_or_hg, 0) + +#define VALGRIND_ANNOTATE_HAPPENS_BEFORE(obj) do {\ + if (On_drd_or_hg) \ + ANNOTATE_HAPPENS_BEFORE((obj));\ +} while (0) + +#define VALGRIND_ANNOTATE_HAPPENS_AFTER(obj) do {\ + if (On_drd_or_hg) \ + ANNOTATE_HAPPENS_AFTER((obj));\ +} while (0) + +#define VALGRIND_ANNOTATE_NEW_MEMORY(addr, size) do {\ + if (On_drd_or_hg) \ + ANNOTATE_NEW_MEMORY((addr), (size));\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_READS_BEGIN() do {\ + if (On_drd_or_hg) \ + ANNOTATE_IGNORE_READS_BEGIN();\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_READS_END() do {\ + if (On_drd_or_hg) \ + ANNOTATE_IGNORE_READS_END();\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_WRITES_BEGIN() do {\ + if (On_drd_or_hg) \ + ANNOTATE_IGNORE_WRITES_BEGIN();\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_WRITES_END() do {\ + if (On_drd_or_hg) \ + ANNOTATE_IGNORE_WRITES_END();\ +} while (0) + +/* Supported by both helgrind and drd. */ +#define VALGRIND_HG_DRD_DISABLE_CHECKING(addr, size) do {\ + if (On_drd_or_hg) \ + VALGRIND_HG_DISABLE_CHECKING((addr), (size));\ +} while (0) + +#else + +#define On_drd_or_hg (0) + +#define VALGRIND_ANNOTATE_HAPPENS_BEFORE(obj) { (void)(obj); } + +#define VALGRIND_ANNOTATE_HAPPENS_AFTER(obj) { (void)(obj); } + +#define VALGRIND_ANNOTATE_NEW_MEMORY(addr, size) do {\ + (void) (addr);\ + (void) (size);\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_READS_BEGIN() do {} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_READS_END() do {} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_WRITES_BEGIN() do {} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_WRITES_END() do {} while (0) + +#define VALGRIND_HG_DRD_DISABLE_CHECKING(addr, size) do {\ + (void) (addr);\ + (void) (size);\ +} while (0) + +#endif + +#if VG_TXINFO_ENABLED + +extern int _Vg_txinfo_emit; +#define VG_txinfo_emit __builtin_expect(_Vg_txinfo_emit, 0) + +void util_emit_log(const char *func, int order); + +#define VALGRIND_SET_CLEAN(addr, len) do {\ + (void)(addr);\ + (void)(len);\ +} while (0) + +#define VALGRIND_START_TX do {} while (0) + +#define VALGRIND_END_TX do {} while (0) + +#define VALGRIND_ADD_TO_TX(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_REMOVE_FROM_TX(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +/* + * Logs library and function name with proper suffix + * to VG log file. + */ +#define DAV_API_START() do {\ + if (VG_txinfo_emit)\ + VALGRIND_PRINTF("%s BEGIN\n", __func__);\ +} while (0) +#define DAV_API_END() do {\ + if (VG_txinfo_emit)\ + VALGRIND_PRINTF("%s END\n", __func__);\ +} while (0) + +#else /* VG_TXINFO_ENABLED */ + +#define VG_txinfo_emit (0) + +#define VALGRIND_SET_CLEAN(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_START_TX do {} while (0) + +#define VALGRIND_END_TX do {} while (0) + +#define VALGRIND_ADD_TO_TX(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_REMOVE_FROM_TX(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define DAV_API_START() do {} while (0) + +#define DAV_API_END() do {} while (0) + +#endif /* VG_TXINFO_ENABLED */ + +#if VG_MEMCHECK_ENABLED + +extern unsigned _On_memcheck; +#define On_memcheck __builtin_expect(_On_memcheck, 0) + +#include "valgrind/memcheck.h" + +#define VALGRIND_DO_DISABLE_ERROR_REPORTING do {\ + if (On_valgrind)\ + VALGRIND_DISABLE_ERROR_REPORTING;\ +} while (0) + +#define VALGRIND_DO_ENABLE_ERROR_REPORTING do {\ + if (On_valgrind)\ + VALGRIND_ENABLE_ERROR_REPORTING;\ +} while (0) + +#define VALGRIND_DO_CREATE_MEMPOOL(heap, rzB, is_zeroed) do {\ + if (On_memcheck)\ + VALGRIND_CREATE_MEMPOOL(heap, rzB, is_zeroed);\ +} while (0) + +#define VALGRIND_DO_DESTROY_MEMPOOL(heap) do {\ + if (On_memcheck)\ + VALGRIND_DESTROY_MEMPOOL(heap);\ +} while (0) + +#define VALGRIND_DO_MEMPOOL_ALLOC(heap, addr, size) do {\ + if (On_memcheck)\ + VALGRIND_MEMPOOL_ALLOC(heap, addr, size);\ +} while (0) + +#define VALGRIND_DO_MEMPOOL_FREE(heap, addr) do {\ + if (On_memcheck)\ + VALGRIND_MEMPOOL_FREE(heap, addr);\ +} while (0) + +#define VALGRIND_DO_MAKE_MEM_DEFINED(addr, len) do {\ + if (On_memcheck)\ + VALGRIND_MAKE_MEM_DEFINED(addr, len);\ +} while (0) + +#define VALGRIND_DO_MAKE_MEM_UNDEFINED(addr, len) do {\ + if (On_memcheck)\ + VALGRIND_MAKE_MEM_UNDEFINED(addr, len);\ +} while (0) + +#define VALGRIND_DO_MAKE_MEM_NOACCESS(addr, len) do {\ + if (On_memcheck)\ + VALGRIND_MAKE_MEM_NOACCESS(addr, len);\ +} while (0) + +#define VALGRIND_DO_CHECK_MEM_IS_ADDRESSABLE(addr, len) do {\ + if (On_memcheck)\ + VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, len);\ +} while (0) + +#else /* VG_MEMCHECK_ENABLED */ + +#define On_memcheck (0) + +#define VALGRIND_DO_DISABLE_ERROR_REPORTING do {} while (0) + +#define VALGRIND_DO_ENABLE_ERROR_REPORTING do {} while (0) + +#define VALGRIND_DO_CREATE_MEMPOOL(heap, rzB, is_zeroed)\ + do { (void) (heap); (void) (rzB); (void) (is_zeroed); } while (0) + +#define VALGRIND_DO_DESTROY_MEMPOOL(heap) { (void) (heap); } + +#define VALGRIND_DO_MEMPOOL_ALLOC(heap, addr, size)\ + do { (void) (heap); (void) (addr); (void) (size); } while (0) + +#define VALGRIND_DO_MEMPOOL_FREE(heap, addr)\ + do { (void) (heap); (void) (addr); } while (0) + +#define VALGRIND_DO_MAKE_MEM_DEFINED(addr, len)\ + do { (void) (addr); (void) (len); } while (0) + +#define VALGRIND_DO_MAKE_MEM_UNDEFINED(addr, len)\ + do { (void) (addr); (void) (len); } while (0) + +#define VALGRIND_DO_MAKE_MEM_NOACCESS(addr, len)\ + do { (void) (addr); (void) (len); } while (0) + +#define VALGRIND_DO_CHECK_MEM_IS_ADDRESSABLE(addr, len)\ + do { (void) (addr); (void) (len); } while (0) + +#endif /* VG_MEMCHECK_ENABLED */ + +#endif /* __DAOS_COMMON_VALGRIND_INTERNAL_H */ diff --git a/src/common/dav_v2/vec.h b/src/common/dav_v2/vec.h new file mode 100644 index 00000000000..5d527cb9746 --- /dev/null +++ b/src/common/dav_v2/vec.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2017-2023, Intel Corporation */ + +/* + * vec.h -- vector interface + */ + +#ifndef __DAOS_COMMON_VEC_H +#define __DAOS_COMMON_VEC_H 1 + +#include +#include "valgrind_internal.h" +#include "util.h" +#include "out.h" + +#define VEC_INIT_SIZE (64) + +#define VEC(name, type)\ +struct name {\ + type *buffer;\ + size_t size;\ + size_t capacity;\ +} + +#define VEC_INITIALIZER {NULL, 0, 0} + +#define VEC_INIT(vec) do {\ + (vec)->buffer = NULL;\ + (vec)->size = 0;\ + (vec)->capacity = 0;\ +} while (0) + +#define VEC_MOVE(vecl, vecr) do {\ + D_FREE((vecl)->buffer);\ + (vecl)->buffer = (vecr)->buffer;\ + (vecl)->size = (vecr)->size;\ + (vecl)->capacity = (vecr)->capacity;\ + (vecr)->buffer = NULL;\ + (vecr)->size = 0;\ + (vecr)->capacity = 0;\ +} while (0) + +#define VEC_REINIT(vec) do {\ + VALGRIND_ANNOTATE_NEW_MEMORY((vec), sizeof(*vec));\ + VALGRIND_ANNOTATE_NEW_MEMORY((vec)->buffer,\ + (sizeof(*(vec)->buffer) * ((vec)->capacity)));\ + (vec)->size = 0;\ +} while (0) + +static inline int +vec_reserve(void *vec, size_t ncapacity, size_t s) +{ + void *tbuf; + size_t ncap = ncapacity == 0 ? VEC_INIT_SIZE : ncapacity; + + VEC(vvec, void) *vecp = (struct vvec *)vec; + + D_REALLOC_NZ(tbuf, vecp->buffer, s * ncap); + if (tbuf == NULL) { + D_CRIT("Realloc!\n"); + return -1; + } + vecp->buffer = tbuf; + vecp->capacity = ncap; + return 0; +} + +#define VEC_RESERVE(vec, ncapacity)\ +(((vec)->size == 0 || (ncapacity) > (vec)->size) ?\ + vec_reserve((void *)vec, ncapacity, sizeof(*(vec)->buffer)) :\ + 0) + +#define VEC_POP_BACK(vec) ((vec)->size -= 1) + +#define VEC_FRONT(vec) ((vec)->buffer[0]) + +#define VEC_BACK(vec) ((vec)->buffer[(vec)->size - 1]) + +#define VEC_ERASE_BY_POS(vec, pos) do {\ + if ((pos) != ((vec)->size - 1))\ + (vec)->buffer[(pos)] = VEC_BACK(vec);\ + VEC_POP_BACK(vec);\ +} while (0) + +#define VEC_ERASE_BY_PTR(vec, element) do {\ + if ((element) != &VEC_BACK(vec))\ + *(element) = VEC_BACK(vec);\ + VEC_POP_BACK(vec);\ +} while (0) + +#define VEC_INSERT(vec, element)\ +((vec)->buffer[(vec)->size - 1] = (element), 0) + +#define VEC_INC_SIZE(vec)\ +(((vec)->size++), 0) + +#define VEC_INC_BACK(vec)\ +((vec)->capacity == (vec)->size ? \ + (VEC_RESERVE((vec), ((vec)->capacity * 2)) == 0 ? \ + VEC_INC_SIZE(vec) : -1) : \ + VEC_INC_SIZE(vec)) + +#define VEC_PUSH_BACK(vec, element)\ +(VEC_INC_BACK(vec) == 0 ? VEC_INSERT(vec, element) : -1) + +#define VEC_FOREACH(el, vec)\ +for (size_t _vec_i = 0;\ + _vec_i < (vec)->size && (((el) = (vec)->buffer[_vec_i]), 1);\ + ++_vec_i) + +#define VEC_FOREACH_REVERSE(el, vec)\ +for (size_t _vec_i = ((vec)->size);\ + _vec_i != 0 && (((el) = (vec)->buffer[_vec_i - 1]), 1);\ + --_vec_i) + +#define VEC_FOREACH_BY_POS(elpos, vec)\ +for ((elpos) = 0; (elpos) < (vec)->size; ++(elpos)) + +#define VEC_FOREACH_BY_PTR(el, vec)\ +for (size_t _vec_i = 0;\ + _vec_i < (vec)->size && (((el) = &(vec)->buffer[_vec_i]), 1);\ + ++_vec_i) + +#define VEC_SIZE(vec)\ +((vec)->size) + +#define VEC_CAPACITY(vec)\ +((vec)->capacity) + +#define VEC_ARR(vec)\ +((vec)->buffer) + +#define VEC_GET(vec, id)\ +(&(vec)->buffer[id]) + +#define VEC_CLEAR(vec) ((vec)->size = 0) + +#define VEC_DELETE(vec) do {\ + D_FREE((vec)->buffer);\ + (vec)->buffer = NULL;\ + (vec)->size = 0;\ + (vec)->capacity = 0;\ +} while (0) + +#endif /* __DAOS_COMMON_VEC_H */ diff --git a/src/common/dav_v2/vecq.h b/src/common/dav_v2/vecq.h new file mode 100644 index 00000000000..a9618862b39 --- /dev/null +++ b/src/common/dav_v2/vecq.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2023, Intel Corporation */ + +/* + * vecq.h -- vector queue (FIFO) interface + */ + +#ifndef __DAOS_COMMON_VECQ_H +#define __DAOS_COMMON_VECQ_H 1 + +#include + +#include "util.h" +#include "out.h" + +#define VECQ_INIT_SIZE (64) + +#define VECQ(name, type)\ +struct name {\ + type *buffer;\ + size_t capacity;\ + size_t front;\ + size_t back;\ +} + +#define VECQ_INIT(vec) do {\ + (vec)->buffer = NULL;\ + (vec)->capacity = 0;\ + (vec)->front = 0;\ + (vec)->back = 0;\ +} while (0) + +#define VECQ_REINIT(vec) do {\ + VALGRIND_ANNOTATE_NEW_MEMORY((vec), sizeof(*vec));\ + VALGRIND_ANNOTATE_NEW_MEMORY((vec)->buffer,\ + (sizeof(*(vec)->buffer) * ((vec)->capacity)));\ + (vec)->front = 0;\ + (vec)->back = 0;\ +} while (0) + +#define VECQ_FRONT_POS(vec)\ +((vec)->front & ((vec)->capacity - 1)) + +#define VECQ_BACK_POS(vec)\ +((vec)->back & ((vec)->capacity - 1)) + +#define VECQ_FRONT(vec)\ +((vec)->buffer[VECQ_FRONT_POS(vec)]) + +#define VECQ_BACK(vec) ((vec)->buffer[VECQ_BACK_POS(vec)]) + +#define VECQ_DEQUEUE(vec)\ +((vec)->buffer[(((vec)->front++) & ((vec)->capacity - 1))]) + +#define VECQ_SIZE(vec)\ +((vec)->back - (vec)->front) + +static inline int +realloc_set(void **buf, size_t s) +{ + void *tbuf; + + D_REALLOC_NZ(tbuf, *buf, s); + if (tbuf == NULL) { + D_CRIT("Realloc!\n"); + return -1; + } + *buf = tbuf; + return 0; +} + +#define VECQ_NCAPACITY(vec)\ +((vec)->capacity == 0 ? VECQ_INIT_SIZE : (vec)->capacity * 2) +#define VECQ_GROW(vec)\ +(realloc_set((void **)&(vec)->buffer,\ + VECQ_NCAPACITY(vec) * sizeof(*(vec)->buffer)) ? -1 :\ + (memcpy((vec)->buffer + (vec)->capacity, (vec)->buffer,\ + VECQ_FRONT_POS(vec) * sizeof(*(vec)->buffer)),\ + (vec)->front = VECQ_FRONT_POS(vec),\ + (vec)->back = (vec)->front + (vec)->capacity,\ + (vec)->capacity = VECQ_NCAPACITY(vec),\ + 0\ +)) + +#define VECQ_INSERT(vec, element)\ +(VECQ_BACK(vec) = element, (vec)->back += 1, 0) + +#define VECQ_ENQUEUE(vec, element)\ +((vec)->capacity == VECQ_SIZE(vec) ?\ + (VECQ_GROW(vec) == 0 ? VECQ_INSERT(vec, element) : -1) :\ +VECQ_INSERT(vec, element)) + +#define VECQ_CAPACITY(vec)\ +((vec)->capacity) + +#define VECQ_FOREACH(el, vec)\ +for (size_t _vec_i = 0;\ + _vec_i < VECQ_SIZE(vec) &&\ + (((el) = (vec)->buffer[_vec_i & ((vec)->capacity - 1)]), 1);\ + ++_vec_i) + +#define VECQ_FOREACH_REVERSE(el, vec)\ +for (size_t _vec_i = VECQ_SIZE(vec);\ + _vec_i > 0 &&\ + (((el) = (vec)->buffer[(_vec_i - 1) & ((vec)->capacity - 1)]), 1);\ + --_vec_i) + +#define VECQ_CLEAR(vec) do {\ + (vec)->front = 0;\ + (vec)->back = 0;\ +} while (0) + +#define VECQ_DELETE(vec) do {\ + D_FREE((vec)->buffer);\ + (vec)->buffer = NULL;\ + (vec)->capacity = 0;\ + (vec)->front = 0;\ + (vec)->back = 0;\ +} while (0) + +#endif /* __DAOS_COMMON_VECQ_H */ diff --git a/src/common/dav_v2/wal_tx.c b/src/common/dav_v2/wal_tx.c new file mode 100644 index 00000000000..9cd5d55d4ac --- /dev/null +++ b/src/common/dav_v2/wal_tx.c @@ -0,0 +1,546 @@ +/** + * (C) Copyright 2022-2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include +#include "dav_internal.h" +#include "wal_tx.h" +#include "util.h" +#include "heap.h" + +struct umem_wal_tx_ops dav_wal_tx_ops; + +static inline uint64_t +mdblob_addr2offset(struct dav_obj *hdl, void *addr) +{ + return umem_cache_ptr2off(hdl->do_store, addr); +} + +#define AD_TX_ACT_ADD(tx, wa) \ + do { \ + d_list_add_tail(&(wa)->wa_link, &(tx)->wt_redo); \ + (tx)->wt_redo_cnt++; \ + if ((wa)->wa_act.ac_opc == UMEM_ACT_COPY || \ + (wa)->wa_act.ac_opc == UMEM_ACT_COPY_PTR) { \ + (tx)->wt_redo_payload_len += (wa)->wa_act.ac_copy.size; \ + } else if ((wa)->wa_act.ac_opc == UMEM_ACT_MOVE) { \ + /* ac_move src addr is playload after wal_trans_entry */\ + (tx)->wt_redo_payload_len += sizeof(uint64_t); \ + } \ + } while (0) + +/** allocate wal_action, if success the wa_link and wa_act.ac_opc will be init-ed */ +#define D_ALLOC_ACT(wa, opc, size) \ + do { \ + if (opc == UMEM_ACT_COPY) \ + D_ALLOC(wa, offsetof(struct wal_action, \ + wa_act.ac_copy.payload[size])); \ + else \ + D_ALLOC_PTR(wa); \ + if (likely(wa != NULL)) { \ + D_INIT_LIST_HEAD(&wa->wa_link); \ + wa->wa_act.ac_opc = opc; \ + } \ + } while (0) + +static inline void +act_copy_payload(struct umem_action *act, void *addr, daos_size_t size) +{ + char *dst = (char *)&act->ac_copy.payload[0]; + + if (size > 0) + memcpy(dst, addr, size); +} + +static void +dav_wal_tx_init(struct umem_wal_tx *utx, struct dav_obj *dav_hdl) +{ + struct dav_tx *tx = utx2wtx(utx); + + D_INIT_LIST_HEAD(&tx->wt_redo); + tx->wt_redo_cnt = 0; + tx->wt_redo_payload_len = 0; + tx->wt_redo_act_pos = NULL; + tx->wt_dav_hdl = dav_hdl; +} + +struct umem_wal_tx * +dav_umem_wtx_new(struct dav_obj *dav_hdl) +{ + struct umem_wal_tx *umem_wtx; + + D_ASSERT(dav_hdl->do_utx == NULL); + D_ALLOC_PTR(umem_wtx); + if (umem_wtx == NULL) + return NULL; + + umem_wtx->utx_ops = &dav_wal_tx_ops; + umem_wtx->utx_id = ULLONG_MAX; + dav_wal_tx_init(umem_wtx, dav_hdl); + dav_hdl->do_utx = umem_wtx; + return umem_wtx; +} + +void +dav_umem_wtx_cleanup(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + d_list_t *list = &tx->wt_redo; + struct wal_action *wa, *next; + + d_list_for_each_entry_safe(wa, next, list, wa_link) { + d_list_del(&wa->wa_link); + D_FREE(wa); + } +} + +static int +dav_wal_tx_submit(struct dav_obj *dav_hdl, struct umem_wal_tx *utx, void *data) +{ + struct wal_action *wa, *next; + struct umem_action *ua; + struct umem_store *store = dav_hdl->do_store; + struct dav_tx *tx = utx2wtx(utx); + d_list_t *redo_list = &tx->wt_redo; + + char *pathname = basename(dav_hdl->do_path); + uint64_t id = utx->utx_id; + int rc; + + if (wal_tx_act_nr(utx) == 0) + return 0; + + d_list_for_each_entry_safe(wa, next, redo_list, wa_link) { + ua = &wa->wa_act; + switch (ua->ac_opc) { + case UMEM_ACT_COPY: + D_DEBUG(DB_TRACE, + "%s: ACT_COPY txid=%lu, (p,o)=%lu,%lu size=%lu\n", + pathname, id, + ua->ac_copy.addr / PAGESIZE, ua->ac_copy.addr % PAGESIZE, + ua->ac_copy.size); + break; + case UMEM_ACT_COPY_PTR: + D_DEBUG(DB_TRACE, + "%s: ACT_COPY_PTR txid=%lu, (p,o)=%lu,%lu size=%lu ptr=0x%lx\n", + pathname, id, + ua->ac_copy_ptr.addr / PAGESIZE, ua->ac_copy_ptr.addr % PAGESIZE, + ua->ac_copy_ptr.size, ua->ac_copy_ptr.ptr); + break; + case UMEM_ACT_ASSIGN: + D_DEBUG(DB_TRACE, + "%s: ACT_ASSIGN txid=%lu, (p,o)=%lu,%lu size=%u\n", + pathname, id, + ua->ac_assign.addr / PAGESIZE, ua->ac_assign.addr % PAGESIZE, + ua->ac_assign.size); + break; + case UMEM_ACT_SET: + D_DEBUG(DB_TRACE, + "%s: ACT_SET txid=%lu, (p,o)=%lu,%lu size=%u val=%u\n", + pathname, id, + ua->ac_set.addr / PAGESIZE, ua->ac_set.addr % PAGESIZE, + ua->ac_set.size, ua->ac_set.val); + break; + case UMEM_ACT_SET_BITS: + D_DEBUG(DB_TRACE, + "%s: ACT_SET_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n", + pathname, id, + ua->ac_op_bits.addr / PAGESIZE, ua->ac_op_bits.addr % PAGESIZE, + ua->ac_op_bits.pos, ua->ac_op_bits.num); + break; + case UMEM_ACT_CLR_BITS: + D_DEBUG(DB_TRACE, + "%s: ACT_CLR_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n", + pathname, id, + ua->ac_op_bits.addr / PAGESIZE, ua->ac_op_bits.addr % PAGESIZE, + ua->ac_op_bits.pos, ua->ac_op_bits.num); + break; + default: + D_ERROR("%s: unknown opc %d\n", dav_hdl->do_path, ua->ac_opc); + ASSERT(0); + } + } + DAV_DBG("tx_id:%lu submitting to WAL: %u bytes in %u actions", + id, tx->wt_redo_payload_len, tx->wt_redo_cnt); + rc = store->stor_ops->so_wal_submit(store, utx, data); + return rc; +} + +/** complete the wl transaction */ +int +dav_wal_tx_commit(struct dav_obj *hdl, struct umem_wal_tx *utx, void *data) +{ + int rc; + + /* write actions in redo list to WAL */ + rc = dav_wal_tx_submit(hdl, utx, data); + + /* FAIL the engine if commit fails */ + D_ASSERT(rc == 0); + dav_umem_wtx_cleanup(utx); + return 0; +} + +int +dav_wal_tx_reserve(struct dav_obj *hdl, uint64_t *id) +{ + int rc; + + rc = hdl->do_store->stor_ops->so_wal_reserv(hdl->do_store, id); + /* REVISIT: + * Remove this assert once callers of dav_free() and dav_memcpy_persist() + * are modified to handle failures. + */ + D_ASSERT(rc == 0); + return rc; +} + +/** + * snapshot data from src to either wal redo log. + */ +int +dav_wal_tx_snap(void *hdl, void *addr, daos_size_t size, void *src, uint32_t flags) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + + if (addr == NULL || size == 0 || size > UMEM_ACT_PAYLOAD_MAX_LEN) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), size); + if (rc != 0) + return rc; + + if (flags & DAV_XADD_WAL_CPTR) { + D_ALLOC_ACT(wa_redo, UMEM_ACT_COPY_PTR, size); + if (wa_redo == NULL) + return -DER_NOMEM; + wa_redo->wa_act.ac_copy_ptr.ptr = (uintptr_t)src; + wa_redo->wa_act.ac_copy_ptr.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_copy_ptr.size = size; + } else { + D_ALLOC_ACT(wa_redo, UMEM_ACT_COPY, size); + if (wa_redo == NULL) + return -DER_NOMEM; + act_copy_payload(&wa_redo->wa_act, src, size); + wa_redo->wa_act.ac_copy.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_copy.size = size; + } + AD_TX_ACT_ADD(tx, wa_redo); + return 0; +} + +/** assign uint64_t value to @addr */ +int +dav_wal_tx_assign(void *hdl, void *addr, uint64_t val) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + if (addr == NULL) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t)); + if (rc != 0) + return rc; + + D_ALLOC_ACT(wa_redo, UMEM_ACT_ASSIGN, sizeof(uint64_t)); + if (wa_redo == NULL) + return -DER_NOMEM; + wa_redo->wa_act.ac_assign.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_assign.size = 8; + wa_redo->wa_act.ac_assign.val = val; + AD_TX_ACT_ADD(tx, wa_redo); + + return 0; +} + +/** Set bits starting from pos */ +int +dav_wal_tx_set_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + if (addr == NULL) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t)); + if (rc != 0) + return rc; + + D_ALLOC_ACT(wa_redo, UMEM_ACT_SET_BITS, sizeof(uint64_t)); + if (wa_redo == NULL) + return -DER_NOMEM; + wa_redo->wa_act.ac_op_bits.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_op_bits.num = num_bits; + wa_redo->wa_act.ac_op_bits.pos = pos; + AD_TX_ACT_ADD(tx, wa_redo); + + return 0; +} + +/** Clr bits starting from pos */ +int +dav_wal_tx_clr_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + if (addr == NULL) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t)); + if (rc != 0) + return rc; + + D_ALLOC_ACT(wa_redo, UMEM_ACT_CLR_BITS, sizeof(uint64_t)); + if (wa_redo == NULL) + return -DER_NOMEM; + wa_redo->wa_act.ac_op_bits.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_op_bits.num = num_bits; + wa_redo->wa_act.ac_op_bits.pos = pos; + AD_TX_ACT_ADD(tx, wa_redo); + + return 0; +} + +/** + * memset a storage region, save the operation for redo + */ +int +dav_wal_tx_set(void *hdl, void *addr, char c, daos_size_t size) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + + if (addr == NULL || size == 0 || size > UMEM_ACT_PAYLOAD_MAX_LEN) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), size); + if (rc != 0) + return rc; + + D_ALLOC_ACT(wa_redo, UMEM_ACT_SET, size); + if (wa_redo == NULL) + return -DER_NOMEM; + + wa_redo->wa_act.ac_set.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_set.size = size; + wa_redo->wa_act.ac_set.val = c; + AD_TX_ACT_ADD(tx, wa_redo); + return 0; +} + +/** + * query action number in redo list. + */ +uint32_t +wal_tx_act_nr(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + + return tx->wt_redo_cnt; +} + +/** + * query payload length in redo list. + */ +uint32_t +wal_tx_payload_len(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + + return tx->wt_redo_payload_len; +} + +/** + * get first action pointer, NULL for list empty. + */ +struct umem_action * +wal_tx_act_first(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + + if (d_list_empty(&tx->wt_redo)) { + tx->wt_redo_act_pos = NULL; + return NULL; + } + + tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo); + return &tx->wt_redo_act_pos->wa_act; +} + +/** + * get next action pointer, NULL for done or list empty. + */ +struct umem_action * +wal_tx_act_next(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + + if (tx->wt_redo_act_pos == NULL) { + if (d_list_empty(&tx->wt_redo)) + return NULL; + tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo); + return &tx->wt_redo_act_pos->wa_act; + } + + D_ASSERT(!d_list_empty(&tx->wt_redo)); + tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo_act_pos->wa_link); + if (&tx->wt_redo_act_pos->wa_link == &tx->wt_redo) { + tx->wt_redo_act_pos = NULL; + return NULL; + } + return &tx->wt_redo_act_pos->wa_act; +} + +struct umem_wal_tx_ops dav_wal_tx_ops = { + .wtx_act_nr = wal_tx_act_nr, + .wtx_payload_sz = wal_tx_payload_len, + .wtx_act_first = wal_tx_act_first, + .wtx_act_next = wal_tx_act_next, +}; + +static inline void * +dav_wal_replay_heap_off2ptr(dav_obj_t *dav_hdl, uint64_t off) +{ + uint32_t z_id = OFFSET_TO_ZID(off); + struct umem_cache_range rg = {0}; + int rc; + struct umem_store *store = dav_hdl->do_store; + + rg.cr_off = GET_ZONE_OFFSET(z_id); + rg.cr_size = ((store->stor_size - rg.cr_off) > ZONE_MAX_SIZE) + ? ZONE_MAX_SIZE + : (store->stor_size - rg.cr_off); + rc = umem_cache_load(store, &rg, 1, 0); + if (rc) { + D_ERROR("Failed to load pages to umem cache"); + errno = daos_der2errno(rc); + return NULL; + } + return umem_cache_off2ptr(store, off); +} + +int +dav_wal_replay_cb(uint64_t tx_id, struct umem_action *act, void *arg) +{ + void *src, *dst; + ptrdiff_t off; + uint64_t *p, mask; + daos_size_t size; + int pos, num, val; + int rc = 0; + dav_obj_t *dav_hdl = arg; + struct umem_store *store = dav_hdl->do_store; + + umem_cache_commit(store, tx_id); + switch (act->ac_opc) { + case UMEM_ACT_COPY: + D_DEBUG(DB_TRACE, + "ACT_COPY txid=%lu, (p,o)=%lu,%lu size=%lu\n", + tx_id, + act->ac_copy.addr / PAGESIZE, act->ac_copy.addr % PAGESIZE, + act->ac_copy.size); + off = act->ac_copy.addr; + src = (void *)&act->ac_copy.payload; + size = act->ac_copy.size; + dst = dav_wal_replay_heap_off2ptr(dav_hdl, off); + if (dst == NULL) { + rc = daos_errno2der(errno); + goto out; + } + memcpy(dst, src, size); + break; + case UMEM_ACT_ASSIGN: + D_DEBUG(DB_TRACE, + "ACT_ASSIGN txid=%lu, (p,o)=%lu,%lu size=%u\n", + tx_id, + act->ac_assign.addr / PAGESIZE, act->ac_assign.addr % PAGESIZE, + act->ac_assign.size); + off = act->ac_assign.addr; + dst = dav_wal_replay_heap_off2ptr(dav_hdl, off); + if (dst == NULL) { + rc = daos_errno2der(errno); + goto out; + } + size = act->ac_assign.size; + ASSERT_rt(size == 1 || size == 2 || size == 4); + src = &act->ac_assign.val; + memcpy(dst, src, size); + break; + case UMEM_ACT_SET: + D_DEBUG(DB_TRACE, + "ACT_SET txid=%lu, (p,o)=%lu,%lu size=%u val=%u\n", + tx_id, + act->ac_set.addr / PAGESIZE, act->ac_set.addr % PAGESIZE, + act->ac_set.size, act->ac_set.val); + off = act->ac_set.addr; + dst = dav_wal_replay_heap_off2ptr(dav_hdl, off); + if (dst == NULL) { + rc = daos_errno2der(errno); + goto out; + } + size = act->ac_set.size; + val = act->ac_set.val; + memset(dst, val, size); + break; + case UMEM_ACT_SET_BITS: + case UMEM_ACT_CLR_BITS: + D_DEBUG(DB_TRACE, + "ACT_CLR_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n", + tx_id, + act->ac_op_bits.addr / PAGESIZE, act->ac_op_bits.addr % PAGESIZE, + act->ac_op_bits.pos, act->ac_op_bits.num); + off = act->ac_op_bits.addr; + size = sizeof(uint64_t); + p = dav_wal_replay_heap_off2ptr(dav_hdl, off); + if (p == NULL) { + rc = daos_errno2der(errno); + goto out; + } + num = act->ac_op_bits.num; + pos = act->ac_op_bits.pos; + ASSERT_rt((pos >= 0) && (pos + num) <= 64); + mask = ((1ULL << num) - 1) << pos; + if (act->ac_opc == UMEM_ACT_SET_BITS) + *p |= mask; + else + *p &= ~mask; + break; + default: + D_ASSERT(0); + break; + } + + if (rc == 0) + rc = umem_cache_touch(store, tx_id, off, size); + +out: + return rc; +} diff --git a/src/common/dav_v2/wal_tx.h b/src/common/dav_v2/wal_tx.h new file mode 100644 index 00000000000..1a7e06c2fed --- /dev/null +++ b/src/common/dav_v2/wal_tx.h @@ -0,0 +1,44 @@ +/** + * (C) Copyright 2021-2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#ifndef __DAOS_COMMON_DAV_WAL_TX_ +#define __DAOS_COMMON_DAV_WAL_TX_ + +#include +#include +#include + +struct dav_obj; + +struct wal_action { + d_list_t wa_link; + struct umem_action wa_act; +}; + +struct dav_tx { + struct dav_obj *wt_dav_hdl; + d_list_t wt_redo; + uint32_t wt_redo_cnt; + uint32_t wt_redo_payload_len; + struct wal_action *wt_redo_act_pos; +}; +D_CASSERT(sizeof(struct dav_tx) <= UTX_PRIV_SIZE, + "Size of struct dav_tx is too big!"); + +#define dav_action_get_next(it) d_list_entry(it.next, struct wal_action, wa_link) + +struct umem_wal_tx *dav_umem_wtx_new(struct dav_obj *dav_hdl); +void dav_umem_wtx_cleanup(struct umem_wal_tx *utx); +int dav_wal_tx_reserve(struct dav_obj *hdl, uint64_t *id); +int dav_wal_tx_commit(struct dav_obj *hdl, struct umem_wal_tx *utx, void *data); +int dav_wal_tx_snap(void *hdl, void *addr, daos_size_t size, void *src, uint32_t flags); +int dav_wal_tx_assign(void *hdl, void *addr, uint64_t val); +int dav_wal_tx_clr_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits); +int dav_wal_tx_set_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits); +int dav_wal_tx_set(void *hdl, void *addr, char c, daos_size_t size); +int dav_wal_replay_cb(uint64_t tx_id, struct umem_action *act, void *arg); + +#endif /*__DAOS_COMMON_DAV_WAL_TX_*/ diff --git a/src/common/mem.c b/src/common/mem.c index 0ee9bcb07b2..beccab45266 100644 --- a/src/common/mem.c +++ b/src/common/mem.c @@ -17,7 +17,9 @@ #ifdef DAOS_PMEM_BUILD #include #include +#define DAV_V2_BUILD #include "dav/dav.h" +#include "dav_v2/dav_v2.h" #endif #define UMEM_TX_DATA_MAGIC (0xc01df00d) @@ -34,7 +36,8 @@ struct umem_tx_stage_item { #ifdef DAOS_PMEM_BUILD -static int daos_md_backend = DAOS_MD_PMEM; +static int daos_md_backend = DAOS_MD_PMEM; +static bool daos_disable_bmem_v2 = false; #define UMM_SLABS_CNT 16 /** Initializes global settings for the pmem objects. @@ -49,6 +52,7 @@ umempobj_settings_init(bool md_on_ssd) int rc; enum pobj_arenas_assignment_type atype; unsigned int md_mode = DAOS_MD_BMEM; + unsigned int md_disable_bmem_v2 = 0; if (!md_on_ssd) { daos_md_backend = DAOS_MD_PMEM; @@ -70,22 +74,39 @@ umempobj_settings_init(bool md_on_ssd) case DAOS_MD_ADMEM: D_INFO("UMEM will use AD-hoc Memory as the metadata backend interface\n"); break; + case DAOS_MD_BMEM_V2: + D_INFO("UMEM will use Blob Backed Memory v2 as the metadata backend interface\n"); + break; default: D_ERROR("DAOS_MD_ON_SSD_MODE=%d envar invalid, use %d for BMEM or %d for ADMEM\n", md_mode, DAOS_MD_BMEM, DAOS_MD_ADMEM); return -DER_INVAL; }; + d_getenv_uint("DAOS_MD_DISABLE_BMEM_V2", &md_disable_bmem_v2); + if (md_disable_bmem_v2 && (md_mode != DAOS_MD_BMEM)) + D_INFO("Ignoring DAOS_MD_DISABLE_BMEM_V2 tunable"); + else + daos_disable_bmem_v2 = md_disable_bmem_v2; + daos_md_backend = md_mode; return 0; } -int umempobj_get_backend_type(void) +int +umempobj_get_backend_type(void) { return daos_md_backend; } -int umempobj_backend_type2class_id(int backend) +bool +umempobj_allow_md_bmem_v2() +{ + return !daos_disable_bmem_v2; +} + +int +umempobj_backend_type2class_id(int backend) { switch (backend) { case DAOS_MD_PMEM: @@ -94,6 +115,8 @@ int umempobj_backend_type2class_id(int backend) return UMEM_CLASS_BMEM; case DAOS_MD_ADMEM: return UMEM_CLASS_ADMEM; + case DAOS_MD_BMEM_V2: + return UMEM_CLASS_BMEM_V2; default: D_ASSERTF(0, "bad daos_md_backend %d\n", backend); @@ -101,6 +124,15 @@ int umempobj_backend_type2class_id(int backend) } } +size_t +umempobj_pgsz(int backend) +{ + if (backend == DAOS_MD_BMEM_V2) + return dav_obj_pgsz_v2(); + else + return (1UL << 12); +} + /** Define common slabs. We can refine this for 2.4 pools but that is for next patch */ static const int slab_map[] = { 0, /* 32 bytes */ @@ -161,6 +193,16 @@ set_slab_desc(struct umem_pool *ph_p, struct umem_slab_desc *slab) /* update with the new slab id */ slab->class_id = davslab.class_id; break; + case DAOS_MD_BMEM_V2: + davslab.unit_size = slab->unit_size; + davslab.alignment = 0; + davslab.units_per_block = 1000; + davslab.header_type = DAV_HEADER_NONE; + davslab.class_id = slab->class_id; + rc = dav_class_register_v2((dav_obj_t *)ph_p->up_priv, &davslab); + /* update with the new slab id */ + slab->class_id = davslab.class_id; + break; case DAOS_MD_ADMEM: /* NOOP for ADMEM now */ slab->class_id = class_id++; @@ -325,6 +367,15 @@ umempobj_create(const char *path, const char *layout_name, int flags, } umm_pool->up_priv = dav_hdl; break; + case DAOS_MD_BMEM_V2: + dav_hdl = dav_obj_create_v2(path, 0, poolsize, mode, &umm_pool->up_store); + if (!dav_hdl) { + D_ERROR("Failed to create pool %s, size="DF_U64": errno = %d\n", + path, poolsize, errno); + goto error; + } + umm_pool->up_priv = dav_hdl; + break; case DAOS_MD_ADMEM: rc = ad_blob_create(path, 0, store, &bh); if (rc) { @@ -408,6 +459,16 @@ umempobj_open(const char *path, const char *layout_name, int flags, struct umem_ goto error; } + umm_pool->up_priv = dav_hdl; + break; + case DAOS_MD_BMEM_V2: + dav_hdl = dav_obj_open_v2(path, 0, &umm_pool->up_store); + if (!dav_hdl) { + D_ERROR("Error in opening the pool %s: errno =%d\n", + path, errno); + goto error; + } + umm_pool->up_priv = dav_hdl; break; case DAOS_MD_ADMEM: @@ -452,6 +513,9 @@ umempobj_close(struct umem_pool *ph_p) case DAOS_MD_BMEM: dav_obj_close((dav_obj_t *)ph_p->up_priv); break; + case DAOS_MD_BMEM_V2: + dav_obj_close_v2((dav_obj_t *)ph_p->up_priv); + break; case DAOS_MD_ADMEM: bh.bh_blob = (struct ad_blob *)ph_p->up_priv; ad_blob_close(bh); @@ -491,6 +555,9 @@ umempobj_get_rootptr(struct umem_pool *ph_p, size_t size) case DAOS_MD_BMEM: off = dav_root((dav_obj_t *)ph_p->up_priv, size); return (char *)dav_get_base_ptr((dav_obj_t *)ph_p->up_priv) + off; + case DAOS_MD_BMEM_V2: + off = dav_root_v2((dav_obj_t *)ph_p->up_priv, size); + return (char *)umem_cache_off2ptr(&ph_p->up_store, off); case DAOS_MD_ADMEM: bh.bh_blob = (struct ad_blob *)ph_p->up_priv; return ad_root(bh, size); @@ -528,6 +595,11 @@ umempobj_get_heapusage(struct umem_pool *ph_p, daos_size_t *curr_allocated) if (rc == 0) *curr_allocated = st.curr_allocated; break; + case DAOS_MD_BMEM_V2: + rc = dav_get_heap_stats_v2((dav_obj_t *)ph_p->up_priv, &st); + if (rc == 0) + *curr_allocated = st.curr_allocated; + break; case DAOS_MD_ADMEM: *curr_allocated = 40960; /* TODO */ break; @@ -539,6 +611,46 @@ umempobj_get_heapusage(struct umem_pool *ph_p, daos_size_t *curr_allocated) return rc; } +/** Obtain the usage statistics for the memory bucket. Note that the usage + * statistics for an evictable memory bucket can be approximate value if + * memory bucket is not yet loaded on to the umem cache. + * + * \param pool[IN] Pointer to the persistent object. + * \param mb_id[IN] memory bucket id. + * \param curr_allocated[IN|OUT] Total bytes currently allocated + * \param maxsz[IN|OUT] Max size the memory bucket can grow. + * + * \return zero on success and non-zero on failure. + */ +int +umempobj_get_mbusage(struct umem_pool *ph_p, uint32_t mb_id, daos_size_t *curr_allocated, + daos_size_t *maxsz) +{ + struct dav_heap_mb_stats st; + int rc = 0; + + switch (ph_p->up_store.store_type) { + case DAOS_MD_PMEM: + case DAOS_MD_BMEM: + case DAOS_MD_ADMEM: + rc = -DER_INVAL; + break; + case DAOS_MD_BMEM_V2: + rc = dav_get_heap_mb_stats_v2((dav_obj_t *)ph_p->up_priv, mb_id, &st); + if (rc == 0) { + *curr_allocated = st.dhms_allocated; + *maxsz = st.dhms_maxsz; + } else + rc = daos_errno2der(errno); + break; + default: + D_ASSERTF(0, "bad daos_md_backend %d\n", ph_p->up_store.store_type); + break; + } + + return rc; +} + /** Log fragmentation related info for the pool. * * \param pool[IN] Pointer to the persistent object. @@ -567,6 +679,12 @@ umempobj_log_fraginfo(struct umem_pool *ph_p) DF_U64", run_active: "DF_U64"\n", st.run_allocated, st.run_active); break; + case DAOS_MD_BMEM_V2: + dav_get_heap_stats_v2((dav_obj_t *)ph_p->up_priv, &st); + D_ERROR("Fragmentation info, run_allocated: " + DF_U64", run_active: "DF_U64"\n", + st.run_allocated, st.run_active); + break; case DAOS_MD_ADMEM: /* TODO */ D_ERROR("Fragmentation info, not implemented in ADMEM yet.\n"); @@ -658,7 +776,8 @@ pmem_tx_free(struct umem_instance *umm, umem_off_t umoff) } static umem_off_t -pmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num) +pmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num, + unsigned int unused) { uint64_t pflags = 0; @@ -866,7 +985,8 @@ pmem_tx_stage(void) } static umem_off_t -pmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num) +pmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num, + unsigned int unused) { PMEMobjpool *pop = (PMEMobjpool *)umm->umm_pool->up_priv; @@ -900,8 +1020,8 @@ pmem_atomic_copy(struct umem_instance *umm, void *dest, const void *src, } static umem_off_t -pmem_atomic_alloc(struct umem_instance *umm, size_t size, - unsigned int type_num) +pmem_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num, + unsigned int unused) { PMEMoid oid; PMEMobjpool *pop = (PMEMobjpool *)umm->umm_pool->up_priv; @@ -1049,7 +1169,8 @@ bmem_tx_free(struct umem_instance *umm, umem_off_t umoff) } static umem_off_t -bmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num) +bmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num, + unsigned int mbkt_id) { uint64_t pflags = 0; @@ -1162,7 +1283,8 @@ bmem_defer_free(struct umem_instance *umm, umem_off_t off, void *act) } static umem_off_t -bmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num) +bmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int type_num, + unsigned int mbkt_id) { dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; @@ -1201,8 +1323,8 @@ bmem_atomic_copy(struct umem_instance *umm, void *dest, const void *src, } static umem_off_t -bmem_atomic_alloc(struct umem_instance *umm, size_t size, - unsigned int type_num) +bmem_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num, + unsigned int mbkt_id) { uint64_t off; dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; @@ -1255,6 +1377,255 @@ static umem_ops_t bmem_ops = { .mo_tx_add_callback = umem_tx_add_cb, }; +/** BMEM v2 operations (depends on dav) */ + +static int +bmem_tx_free_v2(struct umem_instance *umm, umem_off_t umoff) +{ + /* + * This free call could be on error cleanup code path where + * the transaction is already aborted due to previous failed + * pmemobj_tx call. Let's just skip it in this case. + * + * The reason we don't fix caller to avoid calling tx_free() + * in an aborted transaction is that the caller code could be + * shared by both transactional and non-transactional (where + * UMEM_CLASS_VMEM is used, see btree code) interfaces, and + * the explicit umem_free() on error cleanup is necessary for + * non-transactional case. + */ + if (dav_tx_stage_v2() == DAV_TX_STAGE_ONABORT) + return 0; + + if (!UMOFF_IS_NULL(umoff)) { + int rc; + + rc = dav_tx_free_v2(umem_off2offset(umoff)); + return rc ? umem_tx_errno(rc) : 0; + } + + return 0; +} + +static umem_off_t +bmem_tx_alloc_v2(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num, + unsigned int mbkt_id) +{ + uint64_t pflags = 0; + + get_slab(umm, &pflags, &size); + + if (flags & UMEM_FLAG_ZERO) + pflags |= DAV_FLAG_ZERO; + if (flags & UMEM_FLAG_NO_FLUSH) + pflags |= DAV_FLAG_NO_FLUSH; + if (mbkt_id != 0) + pflags |= DAV_EZONE_ID(mbkt_id); + return dav_tx_alloc_v2(size, type_num, pflags); +} + +static int +bmem_tx_add_v2(struct umem_instance *umm, umem_off_t umoff, + uint64_t offset, size_t size) +{ + int rc; + + rc = dav_tx_add_range_v2(umem_off2offset(umoff), size); + return rc ? umem_tx_errno(rc) : 0; +} + +static int +bmem_tx_xadd_v2(struct umem_instance *umm, umem_off_t umoff, uint64_t offset, + size_t size, uint64_t flags) +{ + int rc; + uint64_t pflags = 0; + + if (flags & UMEM_XADD_NO_SNAPSHOT) + pflags |= DAV_XADD_NO_SNAPSHOT; + + rc = dav_tx_xadd_range_v2(umem_off2offset(umoff), size, pflags); + return rc ? umem_tx_errno(rc) : 0; +} + + +static int +bmem_tx_add_ptr_v2(struct umem_instance *umm, void *ptr, size_t size) +{ + int rc; + + rc = dav_tx_add_range_direct_v2(ptr, size); + return rc ? umem_tx_errno(rc) : 0; +} + +static int +bmem_tx_abort_v2(struct umem_instance *umm, int err) +{ + /* + * obj_tx_abort() may have already been called in the error + * handling code of pmemobj APIs. + */ + if (dav_tx_stage_v2() != DAV_TX_STAGE_ONABORT) + dav_tx_abort_v2(err); + + err = dav_tx_end_v2(NULL); + return err ? umem_tx_errno(err) : 0; +} + +static int +bmem_tx_begin_v2(struct umem_instance *umm, struct umem_tx_stage_data *txd) +{ + int rc; + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + if (txd != NULL) { + D_ASSERT(txd->txd_magic == UMEM_TX_DATA_MAGIC); + rc = dav_tx_begin_v2(pop, NULL, DAV_TX_PARAM_CB, pmem_stage_callback, + txd, DAV_TX_PARAM_NONE); + } else { + rc = dav_tx_begin_v2(pop, NULL, DAV_TX_PARAM_NONE); + } + + if (rc != 0) { + /* + * dav_tx_end() needs be called to re-initialize the + * tx state when dav_tx_begin() failed. + */ + rc = dav_tx_end_v2(NULL); + return rc ? umem_tx_errno(rc) : 0; + } + return 0; +} + +static int +bmem_tx_commit_v2(struct umem_instance *umm, void *data) +{ + int rc; + + dav_tx_commit_v2(); + rc = dav_tx_end_v2(data); + + return rc ? umem_tx_errno(rc) : 0; +} + +static int +bmem_tx_stage_v2(void) +{ + return dav_tx_stage_v2(); +} + +static void +bmem_defer_free_v2(struct umem_instance *umm, umem_off_t off, void *act) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + dav_defer_free_v2(pop, umem_off2offset(off), + (struct dav_action *)act); +} + +static umem_off_t +bmem_reserve_v2(struct umem_instance *umm, void *act, size_t size, unsigned int type_num, + unsigned int mbkt_id) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + uint64_t flags = DAV_EZONE_ID(mbkt_id); + + return dav_reserve_v2(pop, (struct dav_action *)act, size, type_num, flags); +} + +static void +bmem_cancel_v2(struct umem_instance *umm, void *actv, int actv_cnt) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + dav_cancel_v2(pop, (struct dav_action *)actv, actv_cnt); +} + +static int +bmem_tx_publish_v2(struct umem_instance *umm, void *actv, int actv_cnt) +{ + int rc; + + rc = dav_tx_publish_v2((struct dav_action *)actv, actv_cnt); + return rc ? umem_tx_errno(rc) : 0; +} + +static void * +bmem_atomic_copy_v2(struct umem_instance *umm, void *dest, const void *src, + size_t len, enum acopy_hint hint) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + if (hint == UMEM_RESERVED_MEM) { + memcpy(dest, src, len); + return dest; + } else { /* UMEM_COMMIT_IMMEDIATE */ + return dav_memcpy_persist_v2(pop, dest, src, len); + } +} + +static umem_off_t +bmem_atomic_alloc_v2(struct umem_instance *umm, size_t size, unsigned int type_num, + unsigned int mbkt_id) +{ + uint64_t off; + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + int rc; + uint64_t flags = DAV_EZONE_ID(mbkt_id); + + rc = dav_alloc_v2(pop, &off, size, type_num, flags, NULL, NULL); + if (rc) + return UMOFF_NULL; + return off; +} + +static int +bmem_atomic_free_v2(struct umem_instance *umm, umem_off_t umoff) +{ + if (!UMOFF_IS_NULL(umoff)) { + uint64_t off = umem_off2offset(umoff); + + dav_free_v2((dav_obj_t *)umm->umm_pool->up_priv, off); + } + return 0; +} + +static void +bmem_atomic_flush_v2(struct umem_instance *umm, void *addr, size_t len) +{ + /* NOP */ +} + +static uint32_t +bmem_allot_mb_evictable_v2(struct umem_instance *umm, int flags) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + return dav_allot_mb_evictable_v2(pop, flags); +} + +static umem_ops_t bmem_v2_ops = { + .mo_tx_free = bmem_tx_free_v2, + .mo_tx_alloc = bmem_tx_alloc_v2, + .mo_tx_add = bmem_tx_add_v2, + .mo_tx_xadd = bmem_tx_xadd_v2, + .mo_tx_add_ptr = bmem_tx_add_ptr_v2, + .mo_tx_abort = bmem_tx_abort_v2, + .mo_tx_begin = bmem_tx_begin_v2, + .mo_tx_commit = bmem_tx_commit_v2, + .mo_tx_stage = bmem_tx_stage_v2, + .mo_reserve = bmem_reserve_v2, + .mo_defer_free = bmem_defer_free_v2, + .mo_cancel = bmem_cancel_v2, + .mo_tx_publish = bmem_tx_publish_v2, + .mo_atomic_copy = bmem_atomic_copy_v2, + .mo_atomic_alloc = bmem_atomic_alloc_v2, + .mo_atomic_free = bmem_atomic_free_v2, + .mo_atomic_flush = bmem_atomic_flush_v2, + .mo_allot_evictable_mb = bmem_allot_mb_evictable_v2, + .mo_tx_add_callback = umem_tx_add_cb, +}; + int umem_tx_errno(int err) { @@ -1283,7 +1654,8 @@ vmem_free(struct umem_instance *umm, umem_off_t umoff) } umem_off_t -vmem_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num) +vmem_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num, + unsigned int unused) { return (uint64_t)((flags & UMEM_FLAG_ZERO) ? calloc(1, size) : malloc(size)); @@ -1343,6 +1715,11 @@ static struct umem_class umem_class_defined[] = { .umc_ops = &bmem_ops, .umc_name = "bmem", }, + { + .umc_id = UMEM_CLASS_BMEM_V2, + .umc_ops = &bmem_v2_ops, + .umc_name = "bmem_v2", + }, { .umc_id = UMEM_CLASS_ADMEM, .umc_ops = &ad_mem_ops, @@ -1392,6 +1769,11 @@ set_offsets(struct umem_instance *umm) umm->umm_base = (uint64_t)dav_get_base_ptr(dav_pop); break; + case UMEM_CLASS_BMEM_V2: + dav_pop = (dav_obj_t *)umm->umm_pool->up_priv; + + umm->umm_base = (uint64_t)dav_get_base_ptr_v2(dav_pop); + break; case UMEM_CLASS_ADMEM: bh.bh_blob = (struct ad_blob *)umm->umm_pool->up_priv; umm->umm_base = (uint64_t)ad_base(bh); @@ -1537,6 +1919,7 @@ umem_rsrvd_item_size(struct umem_instance *umm) case UMEM_CLASS_ADMEM: return sizeof(struct ad_reserv_act); case UMEM_CLASS_BMEM: + case UMEM_CLASS_BMEM_V2: return sizeof(struct dav_action); default: D_ERROR("bad umm_id %d\n", umm->umm_id); @@ -1601,8 +1984,8 @@ umem_rsrvd_act_free(struct umem_rsrvd_act **rsrvd_act) } umem_off_t -umem_reserve(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act, - size_t size) +umem_reserve_common(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act, size_t size, + unsigned int mbkt_id) { if (umm->umm_ops->mo_reserve) { void *act; @@ -1613,8 +1996,7 @@ umem_reserve(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act, D_ASSERT(rsrvd_act->rs_actv_cnt > rsrvd_act->rs_actv_at); act = rsrvd_act->rs_actv + act_size * rsrvd_act->rs_actv_at; - off = umm->umm_ops->mo_reserve(umm, act, size, - UMEM_TYPE_ANY); + off = umm->umm_ops->mo_reserve(umm, act, size, UMEM_TYPE_ANY, mbkt_id); if (!UMOFF_IS_NULL(off)) rsrvd_act->rs_actv_at++; D_ASSERTF(umem_off2flags(off) == 0, @@ -1680,12 +2062,18 @@ umem_tx_publish(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act) return rc; } +/* Memory page */ struct umem_page_info { - /** Back pointer to page */ - struct umem_page *pi_page; + /** Mapped MD page ID */ + uint32_t pi_pg_id; + /** Reference count */ + uint32_t pi_ref; /** Page flags */ - uint64_t pi_waiting : 1, /** Page is copied, but waiting for commit */ - pi_copying : 1; /** Page is being copied. Blocks writes. */ + uint64_t pi_io : 1, /** Page is being flushed/loaded to/from MD-blob */ + pi_copying : 1, /** Page is being copied. Blocks writes. */ + pi_mapped : 1, /** Page is mapped to a MD page */ + pi_sys : 1, /** Page is brought to cache by system internal access */ + pi_loaded : 1; /** Page is loaded */ /** Highest transaction ID checkpointed. This is set before the page is copied. The * checkpoint will not be executed until the last committed ID is greater than or * equal to this value. If that's not the case immediately, the waiting flag is set @@ -1694,243 +2082,593 @@ struct umem_page_info { uint64_t pi_last_checkpoint; /** Highest transaction ID of writes to the page */ uint64_t pi_last_inflight; - /** link chain on global dirty list, LRU list, or free info list */ - d_list_t pi_link; + /** link to global LRU lists, or global free page list, or global pinned list */ + d_list_t pi_lru_link; + /** link to global dirty page list, or wait commit list, or temporary list for flushing */ + d_list_t pi_dirty_link; + /** link to global flushing page list */ + d_list_t pi_flush_link; + /** Waitqueue for page loading/flushing */ + void *pi_io_wq; + /** Waitqueue for page committing */ + void *pi_commit_wq; /** page memory address */ uint8_t *pi_addr; /** Information about in-flight checkpoint */ void *pi_chkpt_data; - /** bitmap for each dirty 16K unit */ - uint64_t pi_bmap[UMEM_CACHE_BMAP_SZ]; + /** bitmap for each dirty 4K unit */ + uint64_t *pi_bmap; }; -int -umem_cache_alloc(struct umem_store *store, uint64_t max_mapped) +/* Convert page ID to MD-blob offset */ +static inline umem_off_t +cache_id2off(struct umem_cache *cache, uint32_t pg_id) { - struct umem_cache *cache; - struct umem_page_info *pinfo; - uint64_t num_pages; - int rc = 0; - int idx; + return ((umem_off_t)pg_id << cache->ca_page_shift) + cache->ca_base_off; +} - D_ASSERT(store != NULL); +/* Convert MD-blob offset to page ID */ +static inline uint32_t +cache_off2id(struct umem_cache *cache, umem_off_t offset) +{ + D_ASSERT(offset >= cache->ca_base_off); + return (offset - cache->ca_base_off) >> cache->ca_page_shift; +} - num_pages = (store->stor_size + UMEM_CACHE_PAGE_SZ - 1) >> UMEM_CACHE_PAGE_SZ_SHIFT; +/* Convert MD-blob offset to MD page */ +static inline struct umem_page * +cache_off2page(struct umem_cache *cache, umem_off_t offset) +{ + uint32_t idx = cache_off2id(cache, offset); - if (max_mapped != 0) { - D_ERROR("Setting max_mapped is unsupported at present\n"); - return -DER_NOTSUPPORTED; - } + D_ASSERTF(idx < cache->ca_md_pages, "offset=" DF_U64 ", md_pages=%u, idx=%u\n", + offset, cache->ca_md_pages, idx); - max_mapped = num_pages; + return &cache->ca_pages[idx]; +} - D_ALLOC(cache, sizeof(*cache) + sizeof(cache->ca_pages[0]) * num_pages + - sizeof(cache->ca_pages[0].pg_info[0]) * max_mapped); - if (cache == NULL) - D_GOTO(error, rc = -DER_NOMEM); +/* Convert memory pointer to memory page */ +static inline struct umem_page_info * +cache_ptr2pinfo(struct umem_cache *cache, const void *ptr) +{ + struct umem_page_info *pinfo; + uint32_t idx; - D_DEBUG(DB_IO, - "Allocated page cache for stor->stor_size=" DF_U64 ", " DF_U64 " pages at %p\n", - store->stor_size, num_pages, cache); + D_ASSERT(ptr >= cache->ca_base); + idx = (ptr - cache->ca_base) >> cache->ca_page_shift; - cache->ca_store = store; - cache->ca_num_pages = num_pages; - cache->ca_max_mapped = num_pages; + D_ASSERTF(idx < cache->ca_mem_pages, "ptr=%p, md_pages=%u, idx=%u\n", + ptr, cache->ca_mem_pages, idx); + pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages]; - D_INIT_LIST_HEAD(&cache->ca_pgs_dirty); - D_INIT_LIST_HEAD(&cache->ca_pgs_copying); - D_INIT_LIST_HEAD(&cache->ca_pgs_lru); - D_INIT_LIST_HEAD(&cache->ca_pi_free); + return &pinfo[idx]; +} - for (idx = 0; idx < num_pages; idx++) - cache->ca_pages[idx].pg_id = idx; +/* Convert MD-blob offset to page offset */ +static inline uint32_t +cache_off2pg_off(struct umem_cache *cache, umem_off_t offset) +{ + D_ASSERT(offset >= cache->ca_base_off); + return (offset - cache->ca_base_off) & cache->ca_page_mask; +} - pinfo = (struct umem_page_info *)&cache->ca_pages[idx]; +bool +umem_cache_offisloaded(struct umem_store *store, umem_off_t offset) +{ + struct umem_cache *cache = store->cache; + struct umem_page *page = cache_off2page(cache, offset); - for (idx = 0; idx < max_mapped; idx++) { - d_list_add_tail(&pinfo->pi_link, &cache->ca_pi_free); - pinfo++; - } + return ((page->pg_info != NULL) && page->pg_info->pi_loaded); +} - store->cache = cache; +/* Convert MD-blob offset to memory pointer */ +void * +umem_cache_off2ptr(struct umem_store *store, umem_off_t offset) +{ + struct umem_cache *cache = store->cache; + struct umem_page *page = cache_off2page(cache, offset); - return 0; + /* The page must be mapped */ + D_ASSERT(page->pg_info != NULL); + return (void *)(page->pg_info->pi_addr + cache_off2pg_off(cache, offset)); +} -error: - D_FREE(cache); - return rc; +/* Convert memory pointer to MD-blob offset */ +umem_off_t +umem_cache_ptr2off(struct umem_store *store, const void *ptr) +{ + struct umem_cache *cache = store->cache; + struct umem_page_info *pinfo = cache_ptr2pinfo(cache, ptr); + umem_off_t offset; + + /* The page must be mapped */ + D_ASSERT(pinfo->pi_mapped); + offset = cache_id2off(cache, pinfo->pi_pg_id); + offset += (ptr - cache->ca_base) & cache->ca_page_mask; + + return offset; } -int -umem_cache_free(struct umem_store *store) +static int +page_waitqueue_create(struct umem_cache *cache, struct umem_page_info *pinfo) { - /** XXX: check reference counts? */ - D_FREE(store->cache); + struct umem_store *store = cache->ca_store; + int rc; + + D_ASSERT(store->stor_ops->so_waitqueue_create != NULL); + if (pinfo->pi_io_wq == NULL) { + rc = store->stor_ops->so_waitqueue_create(&pinfo->pi_io_wq); + if (rc) + return rc; + } + if (pinfo->pi_commit_wq == NULL) { + rc = store->stor_ops->so_waitqueue_create(&pinfo->pi_commit_wq); + if (rc) + return rc; + } + return 0; } -int -umem_cache_check(struct umem_store *store, uint64_t num_pages) +static void +page_waitqueue_destroy(struct umem_cache *cache, struct umem_page_info *pinfo) { - struct umem_cache *cache = store->cache; + struct umem_store *store = cache->ca_store; - D_ASSERT(num_pages + cache->ca_mapped <= cache->ca_num_pages); + if (pinfo->pi_io_wq != NULL) { + store->stor_ops->so_waitqueue_destroy(pinfo->pi_io_wq); + pinfo->pi_io_wq = NULL; + } + if (pinfo->pi_commit_wq != NULL) { + store->stor_ops->so_waitqueue_destroy(pinfo->pi_commit_wq); + pinfo->pi_commit_wq = NULL; + } +} - if (num_pages > cache->ca_max_mapped - cache->ca_mapped) - return num_pages - (cache->ca_max_mapped - cache->ca_mapped); +static inline void +verify_inactive_page(struct umem_page_info *pinfo) +{ + D_ASSERT(d_list_empty(&pinfo->pi_flush_link)); + D_ASSERT(pinfo->pi_ref == 0); + D_ASSERT(pinfo->pi_io == 0); + D_ASSERT(pinfo->pi_copying == 0); +} - return 0; +static inline void +verify_clean_page(struct umem_page_info *pinfo, int mapped) +{ + D_ASSERT(d_list_empty(&pinfo->pi_lru_link)); + D_ASSERT(d_list_empty(&pinfo->pi_dirty_link)); + D_ASSERT(pinfo->pi_mapped == mapped); + verify_inactive_page(pinfo); } int -umem_cache_evict(struct umem_store *store, uint64_t num_pages) +umem_cache_free(struct umem_store *store) { - /** XXX: Not yet implemented */ + struct umem_cache *cache = store->cache; + struct umem_page_info *pinfo; + int i; + + if (cache == NULL) + return 0; + + D_ASSERT(d_list_empty(&cache->ca_pgs_flushing)); + D_ASSERT(d_list_empty(&cache->ca_pgs_wait_commit)); + D_ASSERT(d_list_empty(&cache->ca_pgs_pinned)); + D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] == 0); + D_ASSERT(cache->ca_reserve_waiters == 0); + + pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages]; + for (i = 0; i < cache->ca_mem_pages; i++) { + verify_inactive_page(pinfo); + + page_waitqueue_destroy(store->cache, pinfo); + pinfo++; + } + + if (cache->ca_reserve_wq != NULL) { + store->stor_ops->so_waitqueue_destroy(cache->ca_reserve_wq); + cache->ca_reserve_wq = NULL; + + } + + D_FREE(store->cache); return 0; } -int -umem_cache_map_range(struct umem_store *store, umem_off_t offset, void *start_addr, - uint64_t num_pages) +/* 1: phase I mode; 2: phase II mode; */ +static inline unsigned int +cache_mode(struct umem_cache *cache) { - struct umem_cache *cache = store->cache; - struct umem_page *page; - struct umem_page_info *pinfo; - struct umem_page *end_page; - uint64_t current_addr = (uint64_t)start_addr; + return cache->ca_mode; +} - if (store->cache == NULL) - return 0; /* TODO: When SMD is supported outside VOS, this will be an error */ +static inline struct umem_page_info * +cache_pop_free_page(struct umem_cache *cache) +{ + struct umem_page_info *pinfo; - page = umem_cache_off2page(cache, offset); - end_page = page + num_pages; + pinfo = d_list_pop_entry(&cache->ca_pgs_free, struct umem_page_info, pi_lru_link); + if (pinfo != NULL) { + D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_FREE] > 0); + cache->ca_pgs_stats[UMEM_PG_STATS_FREE] -= 1; + } + return pinfo; +} + +#define UMEM_CHUNK_IDX_SHIFT 6 +#define UMEM_CHUNK_IDX_BITS (1 << UMEM_CHUNK_IDX_SHIFT) +#define UMEM_CHUNK_IDX_MASK (UMEM_CHUNK_IDX_BITS - 1) - D_ASSERTF(page->pg_id + num_pages <= cache->ca_num_pages, - "pg_id=%d, num_pages=" DF_U64 ", cache pages=" DF_U64 "\n", page->pg_id, - num_pages, cache->ca_num_pages); +#define UMEM_CACHE_PAGE_SHIFT_MAX 27 /* 128MB */ +#define UMEM_CACHE_BMAP_SZ_MAX (1 << (UMEM_CACHE_PAGE_SHIFT_MAX - \ + UMEM_CACHE_CHUNK_SZ_SHIFT - UMEM_CHUNK_IDX_SHIFT)) +#define UMEM_CACHE_RSRVD_PAGES 4 - while (page != end_page) { - D_ASSERT(page->pg_info == NULL); +int +umem_cache_alloc(struct umem_store *store, uint32_t page_sz, uint32_t md_pgs, uint32_t mem_pgs, + uint32_t max_ne_pgs, uint32_t base_off, void *base, + bool (*is_evictable_fn)(void *arg, uint32_t pg_id), + int (*evtcb_fn)(int evt_type, void *arg, uint32_t pg_id), void *fn_arg) +{ + struct umem_cache *cache; + struct umem_page_info *pinfo; + struct umem_page *page; + unsigned int page_shift, bmap_sz; + uint64_t *bmap; + void *cur_addr = base; + int idx, cmode = 1, rc = 0; + + D_ASSERT(store != NULL); + D_ASSERT(base != NULL); + + page_shift = __builtin_ctz(page_sz); + if (page_sz != (1 << page_shift)) { + D_ERROR("Page size (%u) isn't aligned.\n", page_sz); + return -DER_INVAL; + } else if (page_shift > UMEM_CACHE_PAGE_SHIFT_MAX) { + D_ERROR("Page size (%u) > Max page size (%u).\n", + page_sz, 1 << UMEM_CACHE_PAGE_SHIFT_MAX); + return -DER_INVAL; + } else if (page_shift <= (UMEM_CACHE_CHUNK_SZ_SHIFT + UMEM_CHUNK_IDX_SHIFT)) { + D_ERROR("Page size (%u) <= Min page size (%u)\n", + page_sz, 1 << (UMEM_CACHE_CHUNK_SZ_SHIFT + UMEM_CHUNK_IDX_SHIFT)); + return -DER_INVAL; + } + + D_ASSERT(md_pgs > 0 && md_pgs >= mem_pgs); + if (mem_pgs == 0) { /* Phase 1 mode */ + mem_pgs = md_pgs; + max_ne_pgs = md_pgs; + } else + cmode = 2; + + bmap_sz = (1 << (page_shift - UMEM_CACHE_CHUNK_SZ_SHIFT - UMEM_CHUNK_IDX_SHIFT)); + + D_ALLOC(cache, sizeof(*cache) + sizeof(cache->ca_pages[0]) * md_pgs + + sizeof(cache->ca_pages[0].pg_info[0]) * mem_pgs + + bmap_sz * sizeof(uint64_t) * mem_pgs); + if (cache == NULL) + return -DER_NOMEM; + + D_DEBUG(DB_IO, "Allocated page cache, md-pages(%u), mem-pages(%u), max-ne-pages(%u) %p\n", + md_pgs, mem_pgs, max_ne_pgs, cache); + + cache->ca_store = store; + cache->ca_base = base; + cache->ca_base_off = base_off; + cache->ca_md_pages = md_pgs; + cache->ca_mem_pages = mem_pgs; + cache->ca_max_ne_pages = max_ne_pgs; + cache->ca_page_sz = page_sz; + cache->ca_page_shift = page_shift; + cache->ca_page_mask = page_sz - 1; + cache->ca_bmap_sz = bmap_sz; + cache->ca_evictable_fn = is_evictable_fn; + cache->ca_evtcb_fn = evtcb_fn; + cache->ca_fn_arg = fn_arg; + cache->ca_mode = cmode; + + D_INIT_LIST_HEAD(&cache->ca_pgs_free); + D_INIT_LIST_HEAD(&cache->ca_pgs_dirty); + D_INIT_LIST_HEAD(&cache->ca_pgs_lru[0]); + D_INIT_LIST_HEAD(&cache->ca_pgs_lru[1]); + D_INIT_LIST_HEAD(&cache->ca_pgs_flushing); + D_INIT_LIST_HEAD(&cache->ca_pgs_wait_commit); + D_INIT_LIST_HEAD(&cache->ca_pgs_pinned); + + pinfo = (struct umem_page_info *)&cache->ca_pages[md_pgs]; + bmap = (uint64_t *)&pinfo[mem_pgs]; + + /* Initialize memory page array */ + for (idx = 0; idx < mem_pgs; idx++) { + pinfo->pi_bmap = bmap; + pinfo->pi_addr = (void *)cur_addr; + D_INIT_LIST_HEAD(&pinfo->pi_dirty_link); + D_INIT_LIST_HEAD(&pinfo->pi_flush_link); + d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_free); + cache->ca_pgs_stats[UMEM_PG_STATS_FREE] += 1; + + pinfo++; + bmap += bmap_sz; + cur_addr += page_sz; + } + store->cache = cache; + + /* Phase 2 mode */ + if (cache_mode(cache) != 1) { + D_ASSERT(store->stor_ops->so_waitqueue_create != NULL); + rc = store->stor_ops->so_waitqueue_create(&cache->ca_reserve_wq); + if (rc) + goto error; + + pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages]; + for (idx = 0; idx < cache->ca_mem_pages; idx++) { + rc = page_waitqueue_create(cache, pinfo); + if (rc) + goto error; + pinfo++; + } + return 0; + } - pinfo = d_list_pop_entry(&cache->ca_pi_free, struct umem_page_info, pi_link); + /* Map all MD pages to memory pages for phase 1 mode */ + for (idx = 0; idx < md_pgs; idx++) { + pinfo = cache_pop_free_page(cache); D_ASSERT(pinfo != NULL); + D_ASSERT(pinfo->pi_addr == (base + (uint64_t)idx * page_sz)); + pinfo->pi_pg_id = idx; + pinfo->pi_mapped = 1; + pinfo->pi_loaded = 1; + + page = &cache->ca_pages[idx]; + D_ASSERT(page->pg_info == NULL); page->pg_info = pinfo; - pinfo->pi_page = page; - pinfo->pi_addr = (void *)current_addr; - current_addr += UMEM_CACHE_PAGE_SZ; - d_list_add_tail(&pinfo->pi_link, &cache->ca_pgs_lru); - page++; + /* Add to non-evictable LRU */ + cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] += 1; + d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_lru[0]); } - cache->ca_mapped += num_pages; - return 0; +error: + umem_cache_free(store); + return rc; } -int -umem_cache_pin(struct umem_store *store, umem_off_t addr, daos_size_t size) +static inline bool +is_id_evictable(struct umem_cache *cache, uint32_t pg_id) +{ + return cache->ca_evictable_fn && cache->ca_evictable_fn(cache->ca_fn_arg, pg_id); +} + +static inline void +cache_push_free_page(struct umem_cache *cache, struct umem_page_info *pinfo) { - struct umem_cache *cache = store->cache; - struct umem_page *page = umem_cache_off2page(cache, addr); - struct umem_page *end_page = umem_cache_off2page(cache, addr + size - 1) + 1; + d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_free); + cache->ca_pgs_stats[UMEM_PG_STATS_FREE] += 1; +} - while (page != end_page) { - page->pg_ref++; - page++; +static inline void +cache_unmap_page(struct umem_cache *cache, struct umem_page_info *pinfo) +{ + verify_clean_page(pinfo, 1); + D_ASSERT(pinfo->pi_pg_id < cache->ca_md_pages); + D_ASSERT(cache->ca_pages[pinfo->pi_pg_id].pg_info == pinfo); + + pinfo->pi_mapped = 0; + pinfo->pi_loaded = 0; + pinfo->pi_last_inflight = 0; + pinfo->pi_last_checkpoint = 0; + cache->ca_pages[pinfo->pi_pg_id].pg_info = NULL; + + cache_push_free_page(cache, pinfo); + + if (!is_id_evictable(cache, pinfo->pi_pg_id)) { + D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] > 0); + cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] -= 1; } +} + +static inline void +cache_map_page(struct umem_cache *cache, struct umem_page_info *pinfo, unsigned int pg_id) +{ + verify_clean_page(pinfo, 0); + D_ASSERT(pinfo->pi_loaded == 0); + + pinfo->pi_mapped = 1; + pinfo->pi_pg_id = pg_id; + cache->ca_pages[pg_id].pg_info = pinfo; + if (!is_id_evictable(cache, pg_id)) + cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] += 1; - return 0; } -int -umem_cache_unpin(struct umem_store *store, umem_off_t addr, daos_size_t size) +static inline void +cache_add2lru(struct umem_cache *cache, struct umem_page_info *pinfo) +{ + D_ASSERT(d_list_empty(&pinfo->pi_lru_link)); + D_ASSERT(pinfo->pi_ref == 0); + + if (is_id_evictable(cache, pinfo->pi_pg_id)) + d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_lru[1]); + else + d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_lru[0]); +} + +static inline void +cache_unpin_page(struct umem_cache *cache, struct umem_page_info *pinfo) { - struct umem_cache *cache = store->cache; - struct umem_page *page = umem_cache_off2page(cache, addr); - struct umem_page *end_page = umem_cache_off2page(cache, addr + size - 1) + 1; + D_ASSERT(pinfo->pi_ref > 0); + pinfo->pi_ref--; - while (page != end_page) { - D_ASSERT(page->pg_ref >= 1); - page->pg_ref--; - page++; + if (pinfo->pi_ref == 0) { + d_list_del_init(&pinfo->pi_lru_link); + cache_add2lru(cache, pinfo); + if (is_id_evictable(cache, pinfo->pi_pg_id)) { + D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] > 0); + cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] -= 1; + } } +} - return 0; +static inline void +cache_pin_page(struct umem_cache *cache, struct umem_page_info *pinfo) +{ + pinfo->pi_ref++; + if (pinfo->pi_ref == 1) { + d_list_del_init(&pinfo->pi_lru_link); + d_list_add_tail(&pinfo->pi_lru_link, &cache->ca_pgs_pinned); + if (is_id_evictable(cache, pinfo->pi_pg_id)) + cache->ca_pgs_stats[UMEM_PG_STATS_PINNED] += 1; + } } -#define UMEM_CHUNK_IDX_SHIFT 6 -#define UMEM_CHUNK_IDX_BITS (1 << UMEM_CHUNK_IDX_SHIFT) -#define UMEM_CHUNK_IDX_MASK (UMEM_CHUNK_IDX_BITS - 1) +static inline void +page_wait_io(struct umem_cache *cache, struct umem_page_info *pinfo) +{ + struct umem_store *store = cache->ca_store; + + D_ASSERT(pinfo->pi_io == 1); + if (store->stor_ops->so_waitqueue_create == NULL) + return; + + D_ASSERT(store->stor_ops->so_waitqueue_wait != NULL); + D_ASSERT(pinfo->pi_io_wq != NULL); + store->stor_ops->so_waitqueue_wait(pinfo->pi_io_wq, false); +} + +static inline void +page_wait_committed(struct umem_cache *cache, struct umem_page_info *pinfo, bool yield_only) +{ + struct umem_store *store = cache->ca_store; + + /* The page is must in flushing */ + D_ASSERT(pinfo->pi_io == 1); + if (store->stor_ops->so_waitqueue_create == NULL) + return; + + D_ASSERT(store->stor_ops->so_waitqueue_wait != NULL); + D_ASSERT(pinfo->pi_commit_wq != NULL); + store->stor_ops->so_waitqueue_wait(pinfo->pi_commit_wq, yield_only); +} + +static inline void +page_wakeup_io(struct umem_cache *cache, struct umem_page_info *pinfo) +{ + struct umem_store *store = cache->ca_store; + + D_ASSERT(pinfo->pi_io == 0); + if (store->stor_ops->so_waitqueue_create == NULL) + return; + + if (cache_mode(cache) == 1) + return; + + D_ASSERT(store->stor_ops->so_waitqueue_wakeup != NULL); + D_ASSERT(pinfo->pi_io_wq != NULL); + store->stor_ops->so_waitqueue_wakeup(pinfo->pi_io_wq, true); +} + +static inline void +page_wakeup_commit(struct umem_cache *cache, struct umem_page_info *pinfo) +{ + struct umem_store *store = cache->ca_store; + + /* The page is must in flushing */ + D_ASSERT(pinfo->pi_io == 1); + if (store->stor_ops->so_waitqueue_create == NULL) + return; + + D_ASSERT(store->stor_ops->so_waitqueue_wakeup != NULL); + D_ASSERT(pinfo->pi_commit_wq != NULL); + store->stor_ops->so_waitqueue_wakeup(pinfo->pi_commit_wq, true); +} + +static inline bool +is_page_dirty(struct umem_page_info *pinfo) +{ + return (pinfo->pi_last_inflight != pinfo->pi_last_checkpoint); +} static inline void touch_page(struct umem_store *store, struct umem_page_info *pinfo, uint64_t wr_tx, umem_off_t first_byte, umem_off_t last_byte) { struct umem_cache *cache = store->cache; - uint64_t start_bit = (first_byte & UMEM_CACHE_PAGE_SZ_MASK) >> UMEM_CACHE_CHUNK_SZ_SHIFT; - uint64_t end_bit = (last_byte & UMEM_CACHE_PAGE_SZ_MASK) >> UMEM_CACHE_CHUNK_SZ_SHIFT; + uint64_t start_bit = (first_byte & cache->ca_page_mask) >> UMEM_CACHE_CHUNK_SZ_SHIFT; + uint64_t end_bit = (last_byte & cache->ca_page_mask) >> UMEM_CACHE_CHUNK_SZ_SHIFT; uint64_t bit_nr; uint64_t bit; uint64_t idx; + D_ASSERT(wr_tx != -1ULL); + D_ASSERTF(store->stor_ops->so_wal_id_cmp(store, wr_tx, pinfo->pi_last_inflight) >= 0, + "cur_tx:"DF_U64" < last_inflight:"DF_U64"\n", wr_tx, pinfo->pi_last_inflight); + D_ASSERTF(pinfo->pi_last_checkpoint == 0 || + store->stor_ops->so_wal_id_cmp(store, wr_tx, pinfo->pi_last_checkpoint) > 0, + "cur_tx:"DF_U64" <= last_checkpoint:"DF_U64"\n", + wr_tx, pinfo->pi_last_checkpoint); + for (bit_nr = start_bit; bit_nr <= end_bit; bit_nr++) { idx = bit_nr >> UMEM_CHUNK_IDX_SHIFT; /** uint64_t index */ bit = bit_nr & UMEM_CHUNK_IDX_MASK; pinfo->pi_bmap[idx] |= 1ULL << bit; } - if (!pinfo->pi_waiting && pinfo->pi_last_checkpoint == pinfo->pi_last_inflight) { - /** Keep the page in the waiting list if it's waiting for a transaction to - * be committed to the WAL before it can be flushed. - */ - d_list_del(&pinfo->pi_link); - d_list_add_tail(&pinfo->pi_link, &cache->ca_pgs_dirty); - } + D_ASSERT(pinfo->pi_loaded == 1); + pinfo->pi_last_inflight = wr_tx; - if (store->stor_ops->so_wal_id_cmp(store, wr_tx, pinfo->pi_last_inflight) <= 0 || - wr_tx == -1ULL) + /* Don't change the pi_dirty_link while the page is being flushed */ + if (!d_list_empty(&pinfo->pi_flush_link)) return; - pinfo->pi_last_inflight = wr_tx; + D_ASSERT(pinfo->pi_io == 0); + if (d_list_empty(&pinfo->pi_dirty_link)) + d_list_add_tail(&pinfo->pi_dirty_link, &cache->ca_pgs_dirty); } +/* Convert MD-blob offset to memory page */ static inline struct umem_page_info * -off2pinfo(struct umem_cache *cache, umem_off_t addr) +cache_off2pinfo(struct umem_cache *cache, umem_off_t addr) { - struct umem_page *page = umem_cache_off2page(cache, addr); + struct umem_page *page = cache_off2page(cache, addr); + D_ASSERT(page->pg_info != NULL); return page->pg_info; } int umem_cache_touch(struct umem_store *store, uint64_t wr_tx, umem_off_t addr, daos_size_t size) { - struct umem_cache *cache = store->cache; - struct umem_page_info *pinfo; - umem_off_t end_addr = addr + size - 1; - struct umem_page_info *end_pinfo; - umem_off_t start_addr; + struct umem_cache *cache = store->cache; + struct umem_page_info *pinfo; + umem_off_t start_addr, end_addr = addr + size - 1; + struct umem_page_info *end_pinfo; if (cache == NULL) return 0; /* TODO: When SMD is supported outside VOS, this will be an error */ - D_ASSERTF(size <= UMEM_CACHE_PAGE_SZ, "size=" DF_U64 "\n", size); - pinfo = off2pinfo(cache, addr); - end_pinfo = off2pinfo(cache, end_addr); + D_ASSERTF(size <= cache->ca_page_sz, "size=" DF_U64 "\n", size); + pinfo = cache_off2pinfo(cache, addr); + end_pinfo = cache_off2pinfo(cache, end_addr); if (pinfo->pi_copying) return -DER_CHKPT_BUSY; + /* Convert the MD-blob offset to umem cache offset (exclude the allocator header) */ + D_ASSERT(addr >= cache->ca_base_off); + addr -= cache->ca_base_off; + end_addr -= cache->ca_base_off; + if (pinfo != end_pinfo) { - /** Eventually, we can just assert equal here. But until we have a guarantee that - * no allocation will span a page boundary, we have to handle this case. We should - * never have to span multiple pages though. - */ + D_ASSERT(cache_mode(cache) == 1); + if (end_pinfo->pi_copying) return -DER_CHKPT_BUSY; - start_addr = end_addr & ~UMEM_CACHE_PAGE_SZ_MASK; + start_addr = end_addr & ~cache->ca_page_mask; touch_page(store, end_pinfo, wr_tx, start_addr, end_addr); end_addr = start_addr - 1; } @@ -1947,7 +2685,7 @@ umem_cache_touch(struct umem_store *store, uint64_t wr_tx, umem_off_t addr, daos /** Maximum number of pages that can be in one set */ #define MAX_PAGES_PER_SET 10 /** Maximum number of ranges that can be in one page */ -#define MAX_IOD_PER_PAGE ((UMEM_CACHE_BMAP_SZ << 6) / 2) +#define MAX_IOD_PER_PAGE ((UMEM_CACHE_BMAP_SZ_MAX << UMEM_CHUNK_IDX_SHIFT) / 2) /** Maximum number of IODs a set can handle */ #define MAX_IOD_PER_SET (2 * MAX_IOD_PER_PAGE) @@ -1978,13 +2716,14 @@ static void page2chkpt(struct umem_store *store, struct umem_page_info *pinfo, struct umem_checkpoint_data *chkpt_data) { - uint64_t *bits = &pinfo->pi_bmap[0]; + struct umem_cache *cache = store->cache; + uint64_t *bits = pinfo->pi_bmap; struct umem_store_iod *store_iod = &chkpt_data->cd_store_iod; d_sg_list_t *sgl = &chkpt_data->cd_sg_list; uint64_t bmap; int i; uint64_t first_bit_shift; - uint64_t offset = (uint64_t)pinfo->pi_page->pg_id << UMEM_CACHE_PAGE_SZ_SHIFT; + uint64_t offset = cache_id2off(cache, pinfo->pi_pg_id); uint64_t map_offset; uint8_t *page_addr = pinfo->pi_addr; int nr = sgl->sg_nr_out; @@ -1998,7 +2737,7 @@ page2chkpt(struct umem_store *store, struct umem_page_info *pinfo, 0) chkpt_data->cd_max_tx = pinfo->pi_last_inflight; - for (i = 0; i < UMEM_CACHE_BMAP_SZ; i++) { + for (i = 0; i < cache->ca_bmap_sz; i++) { if (bits[i] == 0) goto next_bmap; @@ -2062,66 +2801,73 @@ chkpt_insert_sorted(struct umem_store *store, struct umem_checkpoint_data *chkpt d_list_add_tail(&chkpt_data->cd_link, list); } -int -umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, void *arg, - uint64_t *out_id, struct umem_cache_chkpt_stats *stats) +static void +page_flush_completion(struct umem_cache *cache, struct umem_page_info *pinfo) { - struct umem_cache *cache = store->cache; - struct umem_page_info *pinfo = NULL; - struct umem_checkpoint_data *chkpt_data_all; - struct umem_checkpoint_data *chkpt_data; - uint64_t committed_tx = 0; - uint64_t chkpt_id = *out_id; - d_list_t free_list; - d_list_t waiting_list; - int i; - int rc = 0; - int inflight = 0; - int pages_scanned = 0; - int dchunks_copied = 0; - int iovs_used = 0; - int nr_copying_pgs = 0; + D_ASSERT(d_list_empty(&pinfo->pi_dirty_link)); + D_ASSERT(pinfo->pi_io == 1); + pinfo->pi_io = 0; + D_ASSERT(!d_list_empty(&pinfo->pi_flush_link)); + d_list_del_init(&pinfo->pi_flush_link); - if (cache == NULL) - return 0; /* TODO: When SMD is supported outside VOS, this will be an error */ + if (is_page_dirty(pinfo)) + d_list_add_tail(&pinfo->pi_dirty_link, &cache->ca_pgs_dirty); - if (d_list_empty(&cache->ca_pgs_dirty)) - return 0; + page_wakeup_io(cache, pinfo); +} + +static int +cache_flush_pages(struct umem_cache *cache, d_list_t *dirty_list, + struct umem_checkpoint_data *chkpt_data_all, int chkpt_nr, + umem_cache_wait_cb_t wait_commit_cb, void *arg, uint64_t *chkpt_id, + struct umem_cache_chkpt_stats *stats) +{ + struct umem_store *store = cache->ca_store; + struct umem_checkpoint_data *chkpt_data; + struct umem_page_info *pinfo; + d_list_t free_list; + d_list_t waiting_list; + uint64_t committed_tx = 0; + unsigned int max_iod_per_page; + unsigned int tot_pgs = 0, flushed_pgs = 0; + int inflight = 0; + int i, rc = 0; D_ASSERT(store != NULL); + D_ASSERT(!d_list_empty(dirty_list)); + max_iod_per_page = ((cache->ca_bmap_sz << UMEM_CHUNK_IDX_SHIFT) / 2); D_INIT_LIST_HEAD(&free_list); D_INIT_LIST_HEAD(&waiting_list); - D_ALLOC_ARRAY(chkpt_data_all, MAX_INFLIGHT_SETS); - if (chkpt_data_all == NULL) - return -DER_NOMEM; /** Setup the in-flight IODs */ - for (i = 0; i < MAX_INFLIGHT_SETS; i++) { + for (i = 0; i < chkpt_nr; i++) { chkpt_data = &chkpt_data_all[i]; d_list_add_tail(&chkpt_data->cd_link, &free_list); chkpt_data->cd_store_iod.io_regions = &chkpt_data->cd_regions[0]; chkpt_data->cd_sg_list.sg_iovs = &chkpt_data->cd_iovs[0]; } - d_list_splice_init(&cache->ca_pgs_dirty, &cache->ca_pgs_copying); - /** First mark all pages in the new list so they won't be moved by an I/O thread. This * will enable us to continue the algorithm in relative isolation from I/O threads. */ - d_list_for_each_entry(pinfo, &cache->ca_pgs_copying, pi_link) { + d_list_for_each_entry(pinfo, dirty_list, pi_dirty_link) { /** Mark all pages in copying list first. Marking them as waiting will prevent * them from being moved to another list by an I/O operation. */ - pinfo->pi_waiting = 1; - if (store->stor_ops->so_wal_id_cmp(store, pinfo->pi_last_inflight, chkpt_id) > 0) - chkpt_id = pinfo->pi_last_inflight; - nr_copying_pgs++; + D_ASSERT(pinfo->pi_io == 0); + pinfo->pi_io = 1; + D_ASSERT(d_list_empty(&pinfo->pi_flush_link)); + d_list_add_tail(&pinfo->pi_flush_link, &cache->ca_pgs_flushing); + tot_pgs++; + + if (store->stor_ops->so_wal_id_cmp(store, pinfo->pi_last_inflight, *chkpt_id) > 0) + *chkpt_id = pinfo->pi_last_inflight; } do { /** first try to add up to MAX_INFLIGHT_SETS to the waiting queue */ - while (inflight < MAX_INFLIGHT_SETS && !d_list_empty(&cache->ca_pgs_copying)) { + while (inflight < MAX_INFLIGHT_SETS && !d_list_empty(dirty_list)) { chkpt_data = d_list_pop_entry(&free_list, struct umem_checkpoint_data, cd_link); @@ -2134,9 +2880,9 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo chkpt_data->cd_nr_dchunks = 0; while (chkpt_data->cd_nr_pages < MAX_PAGES_PER_SET && - chkpt_data->cd_store_iod.io_nr <= MAX_IOD_PER_PAGE && - (pinfo = d_list_pop_entry(&cache->ca_pgs_copying, - struct umem_page_info, pi_link)) != NULL) { + chkpt_data->cd_store_iod.io_nr <= max_iod_per_page && + (pinfo = d_list_pop_entry(dirty_list, struct umem_page_info, + pi_dirty_link)) != NULL) { D_ASSERT(chkpt_data != NULL); page2chkpt(store, pinfo, chkpt_data); } @@ -2148,7 +2894,7 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo for (i = 0; i < chkpt_data->cd_nr_pages; i++) { pinfo = chkpt_data->cd_pages[i]; pinfo->pi_copying = 0; - d_list_add(&pinfo->pi_link, &cache->ca_pgs_copying); + d_list_add(&pinfo->pi_dirty_link, dirty_list); } d_list_add(&chkpt_data->cd_link, &free_list); rc = 0; @@ -2189,7 +2935,7 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo for (i = 0; i < chkpt_data->cd_nr_pages; i++) { pinfo = chkpt_data->cd_pages[i]; pinfo->pi_copying = 0; - memset(&pinfo->pi_bmap[0], 0, sizeof(pinfo->pi_bmap)); + memset(pinfo->pi_bmap, 0, sizeof(uint64_t) * cache->ca_bmap_sz); } chkpt_insert_sorted(store, chkpt_data, &waiting_list); @@ -2200,7 +2946,7 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo chkpt_data = d_list_pop_entry(&waiting_list, struct umem_checkpoint_data, cd_link); /* Wait for in-flight transactions committed, or yield to make progress */ - wait_cb(arg, chkpt_data ? chkpt_data->cd_max_tx : 0, &committed_tx); + wait_commit_cb(arg, chkpt_data ? chkpt_data->cd_max_tx : 0, &committed_tx); /* The so_flush_prep() could fail when the DMA buffer is under pressure */ if (chkpt_data == NULL) @@ -2222,36 +2968,739 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo rc = store->stor_ops->so_flush_post(chkpt_data->cd_fh, rc); for (i = 0; i < chkpt_data->cd_nr_pages; i++) { pinfo = chkpt_data->cd_pages[i]; - if (pinfo->pi_last_inflight != pinfo->pi_last_checkpoint) - d_list_add_tail(&pinfo->pi_link, &cache->ca_pgs_dirty); - else - d_list_add_tail(&pinfo->pi_link, &cache->ca_pgs_lru); - pinfo->pi_waiting = 0; + page_flush_completion(cache, pinfo); } inflight--; - pages_scanned += chkpt_data->cd_nr_pages; - dchunks_copied += chkpt_data->cd_nr_dchunks; - iovs_used += chkpt_data->cd_sg_list.sg_nr_out; + + flushed_pgs += chkpt_data->cd_nr_pages; + if (stats) { + stats->uccs_nr_pages += chkpt_data->cd_nr_pages; + stats->uccs_nr_dchunks += chkpt_data->cd_nr_dchunks; + stats->uccs_nr_iovs += chkpt_data->cd_sg_list.sg_nr_out; + } d_list_add(&chkpt_data->cd_link, &free_list); if (rc != 0 || (DAOS_FAIL_CHECK(DAOS_MEM_FAIL_CHECKPOINT) && - pages_scanned >= nr_copying_pgs / 2)) { - d_list_move(&cache->ca_pgs_copying, &cache->ca_pgs_dirty); + flushed_pgs >= tot_pgs / 2)) { rc = -DER_AGAIN; break; } - } while (inflight != 0 || !d_list_empty(&cache->ca_pgs_copying)); + } while (inflight != 0 || !d_list_empty(dirty_list)); + + return rc; +} + +int +umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, void *arg, + uint64_t *out_id, struct umem_cache_chkpt_stats *stats) +{ + struct umem_cache *cache; + struct umem_page_info *pinfo; + struct umem_checkpoint_data *chkpt_data_all; + d_list_t dirty_list; + uint64_t chkpt_id = *out_id; + int rc = 0; + + D_ASSERT(store != NULL); + cache = store->cache; + + if (cache == NULL) + return 0; /* TODO: When SMD is supported outside VOS, this will be an error */ + + if (d_list_empty(&cache->ca_pgs_dirty)) + goto wait; + + D_ALLOC_ARRAY(chkpt_data_all, MAX_INFLIGHT_SETS); + if (chkpt_data_all == NULL) + return -DER_NOMEM; + + D_INIT_LIST_HEAD(&dirty_list); + d_list_splice_init(&cache->ca_pgs_dirty, &dirty_list); + + rc = cache_flush_pages(cache, &dirty_list, chkpt_data_all, MAX_INFLIGHT_SETS, wait_cb, arg, + &chkpt_id, stats); D_FREE(chkpt_data_all); + if (!d_list_empty(&dirty_list)) { + D_ASSERT(rc != 0); + d_list_move(&dirty_list, &cache->ca_pgs_dirty); + } +wait: + /* Wait for the evicting pages (if any) with lower checkpoint id */ + d_list_for_each_entry(pinfo, &cache->ca_pgs_flushing, pi_flush_link) { + D_ASSERT(pinfo->pi_io == 1); + if (store->stor_ops->so_wal_id_cmp(store, chkpt_id, pinfo->pi_last_checkpoint) < 0) + continue; + page_wait_io(cache, pinfo); + goto wait; + } *out_id = chkpt_id; - if (stats) { - stats->uccs_nr_pages = pages_scanned; - stats->uccs_nr_dchunks = dchunks_copied; - stats->uccs_nr_iovs = iovs_used; + + return rc; +} + +static inline void +inc_cache_stats(struct umem_cache *cache, unsigned int op) +{ + cache->ca_cache_stats[op] += 1; +} + +static int +cache_load_page(struct umem_cache *cache, struct umem_page_info *pinfo) +{ + struct umem_store *store = cache->ca_store; + uint64_t offset; + daos_size_t len; + int rc; + + D_ASSERT(pinfo->pi_mapped == 1); + + if (pinfo->pi_io == 1) { + page_wait_io(cache, pinfo); + return pinfo->pi_loaded ? 0 : -DER_IO; + } + + offset = cache_id2off(cache, pinfo->pi_pg_id); + D_ASSERT(offset < store->stor_size); + len = min(cache->ca_page_sz, store->stor_size - offset); + pinfo->pi_io = 1; + + if (DAOS_ON_VALGRIND) + VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE((char *)pinfo->pi_addr, len); + rc = store->stor_ops->so_load(store, (char *)pinfo->pi_addr, offset, len); + if (DAOS_ON_VALGRIND) + VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE((char *)pinfo->pi_addr, len); + pinfo->pi_io = 0; + if (rc) { + DL_ERROR(rc, "Read MD blob failed."); + page_wakeup_io(cache, pinfo); + return rc; + } else if (cache->ca_evtcb_fn) { + rc = cache->ca_evtcb_fn(UMEM_CACHE_EVENT_PGLOAD, cache->ca_fn_arg, pinfo->pi_pg_id); + if (rc) { + DL_ERROR(rc, "Pageload callback failed."); + page_wakeup_io(cache, pinfo); + return rc; + } + } + + pinfo->pi_loaded = 1; + /* Add to LRU when it's unpinned */ + if (pinfo->pi_ref == 0) + cache_add2lru(cache, pinfo); + + page_wakeup_io(cache, pinfo); + inc_cache_stats(cache, UMEM_CACHE_STATS_LOAD); + + return rc; +} + +void +umem_cache_commit(struct umem_store *store, uint64_t commit_id) +{ + struct umem_cache *cache = store->cache; + struct umem_page_info *pinfo, *tmp; + + D_ASSERT(store->stor_ops->so_wal_id_cmp(store, cache->ca_commit_id, commit_id) <= 0); + cache->ca_commit_id = commit_id; + + d_list_for_each_entry_safe(pinfo, tmp, &cache->ca_pgs_wait_commit, pi_dirty_link) { + if (store->stor_ops->so_wal_id_cmp(store, pinfo->pi_last_checkpoint, + commit_id) <= 0) { + d_list_del_init(&pinfo->pi_dirty_link); + page_wakeup_commit(cache, pinfo); + } + } +} + +struct wait_page_commit_arg { + struct umem_cache *wca_cache; + struct umem_page_info *wca_pinfo; +}; + +static void +wait_page_commit_cb(void *arg, uint64_t wait_tx, uint64_t *committed_tx) +{ + struct wait_page_commit_arg *wca = arg; + struct umem_cache *cache = wca->wca_cache; + struct umem_store *store = cache->ca_store; + struct umem_page_info *pinfo = wca->wca_pinfo; + + /* Special case, needs to yield to allow progress */ + if (wait_tx == 0) { + page_wait_committed(cache, pinfo, true); + *committed_tx = cache->ca_commit_id; + return; + } + + D_ASSERT(wait_tx == pinfo->pi_last_checkpoint); + /* Page is committed */ + if (store->stor_ops->so_wal_id_cmp(store, cache->ca_commit_id, wait_tx) >= 0) { + *committed_tx = cache->ca_commit_id; + return; + } + + D_ASSERT(d_list_empty(&pinfo->pi_dirty_link)); + d_list_add_tail(&pinfo->pi_dirty_link, &cache->ca_pgs_wait_commit); + page_wait_committed(cache, pinfo, false); + *committed_tx = cache->ca_commit_id; +} + +static int +cache_flush_page(struct umem_cache *cache, struct umem_page_info *pinfo) +{ + struct wait_page_commit_arg arg; + struct umem_checkpoint_data *chkpt_data_all; + d_list_t dirty_list; + uint64_t chkpt_id = 0; + int rc; + + D_ALLOC_ARRAY(chkpt_data_all, 1); + if (chkpt_data_all == NULL) + return -DER_NOMEM; + + D_INIT_LIST_HEAD(&dirty_list); + d_list_del_init(&pinfo->pi_dirty_link); + d_list_add_tail(&pinfo->pi_dirty_link, &dirty_list); + + /* + * Bump the last checkpoint ID beforehand, since cache_flush_pages() could yield before + * bumping the last checkpoint ID. + */ + D_ASSERT(is_page_dirty(pinfo)); + pinfo->pi_last_checkpoint = pinfo->pi_last_inflight; + + arg.wca_cache = cache; + arg.wca_pinfo = pinfo; + + rc = cache_flush_pages(cache, &dirty_list, chkpt_data_all, 1, wait_page_commit_cb, &arg, + &chkpt_id, NULL); + D_FREE(chkpt_data_all); + D_ASSERT(d_list_empty(&dirty_list)); + inc_cache_stats(cache, UMEM_CACHE_STATS_FLUSH); + + return rc; +} + +static int +cache_evict_page(struct umem_cache *cache, bool for_sys) +{ + struct umem_page_info *pinfo; + d_list_t *pg_list = &cache->ca_pgs_lru[1]; + int rc; + + if (cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] == cache->ca_mem_pages) { + D_ERROR("No evictable page.\n"); + return -DER_INVAL; + } else if (d_list_empty(pg_list)) { + D_ERROR("All evictable pages are pinned.\n"); + return -DER_BUSY; + } + + /* Try the most recent used page if it was used for sys */ + if (for_sys) { + pinfo = d_list_entry(pg_list->prev, struct umem_page_info, pi_lru_link); + if (pinfo->pi_sys == 1) + goto evict; + } + + /* Try evictable pages in LRU order */ + pinfo = d_list_entry(pg_list->next, struct umem_page_info, pi_lru_link); +evict: + D_ASSERT(pinfo->pi_ref == 0); + + /* + * To minimize page eviction, let's evict page one by one for this moment, we + * may consider to allow N concurrent pages eviction in the future. + */ + if (pinfo->pi_io == 1) { + D_ASSERT(!d_list_empty(&pinfo->pi_flush_link)); + page_wait_io(cache, pinfo); + return -DER_AGAIN; + } + + if (is_page_dirty(pinfo)) { + rc = cache_flush_page(cache, pinfo); + if (rc) { + DL_ERROR(rc, "Flush page failed."); + return rc; + } + + /* The page is referenced by others while flushing */ + if ((pinfo->pi_ref > 0) || is_page_dirty(pinfo) || pinfo->pi_io == 1) + return -DER_AGAIN; + } + + if (cache->ca_evtcb_fn) { + rc = cache->ca_evtcb_fn(UMEM_CACHE_EVENT_PGEVICT, cache->ca_fn_arg, + pinfo->pi_pg_id); + if (rc) + DL_ERROR(rc, "Page evict callback failed."); + } + d_list_del_init(&pinfo->pi_lru_link); + cache_unmap_page(cache, pinfo); + inc_cache_stats(cache, UMEM_CACHE_STATS_EVICT); + + return 0; +} + +static inline bool +need_reserve(struct umem_cache *cache, uint32_t extra_pgs) +{ + uint32_t page_nr = 0; + + if (cache->ca_replay_done) { + /* Few free pages are always reserved for potential non-evictable zone grow */ + D_ASSERT(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] <= cache->ca_max_ne_pages); + page_nr = cache->ca_max_ne_pages - cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE]; + if (page_nr > UMEM_CACHE_RSRVD_PAGES) + page_nr = UMEM_CACHE_RSRVD_PAGES; + } + page_nr += extra_pgs; + + if (page_nr == 0) + return false; + + return cache->ca_pgs_stats[UMEM_PG_STATS_FREE] < page_nr ? true : false; +} + +static inline bool +need_evict(struct umem_cache *cache) +{ + if (d_list_empty(&cache->ca_pgs_free)) + return true; + + return need_reserve(cache, 1); +} + +static int +cache_get_free_page(struct umem_cache *cache, struct umem_page_info **ret_pinfo, int pinned_nr, + bool for_sys) +{ + struct umem_page_info *pinfo; + int rc, retry_cnt = 0; + + while (need_evict(cache)) { + rc = cache_evict_page(cache, for_sys); + if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) { + DL_ERROR(rc, "Evict page failed."); + return rc; + } + + /* All pinned pages are from current caller */ + if (rc == -DER_BUSY && pinned_nr == cache->ca_pgs_stats[UMEM_PG_STATS_PINNED]) { + D_ERROR("Not enough evictable pages.\n"); + return -DER_INVAL; + } + + D_CDEBUG(retry_cnt == 10, DLOG_ERR, DB_TRACE, + "Retry get free page, %d times\n", retry_cnt); + retry_cnt++; + } + + pinfo = cache_pop_free_page(cache); + D_ASSERT(pinfo != NULL); + *ret_pinfo = pinfo; + + return 0; +} + +/* + * Only allow map empty pages. It could yield when mapping an evictable page, + * so when caller tries to map non-evictable page, the page_nr must be 1. + */ +static int +cache_map_pages(struct umem_cache *cache, uint32_t *pages, int page_nr) +{ + struct umem_page_info *pinfo, *free_pinfo = NULL; + uint32_t pg_id; + int i, rc = 0; + + for (i = 0; i < page_nr; i++) { + pg_id = pages[i]; + + if (is_id_evictable(cache, pg_id) && page_nr != 1) { + D_ERROR("Can only map single evictable page.\n"); + return -DER_INVAL; + } +retry: + pinfo = cache->ca_pages[pg_id].pg_info; + /* The page is already mapped */ + if (pinfo != NULL) { + D_ASSERT(pinfo->pi_pg_id == pg_id); + D_ASSERT(pinfo->pi_mapped == 1); + D_ASSERT(pinfo->pi_loaded == 1); + if (free_pinfo != NULL) { + cache_push_free_page(cache, free_pinfo); + free_pinfo = NULL; + } + continue; + } + + if (is_id_evictable(cache, pg_id)) { + if (free_pinfo == NULL) { + rc = cache_get_free_page(cache, &free_pinfo, 0, false); + if (rc) { + DL_ERROR(rc, "Failed to get free page."); + break; + } + goto retry; + } else { + pinfo = free_pinfo; + free_pinfo = NULL; + } + } else { + pinfo = cache_pop_free_page(cache); + if (pinfo == NULL) { + D_ERROR("No free pages.\n"); + rc = -DER_BUSY; + break; + } + } + + cache_map_page(cache, pinfo, pg_id); + cache_add2lru(cache, pinfo); + /* Map an empty page, doesn't need to load page */ + pinfo->pi_loaded = 1; + } + + return rc; +} + +static int +cache_pin_pages(struct umem_cache *cache, uint32_t *pages, int page_nr, bool for_sys) +{ + struct umem_page_info *pinfo, *free_pinfo = NULL; + uint32_t pg_id; + int i, processed = 0, pinned = 0, rc = 0; + + for (i = 0; i < page_nr; i++) { + pg_id = pages[i]; +retry: + pinfo = cache->ca_pages[pg_id].pg_info; + /* The page is already mapped */ + if (pinfo != NULL) { + D_ASSERT(pinfo->pi_pg_id == pg_id); + D_ASSERT(pinfo->pi_mapped == 1); + inc_cache_stats(cache, UMEM_CACHE_STATS_HIT); + if (free_pinfo != NULL) { + cache_push_free_page(cache, free_pinfo); + free_pinfo = NULL; + } + goto next; + } + + if (free_pinfo == NULL) { + rc = cache_get_free_page(cache, &free_pinfo, pinned, for_sys); + if (rc) + goto error; + /* Above cache_get_free_page() could yield, need re-check mapped status */ + goto retry; + } else { + pinfo = free_pinfo; + free_pinfo = NULL; + } + + inc_cache_stats(cache, UMEM_CACHE_STATS_MISS); + cache_map_page(cache, pinfo, pg_id); +next: + cache_pin_page(cache, pinfo); + processed++; + if (is_id_evictable(cache, pinfo->pi_pg_id)) + pinned++; + } + + for (i = 0; i < page_nr; i++) { + pg_id = pages[i]; + pinfo = cache->ca_pages[pg_id].pg_info; + + D_ASSERT(pinfo != NULL); + if (pinfo->pi_loaded == 0) { + rc = cache_load_page(cache, pinfo); + if (rc) + goto error; + } + pinfo->pi_sys = for_sys; + } + + return 0; +error: + for (i = 0; i < processed; i++) { + pg_id = pages[i]; + pinfo = cache->ca_pages[pg_id].pg_info; + + D_ASSERT(pinfo != NULL); + cache_unpin_page(cache, pinfo); + + } + return rc; +} + +#define DF_RANGE \ + DF_U64", "DF_U64 +#define DP_RANGE(range) \ + (range)->cr_off, (range)->cr_size + +static int +cache_rgs2pgs(struct umem_cache *cache, struct umem_cache_range *ranges, int range_nr, + uint32_t *in_pages, int *page_nr, uint32_t **out_pages) +{ + struct umem_cache_range range; + uint32_t page_id, *pages = in_pages, *old_pages = NULL, len = 0; + int rc = 0, i, page_idx = 0, tot_pages = *page_nr; + + for (i = 0; i < range_nr; i++) { + range = ranges[i]; + /* Assume the ranges are sorted & no overlapping */ + if (i > 0) { + if (range.cr_off < ranges[i - 1].cr_off + ranges[i - 1].cr_size) { + D_ERROR("Invalid ranges ["DF_RANGE"], ["DF_RANGE"]\n", + DP_RANGE(&ranges[i - 1]), DP_RANGE(&range)); + rc = -DER_INVAL; + goto error; + } + } + + D_ASSERT(range.cr_size > 0); + while (range.cr_size > 0) { + page_id = cache_off2id(cache, range.cr_off); + + if (len != 0 && page_id != pages[page_idx]) { + page_idx++; + if (page_idx == tot_pages) { + D_REALLOC_ARRAY(pages, old_pages, tot_pages, tot_pages * 2); + if (pages == NULL) { + D_ERROR("Alloc array(%d) failed.\n", tot_pages * 2); + rc = -DER_NOMEM; + goto error; + } + old_pages = pages; + tot_pages = tot_pages * 2; + } + } + + pages[page_idx] = page_id; + len = cache->ca_page_sz - cache_off2pg_off(cache, range.cr_off); + range.cr_off += len; + if (range.cr_size >= len) + range.cr_size -= len; + else + range.cr_size = 0; + } + } + + D_ASSERT(page_idx < tot_pages); + *out_pages = pages; + *page_nr = page_idx + 1; + + return 0; +error: + if (old_pages) + D_FREE(old_pages); + return rc; +} + +#define UMEM_PAGES_ON_STACK 16 + +void +umem_cache_post_replay(struct umem_store *store) +{ + struct umem_cache *cache = store->cache; + int cnt = 0; + int idx; + struct umem_page_info *pinfo; + + pinfo = (struct umem_page_info *)&cache->ca_pages[cache->ca_md_pages]; + for (idx = 0; idx < cache->ca_mem_pages; idx++) { + if (pinfo[idx].pi_loaded == 0) + continue; + + if (!is_id_evictable(cache, pinfo[idx].pi_pg_id)) { + d_list_del_init(&pinfo[idx].pi_lru_link); + d_list_add_tail(&pinfo[idx].pi_lru_link, &cache->ca_pgs_lru[0]); + cnt++; + } + } + cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE] = cnt; + cache->ca_replay_done = 1; +} + +int +umem_cache_map(struct umem_store *store, struct umem_cache_range *ranges, int range_nr) +{ + struct umem_cache *cache = store->cache; + uint32_t in_pages[UMEM_PAGES_ON_STACK], *out_pages; + int rc, page_nr = UMEM_PAGES_ON_STACK; + + rc = cache_rgs2pgs(cache, ranges, range_nr, &in_pages[0], &page_nr, &out_pages); + if (rc) + return rc; + + rc = cache_map_pages(cache, out_pages, page_nr); + if (rc) + DL_ERROR(rc, "Map page failed."); + + if (out_pages != &in_pages[0]) + D_FREE(out_pages); + + return rc; +} + +int +umem_cache_load(struct umem_store *store, struct umem_cache_range *ranges, int range_nr, + bool for_sys) +{ + struct umem_cache *cache = store->cache; + struct umem_page_info *pinfo; + uint32_t in_pages[UMEM_PAGES_ON_STACK], *out_pages; + int i, rc, page_nr = UMEM_PAGES_ON_STACK; + + rc = cache_rgs2pgs(cache, ranges, range_nr, &in_pages[0], &page_nr, &out_pages); + if (rc) + return rc; + + rc = cache_pin_pages(cache, out_pages, page_nr, for_sys); + if (rc) { + DL_ERROR(rc, "Load page failed."); + } else { + for (i = 0; i < page_nr; i++) { + uint32_t pg_id = out_pages[i]; + + pinfo = cache->ca_pages[pg_id].pg_info; + D_ASSERT(pinfo != NULL); + cache_unpin_page(cache, pinfo); + } + } + + if (out_pages != &in_pages[0]) + D_FREE(out_pages); + + return rc; +} + +struct umem_pin_handle { + uint32_t ph_page_nr; + uint32_t ph_pages[0]; +}; + +int +umem_cache_pin(struct umem_store *store, struct umem_cache_range *ranges, int range_nr, + bool for_sys, struct umem_pin_handle **pin_handle) +{ + struct umem_cache *cache = store->cache; + struct umem_pin_handle *handle; + uint32_t in_pages[UMEM_PAGES_ON_STACK], *out_pages; + int rc, page_nr = UMEM_PAGES_ON_STACK; + + rc = cache_rgs2pgs(cache, ranges, range_nr, &in_pages[0], &page_nr, &out_pages); + if (rc) + return rc; + + rc = cache_pin_pages(cache, out_pages, page_nr, for_sys); + if (rc) { + DL_ERROR(rc, "Load page failed."); + goto out; + } + + D_ALLOC(handle, sizeof(struct umem_pin_handle) + sizeof(uint32_t) * page_nr); + if (handle == NULL) { + rc = -DER_NOMEM; + goto out; + } + handle->ph_page_nr = page_nr; + memcpy(&handle->ph_pages[0], out_pages, sizeof(uint32_t) * page_nr); + *pin_handle = handle; +out: + if (out_pages != &in_pages[0]) + D_FREE(out_pages); + + return rc; +} + +void +umem_cache_unpin(struct umem_store *store, struct umem_pin_handle *pin_handle) +{ + struct umem_cache *cache = store->cache; + struct umem_page_info *pinfo; + int i; + + D_ASSERT(pin_handle != NULL); + D_ASSERT(pin_handle->ph_page_nr > 0); + + for (i = 0; i < pin_handle->ph_page_nr; i++) { + uint32_t pg_id = pin_handle->ph_pages[i]; + + pinfo = cache->ca_pages[pg_id].pg_info; + D_ASSERT(pinfo != NULL); + cache_unpin_page(cache, pinfo); + } + + D_FREE(pin_handle); +} + +int +umem_cache_reserve(struct umem_store *store) +{ + struct umem_cache *cache = store->cache; + int rc = 0, retry_cnt = 0; + + if (cache_mode(cache) == 1) + return rc; + + /* MUST ensure the FIFO order */ + if (!need_reserve(cache, 0) && !cache->ca_reserve_waiters) + return rc; + + D_ASSERT(cache->ca_reserve_wq != NULL); + cache->ca_reserve_waiters++; + if (cache->ca_reserve_waiters > 1) { + D_ASSERT(store->stor_ops->so_waitqueue_wait != NULL); + store->stor_ops->so_waitqueue_wait(cache->ca_reserve_wq, false); + } + + while (need_reserve(cache, 0)) { + rc = cache_evict_page(cache, false); + if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) { + DL_ERROR(rc, "Evict page failed."); + break; + } + rc = 0; + + D_CDEBUG(retry_cnt == 10, DLOG_ERR, DB_TRACE, + "Retry reserve free page, %d times\n", retry_cnt); + retry_cnt++; + } + + D_ASSERT(cache->ca_reserve_waiters > 0); + cache->ca_reserve_waiters--; + if (cache->ca_reserve_waiters > 0) { + D_ASSERT(store->stor_ops->so_waitqueue_wakeup != NULL); + store->stor_ops->so_waitqueue_wakeup(cache->ca_reserve_wq, false); } return rc; } + +uint32_t +umem_get_mb_from_offset(struct umem_instance *umm, umem_off_t off) +{ + uint32_t page_id; + struct umem_cache *cache = umm->umm_pool->up_store.cache; + + page_id = cache_off2id(cache, off); + if (is_id_evictable(cache, page_id)) + return page_id; + return 0; +} + +umem_off_t +umem_get_mb_base_offset(struct umem_instance *umm, uint32_t id) +{ + struct umem_cache *cache = umm->umm_pool->up_store.cache; + + return cache_id2off(cache, id); +} + #endif diff --git a/src/common/tests/umem_test.c b/src/common/tests/umem_test.c index 8c192f7e892..6080843f51c 100644 --- a/src/common/tests/umem_test.c +++ b/src/common/tests/umem_test.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -58,16 +58,17 @@ reset_arg(struct test_arg *arg) static void touch_mem(struct test_arg *arg, uint64_t tx_id, uint64_t offset, uint64_t size) { + struct umem_cache *cache = arg->ta_store.cache; struct chunk *prep = &arg->ta_chunks[arg->ta_chunk_nr++]; struct chunk *flush = &arg->ta_chunks[arg->ta_chunk_nr++]; d_list_t *prep_list = &arg->ta_prep_list; d_list_t *flush_list = &arg->ta_flush_list; int rc; - rc = umem_cache_touch(&arg->ta_store, tx_id, offset, size); + rc = umem_cache_touch(&arg->ta_store, tx_id, offset + cache->ca_base_off, size); assert_int_equal(rc, 0); - prep->ch_off = offset; + prep->ch_off = offset + cache->ca_base_off; prep->ch_size = size; d_list_add_tail(&prep->ch_link, prep_list); @@ -140,7 +141,7 @@ check_io_region(struct test_arg *arg, struct umem_store_region *region) static void check_iov(struct test_arg *arg, d_iov_t *iov) { - find_expected(arg, "io_region", &arg->ta_flush_list, (uint64_t)iov->iov_buf, + find_expected(arg, "io_iov", &arg->ta_flush_list, (uint64_t)iov->iov_buf, (uint64_t)iov->iov_buf + iov->iov_len); } @@ -239,10 +240,18 @@ static int global_setup(void **state) { struct test_arg *arg; + int rc; + + rc = daos_debug_init(DAOS_LOG_DEFAULT); + if (rc) { + print_message("Failed to init debug\n"); + return 1; + } D_ALLOC_PTR(arg); if (arg == NULL) { print_message("Failed to allocate test struct\n"); + daos_debug_fini(); return 1; } @@ -259,6 +268,7 @@ global_teardown(void **state) umem_cache_free(&arg->ta_store); D_FREE(arg); + daos_debug_fini(); return 0; } @@ -419,16 +429,14 @@ test_page_cache(void **state) arg->ta_store.stor_ops = &stor_ops; arg->ta_store.store_type = DAOS_MD_BMEM; - rc = umem_cache_alloc(&arg->ta_store, 0); + rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, 3, 0, 0, 0, + (void *)(UMEM_CACHE_PAGE_SZ), NULL, NULL, NULL); assert_rc_equal(rc, 0); cache = arg->ta_store.cache; assert_non_null(cache); - assert_int_equal(cache->ca_num_pages, 3); - assert_int_equal(cache->ca_max_mapped, 3); - - rc = umem_cache_map_range(&arg->ta_store, 0, (void *)(UMEM_CACHE_PAGE_SZ), 3); - assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_md_pages, 3); + assert_int_equal(cache->ca_mem_pages, 3); reset_arg(arg); /** touch multiple chunks */ @@ -486,16 +494,14 @@ test_many_pages(void **state) /** In case prior test failed */ umem_cache_free(&arg->ta_store); - rc = umem_cache_alloc(&arg->ta_store, 0); + rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, LARGE_NUM_PAGES, 0, 0, 0, + (void *)(UMEM_CACHE_PAGE_SZ), NULL, NULL, NULL); assert_rc_equal(rc, 0); cache = arg->ta_store.cache; assert_non_null(cache); - assert_int_equal(cache->ca_num_pages, LARGE_NUM_PAGES); - assert_int_equal(cache->ca_max_mapped, LARGE_NUM_PAGES); - - rc = umem_cache_map_range(&arg->ta_store, 0, (void *)(UMEM_CACHE_PAGE_SZ), LARGE_NUM_PAGES); - assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_md_pages, LARGE_NUM_PAGES); + assert_int_equal(cache->ca_mem_pages, LARGE_NUM_PAGES); /** Touch all pages, more than can fit in a single set */ reset_arg(arg); @@ -532,16 +538,14 @@ test_many_writes(void **state) /** In case prior test failed */ umem_cache_free(&arg->ta_store); - rc = umem_cache_alloc(&arg->ta_store, 0); + rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, LARGE_NUM_PAGES, 0, 0, 0, + (void *)(UMEM_CACHE_PAGE_SZ), NULL, NULL, NULL); assert_rc_equal(rc, 0); cache = arg->ta_store.cache; assert_non_null(cache); - assert_int_equal(cache->ca_num_pages, LARGE_NUM_PAGES); - assert_int_equal(cache->ca_max_mapped, LARGE_NUM_PAGES); - - rc = umem_cache_map_range(&arg->ta_store, 0, (void *)(UMEM_CACHE_PAGE_SZ), LARGE_NUM_PAGES); - assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_md_pages, LARGE_NUM_PAGES); + assert_int_equal(cache->ca_mem_pages, LARGE_NUM_PAGES); /** Touch all pages, more than can fit in a single set */ reset_arg(arg); @@ -559,6 +563,192 @@ test_many_writes(void **state) umem_cache_free(&arg->ta_store); } +static int +waitqueue_create(void **wq) +{ + *wq = (void *)(UINT64_MAX); + return 0; +} + +static void +waitqueue_destroy(void *wq) +{ +} + +static void +waitqueue_wait(void *wq, bool yield_only) +{ +} + +static void +waitqueue_wakeup(void *wq, bool wakeup_all) +{ +} + +static int +store_load(struct umem_store *store, char *start_addr, daos_off_t offset, daos_size_t len) +{ + return 0; +} + +static struct umem_store_ops p2_ops = { + .so_waitqueue_create = waitqueue_create, + .so_waitqueue_destroy = waitqueue_destroy, + .so_waitqueue_wait = waitqueue_wait, + .so_waitqueue_wakeup = waitqueue_wakeup, + .so_load = store_load, + .so_flush_prep = flush_prep, + .so_flush_copy = flush_copy, + .so_flush_post = flush_post, + .so_wal_id_cmp = wal_id_cmp, +}; + +#define PAGE_NUM_MD 20 +#define PAGE_NUM_MEM 10 +#define PAGE_NUM_MAX_NE 5 + +static bool +is_evictable_fn(void *arg, uint32_t page_id) +{ + return page_id >= PAGE_NUM_MAX_NE; +} + +static int +pagevnt_fn(int event_type, void *arg, uint32_t page_id) +{ + return 0; +} + +static void +test_p2_basic(void **state) +{ + struct test_arg *arg = *state; + struct umem_cache *cache; + struct umem_cache_range rg = { 0 }; + struct umem_pin_handle *pin_hdl; + int rc; + + arg->ta_store.stor_size = UMEM_CACHE_PAGE_SZ * PAGE_NUM_MD; + arg->ta_store.stor_ops = &p2_ops; + arg->ta_store.store_type = DAOS_MD_BMEM; + + rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, PAGE_NUM_MD, PAGE_NUM_MEM, + PAGE_NUM_MAX_NE, 4096, (void *)(UMEM_CACHE_PAGE_SZ), is_evictable_fn, + pagevnt_fn, NULL); + assert_rc_equal(rc, 0); + + cache = arg->ta_store.cache; + assert_non_null(cache); + + reset_arg(arg); + + /* Load single page */ + rg.cr_off = cache->ca_base_off; + rg.cr_size = UMEM_CACHE_PAGE_SZ; + rc = umem_cache_load(&arg->ta_store, &rg, 1, false); + assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], 1); + assert_ptr_equal(umem_cache_off2ptr(&arg->ta_store, cache->ca_base_off), cache->ca_base); + + /* Map single non-evictable page */ + rg.cr_off = cache->ca_base_off + 1 * UMEM_CACHE_PAGE_SZ; + rg.cr_size = UMEM_CACHE_PAGE_SZ; + rc = umem_cache_map(&arg->ta_store, &rg, 1); + assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], 2); + assert_ptr_equal(umem_cache_off2ptr(&arg->ta_store, + cache->ca_base_off + UMEM_CACHE_PAGE_SZ), + cache->ca_base + UMEM_CACHE_PAGE_SZ); + + /* Load multiple pages */ + rg.cr_off = cache->ca_base_off + (PAGE_NUM_MAX_NE - 1) * UMEM_CACHE_PAGE_SZ; + rg.cr_size = 3 * UMEM_CACHE_PAGE_SZ; + rc = umem_cache_load(&arg->ta_store, &rg, 1, false); + assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], 3); + + /* Pin multiple pages */ + rg.cr_off = cache->ca_base_off + (PAGE_NUM_MAX_NE - 1) * UMEM_CACHE_PAGE_SZ; + rg.cr_size = 2 * UMEM_CACHE_PAGE_SZ; + rc = umem_cache_pin(&arg->ta_store, &rg, 1, false, &pin_hdl); + assert_rc_equal(rc, 0); + assert_non_null(pin_hdl); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], 1); + + /* Unpin the pinned pages */ + umem_cache_unpin(&arg->ta_store, pin_hdl); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], 0); + + /* Reserve free pages */ + rc = umem_cache_reserve(&arg->ta_store); + assert_rc_equal(rc, 0); + + umem_cache_free(&arg->ta_store); +} + +static void +test_p2_evict(void **state) +{ + struct test_arg *arg = *state; + struct umem_cache *cache; + struct umem_cache_range rg = { 0 }; + struct umem_pin_handle *pin_hdl; + uint64_t id; + int i, rc; + + arg->ta_store.stor_size = UMEM_CACHE_PAGE_SZ * PAGE_NUM_MD; + arg->ta_store.stor_ops = &p2_ops; + arg->ta_store.store_type = DAOS_MD_BMEM; + + rc = umem_cache_alloc(&arg->ta_store, UMEM_CACHE_PAGE_SZ, PAGE_NUM_MD, PAGE_NUM_MEM, + PAGE_NUM_MAX_NE, 4096, (void *)(UMEM_CACHE_PAGE_SZ), is_evictable_fn, + pagevnt_fn, NULL); + assert_rc_equal(rc, 0); + + cache = arg->ta_store.cache; + assert_non_null(cache); + + reset_arg(arg); + + /* Load all non-evictable pages */ + rg.cr_off = cache->ca_base_off; + rg.cr_size = PAGE_NUM_MAX_NE * UMEM_CACHE_PAGE_SZ; + rc = umem_cache_load(&arg->ta_store, &rg, 1, false); + assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], PAGE_NUM_MAX_NE); + + /* Load more pages to fill the cache */ + rg.cr_off = cache->ca_base_off + PAGE_NUM_MAX_NE * UMEM_CACHE_PAGE_SZ; + rg.cr_size = (PAGE_NUM_MEM - PAGE_NUM_MAX_NE) * UMEM_CACHE_PAGE_SZ; + rc = umem_cache_load(&arg->ta_store, &rg, 1, false); + assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], PAGE_NUM_MAX_NE); + + /* Dirty all pages */ + for (i = 0; i < PAGE_NUM_MEM; i++) { + touch_mem(arg, i + 1, i * UMEM_CACHE_PAGE_SZ, UMEM_CACHE_CHUNK_SZ); + umem_cache_commit(&arg->ta_store, i + 1); + } + id = PAGE_NUM_MEM; + + /* Pin an unmapped page to trigger eviction */ + rg.cr_off = cache->ca_base_off + PAGE_NUM_MEM * UMEM_CACHE_PAGE_SZ; + rg.cr_size = 100; + rc = umem_cache_pin(&arg->ta_store, &rg, 1, false, &pin_hdl); + assert_rc_equal(rc, 0); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], 1); + + umem_cache_unpin(&arg->ta_store, pin_hdl); + assert_int_equal(cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], 0); + + rc = umem_cache_checkpoint(&arg->ta_store, wait_cb, NULL, &id, NULL); + assert_rc_equal(rc, 0); + assert_int_equal(id, PAGE_NUM_MEM); + check_lists_empty(arg); + + umem_cache_free(&arg->ta_store); +} + int main(int argc, char **argv) { @@ -570,6 +760,8 @@ main(int argc, char **argv) {"UMEM005: Test page cache", test_page_cache, NULL, NULL}, {"UMEM006: Test page cache many pages", test_many_pages, NULL, NULL}, {"UMEM007: Test page cache many writes", test_many_writes, NULL, NULL}, + {"UMEM008: Test phase2 APIs", test_p2_basic, NULL, NULL}, + {"UMEM009: Test phase2 eviction", test_p2_evict, NULL, NULL}, {NULL, NULL, NULL, NULL}}; d_register_alt_assert(mock_assert); diff --git a/src/common/tests/umem_test_bmem.c b/src/common/tests/umem_test_bmem.c index 07f4a112b36..cd745c48dc8 100644 --- a/src/common/tests/umem_test_bmem.c +++ b/src/common/tests/umem_test_bmem.c @@ -1,6 +1,6 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. - * (C) Copyright 2023 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2023-2024 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -28,7 +28,9 @@ #include #include "utest_common.h" -#define POOL_SIZE ((1024 * 1024 * 1024ULL)) +#define POOL_SIZE ((256 * 1024 * 1024ULL)) +#define NEMB_RATIO (0.8) +#define MB_SIZE (16 * 1024 * 1024) struct test_arg { struct utest_context *ta_utx; @@ -58,7 +60,7 @@ validate_persist_activity(uint64_t persist_reserv_incr, uint64_t persist_submit_ static int _persist_reserv(struct umem_store *store, uint64_t *id) { - persist_reserv_cnt++; + *id = persist_reserv_cnt++; return 0; } @@ -76,6 +78,116 @@ struct umem_store_ops _store_ops = { struct umem_store ustore = { .stor_size = POOL_SIZE, .stor_ops = &_store_ops, .store_type = DAOS_MD_BMEM }; +static int +waitqueue_create(void **wq) +{ + *wq = (void *)(UINT64_MAX); + return 0; +} + +static void +waitqueue_destroy(void *wq) +{ +} + +static void +waitqueue_wait(void *wq, bool yield_only) +{ +} + +static void +waitqueue_wakeup(void *wq, bool wakeup_all) +{ +} + +static int +store_load(struct umem_store *store, char *start_addr, daos_off_t offset, daos_size_t len) +{ + memset(start_addr, 0, len); + D_ASSERTF(0, "Test is not suppose to do a store_load"); +} + +char store_buf[4096]; + +static int +store_read(struct umem_store *store, struct umem_store_iod *iod, d_sg_list_t *sgl) +{ + /* Fake Heap header read write */ + D_ASSERT(sgl->sg_iovs->iov_len <= 4096); + memcpy(sgl->sg_iovs->iov_buf, store_buf, sgl->sg_iovs->iov_len); + return 0; +} + +static int +store_write(struct umem_store *store, struct umem_store_iod *iod, d_sg_list_t *sgl) +{ + /* Fake Heap header read write */ + D_ASSERT(sgl->sg_iovs->iov_len <= 4096); + memcpy(store_buf, sgl->sg_iovs->iov_buf, sgl->sg_iovs->iov_len); + return 0; +} + +static int +store_flush_prep(struct umem_store *store, struct umem_store_iod *iod, daos_handle_t *fh) +{ + D_ASSERTF(0, "Test is not suppose to do a store_flush_prep"); + return 0; +} + +static int +store_flush_copy(daos_handle_t fh, d_sg_list_t *sgl) +{ + D_ASSERTF(0, "Test is not suppose to do a store_flush_copy"); + return 0; +} + +static int +store_flush_post(daos_handle_t fh, int err) +{ + D_ASSERTF(0, "Test is not suppose to do a store_flush_post"); + return 0; +} + +static int +wal_id_cmp(struct umem_store *store, uint64_t id1, uint64_t id2) +{ + if (id1 > id2) + return 1; + if (id1 < id2) + return -1; + return 0; +} + +static int +wal_replay(struct umem_store *store, + int (*replay_cb)(uint64_t tx_id, struct umem_action *act, void *arg), void *arg) +{ + D_ASSERTF(0, "Test is not suppose to do a store_flush_post"); + return 0; +} + +struct umem_store_ops _store_ops_v2 = { + .so_waitqueue_create = waitqueue_create, + .so_waitqueue_destroy = waitqueue_destroy, + .so_waitqueue_wait = waitqueue_wait, + .so_waitqueue_wakeup = waitqueue_wakeup, + .so_load = store_load, + .so_read = store_read, + .so_write = store_write, + .so_flush_prep = store_flush_prep, + .so_flush_copy = store_flush_copy, + .so_flush_post = store_flush_post, + .so_wal_reserv = _persist_reserv, + .so_wal_submit = _persist_submit, + .so_wal_replay = wal_replay, + .so_wal_id_cmp = wal_id_cmp, +}; + +struct umem_store ustore_v2 = {.stor_size = POOL_SIZE * 2, + .stor_ops = &_store_ops_v2, + .store_type = DAOS_MD_BMEM_V2, + .stor_priv = (void *)(UINT64_MAX)}; + int teardown_pmem(void **state) { @@ -94,8 +206,8 @@ teardown_pmem(void **state) return rc; } -int -setup_pmem(void **state) +static int +setup_pmem_internal(void **state, struct umem_store *store) { struct test_arg *arg = *state; static int tnum; @@ -107,8 +219,8 @@ setup_pmem(void **state) return 1; } - rc = utest_pmem_create(arg->ta_pool_name, POOL_SIZE, - sizeof(*arg->ta_root), &ustore, &arg->ta_utx); + rc = utest_pmem_create(arg->ta_pool_name, POOL_SIZE, sizeof(*arg->ta_root), store, + &arg->ta_utx); if (rc != 0) { perror("Could not create pmem context"); rc = 1; @@ -123,6 +235,18 @@ setup_pmem(void **state) return rc; } +static int +setup_pmem(void **state) +{ + return setup_pmem_internal(state, &ustore); +} + +static int +setup_pmem_v2(void **state) +{ + return setup_pmem_internal(state, &ustore_v2); +} + static int global_setup(void **state) { @@ -132,6 +256,7 @@ global_setup(void **state) print_message("Failed to set the md_on_ssd tunable\n"); return 1; } + ustore.store_type = umempobj_get_backend_type(); D_ALLOC_PTR(arg); if (arg == NULL) { @@ -204,6 +329,57 @@ test_atomic_alloc(void **state) assert_true(cur_mem_used == initial_mem_used); } +static void +test_atomic_alloc_from_bucket(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + uint64_t off, size, off_arr[16]; + int i, rc; + uint64_t initial_mem_used, cur_mem_used; + uint64_t total_size = 0; + + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + snap_persist_activity(); + off = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(off)); + validate_persist_activity(1, 1); + + rc = umem_atomic_free(umm, off); + assert_int_equal(rc, 0); + validate_persist_activity(2, 2); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(cur_mem_used == initial_mem_used); + + /* Negative test: Incorrect size test */ + snap_persist_activity(); + off = umem_atomic_alloc_from_bucket(umm, 0, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_true(UMOFF_IS_NULL(off)); + validate_persist_activity(0, 0); + + /* Validate allocation of various sizes */ + snap_persist_activity(); + for (i = 1; i < 16; i++) { + size = (1ul<ta_utx, &cur_mem_used); + assert_true(cur_mem_used >= initial_mem_used+total_size); + + snap_persist_activity(); + for (i = 15; i > 0; i--) { + rc = umem_atomic_free(umm, off_arr[i]); + assert_int_equal(rc, 0); + } + validate_persist_activity(15, 15); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(cur_mem_used == initial_mem_used); +} + static void test_atomic_copy(void **state) { @@ -769,29 +945,41 @@ test_alloc(void **state) int rc; rc = utest_tx_begin(arg->ta_utx); - if (rc != 0) - goto done; + assert_int_equal(rc, 0); umoff = umem_zalloc(umm, 4); - if (UMOFF_IS_NULL(umoff)) { - print_message("umoff unexpectedly NULL\n"); - rc = 1; - goto end; - } + assert_false(UMOFF_IS_NULL(umoff)); value1 = umem_off2ptr(umm, umoff); + assert_true(*value1 == 0); - if (*value1 != 0) { - print_message("Bad value for allocated umoff\n"); - rc = 1; - goto end; - } + rc = umem_free(umm, umoff); + assert_int_equal(rc, 0); + utest_tx_end(arg->ta_utx, rc); +} + +static void +test_alloc_from_bucket(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + int *value1; + umem_off_t umoff = 0; + int rc; + + rc = utest_tx_begin(arg->ta_utx); + assert_int_equal(rc, 0); + + umoff = umem_zalloc_from_bucket(umm, 4, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + + value1 = umem_off2ptr(umm, umoff); + + assert_true(*value1 == 0); rc = umem_free(umm, umoff); -end: - rc = utest_tx_end(arg->ta_utx, rc); -done: assert_int_equal(rc, 0); + utest_tx_end(arg->ta_utx, rc); } static void @@ -923,106 +1111,234 @@ test_tx_alloc(void **state) } static void -test_tx_add(void **state) +test_tx_alloc_from_bucket(void **state) { struct test_arg *arg = *state; struct umem_instance *umm = utest_utx2umm(arg->ta_utx); int rc; - umem_off_t umoff; - char *start_ptr, *tmp_ptr; - char local_buf[2048]; - - /* Setup */ - umoff = umem_atomic_alloc(umm, 2048, UMEM_TYPE_ANY); - assert_false(UMOFF_IS_NULL(umoff)); - start_ptr = umem_off2ptr(umm, umoff); - memset(local_buf, 0, 2048); - tmp_ptr = umem_atomic_copy(umm, start_ptr, local_buf, 2048, UMEM_COMMIT_IMMEDIATE); - assert_true(tmp_ptr == start_ptr); - - /* Negative tests */ - expect_assert_failure(umem_tx_add(umm, umoff, 128)); + daos_size_t allotted_size = 0; + uint64_t initial_mem_used, cur_mem_used; + int *value1, *value2; + umem_off_t umoff1 = 0, umoff2 = 0; - /* Normal operation */ + /* Test umem_zalloc */ snap_persist_activity(); + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); rc = umem_tx_begin(umm, NULL); assert_int_equal(rc, 0); - rc = umem_tx_add(umm, umoff, 128); - assert_int_equal(rc, 0); - start_ptr = umem_off2ptr(umm, umoff); - memset(start_ptr, 'a', 128); - memset(local_buf, 'a', 128); - rc = umem_tx_end(umm, 0); + + umoff1 = umem_zalloc_from_bucket(umm, 4, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff1)); + allotted_size += 4; + + value1 = umem_off2ptr(umm, umoff1); + + assert_true(*value1 == 0); + + rc = umem_tx_commit(umm); assert_int_equal(rc, 0); validate_persist_activity(1, 1); - assert_false(strncmp(local_buf, start_ptr, 128)); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(cur_mem_used >= (initial_mem_used + allotted_size)); - /* Abort a transaction after tx add */ + /* Test umem_alloc */ snap_persist_activity(); rc = umem_tx_begin(umm, NULL); assert_int_equal(rc, 0); - rc = umem_tx_add(umm, umoff+128, 128); - assert_int_equal(rc, 0); - tmp_ptr = umem_off2ptr(umm, umoff+128); - memset(tmp_ptr, 'b', 128); - rc = umem_tx_abort(umm, 1); - assert_true(rc != 0); - validate_persist_activity(1, 0); - assert_false(strncmp(local_buf, start_ptr, 256)); - /* Invalid offset */ - snap_persist_activity(); - rc = umem_tx_begin(umm, NULL); - assert_int_equal(rc, 0); - rc = umem_tx_add(umm, POOL_SIZE+4096, 128); - assert_true(rc != 0); - assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT); - rc = umem_tx_end(umm, rc); - assert_true(rc != 0); - validate_persist_activity(1, 0); -} + umoff2 = umem_alloc_from_bucket(umm, 4, UMEM_DEFAULT_MBKT_ID); + allotted_size += 4; + assert_false(UMOFF_IS_NULL(umoff2)); -static void -test_tx_add_ptr(void **state) -{ - struct test_arg *arg = *state; - struct umem_instance *umm = utest_utx2umm(arg->ta_utx); - int rc; - umem_off_t umoff; - char *start_ptr, *tmp_ptr; - char local_buf[2048]; + value2 = umem_off2ptr(umm, umoff2); + *value2 = 100; - /* Setup */ - umoff = umem_atomic_alloc(umm, 2048, UMEM_TYPE_ANY); - assert_false(UMOFF_IS_NULL(umoff)); - start_ptr = umem_off2ptr(umm, umoff); - memset(local_buf, 0, 2048); - tmp_ptr = umem_atomic_copy(umm, start_ptr, local_buf, 2048, UMEM_COMMIT_IMMEDIATE); - assert_true(tmp_ptr == start_ptr); + rc = umem_tx_commit(umm); + assert_int_equal(rc, 0); - /* Negative tests */ - expect_assert_failure(umem_tx_add_ptr(umm, start_ptr, 128)); + validate_persist_activity(1, 1); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(cur_mem_used >= (initial_mem_used + allotted_size)); - /* Normal operation */ + /* Test umem_free */ snap_persist_activity(); rc = umem_tx_begin(umm, NULL); assert_int_equal(rc, 0); - start_ptr = umem_off2ptr(umm, umoff); - rc = umem_tx_add_ptr(umm, start_ptr, 128); - assert_int_equal(rc, 0); - memset(start_ptr, 'a', 128); - memset(local_buf, 'a', 128); - rc = umem_tx_end(umm, 0); + + rc = umem_free(umm, umoff2); assert_int_equal(rc, 0); - validate_persist_activity(1, 1); - assert_false(strncmp(local_buf, start_ptr, 128)); + allotted_size -= 4; - /* Abort a transaction after tx add */ - snap_persist_activity(); - rc = umem_tx_begin(umm, NULL); + rc = umem_free(umm, umoff1); assert_int_equal(rc, 0); - tmp_ptr = umem_off2ptr(umm, umoff+128); - rc = umem_tx_add_ptr(umm, tmp_ptr, 128); + allotted_size -= 4; + + rc = umem_tx_commit(umm); + assert_int_equal(rc, 0); + validate_persist_activity(1, 1); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(allotted_size == 0); + assert_true(cur_mem_used == initial_mem_used); + + /* Negative Tests */ + /* Outside of TX */ + expect_assert_failure(umem_alloc_from_bucket(umm, 100, UMEM_DEFAULT_MBKT_ID)); + expect_assert_failure(umem_zalloc_from_bucket(umm, 100, UMEM_DEFAULT_MBKT_ID)); + + /* alloc of size zero */ + snap_persist_activity(); + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + umoff1 = umem_alloc_from_bucket(umm, 0, UMEM_DEFAULT_MBKT_ID); + assert_true(UMOFF_IS_NULL(umoff1)); + assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT); + rc = umem_tx_end(umm, 1); + assert_false(rc == 0); + validate_persist_activity(1, 0); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(initial_mem_used == cur_mem_used); + + snap_persist_activity(); + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + umoff1 = umem_zalloc_from_bucket(umm, 0, UMEM_DEFAULT_MBKT_ID); + assert_true(UMOFF_IS_NULL(umoff1)); + assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT); + rc = umem_tx_end(umm, 1); + assert_false(rc == 0); + validate_persist_activity(1, 0); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(initial_mem_used == cur_mem_used); + + /* free outside of tx */ + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + umoff1 = umem_zalloc_from_bucket(umm, 4, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff1)); + rc = umem_tx_end(umm, 0); + assert_int_equal(rc, 0); + expect_assert_failure(umem_free(umm, umoff1)); + + /* abort after alloc and used memory should not increase */ + snap_persist_activity(); + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + umoff1 = umem_alloc_from_bucket(umm, 16, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff1)); + umoff1 = umem_zalloc_from_bucket(umm, 32, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff2)); + rc = umem_tx_abort(umm, 1); + assert_false(rc == 0); + validate_persist_activity(1, 0); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(initial_mem_used == cur_mem_used); + +} + +static void +test_tx_add(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + int rc; + umem_off_t umoff; + char *start_ptr, *tmp_ptr; + char local_buf[2048]; + + /* Setup */ + umoff = umem_atomic_alloc(umm, 2048, UMEM_TYPE_ANY); + assert_false(UMOFF_IS_NULL(umoff)); + start_ptr = umem_off2ptr(umm, umoff); + memset(local_buf, 0, 2048); + tmp_ptr = umem_atomic_copy(umm, start_ptr, local_buf, 2048, UMEM_COMMIT_IMMEDIATE); + assert_true(tmp_ptr == start_ptr); + + /* Negative tests */ + expect_assert_failure(umem_tx_add(umm, umoff, 128)); + + /* Normal operation */ + snap_persist_activity(); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + rc = umem_tx_add(umm, umoff, 128); + assert_int_equal(rc, 0); + start_ptr = umem_off2ptr(umm, umoff); + memset(start_ptr, 'a', 128); + memset(local_buf, 'a', 128); + rc = umem_tx_end(umm, 0); + assert_int_equal(rc, 0); + validate_persist_activity(1, 1); + assert_false(strncmp(local_buf, start_ptr, 128)); + + /* Abort a transaction after tx add */ + snap_persist_activity(); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + rc = umem_tx_add(umm, umoff+128, 128); + assert_int_equal(rc, 0); + tmp_ptr = umem_off2ptr(umm, umoff+128); + memset(tmp_ptr, 'b', 128); + rc = umem_tx_abort(umm, 1); + assert_true(rc != 0); + validate_persist_activity(1, 0); + assert_false(strncmp(local_buf, start_ptr, 256)); + + /* Invalid offset */ + snap_persist_activity(); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + rc = umem_tx_add(umm, umm->umm_pool->up_store.stor_size + 4096, 128); + assert_true(rc != 0); + assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT); + rc = umem_tx_end(umm, rc); + assert_true(rc != 0); + validate_persist_activity(1, 0); +} + +static void +test_tx_add_ptr(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + int rc; + umem_off_t umoff; + char *start_ptr, *tmp_ptr; + char local_buf[2048]; + + /* Setup */ + umoff = umem_atomic_alloc(umm, 2048, UMEM_TYPE_ANY); + assert_false(UMOFF_IS_NULL(umoff)); + start_ptr = umem_off2ptr(umm, umoff); + memset(local_buf, 0, 2048); + tmp_ptr = umem_atomic_copy(umm, start_ptr, local_buf, 2048, UMEM_COMMIT_IMMEDIATE); + assert_true(tmp_ptr == start_ptr); + + /* Negative tests */ + expect_assert_failure(umem_tx_add_ptr(umm, start_ptr, 128)); + + /* Normal operation */ + snap_persist_activity(); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + start_ptr = umem_off2ptr(umm, umoff); + rc = umem_tx_add_ptr(umm, start_ptr, 128); + assert_int_equal(rc, 0); + memset(start_ptr, 'a', 128); + memset(local_buf, 'a', 128); + rc = umem_tx_end(umm, 0); + assert_int_equal(rc, 0); + validate_persist_activity(1, 1); + assert_false(strncmp(local_buf, start_ptr, 128)); + + /* Abort a transaction after tx add */ + snap_persist_activity(); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + tmp_ptr = umem_off2ptr(umm, umoff+128); + rc = umem_tx_add_ptr(umm, tmp_ptr, 128); assert_int_equal(rc, 0); memset(tmp_ptr, 'b', 128); rc = umem_tx_abort(umm, 1); @@ -1094,17 +1410,6 @@ test_tx_xadd_ptr(void **state) assert_true(rc != 0); validate_persist_activity(1, 0); assert_false(strncmp(local_buf, start_ptr, 512)); - - /* Invalid pointer */ - snap_persist_activity(); - rc = umem_tx_begin(umm, NULL); - assert_int_equal(rc, 0); - rc = umem_tx_xadd_ptr(umm, local_buf, 128, UMEM_XADD_NO_SNAPSHOT); - assert_true(rc != 0); - assert_true(umem_tx_stage(umm) == UMEM_STAGE_ONABORT); - rc = umem_tx_end(umm, rc); - assert_true(rc != 0); - validate_persist_activity(1, 0); } static void @@ -1238,6 +1543,137 @@ test_tx_reserve_publish_cancel(void **state) umem_rsrvd_act_free(&rsrvd_act); } +static void +test_tx_bucket_reserve_publish_cancel(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + int rc; + struct umem_rsrvd_act *rsrvd_act; + umem_off_t umoff; + char *rsrv_ptr1, *rsrv_ptr2, *rsrv_ptr3, *rsrv_ptr4; + char *data = "Test Program test_tx_xadd_ptr"; + char local_buf[980]; + uint64_t initial_mem_used, cur_mem_used; + uint64_t allotted_mem = 0; + char addon_buf[128]; + + /* Reserve/Publish */ + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2); + assert_int_equal(rc, 0); + umoff = umem_reserve_from_bucket(umm, rsrvd_act, 980, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + rsrv_ptr1 = umem_off2ptr(umm, umoff); + memset(rsrv_ptr1, 0, 980); + memset(local_buf, 0, 980); + memcpy(rsrv_ptr1+128, data, strlen(data)); + memcpy(local_buf+128, data, strlen(data)); + + umoff = umem_reserve_from_bucket(umm, rsrvd_act, 128, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + rsrv_ptr2 = umem_off2ptr(umm, umoff); + memset(rsrv_ptr2, 0, 128); + memset(addon_buf, 0, 128); + memcpy(rsrv_ptr2, data, strlen(data)); + memcpy(addon_buf, data, strlen(data)); + + + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + snap_persist_activity(); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + rc = umem_tx_add_ptr(umm, rsrv_ptr1, 128); + assert_int_equal(rc, 0); + strcpy(rsrv_ptr1, "header"); + strcpy(local_buf, "header"); + rc = umem_tx_publish(umm, rsrvd_act); + assert_int_equal(rc, 0); + allotted_mem = 980 + 128; + rc = umem_tx_commit(umm); + assert_int_equal(rc, 0); + validate_persist_activity(1, 1); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(cur_mem_used >= initial_mem_used + allotted_mem); + assert_int_equal(memcmp(rsrv_ptr1, local_buf, 980), 0); + assert_int_equal(memcmp(rsrv_ptr2, addon_buf, 128), 0); + umem_rsrvd_act_free(&rsrvd_act); + + + /* Reserve/Cancel */ + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2); + assert_int_equal(rc, 0); + umoff = umem_reserve_from_bucket(umm, rsrvd_act, 980, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + rsrv_ptr1 = umem_off2ptr(umm, umoff); + memset(rsrv_ptr1, 1, 980); + memset(local_buf, 1, 980); + + umoff = umem_reserve_from_bucket(umm, rsrvd_act, 128, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + rsrv_ptr2 = umem_off2ptr(umm, umoff); + memset(rsrv_ptr2, 1, 128); + + + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + snap_persist_activity(); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + rc = umem_tx_add_ptr(umm, rsrv_ptr1, 128); + assert_int_equal(rc, 0); + strcpy(rsrv_ptr1, "header"); + rc = umem_tx_add_ptr(umm, rsrv_ptr2, 128); + assert_int_equal(rc, 0); + strcpy(rsrv_ptr2, "leader"); + rc = umem_tx_abort(umm, 1); + assert_false(rc == 0); + assert_int_equal(memcmp(rsrv_ptr1, local_buf, 980), 0); + assert_int_equal(memcmp(rsrv_ptr2, local_buf, 128), 0); + umem_cancel(umm, rsrvd_act); + validate_persist_activity(1, 0); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(cur_mem_used >= initial_mem_used); + umoff = umem_atomic_alloc_from_bucket(umm, 980, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + rsrv_ptr3 = umem_off2ptr(umm, umoff); + assert_ptr_equal(rsrv_ptr1, rsrv_ptr3); + umoff = umem_atomic_alloc_from_bucket(umm, 128, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + rsrv_ptr4 = umem_off2ptr(umm, umoff); + assert_ptr_equal(rsrv_ptr2, rsrv_ptr4); + umem_rsrvd_act_free(&rsrvd_act); + + /* reserve - atomic_copy - cancel */ + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2); + assert_int_equal(rc, 0); + umoff = umem_reserve_from_bucket(umm, rsrvd_act, 980, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + rsrv_ptr1 = umem_off2ptr(umm, umoff); + memset(local_buf, 1, 980); + memcpy(local_buf+128, data, strlen(data)); + snap_persist_activity(); + umem_atomic_copy(umm, rsrv_ptr1, local_buf, 980, UMEM_COMMIT_IMMEDIATE); + validate_persist_activity(1, 1); + + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + snap_persist_activity(); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + rc = umem_tx_add_ptr(umm, rsrv_ptr1, 128); + assert_int_equal(rc, 0); + strcpy(rsrv_ptr1, "header"); + strcpy(local_buf, "header"); + rc = umem_tx_publish(umm, rsrvd_act); + assert_int_equal(rc, 0); + allotted_mem = 980; + rc = umem_tx_commit(umm); + assert_int_equal(rc, 0); + validate_persist_activity(1, 1); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(cur_mem_used >= initial_mem_used + allotted_mem); + assert_int_equal(memcmp(rsrv_ptr1, local_buf, 980), 0); + umem_rsrvd_act_free(&rsrvd_act); +} + static void test_tx_dfree_publish_cancel(void **state) { @@ -1295,123 +1731,980 @@ test_tx_dfree_publish_cancel(void **state) umem_rsrvd_act_free(&rsrvd_act); } -#if 0 -/** This test is removed because the umempobj_set_slab_desc APIs are removed. Testing the - * underlying dav or pmem APIs should probably be handled elsewhere. - */ static void -test_tx_alloc_withslabs(void **state) +test_tx_bucket_dfree_publish_cancel(void **state) { struct test_arg *arg = *state; struct umem_instance *umm = utest_utx2umm(arg->ta_utx); - struct umem_slab_desc slab[5]; - int rc, i; - umem_off_t ummoff_exact1[5], ummoff_less[5], ummoff_exact2[5], ummoff_greater; - size_t size_exact, size_less, size_greater; - size_t initial_mem_used, cur_mem_used, total_allotted; - - /* Negative tests for allocation class */ - slab[0].unit_size = ULONG_MAX; - slab[0].class_id = 0; - rc = umempobj_set_slab_desc(umm->umm_pool, &slab[0]); - assert_int_not_equal(rc, 0); - slab[0].unit_size = 344; - slab[0].class_id = UINT8_MAX; - rc = umempobj_set_slab_desc(umm->umm_pool, &slab[0]); - assert_int_not_equal(rc, 0); - - /* Valid slab creation */ - for (i = 0; i < 5; i++) { - slab[i].unit_size = (1<<(i*2)) + 200 + i*16; - slab[i].class_id = 0; - rc = umempobj_set_slab_desc(umm->umm_pool, &slab[i]); - assert_int_equal(rc, 0); - assert_int_not_equal(slab[i].class_id, 0); + int rc; + struct umem_rsrvd_act *rsrvd_act; + umem_off_t umoff1, umoff2; + uint64_t freed_mem = 0; + uint64_t initial_mem_used, cur_mem_used; - umm->umm_slabs[i] = slab[i]; - } + /* Defer Free/Publish */ + umoff1 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff1)); + umoff2 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff2)); + + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2); + assert_int_equal(rc, 0); + + umem_defer_free(umm, umoff1, rsrvd_act); + umem_defer_free(umm, umoff2, rsrvd_act); utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + snap_persist_activity(); rc = umem_tx_begin(umm, NULL); assert_int_equal(rc, 0); - total_allotted = 0; - for (i = 0; i < 5; i++) { - size_exact = (1<<(i*2)) + 200 + i*16; - ummoff_exact1[i] = umem_alloc_verb(umm, i, UMEM_FLAG_ZERO, size_exact); - assert_false(UMOFF_IS_NULL(ummoff_exact1[i])); - size_less = 200; - ummoff_less[i] = umem_alloc_verb(umm, i, UMEM_FLAG_ZERO, size_less); - assert_false(UMOFF_IS_NULL(ummoff_less[i])); - assert_true(ummoff_exact1[i] + size_exact == ummoff_less[i]); - ummoff_exact2[i] = umem_alloc_verb(umm, i, UMEM_FLAG_ZERO, size_exact); - assert_false(UMOFF_IS_NULL(ummoff_exact2[i])); - assert_true(ummoff_less[i] + size_exact == ummoff_exact2[i]); - total_allotted += size_exact*3; - } + rc = umem_tx_publish(umm, rsrvd_act); + assert_int_equal(rc, 0); + freed_mem = 2048 + 1024; rc = umem_tx_commit(umm); assert_int_equal(rc, 0); + validate_persist_activity(1, 1); utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); - assert_true(initial_mem_used + total_allotted == cur_mem_used); + assert_true(initial_mem_used >= cur_mem_used + freed_mem); + umem_rsrvd_act_free(&rsrvd_act); - for (i = 0; i < 5; i++) { - size_greater = (1<<(i*2)) + 200 + i*16 + 100; - rc = umem_tx_begin(umm, NULL); - assert_int_equal(rc, 0); - ummoff_greater = umem_alloc_verb(umm, i, UMEM_FLAG_ZERO, size_greater); - assert_true(UMOFF_IS_NULL(ummoff_greater)); - rc = umem_tx_end(umm, 1); - assert_int_equal(rc, umem_tx_errno(ENOMEM)); + + /* Defer Free/Cancel */ + umoff1 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff1)); + umoff2 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff2)); + + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 2); + assert_int_equal(rc, 0); + + umem_defer_free(umm, umoff1, rsrvd_act); + umem_defer_free(umm, umoff2, rsrvd_act); + + utest_get_scm_used_space(arg->ta_utx, &initial_mem_used); + umem_cancel(umm, rsrvd_act); + utest_get_scm_used_space(arg->ta_utx, &cur_mem_used); + assert_true(cur_mem_used >= initial_mem_used); + umem_rsrvd_act_free(&rsrvd_act); +} + +static void +test_atomic_alloc_mb(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + umem_off_t umoff, umoff1, umoff2, umoff3, umoff4; + uint32_t mb_id; + int found = 0, i; + + mb_id = umem_allot_mb_evictable(umm, 0); + assert_int_not_equal(mb_id, 0); /* zero maps to non-evictable memory bucket */ + + /* Allocate objects from the memory bucket */ + umoff1 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id); + umoff2 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff2)); + assert_true(umem_get_mb_from_offset(umm, umoff2) == mb_id); + + /* Allocate from non-evictable memory bucket */ + umoff3 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff3)); + assert_true(umem_get_mb_from_offset(umm, umoff3) == UMEM_DEFAULT_MBKT_ID); + umoff4 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff4)); + assert_true(umem_get_mb_from_offset(umm, umoff4) == UMEM_DEFAULT_MBKT_ID); + + /* Free allocated objects */ + umem_atomic_free(umm, umoff1); + umem_atomic_free(umm, umoff2); + umem_atomic_free(umm, umoff3); + umem_atomic_free(umm, umoff4); + + /* + * Validate whether those freed objects are in the free list of respective + * Memory buckets. We do many allocations and free to ensure that the objects + * in recycler bin are moved back for reallocation. + */ + + found = 0; + for (i = 0; i < 16 * 1024; i++) { + umoff = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + umem_atomic_free(umm, umoff); + if (umoff == umoff1) { + found = 1; + break; + } + } + assert_int_equal(found, 1); + + found = 0; + for (i = 0; i < 16 * 1024; i++) { + umoff = + umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == UMEM_DEFAULT_MBKT_ID); + umem_atomic_free(umm, umoff); + if (umoff == umoff3) { + found = 1; + break; + } + } + assert_int_equal(found, 1); + + found = 0; + for (i = 0; i < 16 * 1024; i++) { + umoff = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + umem_atomic_free(umm, umoff); + if (umoff == umoff2) { + found = 1; + break; + } + } + assert_int_equal(found, 1); + + found = 0; + for (i = 0; i < 16 * 1024; i++) { + umoff = + umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, UMEM_DEFAULT_MBKT_ID); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == UMEM_DEFAULT_MBKT_ID); + umem_atomic_free(umm, umoff); + if (umoff == umoff4) { + found = 1; + break; + } } + assert_int_equal(found, 1); +} + +static void +test_atomic_alloc_overflow_mb(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + umem_off_t umoff, umoff_prev; + umem_off_t umoff1 = UMOFF_NULL, umoff2 = UMOFF_NULL, umoff3 = UMOFF_NULL; + uint32_t mb_id, ret_id; + int hit = 0; + uint64_t allocated_size = 0; + + mb_id = umem_allot_mb_evictable(umm, 0); + assert_int_not_equal(mb_id, 0); /* zero maps to non-evictable memory bucket */ + + do { + hit = 0; + /* Allocate objects from the memory bucket */ + umoff_prev = umoff1; + umoff1 = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + ret_id = umem_get_mb_from_offset(umm, umoff1); + if (ret_id == mb_id) + allocated_size += 2048; + else if (ret_id == 0) { + umem_atomic_free(umm, umoff1); + umoff1 = umoff_prev; + hit++; + } else + assert_true(ret_id == mb_id); + umoff_prev = umoff2; + umoff2 = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff2)); + ret_id = umem_get_mb_from_offset(umm, umoff2); + if (ret_id == mb_id) + allocated_size += 1024; + else if (ret_id == 0) { + umem_atomic_free(umm, umoff2); + umoff2 = umoff_prev; + hit++; + } else + assert_true(ret_id == mb_id); + umoff_prev = umoff3; + umoff3 = umem_atomic_alloc_from_bucket(umm, 128, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff3)); + ret_id = umem_get_mb_from_offset(umm, umoff3); + if (ret_id == mb_id) + allocated_size += 128; + else if (ret_id == 0) { + umem_atomic_free(umm, umoff3); + umoff3 = umoff_prev; + hit++; + } else + assert_true(ret_id == mb_id); + } while (hit != 3); + print_message("Total allocated size from mb %lu\n", allocated_size); + + umem_atomic_free(umm, umoff1); + umem_atomic_free(umm, umoff2); + umem_atomic_free(umm, umoff3); + + /* + * The only free memory in the MB is that of the offsets freed above. + * Subsequent allocation from the same MB should return the same offsets. + */ + umoff = umem_atomic_alloc_from_bucket(umm, 2048, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + assert_true(umoff == umoff1); + umoff = umem_atomic_alloc_from_bucket(umm, 1024, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + assert_true(umoff == umoff2); + umoff = umem_atomic_alloc_from_bucket(umm, 128, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + assert_true(umoff == umoff3); +} + +static void +test_reserve_from_mb(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + umem_off_t umoff, umoff1; + uint32_t mb_id; + struct umem_rsrvd_act *rsrvd_act; + size_t rsrv_size = 1032; + int found = 0, i, rc; + + mb_id = umem_allot_mb_evictable(umm, 0); + assert_int_not_equal(mb_id, 0); /* zero maps to non-evictable memory bucket */ + + /* Reserve an object and then cancel the allocation */ + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1); + assert_int_equal(rc, 0); + umoff = umem_reserve_from_bucket(umm, rsrvd_act, rsrv_size, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + /* Validate that the object is from the memory bucket of interest. */ + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + umem_cancel(umm, rsrvd_act); + umem_rsrvd_act_free(&rsrvd_act); + /* Validate that the object is really freed */ + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1); + assert_int_equal(rc, 0); + umoff1 = umem_reserve_from_bucket(umm, rsrvd_act, rsrv_size, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umoff1 == umoff); + umem_cancel(umm, rsrvd_act); + umem_rsrvd_act_free(&rsrvd_act); + + /* Reserve an object and publish it within a transaction. */ + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1); + assert_int_equal(rc, 0); + umoff = umem_reserve_from_bucket(umm, rsrvd_act, rsrv_size, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + /* Validate that the object is from the memory bucket of interest. */ + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + rc = umem_tx_publish(umm, rsrvd_act); + assert_int_equal(rc, 0); + rc = umem_tx_commit(umm); + assert_int_equal(rc, 0); + umem_rsrvd_act_free(&rsrvd_act); + /* + * Make sure that the above allocated object is never returned by + * subsequent allocation. + */ + for (i = 0; i < 32 * 1024; i++) { + umoff1 = umem_atomic_alloc_from_bucket(umm, rsrv_size, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id); + umem_atomic_free(umm, umoff1); + assert_false(umoff == umoff1); + } + + /* Defer free an object and cancel it subsequently */ + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1); + assert_int_equal(rc, 0); + umem_defer_free(umm, umoff, rsrvd_act); + assert_int_equal(rc, 0); + umem_cancel(umm, rsrvd_act); + umem_rsrvd_act_free(&rsrvd_act); + /* Validate that the object is not really freed */ + for (i = 0; i < 32 * 1024; i++) { + umoff1 = umem_atomic_alloc_from_bucket(umm, rsrv_size, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id); + umem_atomic_free(umm, umoff1); + assert_false(umoff == umoff1); + } + + /* Defer free an object and publish it within a transaction. */ + rc = umem_rsrvd_act_alloc(umm, &rsrvd_act, 1); + assert_int_equal(rc, 0); + umem_defer_free(umm, umoff, rsrvd_act); + assert_int_equal(rc, 0); + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + rc = umem_tx_publish(umm, rsrvd_act); + assert_int_equal(rc, 0); + rc = umem_tx_commit(umm); + assert_int_equal(rc, 0); + umem_rsrvd_act_free(&rsrvd_act); + /* Validate that the object is returned in subsequent allocation */ + found = 0; + for (i = 0; i < 32 * 1024; i++) { + umoff1 = umem_atomic_alloc_from_bucket(umm, rsrv_size, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id); + umem_atomic_free(umm, umoff1); + if (umoff == umoff1) { + found = 1; + break; + } + } + assert_int_equal(found, 1); +} + +static void +test_tx_alloc_from_mb(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + umem_off_t umoff = UINT64_MAX, umoff1 = UINT64_MAX; + uint32_t mb_id; + size_t alloc_size = 1024; + int found = 0, i, rc; + + mb_id = umem_allot_mb_evictable(umm, 0); + assert_int_not_equal(mb_id, 0); /* zero maps to non-evictable memory bucket */ + + /* Do a tx alloc and fail the transaction. */ + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + umoff = umem_alloc_from_bucket(umm, alloc_size, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + rc = umem_tx_end(umm, 1); + assert_true(rc == umem_tx_errno(1)); + found = 0; + for (i = 0; i < 32 * 1024; i++) { + umoff1 = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id); + umem_atomic_free(umm, umoff1); + if (umoff == umoff1) { + found = 1; + break; + } + } + assert_int_equal(found, 1); + + /* Do a tx alloc and pass the transaction. */ + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + umoff = umem_alloc_from_bucket(umm, alloc_size, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + rc = umem_tx_end(umm, 0); + assert_int_equal(rc, 0); + for (i = 0; i < 32 * 1024; i++) { + umoff1 = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id); + umem_atomic_free(umm, umoff1); + assert_false(umoff == umoff1); + } + + /* Do a tx free and fail the transaction. */ + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + umem_free(umm, umoff); + rc = umem_tx_end(umm, 1); + assert_true(rc == umem_tx_errno(1)); + for (i = 0; i < 32 * 1024; i++) { + umoff1 = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id); + umem_atomic_free(umm, umoff1); + assert_false(umoff == umoff1); + } + + /* Do a tx free and pass the transaction. */ + rc = umem_tx_begin(umm, NULL); + assert_int_equal(rc, 0); + umem_free(umm, umoff); + rc = umem_tx_end(umm, 0); + assert_int_equal(rc, 0); + found = 0; + for (i = 0; i < 32 * 1024; i++) { + umoff1 = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, mb_id); + assert_false(UMOFF_IS_NULL(umoff1)); + assert_true(umem_get_mb_from_offset(umm, umoff1) == mb_id); + umem_atomic_free(umm, umoff1); + if (umoff == umoff1) { + found = 1; + break; + } + } + assert_int_equal(found, 1); +} + +struct bucket_alloc_info { + umem_off_t start_umoff; + uint32_t num_allocs; + uint32_t mb_id; +}; + +void +alloc_bucket_to_full(struct umem_instance *umm, struct bucket_alloc_info *ainfo) +{ + umem_off_t umoff, prev_umoff; + size_t alloc_size = 128; + umem_off_t *ptr; + struct umem_cache_range rg = {0}; + struct umem_pin_handle *p_hdl; + uint32_t id = ainfo->mb_id; + + if (UMOFF_IS_NULL(ainfo->start_umoff)) { + ainfo->start_umoff = + umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, id); + assert_false(UMOFF_IS_NULL(ainfo->start_umoff)); + ainfo->num_allocs++; + assert_true(umem_get_mb_from_offset(umm, ainfo->start_umoff) == id); + } + prev_umoff = ainfo->start_umoff; + rg.cr_off = umem_get_mb_base_offset(umm, id); + rg.cr_size = 1; + assert_true(umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl) == 0); + + while (1) { + umoff = umem_atomic_alloc_from_bucket(umm, alloc_size, UMEM_TYPE_ANY, id); + assert_false(UMOFF_IS_NULL(umoff)); + if (umem_get_mb_from_offset(umm, umoff) != id) { + umem_atomic_free(umm, umoff); + break; + } + ptr = (umem_off_t *)umem_off2ptr(umm, prev_umoff); + *ptr = umoff; + ptr = (umem_off_t *)umem_off2ptr(umm, umoff); + *ptr = UMOFF_NULL; + prev_umoff = umoff; + ainfo->num_allocs++; + } + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + print_message("Bulk Alloc: Bucket %d, start off %lu num_allocation %d\n", ainfo->mb_id, + ainfo->start_umoff, ainfo->num_allocs); +} + +void +free_bucket_by_pct(struct umem_instance *umm, struct bucket_alloc_info *ainfo, int pct) +{ + int num_free = (ainfo->num_allocs * pct) / 100; + umem_off_t umoff, *ptr, next_umoff; + struct umem_pin_handle *p_hdl; + struct umem_cache_range rg = {0}; + int i, rc; + + assert_true((pct >= 0) && (pct <= 100)); + + if (UMOFF_IS_NULL(ainfo->start_umoff)) + return; + print_message("Bulk Free BEFORE: Bucket %d, start off %lu num_allocation %d\n", + ainfo->mb_id, ainfo->start_umoff, ainfo->num_allocs); + + rg.cr_off = umem_get_mb_base_offset(umm, ainfo->mb_id); + rg.cr_size = 1; + rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); + assert_true(rc == 0); + + umoff = ainfo->start_umoff; + for (i = 0; i < num_free; i++) { + ptr = (umem_off_t *)umem_off2ptr(umm, umoff); + next_umoff = *ptr; + umem_atomic_free(umm, umoff); + umoff = next_umoff; + ainfo->num_allocs--; + if (UMOFF_IS_NULL(umoff)) + break; + } + ainfo->start_umoff = umoff; + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + print_message("Bulk Free AFTER: Bucket %d, start off %lu num_allocation %d\n", ainfo->mb_id, + ainfo->start_umoff, ainfo->num_allocs); +} + +static void +test_tx_alloc_from_multimb(void **state) +{ + struct test_arg *arg = *state; + struct umem_instance *umm = utest_utx2umm(arg->ta_utx); + struct bucket_alloc_info ainfo[10]; + uint32_t id; + int i; + + for (i = 0; i < 10; i++) { + /* Create an MB and fill it with allocs */ + ainfo[i].mb_id = umem_allot_mb_evictable(umm, 0); + ainfo[i].num_allocs = 0; + ainfo[i].start_umoff = UMOFF_NULL; + assert_true(ainfo[i].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[i]); + } + + /* Free 5% of space for MB 2 */ + free_bucket_by_pct(umm, &ainfo[2], 5); /* 90+ */ + /* Free 30% of space for MB 3 */ + free_bucket_by_pct(umm, &ainfo[3], 30); /* 30-75 */ + /* Free 80% of space for MB 4 */ + free_bucket_by_pct(umm, &ainfo[4], 80); /* 0-30 */ + /* Free 15% of space for MB 5 */ + free_bucket_by_pct(umm, &ainfo[5], 20); /* 75-90 */ + /* Free 10% of space for MB 6 */ + free_bucket_by_pct(umm, &ainfo[6], 18); /* 75-90 */ + /* Free 50% of space for MB 7 */ + free_bucket_by_pct(umm, &ainfo[7], 50); /* 30-75 */ + /* Free 90% of space for MB 8 */ + free_bucket_by_pct(umm, &ainfo[8], 90); /* 0-30 */ + + /* Allocator should return mb with utilization 30%-75% */ + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[3].mb_id); + assert_true(id == ainfo[3].mb_id); + alloc_bucket_to_full(umm, &ainfo[3]); + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[7].mb_id); + assert_true(id == ainfo[7].mb_id); + alloc_bucket_to_full(umm, &ainfo[7]); + + /* Next preference should be 0%-30% */ + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[4].mb_id); + assert_true(id == ainfo[4].mb_id); + alloc_bucket_to_full(umm, &ainfo[4]); + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[8].mb_id); + assert_true(id == ainfo[8].mb_id); + alloc_bucket_to_full(umm, &ainfo[8]); + + /* Next is to create a new memory bucket. */ + id = umem_allot_mb_evictable(umm, 0); + for (i = 0; i < 10; i++) + assert_true(id != ainfo[i].mb_id); + print_message("obtained id %d\n", id); + + /* Without eviction support 75-90% and 90% and above cannot be tested. + * TBD: as this requires supporting eviction within this test environment. + */ +} + +static void +test_umempobj_create_smallsize(void **state) +{ + int num = 0; + char *name; + uint32_t id; + struct umem_store ustore_tmp = {.stor_size = POOL_SIZE, + .stor_ops = &_store_ops_v2, + .store_type = DAOS_MD_BMEM_V2, + .stor_priv = (void *)(UINT64_MAX)}; + struct umem_attr uma; + struct umem_instance umm; + + uma.uma_id = umempobj_backend_type2class_id(ustore_tmp.store_type); + + /* umempobj_create with zero scm size */ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++); + assert_true(name != NULL); + uma.uma_pool = + umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, 0, 0666, &ustore_tmp); + assert_ptr_equal(uma.uma_pool, NULL); + unlink(name); + D_FREE(name); + + /* umempobj_create with zero metablob size */ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++); + assert_true(name != NULL); + ustore_tmp.stor_size = 0; + uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, POOL_SIZE, 0666, + &ustore_tmp); + assert_ptr_equal(uma.uma_pool, NULL); + ustore_tmp.stor_size = POOL_SIZE; + unlink(name); + D_FREE(name); + + /* umempobj_create with scm size less than 32MB */ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++); + assert_true(name != NULL); + uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, + 24 * 1024 * 1024, 0666, &ustore_tmp); + assert_ptr_equal(uma.uma_pool, NULL); + unlink(name); + D_FREE(name); + + /* umempobj_create with scm size set to 112MB */ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++); + assert_true(name != NULL); + uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, + 112 * 1024 * 1024, 0666, &ustore_tmp); + assert_ptr_not_equal(uma.uma_pool, NULL); + umempobj_close(uma.uma_pool); + unlink(name); + D_FREE(name); + + /* umempobj_create with scm and metablob size set to 112MB */ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++); + assert_true(name != NULL); + ustore_tmp.stor_size = 112 * 1024 * 1024; + uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, + 112 * 1024 * 1024, 0666, &ustore_tmp); + umem_class_init(&uma, &umm); + id = umem_allot_mb_evictable(&umm, 0); + print_message("with scm == metablob, evictable id returned is %d\n", id); + assert_true(id == 0); + ustore_tmp.stor_size = POOL_SIZE; + assert_ptr_not_equal(uma.uma_pool, NULL); + umempobj_close(uma.uma_pool); + unlink(name); + D_FREE(name); + + /* umempobj_create with scm size greater than metablob size*/ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", num++); + assert_true(name != NULL); + ustore_tmp.stor_size = 224 * 1024 * 1024; + uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, + 112 * 1024 * 1024, 0666, &ustore_tmp); + umem_class_init(&uma, &umm); + id = umem_allot_mb_evictable(&umm, 0); + print_message("with metablob > scm, evictable id returned is %d\n", id); + assert_true(id != 0); + ustore_tmp.stor_size = POOL_SIZE; + assert_ptr_not_equal(uma.uma_pool, NULL); + umempobj_close(uma.uma_pool); + unlink(name); + D_FREE(name); +} + +static void +test_umempobj_nemb_usage(void **state) +{ + int num = 0; + char *name; + struct umem_store ustore_tmp = {.stor_size = 256 * 1024 * 1024, + .stor_ops = &_store_ops_v2, + .store_type = DAOS_MD_BMEM_V2, + .stor_priv = (void *)(UINT64_MAX)}; + struct umem_attr uma; + struct umem_instance umm; + umem_off_t umoff, *ptr = NULL, prev_umoff = UMOFF_NULL; + size_t alloc_size = (10 * 1024 * 1024); + + uma.uma_id = umempobj_backend_type2class_id(ustore_tmp.store_type); + /* Create a heap and cache of size 256MB and 249MB (16 & 15 zones) respectively */ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", 0); + assert_true(name != NULL); + uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, + 240 * 1024 * 1024, 0666, &ustore_tmp); + assert_ptr_not_equal(uma.uma_pool, NULL); + + umem_class_init(&uma, &umm); + + /* Do allocation and verify that only 13 zones allotted to non evictable MBs */ + for (num = 0;; num++) { + /* do an allocation that takes more than half the zone size */ + umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + *ptr = prev_umoff; + prev_umoff = umoff; + } + /* 80% nemb when heap size greater than cache size */ + assert_int_equal(num, 13); + print_message("Number of allocations is %d\n", num); + + for (--num;; num--) { + umoff = *ptr; + umem_atomic_free(&umm, prev_umoff); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + prev_umoff = umoff; + } + assert_int_equal(num, 0); + umempobj_close(uma.uma_pool); + unlink(name); + D_FREE(name); + + prev_umoff = UMOFF_NULL; + /* Create a heap and cache of size 256MB (16 zones) each */ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", 1); + assert_true(name != NULL); + uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, + 256 * 1024 * 1024, 0666, &ustore_tmp); + assert_ptr_not_equal(uma.uma_pool, NULL); + + umem_class_init(&uma, &umm); + + /* Do allocation and verify that all 16 zones are allotted to non evictable MBs */ + for (num = 0;; num++) { + /* do an allocation that takes more than half the zone size */ + umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + *ptr = prev_umoff; + prev_umoff = umoff; + } + assert_int_equal(num, 16); + print_message("Number of allocations is %d\n", num); + + for (--num;; num--) { + umoff = *ptr; + umem_atomic_free(&umm, prev_umoff); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + prev_umoff = umoff; + } + assert_int_equal(num, 0); + umempobj_close(uma.uma_pool); + unlink(name); + D_FREE(name); +} + +static void +test_umempobj_heap_mb_stats(void **state) +{ + int num = 0, count, rc; + char *name; + uint64_t scm_size = 128 * 1024 * 1024; + uint64_t meta_size = 256 * 1024 * 1024; + struct umem_store ustore_tmp = {.stor_size = meta_size, + .stor_ops = &_store_ops_v2, + .store_type = DAOS_MD_BMEM_V2, + .stor_priv = (void *)(UINT64_MAX)}; + struct umem_attr uma; + struct umem_instance umm; + umem_off_t umoff, *ptr = NULL, prev_umoff = UMOFF_NULL; + size_t alloc_size = 128; + uint64_t allocated, allocated0, allocated1, maxsz, maxsz_exp; + uint32_t mb_id; + + uma.uma_id = umempobj_backend_type2class_id(ustore_tmp.store_type); + /* Create a heap and cache of size 256MB and 128MB (16 & 8 zones) respectively */ + D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", 0); + assert_true(name != NULL); + uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, scm_size, 0666, + &ustore_tmp); + assert_ptr_not_equal(uma.uma_pool, NULL); + maxsz_exp = (uint64_t)(scm_size / MB_SIZE * NEMB_RATIO) * MB_SIZE; + + umem_class_init(&uma, &umm); + + rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated0, &maxsz); + print_message("NE usage max_size = %lu exp_max_size = %lu allocated = %lu\n", maxsz, + maxsz_exp, allocated0); + assert_int_equal(rc, 0); + assert_int_equal(maxsz, maxsz_exp); + + /* allocate and consume all of the space */ + for (num = 0;; num++) { + umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + *ptr = prev_umoff; + prev_umoff = umoff; + } + rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated1, &maxsz); + print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated1); + assert_int_equal(rc, 0); + assert_true(allocated1 * 100 / maxsz >= 99); + assert_int_equal(maxsz, maxsz_exp); + + for (count = num; count > num / 2; count--) { + umoff = *ptr; + umem_atomic_free(&umm, prev_umoff); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + prev_umoff = umoff; + } + rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated, &maxsz); + print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated); + assert_int_equal(rc, 0); + assert_true(allocated < allocated1 / 2); + assert_int_equal(maxsz, maxsz_exp); + for (;;) { + umoff = *ptr; + umem_atomic_free(&umm, prev_umoff); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + prev_umoff = umoff; + } + rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated, &maxsz); + print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated); + assert_int_equal(rc, 0); + assert_int_equal(allocated, allocated0); + assert_int_equal(maxsz, maxsz_exp); + + /* Now Test an evictable MB */ + mb_id = umem_allot_mb_evictable(&umm, 0); + assert_true(mb_id > 0); + maxsz_exp = MB_SIZE; + + rc = umempobj_get_mbusage(umm.umm_pool, mb_id, &allocated0, &maxsz); + print_message("E usage max_size = %lu exp_max_size = %lu allocated = %lu\n", maxsz, + maxsz_exp, allocated0); + assert_int_equal(rc, 0); + assert_int_equal(maxsz, maxsz_exp); + + prev_umoff = UMOFF_NULL; + ptr = NULL; + /* allocate and consume all of the space */ + for (num = 0;; num++) { + umoff = umem_atomic_alloc_from_bucket(&umm, alloc_size, UMEM_TYPE_ANY, mb_id); + if (umem_get_mb_from_offset(&umm, umoff) != mb_id) { + umem_atomic_free(&umm, umoff); + break; + } + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + *ptr = prev_umoff; + prev_umoff = umoff; + } + rc = umempobj_get_mbusage(umm.umm_pool, mb_id, &allocated1, &maxsz); + print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated1); + assert_int_equal(rc, 0); + assert_true(allocated1 * 100 / maxsz >= 99); + assert_int_equal(maxsz, maxsz_exp); + + for (count = num; count > num / 2; count--) { + umoff = *ptr; + umem_atomic_free(&umm, prev_umoff); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + prev_umoff = umoff; + } + rc = umempobj_get_mbusage(umm.umm_pool, mb_id, &allocated, &maxsz); + print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated); + assert_int_equal(rc, 0); + assert_true(allocated < allocated1 / 2); + assert_int_equal(maxsz, maxsz_exp); + for (;;) { + umoff = *ptr; + umem_atomic_free(&umm, prev_umoff); + if (UMOFF_IS_NULL(umoff)) + break; + ptr = (umem_off_t *)umem_off2ptr(&umm, umoff); + prev_umoff = umoff; + } + rc = umempobj_get_mbusage(umm.umm_pool, mb_id, &allocated, &maxsz); + print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated); + assert_int_equal(rc, 0); + assert_int_equal(allocated, allocated0); + assert_int_equal(maxsz, maxsz_exp); + + /* Testing invalid mb_ids */ + rc = umempobj_get_mbusage(umm.umm_pool, mb_id - 1, &allocated, &maxsz); + assert_int_equal(rc, -DER_INVAL); + rc = umempobj_get_mbusage(umm.umm_pool, mb_id + 1, &allocated, &maxsz); + assert_int_equal(rc, -DER_INVAL); + rc = umempobj_get_mbusage(umm.umm_pool, 50, &allocated, &maxsz); + assert_int_equal(rc, -DER_INVAL); + + umempobj_close(uma.uma_pool); + unlink(name); + D_FREE(name); } -#endif int main(int argc, char **argv) { - int rc; - - static const struct CMUnitTest umem_tests[] = { - { "BMEM001: Test atomic alloc/free", test_atomic_alloc, - setup_pmem, teardown_pmem}, - { "BMEM002: Test null flags pmem", test_invalid_flags, - setup_pmem, teardown_pmem}, - { "BMEM003: Test alloc pmem", test_alloc, - setup_pmem, teardown_pmem}, - { "BMEM004: Test atomic copy", test_atomic_copy, - setup_pmem, teardown_pmem}, - { "BMEM005: Test simple commit tx", test_simple_commit_tx, - setup_pmem, teardown_pmem}, - { "BMEM006: Test simple abort tx", test_simple_abort_tx, - setup_pmem, teardown_pmem}, - { "BMEM007: Test nested commit tx", test_nested_commit_tx, - setup_pmem, teardown_pmem}, - { "BMEM008: Test nested outer abort tx", test_nested_outer_abort_tx, - setup_pmem, teardown_pmem}, - { "BMEM009: Test nested inner abort tx", test_nested_inner_abort_tx, - setup_pmem, teardown_pmem}, - { "BMEM010: Test tx alloc/free", test_tx_alloc, - setup_pmem, teardown_pmem}, - { "BMEM011: Test tx add range", test_tx_add, - setup_pmem, teardown_pmem}, - { "BMEM012: Test tx add ptr", test_tx_add_ptr, - setup_pmem, teardown_pmem}, - { "BMEM013: Test tx xadd ptr", test_tx_xadd_ptr, - setup_pmem, teardown_pmem}, - { "BMEM014: Test tx reserve publish/cancel", test_tx_reserve_publish_cancel, - setup_pmem, teardown_pmem}, - { "BMEM015: Test tx defer free publish/cancel", test_tx_dfree_publish_cancel, - setup_pmem, teardown_pmem}, - { NULL, NULL, NULL, NULL } - }; + int rc = 0; + + static const struct CMUnitTest v1_tests[] = { + {"BMEM001: Test atomic alloc/free", test_atomic_alloc, setup_pmem, teardown_pmem}, + {"BMEM001a: Test atomic alloc/free", test_atomic_alloc_from_bucket, setup_pmem, + teardown_pmem}, + {"BMEM002: Test null flags pmem", test_invalid_flags, setup_pmem, teardown_pmem}, + {"BMEM003: Test alloc pmem", test_alloc, setup_pmem, teardown_pmem}, + {"BMEM003a: Test alloc pmem", test_alloc_from_bucket, setup_pmem, teardown_pmem}, + {"BMEM004a: Test atomic copy", test_atomic_copy, setup_pmem, teardown_pmem}, + {"BMEM005: Test simple commit tx", test_simple_commit_tx, setup_pmem, teardown_pmem}, + {"BMEM006: Test simple abort tx", test_simple_abort_tx, setup_pmem, teardown_pmem}, + {"BMEM007: Test nested commit tx", test_nested_commit_tx, setup_pmem, teardown_pmem}, + {"BMEM008: Test nested outer abort tx", test_nested_outer_abort_tx, setup_pmem, + teardown_pmem}, + {"BMEM009: Test nested inner abort tx", test_nested_inner_abort_tx, setup_pmem, + teardown_pmem}, + {"BMEM010: Test tx alloc/free", test_tx_alloc, setup_pmem, teardown_pmem}, + {"BMEM010a: Test tx alloc/free", test_tx_alloc_from_bucket, setup_pmem, teardown_pmem}, + {"BMEM011: Test tx add range", test_tx_add, setup_pmem, teardown_pmem}, + {"BMEM012: Test tx add ptr", test_tx_add_ptr, setup_pmem, teardown_pmem}, + {"BMEM013: Test tx xadd ptr", test_tx_xadd_ptr, setup_pmem, teardown_pmem}, + {"BMEM014: Test tx reserve publish/cancel", test_tx_reserve_publish_cancel, setup_pmem, + teardown_pmem}, + {"BMEM014a: Test tx reserve publish/cancel", test_tx_bucket_reserve_publish_cancel, + setup_pmem, teardown_pmem}, + {"BMEM015: Test tx defer free publish/cancel", test_tx_dfree_publish_cancel, setup_pmem, + teardown_pmem}, + {"BMEM015a: Test tx defer free publish/cancel", test_tx_bucket_dfree_publish_cancel, + setup_pmem, teardown_pmem}, + {NULL, NULL, NULL, NULL}}; + + static const struct CMUnitTest v2_tests[] = { + {"BMEM001: Test atomic alloc/free", test_atomic_alloc, setup_pmem_v2, teardown_pmem}, + {"BMEM001a: Test atomic alloc/free", test_atomic_alloc_from_bucket, setup_pmem_v2, + teardown_pmem}, + {"BMEM002: Test null flags pmem", test_invalid_flags, setup_pmem_v2, teardown_pmem}, + {"BMEM003: Test alloc pmem", test_alloc, setup_pmem_v2, teardown_pmem}, + {"BMEM003a: Test alloc pmem", test_alloc_from_bucket, setup_pmem_v2, teardown_pmem}, + {"BMEM004a: Test atomic copy", test_atomic_copy, setup_pmem_v2, teardown_pmem}, + {"BMEM005: Test simple commit tx", test_simple_commit_tx, setup_pmem_v2, teardown_pmem}, + {"BMEM006: Test simple abort tx", test_simple_abort_tx, setup_pmem_v2, teardown_pmem}, + {"BMEM007: Test nested commit tx", test_nested_commit_tx, setup_pmem_v2, teardown_pmem}, + {"BMEM008: Test nested outer abort tx", test_nested_outer_abort_tx, setup_pmem_v2, + teardown_pmem}, + {"BMEM009: Test nested inner abort tx", test_nested_inner_abort_tx, setup_pmem_v2, + teardown_pmem}, + {"BMEM010: Test tx alloc/free", test_tx_alloc, setup_pmem_v2, teardown_pmem}, + {"BMEM010a: Test tx alloc/free", test_tx_alloc_from_bucket, setup_pmem_v2, + teardown_pmem}, + {"BMEM011: Test tx add range", test_tx_add, setup_pmem_v2, teardown_pmem}, + {"BMEM012: Test tx add ptr", test_tx_add_ptr, setup_pmem_v2, teardown_pmem}, + {"BMEM013: Test tx xadd ptr", test_tx_xadd_ptr, setup_pmem_v2, teardown_pmem}, + {"BMEM014: Test tx reserve publish/cancel", test_tx_reserve_publish_cancel, + setup_pmem_v2, teardown_pmem}, + {"BMEM014a: Test tx reserve publish/cancel", test_tx_bucket_reserve_publish_cancel, + setup_pmem_v2, teardown_pmem}, + {"BMEM015: Test tx defer free publish/cancel", test_tx_dfree_publish_cancel, + setup_pmem_v2, teardown_pmem}, + {"BMEM015a: Test tx defer free publish/cancel", test_tx_bucket_dfree_publish_cancel, + setup_pmem_v2, teardown_pmem}, + {"BMEM016: Test atomic allocs within a memory bucket", test_atomic_alloc_mb, + setup_pmem_v2, teardown_pmem}, + {"BMEM017: Test atomic allocs overflow a memory bucket", test_atomic_alloc_overflow_mb, + setup_pmem_v2, teardown_pmem}, + {"BMEM018: Test reserve/defer_free from a memory bucket", test_reserve_from_mb, + setup_pmem_v2, teardown_pmem}, + {"BMEM019: Test tx alloc/free from a memory bucket", test_tx_alloc_from_mb, + setup_pmem_v2, teardown_pmem}, + {"BMEM020: Test tx alloc/free from multiple memory buckets", test_tx_alloc_from_multimb, + setup_pmem_v2, teardown_pmem}, + {"BMEM021: Test umempobj create small size", test_umempobj_create_smallsize, NULL, + NULL}, + {"BMEM022: Test umempobj non_evictable MB usage", test_umempobj_nemb_usage, NULL, NULL}, + {"BMEM023: Test umempobj get MB stats", test_umempobj_heap_mb_stats, NULL, NULL}, + {NULL, NULL, NULL, NULL}}; rc = daos_debug_init(DAOS_LOG_DEFAULT); if (rc != 0) return rc; - d_register_alt_assert(mock_assert); - rc = cmocka_run_group_tests_name("umem tests", umem_tests, global_setup, global_teardown); + rc = cmocka_run_group_tests_name("bmem v1 tests", v1_tests, global_setup, global_teardown); + + rc += cmocka_run_group_tests_name("bmem v2 tests", v2_tests, global_setup, global_teardown); daos_debug_fini(); return rc; diff --git a/src/control/SConscript b/src/control/SConscript index 06410fee53a..17b654f162d 100644 --- a/src/control/SConscript +++ b/src/control/SConscript @@ -150,6 +150,7 @@ def scons(): dbenv = denv.Clone() dblibs = dbenv.subst("-L$BUILD_DIR/src/gurt " "-L$BUILD_DIR/src/cart " + "-L$BUILD_DIR/src/common/dav_v2 " "-L$BUILD_DIR/src/common " "-L$BUILD_DIR/src/client/dfs " "-L$BUILD_DIR/src/utils " @@ -184,6 +185,7 @@ def scons(): cgolibdirs = aenv.subst("-L$BUILD_DIR/src/control/lib/spdk " "-L$BUILD_DIR/src/gurt " "-L$BUILD_DIR/src/cart " + "-L$BUILD_DIR/src/common/dav_v2 " "-L$BUILD_DIR/src/common " "-L$BUILD_DIR/src/utils/ddb " "-L$SPDK_PREFIX/lib " @@ -210,7 +212,7 @@ def scons(): ddb_env.d_add_rpaths(None, True, True) # Add vos and dependent libs for ddb - ddb_env.AppendENVPath("CGO_LDFLAGS", " -lvos -ldaos_common_pmem -lpmem " + ddb_env.AppendENVPath("CGO_LDFLAGS", " -lvos -ldav_v2 -ldaos_common_pmem -lpmem " "-labt -lgurt -luuid -lbio -lcart", sep=" ") install_go_bin(ddb_env, "ddb", ['ddb']) diff --git a/src/control/cmd/daos/pretty/pool.go b/src/control/cmd/daos/pretty/pool.go index a9f685b536f..f1a0b4525a7 100644 --- a/src/control/cmd/daos/pretty/pool.go +++ b/src/control/cmd/daos/pretty/pool.go @@ -9,6 +9,7 @@ package pretty import ( "fmt" "io" + "strings" "github.com/dustin/go-humanize" "github.com/pkg/errors" @@ -19,14 +20,36 @@ import ( const msgNoPools = "No pools in system" -func getTierNameText(tierIdx int) string { - switch tierIdx { - case int(daos.StorageMediaTypeScm): - return fmt.Sprintf("- Storage tier %d (SCM):", tierIdx) - case int(daos.StorageMediaTypeNvme): - return fmt.Sprintf("- Storage tier %d (NVMe):", tierIdx) - default: - return fmt.Sprintf("- Storage tier %d (unknown):", tierIdx) +func printPoolTiers(memFileBytes uint64, suss []*daos.StorageUsageStats, w *txtfmt.ErrWriter, fullStats bool) { + mdOnSSD := memFileBytes != 0 + for tierIdx, tierStats := range suss { + if mdOnSSD { + if tierIdx == 0 { + if fullStats { + fmt.Fprintf(w, "- Total memory-file size: %s\n", + humanize.Bytes(memFileBytes)) + } + fmt.Fprintf(w, "- Metadata storage:\n") + } else { + fmt.Fprintf(w, "- Data storage:\n") + } + } else { + if tierIdx >= int(daos.StorageMediaTypeMax) { + // Print unknown type tiers. + tierStats.MediaType = daos.StorageMediaTypeMax + } + fmt.Fprintf(w, "- Storage tier %d (%s):\n", tierIdx, + strings.ToUpper(tierStats.MediaType.String())) + } + + fmt.Fprintf(w, " Total size: %s\n", humanize.Bytes(tierStats.Total)) + if fullStats { + fmt.Fprintf(w, " Free: %s, min:%s, max:%s, mean:%s\n", + humanize.Bytes(tierStats.Free), humanize.Bytes(tierStats.Min), + humanize.Bytes(tierStats.Max), humanize.Bytes(tierStats.Mean)) + } else { + fmt.Fprintf(w, " Free: %s\n", humanize.Bytes(tierStats.Free)) + } } } @@ -66,14 +89,8 @@ func PrintPoolInfo(pi *daos.PoolInfo, out io.Writer) error { if pi.QueryMask.HasOption(daos.PoolQueryOptionSpace) && pi.TierStats != nil { fmt.Fprintln(w, "Pool space info:") - fmt.Fprintf(w, "- Target(VOS) count:%d\n", pi.ActiveTargets) - for tierIdx, tierStats := range pi.TierStats { - fmt.Fprintln(w, getTierNameText(tierIdx)) - fmt.Fprintf(w, " Total size: %s\n", humanize.Bytes(tierStats.Total)) - fmt.Fprintf(w, " Free: %s, min:%s, max:%s, mean:%s\n", - humanize.Bytes(tierStats.Free), humanize.Bytes(tierStats.Min), - humanize.Bytes(tierStats.Max), humanize.Bytes(tierStats.Mean)) - } + fmt.Fprintf(w, "- Target count:%d\n", pi.ActiveTargets) + printPoolTiers(pi.MemFileBytes, pi.TierStats, w, true) } return w.Err } @@ -89,11 +106,7 @@ func PrintPoolQueryTargetInfo(pqti *daos.PoolQueryTargetInfo, out io.Writer) err // Maintain output compatibility with the `daos pool query-targets` output. fmt.Fprintf(w, "Target: type %s, state %s\n", pqti.Type, pqti.State) if pqti.Space != nil { - for tierIdx, tierUsage := range pqti.Space { - fmt.Fprintln(w, getTierNameText(tierIdx)) - fmt.Fprintf(w, " Total size: %s\n", humanize.Bytes(tierUsage.Total)) - fmt.Fprintf(w, " Free: %s\n", humanize.Bytes(tierUsage.Free)) - } + printPoolTiers(pqti.MemFileBytes, pqti.Space, w, false) } return w.Err diff --git a/src/control/cmd/daos/pretty/pool_test.go b/src/control/cmd/daos/pretty/pool_test.go index 3a1724e1dda..938b73d0c86 100644 --- a/src/control/cmd/daos/pretty/pool_test.go +++ b/src/control/cmd/daos/pretty/pool_test.go @@ -55,12 +55,14 @@ Pool health info: }, TierStats: []*daos.StorageUsageStats{ { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -70,11 +72,11 @@ Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: - Rebuild busy, 42 objs, 21 recs Pool space info: -- Target(VOS) count:1 +- Target count:1 - Storage tier 0 (SCM): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B `, poolUUID.String()), @@ -99,12 +101,14 @@ Pool space info: }, TierStats: []*daos.StorageUsageStats{ { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -115,11 +119,11 @@ Pool health info: - Enabled ranks: 0-2 - Rebuild busy, 42 objs, 21 recs Pool space info: -- Target(VOS) count:1 +- Target count:1 - Storage tier 0 (SCM): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B `, poolUUID.String()), @@ -144,12 +148,14 @@ Pool space info: }, TierStats: []*daos.StorageUsageStats{ { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -160,11 +166,11 @@ Pool health info: - Disabled ranks: 0-1,3 - Rebuild busy, 42 objs, 21 recs Pool space info: -- Target(VOS) count:1 +- Target count:1 - Storage tier 0 (SCM): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B `, poolUUID.String()), @@ -189,12 +195,14 @@ Pool space info: }, TierStats: []*daos.StorageUsageStats{ { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -205,11 +213,11 @@ Pool health info: - Disabled ranks: 0-1,3 - Rebuild unknown, 42 objs, 21 recs Pool space info: -- Target(VOS) count:1 +- Target count:1 - Storage tier 0 (SCM): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B `, poolUUID.String()), @@ -234,12 +242,14 @@ Pool space info: }, TierStats: []*daos.StorageUsageStats{ { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 2, - Free: 1, + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -249,13 +259,60 @@ Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. Pool health info: - Rebuild failed, status=2 Pool space info: -- Target(VOS) count:1 +- Target count:1 - Storage tier 0 (SCM): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 2 B Free: 1 B, min:0 B, max:0 B, mean:0 B +`, poolUUID.String()), + }, + "normal response: MD-on-SSD": { + pi: &daos.PoolInfo{ + QueryMask: daos.DefaultPoolQueryMask, + State: daos.PoolServiceStateDegraded, + UUID: poolUUID, + TotalTargets: 2, + DisabledTargets: 1, + ActiveTargets: 1, + ServiceLeader: 42, + Version: 100, + PoolLayoutVer: 1, + UpgradeLayoutVer: 2, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateBusy, + Objects: 42, + Records: 21, + }, + TierStats: []*daos.StorageUsageStats{ + { + Total: 2, + Free: 1, + MediaType: daos.StorageMediaTypeScm, + }, + { + Total: 4, + Free: 2, + MediaType: daos.StorageMediaTypeNvme, + }, + }, + MemFileBytes: 1, + }, + expPrintStr: fmt.Sprintf(` +Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=Degraded +Pool layout out of date (1 < 2) -- see `+backtickStr+` for details. +Pool health info: +- Rebuild busy, 42 objs, 21 recs +Pool space info: +- Target count:1 +- Total memory-file size: 1 B +- Metadata storage: + Total size: 2 B + Free: 1 B, min:0 B, max:0 B, mean:0 B +- Data storage: + Total size: 4 B + Free: 2 B, min:0 B, max:0 B, mean:0 B `, poolUUID.String()), }, } { @@ -287,12 +344,14 @@ func TestPretty_PrintPoolQueryTarget(t *testing.T) { State: daos.PoolTargetStateDownOut, Space: []*daos.StorageUsageStats{ { - Total: 6000000000, - Free: 5000000000, + Total: 6000000000, + Free: 5000000000, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 100000000000, - Free: 90000000000, + Total: 100000000000, + Free: 90000000000, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -301,7 +360,7 @@ Target: type unknown, state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB `, @@ -312,12 +371,14 @@ Target: type unknown, state down_out State: daos.PoolTargetStateDown, Space: []*daos.StorageUsageStats{ { - Total: 6000000000, - Free: 5000000000, + Total: 6000000000, + Free: 5000000000, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 100000000000, - Free: 90000000000, + Total: 100000000000, + Free: 90000000000, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -326,7 +387,7 @@ Target: type unknown, state down - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB `, @@ -337,12 +398,14 @@ Target: type unknown, state down State: daos.PoolTargetStateUp, Space: []*daos.StorageUsageStats{ { - Total: 6000000000, - Free: 5000000000, + Total: 6000000000, + Free: 5000000000, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 100000000000, - Free: 90000000000, + Total: 100000000000, + Free: 90000000000, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -351,7 +414,7 @@ Target: type unknown, state up - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB `, @@ -362,12 +425,14 @@ Target: type unknown, state up State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{ { - Total: 6000000000, - Free: 5000000000, + Total: 6000000000, + Free: 5000000000, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 100000000000, - Free: 90000000000, + Total: 100000000000, + Free: 90000000000, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -376,7 +441,7 @@ Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB `, @@ -387,12 +452,14 @@ Target: type unknown, state up_in State: daos.PoolTargetStateNew, Space: []*daos.StorageUsageStats{ { - Total: 6000000000, - Free: 5000000000, + Total: 6000000000, + Free: 5000000000, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 100000000000, - Free: 90000000000, + Total: 100000000000, + Free: 90000000000, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -401,7 +468,7 @@ Target: type unknown, state new - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB `, @@ -412,12 +479,14 @@ Target: type unknown, state new State: daos.PoolTargetStateDrain, Space: []*daos.StorageUsageStats{ { - Total: 6000000000, - Free: 5000000000, + Total: 6000000000, + Free: 5000000000, + MediaType: daos.StorageMediaTypeScm, }, { - Total: 100000000000, - Free: 90000000000, + Total: 100000000000, + Free: 90000000000, + MediaType: daos.StorageMediaTypeNvme, }, }, }, @@ -426,7 +495,35 @@ Target: type unknown, state drain - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): + Total size: 100 GB + Free: 90 GB +`, + }, + "valid: single target (unknown, down_out): MD-on-SSD": { + pqti: &daos.PoolQueryTargetInfo{ + Type: 0, + State: daos.PoolTargetStateDownOut, + Space: []*daos.StorageUsageStats{ + { + Total: 6000000000, + Free: 5000000000, + MediaType: daos.StorageMediaTypeScm, + }, + { + Total: 100000000000, + Free: 90000000000, + MediaType: daos.StorageMediaTypeNvme, + }, + }, + MemFileBytes: 3000000000, + }, + expPrintStr: ` +Target: type unknown, state down_out +- Metadata storage: + Total size: 6.0 GB + Free: 5.0 GB +- Data storage: Total size: 100 GB Free: 90 GB `, diff --git a/src/control/cmd/dmg/pool.go b/src/control/cmd/dmg/pool.go index 1127ab2b6f6..dbb9f50e495 100644 --- a/src/control/cmd/dmg/pool.go +++ b/src/control/cmd/dmg/pool.go @@ -51,7 +51,9 @@ type PoolCmd struct { var ( // Default to 6% SCM:94% NVMe - defaultTierRatios = []float64{0.06, 0.94} + defaultTierRatios = []float64{0.06, 0.94} + errPoolCreateIncompatOpts = errors.New("unsupported option combination, use (--scm-size and " + + "--nvme-size) or (--meta-size and --data-size) or (--size)") ) type tierRatioFlag struct { @@ -176,6 +178,9 @@ type PoolCreateCmd struct { NumSvcReps uint32 `short:"v" long:"nsvc" description:"Number of pool service replicas"` ScmSize ui.ByteSizeFlag `short:"s" long:"scm-size" description:"Per-engine SCM allocation for DAOS pool (manual)"` NVMeSize ui.ByteSizeFlag `short:"n" long:"nvme-size" description:"Per-engine NVMe allocation for DAOS pool (manual)"` + MetaSize ui.ByteSizeFlag `long:"meta-size" description:"Per-engine Metadata-on-SSD allocation for DAOS pool (manual). Only valid in MD-on-SSD mode"` + DataSize ui.ByteSizeFlag `long:"data-size" description:"Per-engine Data-on-SSD allocation for DAOS pool (manual). Only valid in MD-on-SSD mode"` + MemRatio tierRatioFlag `long:"mem-ratio" description:"Percentage of the pool metadata storage size (on SSD) that should be used as the memory file size (on ram-disk). Default value is 100% and only valid in MD-on-SSD mode"` RankList ui.RankSetFlag `short:"r" long:"ranks" description:"Storage engine unique identifiers (ranks) for DAOS pool"` Args struct { @@ -183,18 +188,6 @@ type PoolCreateCmd struct { } `positional-args:"yes"` } -func (cmd *PoolCreateCmd) checkSizeArgs() error { - if cmd.Size.IsSet() { - if cmd.ScmSize.IsSet() || cmd.NVMeSize.IsSet() { - return errIncompatFlags("size", "scm-size", "nvme-size") - } - } else if !cmd.ScmSize.IsSet() { - return errors.New("either --size or --scm-size must be set") - } - - return nil -} - func ratio2Percentage(log logging.Logger, scm, nvme float64) (p float64) { p = 100.00 min := storage.MinScmToNVMeRatio * p @@ -212,6 +205,23 @@ func ratio2Percentage(log logging.Logger, scm, nvme float64) (p float64) { return } +// MemRatio can be supplied as two fractions that make up 1 or a single fraction less than 1. +// Supply only the first fraction in request and if not set then use the default. +func (cmd *PoolCreateCmd) setMemRatio(req *control.PoolCreateReq, defVal float32) error { + if cmd.MemRatio.IsSet() { + f, err := ratiosToSingleFraction(cmd.MemRatio.Ratios()) + if err != nil { + return errors.Wrap(err, "md-on-ssd mode pool create unexpected mem-ratio") + } + + req.MemRatio = f + return nil + } + + req.MemRatio = defVal + return nil +} + func (cmd *PoolCreateCmd) storageAutoPercentage(ctx context.Context, req *control.PoolCreateReq) error { if cmd.NumRanks > 0 { return errIncompatFlags("size", "nranks") @@ -224,6 +234,11 @@ func (cmd *PoolCreateCmd) storageAutoPercentage(ctx context.Context, req *contro availFrac := float64(cmd.Size.availRatio) / 100.0 req.TierRatio = []float64{availFrac, availFrac} + // Pass --mem-ratio or zero if unset. + if err := cmd.setMemRatio(req, 0.0); err != nil { + return err + } + return nil } @@ -236,6 +251,11 @@ func (cmd *PoolCreateCmd) storageAutoTotal(req *control.PoolCreateReq) error { req.TierRatio = cmd.TierRatio.Ratios() req.TotalBytes = cmd.Size.Bytes + // Pass --mem-ratio or zero if unset. + if err := cmd.setMemRatio(req, 0.0); err != nil { + return err + } + scmPercentage := ratio2Percentage(cmd.Logger, req.TierRatio[0], req.TierRatio[1]) msg := fmt.Sprintf("Creating DAOS pool with automatic storage allocation: "+ "%s total, %0.2f%% ratio", humanize.Bytes(req.TotalBytes), scmPercentage) @@ -247,12 +267,40 @@ func (cmd *PoolCreateCmd) storageAutoTotal(req *control.PoolCreateReq) error { return nil } +func (cmd *PoolCreateCmd) storageManualMdOnSsd(req *control.PoolCreateReq) error { + metaBytes := cmd.MetaSize.Bytes + dataBytes := cmd.DataSize.Bytes + req.TierBytes = []uint64{metaBytes, dataBytes} + + // Explicitly set mem-ratio non-zero, this will prevent MD-on-SSD syntax being used if the + // mode is not enabled by providing indication of which syntax type was used. + if err := cmd.setMemRatio(req, storage.DefaultMemoryFileRatio); err != nil { + return err + } + + msg := fmt.Sprintf("Creating DAOS pool in MD-on-SSD mode with manual per-engine storage "+ + "allocation: %s metadata, %s data (%0.2f%% storage ratio) and %0.2f%% "+ + "memory-file:meta-blob size ratio", humanize.Bytes(metaBytes), + humanize.Bytes(dataBytes), 100.00*(float64(metaBytes)/float64(dataBytes)), + 100.00*req.MemRatio) + cmd.Info(msg) + + return nil +} + func (cmd *PoolCreateCmd) storageManual(req *control.PoolCreateReq) error { - if cmd.NumRanks > 0 { + switch { + case cmd.NumRanks > 0: return errIncompatFlags("nranks", "scm-size") - } - if cmd.TierRatio.IsSet() { + case cmd.TierRatio.IsSet(): return errIncompatFlags("tier-ratio", "scm-size") + case cmd.MetaSize.IsSet() || cmd.DataSize.IsSet(): + cmd.Tracef("md-on-ssd options detected for pool create: %+v", cmd) + return cmd.storageManualMdOnSsd(req) + case cmd.MemRatio.IsSet(): + return errIncompatFlags("mem-ratio", "scm-size", "nvme-size") + case cmd.NVMeSize.IsSet() && !cmd.ScmSize.IsSet(): + return errors.New("--nvme-size cannot be set without --scm-size") } scmBytes := cmd.ScmSize.Bytes @@ -270,10 +318,6 @@ func (cmd *PoolCreateCmd) storageManual(req *control.PoolCreateReq) error { // Execute is run when PoolCreateCmd subcommand is activated func (cmd *PoolCreateCmd) Execute(args []string) error { - if err := cmd.checkSizeArgs(); err != nil { - return err - } - if cmd.Args.PoolLabel != "" { for _, prop := range cmd.Properties.ToSet { if prop.Name == "label" { @@ -302,6 +346,20 @@ func (cmd *PoolCreateCmd) Execute(args []string) error { } } + // Refuse unsupported input value combinations. + + pmemParams := cmd.ScmSize.IsSet() || cmd.NVMeSize.IsSet() + mdParams := cmd.MetaSize.IsSet() || cmd.DataSize.IsSet() + + switch { + case (pmemParams || mdParams) && cmd.Size.IsSet(): + return errPoolCreateIncompatOpts + case pmemParams && mdParams: + return errPoolCreateIncompatOpts + case !pmemParams && !mdParams && !cmd.Size.IsSet(): + return errPoolCreateIncompatOpts + } + // Validate supported input values and set request fields. switch { diff --git a/src/control/cmd/dmg/pool_test.go b/src/control/cmd/dmg/pool_test.go index b1270b0f19f..5d30ec2dfb1 100644 --- a/src/control/cmd/dmg/pool_test.go +++ b/src/control/cmd/dmg/pool_test.go @@ -15,6 +15,7 @@ import ( "strings" "testing" + "github.com/dustin/go-humanize" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/pkg/errors" @@ -232,7 +233,7 @@ func TestPoolCommands(t *testing.T) { "Create pool with missing size", "pool create label", "", - errors.New("must be set"), + errPoolCreateIncompatOpts, }, { "Create pool with missing label", @@ -244,13 +245,13 @@ func TestPoolCommands(t *testing.T) { "Create pool with incompatible arguments (auto nvme-size)", fmt.Sprintf("pool create label --size %s --nvme-size %s", testSizeStr, testSizeStr), "", - errors.New("may not be mixed"), + errPoolCreateIncompatOpts, }, { "Create pool with incompatible arguments (auto scm-size)", fmt.Sprintf("pool create label --size %s --scm-size %s", testSizeStr, testSizeStr), "", - errors.New("may not be mixed"), + errPoolCreateIncompatOpts, }, { "Create pool with incompatible arguments (% size nranks)", @@ -282,6 +283,24 @@ func TestPoolCommands(t *testing.T) { "", errors.New("may not be mixed"), }, + { + "Create pool with incompatible arguments (auto with meta-size)", + fmt.Sprintf("pool create label --size %s --meta-size 32G", testSizeStr), + "", + errPoolCreateIncompatOpts, + }, + { + "Create pool with incompatible arguments (scm-size with meta-size)", + fmt.Sprintf("pool create label --scm-size %s --meta-size 32G", testSizeStr), + "", + errPoolCreateIncompatOpts, + }, + { + "Create pool with incompatible arguments (scm-size with data-size)", + fmt.Sprintf("pool create label --scm-size %s --data-size 32G", testSizeStr), + "", + errPoolCreateIncompatOpts, + }, { "Create pool with too-large tier-ratio (auto)", fmt.Sprintf("pool create label --size %s --tier-ratio 200", testSizeStr), @@ -361,7 +380,7 @@ func TestPoolCommands(t *testing.T) { "Create pool with incompatible arguments (-n without -s)", fmt.Sprintf("pool create label --nvme-size %s", testSizeStr), "", - errors.New("must be set"), + errors.New("cannot be set without --scm-size"), }, { "Create pool with minimal arguments", @@ -380,6 +399,104 @@ func TestPoolCommands(t *testing.T) { }, " "), nil, }, + { + "Create pool with manual memory file ratio; legacy syntax", + fmt.Sprintf("pool create label --scm-size %s --mem-ratio 0.25", + testSizeStr), + "", + errors.New("may not be mixed"), + }, + { + "Create pool with default memory file ratio; MD-on-SSD syntax", + fmt.Sprintf("pool create label --meta-size %s --data-size 1024G", + testSizeStr), + strings.Join([]string{ + printRequest(t, &control.PoolCreateReq{ + User: eUsr.Username + "@", + UserGroup: eGrp.Name + "@", + Ranks: []ranklist.Rank{}, + TierBytes: []uint64{ + uint64(testSize), + 1024 * humanize.GByte, + }, + MemRatio: 1, + Properties: []*daos.PoolProperty{ + propWithVal("label", "label"), + }, + }), + }, " "), + nil, + }, + { + "Create pool with manual memory file ratio; MD-on-SSD syntax; single value", + fmt.Sprintf("pool create label --meta-size %s --data-size 1024G --mem-ratio 25.5", + testSizeStr), + strings.Join([]string{ + printRequest(t, &control.PoolCreateReq{ + User: eUsr.Username + "@", + UserGroup: eGrp.Name + "@", + Ranks: []ranklist.Rank{}, + TierBytes: []uint64{ + uint64(testSize), + 1024 * humanize.GByte, + }, + MemRatio: 0.255, + Properties: []*daos.PoolProperty{ + propWithVal("label", "label"), + }, + }), + }, " "), + nil, + }, + { + "Create pool with manual memory file ratio; MD-on-SSD syntax; both tiers", + fmt.Sprintf("pool create label --meta-size %s --data-size 1024G --mem-ratio 25.5,74.5", + testSizeStr), + strings.Join([]string{ + printRequest(t, &control.PoolCreateReq{ + User: eUsr.Username + "@", + UserGroup: eGrp.Name + "@", + Ranks: []ranklist.Rank{}, + TierBytes: []uint64{ + uint64(testSize), + 1024 * humanize.GByte, + }, + MemRatio: 0.255, + Properties: []*daos.PoolProperty{ + propWithVal("label", "label"), + }, + }), + }, " "), + nil, + }, + { + "Create pool with manual memory file ratio; MD-on-SSD syntax; three tiers", + fmt.Sprintf("pool create label --meta-size %s --data-size 1024G --mem-ratio 25.5,25.5,49", + testSizeStr), + "", + errors.New("unexpected mem-ratio"), + }, + { + "Create pool with manual memory file ratio; MD-on-SSD syntax; 100% tier", + fmt.Sprintf("pool create label --meta-size %s --data-size 1024G --mem-ratio 100", + testSizeStr), + strings.Join([]string{ + printRequest(t, &control.PoolCreateReq{ + User: eUsr.Username + "@", + UserGroup: eGrp.Name + "@", + Ranks: []ranklist.Rank{}, + TierBytes: []uint64{ + uint64(testSize), + 1024 * humanize.GByte, + }, + MemRatio: 1, + Properties: []*daos.PoolProperty{ + propWithVal("label", "label"), + }, + }), + }, " "), + nil, + }, { "Create pool with manual ranks", fmt.Sprintf("pool create label --size %s --ranks 1,2", testSizeStr), diff --git a/src/control/cmd/dmg/pretty/pool.go b/src/control/cmd/dmg/pretty/pool.go index 518502172cf..d28cc2f8061 100644 --- a/src/control/cmd/dmg/pretty/pool.go +++ b/src/control/cmd/dmg/pretty/pool.go @@ -21,17 +21,6 @@ import ( const msgNoPools = "No pools in system" -func getTierNameText(tierIdx int) string { - switch tierIdx { - case int(daos.StorageMediaTypeScm): - return fmt.Sprintf("- Storage tier %d (SCM):", tierIdx) - case int(daos.StorageMediaTypeNvme): - return fmt.Sprintf("- Storage tier %d (NVMe):", tierIdx) - default: - return fmt.Sprintf("- Storage tier %d (unknown):", tierIdx) - } -} - // PrintPoolQueryResponse generates a human-readable representation of the supplied // PoolQueryResp struct and writes it to the supplied io.Writer. func PrintPoolQueryResponse(pqr *control.PoolQueryResp, out io.Writer, opts ...PrintConfigOption) error { @@ -60,6 +49,42 @@ func PrintTierRatio(ratio float64) string { return fmt.Sprintf("%.2f%%", ratio*100) } +func printTierBytesRow(fmtName string, tierBytes uint64, numRanks int) txtfmt.TableRow { + return txtfmt.TableRow{ + fmtName: fmt.Sprintf("%s (%s / rank)", + humanize.Bytes(tierBytes*uint64(numRanks)), + humanize.Bytes(tierBytes)), + } +} + +func getPoolCreateRespRows(mdOnSSD bool, tierBytes []uint64, tierRatios []float64, numRanks int) (title string, rows []txtfmt.TableRow) { + title = "Pool created with " + tierName := "SCM" + if mdOnSSD { + tierName = "Metadata" + } + + for tierIdx, tierRatio := range tierRatios { + if tierIdx > 0 { + title += "," + tierName = "NVMe" + if mdOnSSD { + tierName = "Data" + } + } + + title += PrintTierRatio(tierRatio) + fmtName := fmt.Sprintf("Storage tier %d (%s)", tierIdx, tierName) + if mdOnSSD { + fmtName = tierName + " Storage" + } + rows = append(rows, printTierBytesRow(fmtName, tierBytes[tierIdx], numRanks)) + } + title += " storage tier ratio" + + return title, rows +} + // PrintPoolCreateResponse generates a human-readable representation of the pool create // response and prints it to the supplied io.Writer. func PrintPoolCreateResponse(pcr *control.PoolCreateResp, out io.Writer, opts ...PrintConfigOption) error { @@ -87,27 +112,28 @@ func PrintPoolCreateResponse(pcr *control.PoolCreateResp, out io.Writer, opts .. return errors.New("create response had 0 target ranks") } - numRanks := uint64(len(pcr.TgtRanks)) + numRanks := len(pcr.TgtRanks) fmtArgs := make([]txtfmt.TableRow, 0, 6) fmtArgs = append(fmtArgs, txtfmt.TableRow{"UUID": pcr.UUID}) fmtArgs = append(fmtArgs, txtfmt.TableRow{"Service Leader": fmt.Sprintf("%d", pcr.Leader)}) fmtArgs = append(fmtArgs, txtfmt.TableRow{"Service Ranks": pretty.PrintRanks(pcr.SvcReps)}) fmtArgs = append(fmtArgs, txtfmt.TableRow{"Storage Ranks": pretty.PrintRanks(pcr.TgtRanks)}) - fmtArgs = append(fmtArgs, txtfmt.TableRow{"Total Size": humanize.Bytes(totalSize * numRanks)}) + fmtArgs = append(fmtArgs, txtfmt.TableRow{ + "Total Size": humanize.Bytes(totalSize * uint64(numRanks)), + }) - title := "Pool created with " - tierName := "SCM" - for tierIdx, tierRatio := range tierRatios { - if tierIdx > 0 { - title += "," - tierName = "NVMe" - } + mdOnSsdEnabled := pcr.MemFileBytes > 0 - title += PrintTierRatio(tierRatio) - fmtName := fmt.Sprintf("Storage tier %d (%s)", tierIdx, tierName) - fmtArgs = append(fmtArgs, txtfmt.TableRow{fmtName: fmt.Sprintf("%s (%s / rank)", humanize.Bytes(pcr.TierBytes[tierIdx]*numRanks), humanize.Bytes(pcr.TierBytes[tierIdx]))}) + title, tierRows := getPoolCreateRespRows(mdOnSsdEnabled, pcr.TierBytes, tierRatios, + numRanks) + + // Print memory-file to meta-blob ratio for MD-on-SSD. + if mdOnSsdEnabled { + tierRows = append(tierRows, printTierBytesRow("Memory File Size", + pcr.MemFileBytes, numRanks)) } - title += " storage tier ratio" + + fmtArgs = append(fmtArgs, tierRows...) _, err := fmt.Fprintln(out, txtfmt.FormatEntity(title, fmtArgs)) return err diff --git a/src/control/cmd/dmg/pretty/pool_test.go b/src/control/cmd/dmg/pretty/pool_test.go index 720d0bf7e41..bbc880f5b82 100644 --- a/src/control/cmd/dmg/pretty/pool_test.go +++ b/src/control/cmd/dmg/pretty/pool_test.go @@ -22,7 +22,18 @@ import ( "github.com/daos-stack/daos/src/control/lib/ranklist" ) -func TestPretty_PrintPoolQueryTargetResp(t *testing.T) { +func TestPretty_PrintPoolQueryTargetResponse(t *testing.T) { + tier0 := &daos.StorageUsageStats{ + Total: 6000000000, + Free: 5000000000, + MediaType: daos.StorageMediaTypeScm, + } + tier1 := &daos.StorageUsageStats{ + Total: 100000000000, + Free: 90000000000, + MediaType: daos.StorageMediaTypeNvme, + } + for name, tc := range map[string]struct { pqtr *control.PoolQueryTargetResp expPrintStr string @@ -44,58 +55,22 @@ func TestPretty_PrintPoolQueryTargetResp(t *testing.T) { { Type: 0, State: daos.PoolTargetStateDown, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateUpIn, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateDownOut, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateUpIn, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, }, }, @@ -104,28 +79,28 @@ Target: type unknown, state down - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB `, @@ -137,58 +112,22 @@ Target: type unknown, state up_in { Type: 0, State: 42, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateUpIn, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateDownOut, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateUpIn, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, }, }, @@ -197,28 +136,28 @@ Target: type unknown, state invalid - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB `, @@ -230,58 +169,22 @@ Target: type unknown, state up_in { Type: 42, State: daos.PoolTargetStateDown, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateUpIn, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateDownOut, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, { Type: 0, State: daos.PoolTargetStateUpIn, - Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - }, + Space: []*daos.StorageUsageStats{tier0, tier1}, }, }, }, @@ -290,28 +193,28 @@ Target: type invalid, state down - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB `, @@ -324,14 +227,7 @@ Target: type unknown, state up_in Type: 0, State: daos.PoolTargetStateDown, Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, + tier0, tier1, { Total: 800000000000, Free: 200000000000, @@ -342,14 +238,7 @@ Target: type unknown, state up_in Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, + tier0, tier1, { Total: 800000000000, Free: 200000000000, @@ -360,17 +249,11 @@ Target: type unknown, state up_in Type: 0, State: daos.PoolTargetStateDownOut, Space: []*daos.StorageUsageStats{ + tier0, tier1, { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, - { - Total: 800000000000, - Free: 200000000000, + Total: 800000000000, + Free: 200000000000, + MediaType: daos.StorageMediaType(3), }, }, }, @@ -378,14 +261,7 @@ Target: type unknown, state up_in Type: 0, State: daos.PoolTargetStateUpIn, Space: []*daos.StorageUsageStats{ - { - Total: 6000000000, - Free: 5000000000, - }, - { - Total: 100000000000, - Free: 90000000000, - }, + tier0, tier1, { Total: 800000000000, Free: 200000000000, @@ -399,40 +275,40 @@ Target: type unknown, state down - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -- Storage tier 2 (unknown): +- Storage tier 2 (UNKNOWN): Total size: 800 GB Free: 200 GB Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -- Storage tier 2 (unknown): +- Storage tier 2 (UNKNOWN): Total size: 800 GB Free: 200 GB Target: type unknown, state down_out - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -- Storage tier 2 (unknown): +- Storage tier 2 (UNKNOWN): Total size: 800 GB Free: 200 GB Target: type unknown, state up_in - Storage tier 0 (SCM): Total size: 6.0 GB Free: 5.0 GB -- Storage tier 1 (NVMe): +- Storage tier 1 (NVME): Total size: 100 GB Free: 90 GB -- Storage tier 2 (unknown): +- Storage tier 2 (UNKNOWN): Total size: 800 GB Free: 200 GB `, @@ -489,6 +365,31 @@ Pool created with 5.66%%,94.34%% storage tier ratio Storage tier 0 (SCM) : 2.4 GB (600 MB / rank) Storage tier 1 (NVMe): 40 GB (10 GB / rank) +`, test.MockPoolUUID()), + }, + "basic; md-on-ssd": { + pcr: &control.PoolCreateResp{ + UUID: test.MockUUID(), + SvcReps: mockRanks(0, 1, 2), + TgtRanks: mockRanks(0, 1, 2, 3), + TierBytes: []uint64{ + 600 * humanize.MByte, + 10 * humanize.GByte, + }, + MemFileBytes: 300 * humanize.MByte, // Non-zero indicates MD-on-SSD. + }, + expPrintStr: fmt.Sprintf(` +Pool created with 5.66%%,94.34%% storage tier ratio +------------------------------------------------- + UUID : %s + Service Leader : 0 + Service Ranks : [0-2] + Storage Ranks : [0-3] + Total Size : 42 GB + Metadata Storage : 2.4 GB (600 MB / rank) + Data Storage : 40 GB (10 GB / rank) + Memory File Size : 1.2 GB (300 MB / rank) + `, test.MockPoolUUID()), }, "no nvme": { @@ -681,6 +582,33 @@ one 6.0 TB Ready 83%% 16%% 0/16 verbose: true, expPrintStr: msgNoPools + "\n", }, + "verbose, two pools": { + resp: &control.ListPoolsResp{ + Pools: []*daos.PoolInfo{ + { + UUID: test.MockPoolUUID(1), + TierStats: exampleTierStats, + TotalTargets: 16, + ActiveTargets: 16, + DisabledTargets: 0, + State: daos.PoolServiceStateReady, + PoolLayoutVer: 1, + UpgradeLayoutVer: 2, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateIdle, + }, + QueryMask: daos.DefaultPoolQueryMask, + }, + }, + }, + verbose: true, + expPrintStr: ` +Label UUID State SvcReps SCM Size SCM Used SCM Imbalance NVME Size NVME Used NVME Imbalance Disabled UpgradeNeeded? Rebuild State +----- ---- ----- ------- -------- -------- ------------- --------- --------- -------------- -------- -------------- ------------- +- 00000001-0001-0001-0001-000000000001 Ready N/A 100 GB 80 GB 16% 6.0 TB 5.0 TB 8% 0/16 1->2 idle + +`, + }, } { t.Run(name, func(t *testing.T) { var bld strings.Builder diff --git a/src/control/cmd/dmg/pretty/storage_nvme.go b/src/control/cmd/dmg/pretty/storage_nvme.go index a65c3c050b9..8920094f9d1 100644 --- a/src/control/cmd/dmg/pretty/storage_nvme.go +++ b/src/control/cmd/dmg/pretty/storage_nvme.go @@ -213,6 +213,22 @@ func printNvmeFormatResults(inCtrlrs storage.NvmeControllers, out io.Writer, opt return nil } +func rolesRankFromSmd(ctrlr *storage.NvmeController) (string, string) { + rolesStr := "NA" + roles := ctrlr.Roles() + if !roles.IsEmpty() { + rolesStr = roles.String() + } + + rankStr := "None" + rank := ctrlr.Rank() + if rank != ranklist.NilRank { + rankStr = rank.String() + } + + return rolesStr, rankStr +} + // PrintNvmeControllers displays controller details in a verbose table. func PrintNvmeControllers(controllers storage.NvmeControllers, out io.Writer, opts ...PrintConfigOption) error { w := txtfmt.NewErrWriter(out) @@ -245,18 +261,7 @@ func PrintNvmeControllers(controllers storage.NvmeControllers, out io.Writer, op row[fwTitle] = ctrlr.FwRev row[socketTitle] = fmt.Sprint(ctrlr.SocketID) row[capacityTitle] = humanize.Bytes(ctrlr.Capacity()) - roles := "NA" - rank := "None" - // Assumes that all SMD devices on a controller have the same roles and rank. - if len(ctrlr.SmdDevices) > 0 { - sd := ctrlr.SmdDevices[0] - roles = sd.Roles.String() - if sd.Rank != ranklist.NilRank { - rank = sd.Rank.String() - } - } - row[rolesTitle] = roles - row[rankTitle] = rank + row[rolesTitle], row[rankTitle] = rolesRankFromSmd(ctrlr) table = append(table, row) } @@ -276,7 +281,7 @@ func PrintNvmeHealthMap(hsm control.HostStorageMap, out io.Writer, opts ...Print lineBreak := strings.Repeat("-", len(hosts)) fmt.Fprintf(out, "%s\n%s\n%s\n", lineBreak, hosts, lineBreak) - if len(hss.HostStorage.NvmeDevices) == 0 { + if hss.HostStorage.NvmeDevices.Len() == 0 { fmt.Fprintln(out, " No NVMe devices detected") continue } diff --git a/src/control/cmd/dmg/utils.go b/src/control/cmd/dmg/utils.go index b8b97e43ff8..c29c74a628f 100644 --- a/src/control/cmd/dmg/utils.go +++ b/src/control/cmd/dmg/utils.go @@ -55,3 +55,17 @@ func errIncompatFlags(key string, incompat ...string) error { return errors.Errorf("%s with --%s", base, strings.Join(incompat, " or --")) } + +// Convert pair of ratios to a single fraction. +func ratiosToSingleFraction(ratios []float64) (float32, error) { + nrRatios := len(ratios) + + // Most validation already performed by tierRatioFlag type, this just prevents + // incomplete or overvalue tier combinations and restricts to 1 or 2 tiers. + if nrRatios != 2 && ratios[0] < 1 { + return 0, errors.Errorf("want 2 ratio values got %d", nrRatios) + } + + // Precision loss deemed acceptable with conversion from float64 to float32. + return float32(ratios[0]), nil +} diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go index cb2dc5099d4..ee0f6a92717 100644 --- a/src/control/common/proto/ctl/storage_nvme.pb.go +++ b/src/control/common/proto/ctl/storage_nvme.pb.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -95,12 +95,13 @@ type ScanNvmeReq struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - Health bool `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"` // Retrieve NVMe device health statistics - Meta bool `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"` // Retrieve metadata relating to NVMe device - Basic bool `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"` // Strip NVMe device details to only basic - MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob - RdbSize uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"` // Size of the RDB blob - LinkStats bool `protobuf:"varint,6,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics + Health bool `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"` // Retrieve NVMe device health statistics + Meta bool `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"` // Retrieve metadata relating to NVMe device + Basic bool `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"` // Strip NVMe device details to only basic + MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob + RdbSize uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"` // Size of the RDB blob + MemRatio float32 `protobuf:"fixed32,6,opt,name=MemRatio,proto3" json:"MemRatio,omitempty"` // Ratio of VOS-file:meta-blob sizes + LinkStats bool `protobuf:"varint,7,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics } func (x *ScanNvmeReq) Reset() { @@ -170,6 +171,13 @@ func (x *ScanNvmeReq) GetRdbSize() uint64 { return 0 } +func (x *ScanNvmeReq) GetMemRatio() float32 { + if x != nil { + return x.MemRatio + } + return 0 +} + func (x *ScanNvmeReq) GetLinkStats() bool { if x != nil { return x.LinkStats @@ -284,7 +292,7 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72, 0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08, - 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, + 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xbf, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, @@ -293,20 +301,22 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65, 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, - 0x12, 0x1c, 0x0a, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x06, 0x20, - 0x01, 0x28, 0x08, 0x52, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65, - 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b, - 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, - 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73, - 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, - 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, - 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e, - 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, - 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, - 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74, - 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x12, 0x1a, 0x0a, 0x08, 0x4d, 0x65, 0x6d, 0x52, 0x61, 0x74, 0x69, 0x6f, 0x18, 0x06, 0x20, 0x01, + 0x28, 0x02, 0x52, 0x08, 0x4d, 0x65, 0x6d, 0x52, 0x61, 0x74, 0x69, 0x6f, 0x12, 0x1c, 0x0a, 0x09, + 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x08, 0x52, + 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, + 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, + 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, + 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, + 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, + 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, + 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, + 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, + 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, + 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, + 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/src/control/common/proto/logging.go b/src/control/common/proto/logging.go index 033b235669d..5de759ce865 100644 --- a/src/control/common/proto/logging.go +++ b/src/control/common/proto/logging.go @@ -10,6 +10,7 @@ import ( "fmt" "strings" + "github.com/dustin/go-humanize" "google.golang.org/protobuf/proto" grpcpb "github.com/Jille/raft-grpc-transport/proto" @@ -96,6 +97,7 @@ func Debug(msg proto.Message) string { fmt.Fprintf(&bld, "(%.02f%%) ", m.TierRatio[i]) } } + fmt.Fprintf(&bld, "mem-ratio: %.02f ", m.MemRatio) case *mgmtpb.PoolCreateResp: fmt.Fprintf(&bld, "%T svc_ldr:%d ", m, m.SvcLdr) ranks := &ranklist.RankSet{} @@ -112,6 +114,7 @@ func Debug(msg proto.Message) string { for i, b := range m.TierBytes { fmt.Fprintf(&bld, "%d:%d ", i, b) } + fmt.Fprintf(&bld, "meta-file-size:%s", humanize.Bytes(m.MemFileBytes)) case *mgmtpb.PoolEvictReq: fmt.Fprintf(&bld, "%T pool:%s", m, m.Id) if len(m.Handles) > 0 { diff --git a/src/control/common/proto/mgmt/pool.pb.go b/src/control/common/proto/mgmt/pool.pb.go index d514bfa6c43..4c1103520d1 100644 --- a/src/control/common/proto/mgmt/pool.pb.go +++ b/src/control/common/proto/mgmt/pool.pb.go @@ -315,6 +315,7 @@ type PoolCreateReq struct { NumRanks uint32 `protobuf:"varint,11,opt,name=num_ranks,json=numRanks,proto3" json:"num_ranks,omitempty"` // Number of target ranks to use Ranks []uint32 `protobuf:"varint,12,rep,packed,name=ranks,proto3" json:"ranks,omitempty"` // target ranks TierBytes []uint64 `protobuf:"varint,13,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"` // Size in bytes of storage tier + MemRatio float32 `protobuf:"fixed32,14,opt,name=mem_ratio,json=memRatio,proto3" json:"mem_ratio,omitempty"` // Fraction of meta-blob-sz to use as mem-file-sz } func (x *PoolCreateReq) Reset() { @@ -440,17 +441,25 @@ func (x *PoolCreateReq) GetTierBytes() []uint64 { return nil } +func (x *PoolCreateReq) GetMemRatio() float32 { + if x != nil { + return x.MemRatio + } + return 0 +} + // PoolCreateResp returns created pool uuid and ranks. type PoolCreateResp struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - Status int32 `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"` // DAOS error code - SvcLdr uint32 `protobuf:"varint,2,opt,name=svc_ldr,json=svcLdr,proto3" json:"svc_ldr,omitempty"` // Current service leader rank - SvcReps []uint32 `protobuf:"varint,3,rep,packed,name=svc_reps,json=svcReps,proto3" json:"svc_reps,omitempty"` // pool service replica ranks - TgtRanks []uint32 `protobuf:"varint,4,rep,packed,name=tgt_ranks,json=tgtRanks,proto3" json:"tgt_ranks,omitempty"` // pool target ranks - TierBytes []uint64 `protobuf:"varint,5,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"` // storage tiers allocated to pool + Status int32 `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"` // DAOS error code + SvcLdr uint32 `protobuf:"varint,2,opt,name=svc_ldr,json=svcLdr,proto3" json:"svc_ldr,omitempty"` // Current service leader rank + SvcReps []uint32 `protobuf:"varint,3,rep,packed,name=svc_reps,json=svcReps,proto3" json:"svc_reps,omitempty"` // pool service replica ranks + TgtRanks []uint32 `protobuf:"varint,4,rep,packed,name=tgt_ranks,json=tgtRanks,proto3" json:"tgt_ranks,omitempty"` // pool target ranks + TierBytes []uint64 `protobuf:"varint,5,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"` // per-rank storage tier sizes allocated in pool + MemFileBytes uint64 `protobuf:"varint,6,opt,name=mem_file_bytes,json=memFileBytes,proto3" json:"mem_file_bytes,omitempty"` // per-rank accumulated value of memory file sizes } func (x *PoolCreateResp) Reset() { @@ -520,6 +529,13 @@ func (x *PoolCreateResp) GetTierBytes() []uint64 { return nil } +func (x *PoolCreateResp) GetMemFileBytes() uint64 { + if x != nil { + return x.MemFileBytes + } + return 0 +} + // PoolDestroyReq supplies pool identifier and force flag. type PoolDestroyReq struct { state protoimpl.MessageState @@ -1150,8 +1166,9 @@ type PoolExtendResp struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - Status int32 `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"` // DAOS error code - TierBytes []uint64 `protobuf:"varint,2,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"` // storage tiers allocated to pool + Status int32 `protobuf:"varint,1,opt,name=status,proto3" json:"status,omitempty"` // DAOS error code + TierBytes []uint64 `protobuf:"varint,2,rep,packed,name=tier_bytes,json=tierBytes,proto3" json:"tier_bytes,omitempty"` // storage tiers allocated to pool + MetaBlobBytes uint32 `protobuf:"varint,3,opt,name=meta_blob_bytes,json=metaBlobBytes,proto3" json:"meta_blob_bytes,omitempty"` // Size in bytes of metadata blob on SSD } func (x *PoolExtendResp) Reset() { @@ -1200,6 +1217,13 @@ func (x *PoolExtendResp) GetTierBytes() []uint64 { return nil } +func (x *PoolExtendResp) GetMetaBlobBytes() uint32 { + if x != nil { + return x.MetaBlobBytes + } + return 0 +} + // PoolReintegrateReq supplies pool identifier, rank, and target_idxs. type PoolReintegrateReq struct { state protoimpl.MessageState @@ -1826,6 +1850,7 @@ type PoolQueryResp struct { SvcLdr uint32 `protobuf:"varint,18,opt,name=svc_ldr,json=svcLdr,proto3" json:"svc_ldr,omitempty"` // current raft leader (2.6+) SvcReps []uint32 `protobuf:"varint,19,rep,packed,name=svc_reps,json=svcReps,proto3" json:"svc_reps,omitempty"` // service replica ranks QueryMask uint64 `protobuf:"varint,20,opt,name=query_mask,json=queryMask,proto3" json:"query_mask,omitempty"` // Bitmask of pool query options used + MemFileBytes uint64 `protobuf:"varint,21,opt,name=mem_file_bytes,json=memFileBytes,proto3" json:"mem_file_bytes,omitempty"` // per-pool accumulated value of memory file sizes } func (x *PoolQueryResp) Reset() { @@ -1993,6 +2018,13 @@ func (x *PoolQueryResp) GetQueryMask() uint64 { return 0 } +func (x *PoolQueryResp) GetMemFileBytes() uint64 { + if x != nil { + return x.MemFileBytes + } + return 0 +} + type PoolProperty struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -2597,7 +2629,8 @@ type PoolQueryTargetInfo struct { Type PoolQueryTargetInfo_TargetType `protobuf:"varint,1,opt,name=type,proto3,enum=mgmt.PoolQueryTargetInfo_TargetType" json:"type,omitempty"` // Target type jsee enum daos_target_type_t State PoolQueryTargetInfo_TargetState `protobuf:"varint,2,opt,name=state,proto3,enum=mgmt.PoolQueryTargetInfo_TargetState" json:"state,omitempty"` // target state see enum daos_target_state_t // TODO: target performance data - Space []*StorageTargetUsage `protobuf:"bytes,3,rep,name=space,proto3" json:"space,omitempty"` // this target's usage per storage tier + Space []*StorageTargetUsage `protobuf:"bytes,3,rep,name=space,proto3" json:"space,omitempty"` // this target's usage per storage tier + MemFileBytes uint64 `protobuf:"varint,4,opt,name=mem_file_bytes,json=memFileBytes,proto3" json:"mem_file_bytes,omitempty"` // per-target value of memory file size } func (x *PoolQueryTargetInfo) Reset() { @@ -2653,6 +2686,13 @@ func (x *PoolQueryTargetInfo) GetSpace() []*StorageTargetUsage { return nil } +func (x *PoolQueryTargetInfo) GetMemFileBytes() uint64 { + if x != nil { + return x.MemFileBytes + } + return 0 +} + // PoolQueryTargetResp represents a pool target query response type PoolQueryTargetResp struct { state protoimpl.MessageState @@ -2839,7 +2879,7 @@ var File_mgmt_pool_proto protoreflect.FileDescriptor var file_mgmt_pool_proto_rawDesc = []byte{ 0x0a, 0x0f, 0x6d, 0x67, 0x6d, 0x74, 0x2f, 0x70, 0x6f, 0x6f, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x12, 0x04, 0x6d, 0x67, 0x6d, 0x74, 0x22, 0x87, 0x03, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, + 0x6f, 0x12, 0x04, 0x6d, 0x67, 0x6d, 0x74, 0x22, 0xa4, 0x03, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x43, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52, 0x65, 0x71, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, @@ -2864,217 +2904,215 @@ var file_mgmt_pool_proto_rawDesc = []byte{ 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0c, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x0d, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, - 0x73, 0x22, 0x98, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x43, 0x72, 0x65, 0x61, 0x74, 0x65, - 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, - 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x17, 0x0a, 0x07, - 0x73, 0x76, 0x63, 0x5f, 0x6c, 0x64, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x73, - 0x76, 0x63, 0x4c, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70, - 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73, - 0x12, 0x1b, 0x0a, 0x09, 0x74, 0x67, 0x74, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, - 0x03, 0x28, 0x0d, 0x52, 0x08, 0x74, 0x67, 0x74, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, - 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, - 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, 0x83, 0x01, 0x0a, - 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x52, 0x65, 0x71, 0x12, - 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, - 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, - 0x64, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, - 0x52, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, - 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, - 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1c, 0x0a, 0x09, 0x72, 0x65, 0x63, 0x75, 0x72, 0x73, 0x69, 0x76, - 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x72, 0x65, 0x63, 0x75, 0x72, 0x73, 0x69, - 0x76, 0x65, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, - 0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0xc0, 0x01, - 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x76, 0x69, 0x63, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, - 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, - 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, - 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, - 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x18, 0x0a, - 0x07, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x09, 0x52, 0x07, - 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x64, 0x65, 0x73, 0x74, 0x72, - 0x6f, 0x79, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x64, 0x65, 0x73, 0x74, 0x72, 0x6f, - 0x79, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x5f, 0x64, 0x65, 0x73, 0x74, 0x72, - 0x6f, 0x79, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0c, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x44, - 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x61, 0x63, 0x68, 0x69, 0x6e, - 0x65, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x61, 0x63, 0x68, 0x69, 0x6e, 0x65, - 0x22, 0x3d, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x76, 0x69, 0x63, 0x74, 0x52, 0x65, 0x73, + 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x6d, 0x65, 0x6d, 0x5f, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x18, 0x0e, + 0x20, 0x01, 0x28, 0x02, 0x52, 0x08, 0x6d, 0x65, 0x6d, 0x52, 0x61, 0x74, 0x69, 0x6f, 0x22, 0xbe, + 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x43, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, - 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x63, 0x6f, 0x75, - 0x6e, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x22, - 0x82, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x52, + 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x17, 0x0a, 0x07, 0x73, 0x76, 0x63, + 0x5f, 0x6c, 0x64, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x73, 0x76, 0x63, 0x4c, + 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70, 0x73, 0x18, 0x03, + 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73, 0x12, 0x1b, 0x0a, + 0x09, 0x74, 0x67, 0x74, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, + 0x52, 0x08, 0x74, 0x67, 0x74, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, + 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, + 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, 0x73, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, + 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, + 0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, + 0x83, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, 0x01, - 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x09, 0x74, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, - 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, - 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x63, 0x6c, - 0x75, 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, - 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, - 0x80, 0x01, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x72, 0x61, 0x69, 0x6e, 0x52, 0x65, 0x71, - 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, - 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, - 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, - 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x09, 0x74, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, - 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, - 0x6b, 0x73, 0x22, 0x27, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x72, 0x61, 0x69, 0x6e, 0x52, - 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, - 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0xa8, 0x01, 0x0a, 0x0d, - 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, - 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, - 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, - 0x14, 0x0a, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x05, - 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, - 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, - 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, - 0x18, 0x05, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, - 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x64, 0x6f, 0x6d, 0x61, 0x69, - 0x6e, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x0c, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x44, - 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x73, 0x22, 0x47, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, - 0x74, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, + 0x52, 0x02, 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x18, 0x03, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x05, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, + 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, + 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1c, 0x0a, 0x09, 0x72, 0x65, 0x63, 0x75, 0x72, + 0x73, 0x69, 0x76, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x72, 0x65, 0x63, 0x75, + 0x72, 0x73, 0x69, 0x76, 0x65, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x65, 0x73, + 0x74, 0x72, 0x6f, 0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, - 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x02, - 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, - 0xa5, 0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, - 0x61, 0x74, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, - 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, - 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, - 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, - 0x52, 0x09, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, - 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, - 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, - 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, - 0x65, 0x72, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, 0x2d, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x52, - 0x65, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, - 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, - 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x20, 0x0a, 0x0c, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, - 0x6f, 0x6c, 0x73, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x22, 0x83, 0x02, 0x0a, 0x0d, 0x4c, 0x69, 0x73, - 0x74, 0x50, 0x6f, 0x6f, 0x6c, 0x73, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, + 0x22, 0xc0, 0x01, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x76, 0x69, 0x63, 0x74, 0x52, 0x65, + 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, + 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, + 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, + 0x12, 0x18, 0x0a, 0x07, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, + 0x09, 0x52, 0x07, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x64, 0x65, + 0x73, 0x74, 0x72, 0x6f, 0x79, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x64, 0x65, 0x73, + 0x74, 0x72, 0x6f, 0x79, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x5f, 0x64, 0x65, + 0x73, 0x74, 0x72, 0x6f, 0x79, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0c, 0x66, 0x6f, 0x72, + 0x63, 0x65, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x61, 0x63, + 0x68, 0x69, 0x6e, 0x65, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x61, 0x63, 0x68, + 0x69, 0x6e, 0x65, 0x22, 0x3d, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x76, 0x69, 0x63, 0x74, + 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x14, 0x0a, 0x05, + 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x63, 0x6f, 0x75, + 0x6e, 0x74, 0x22, 0x82, 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x63, 0x6c, 0x75, + 0x64, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, + 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, + 0x09, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, + 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, + 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x45, + 0x78, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, - 0x75, 0x73, 0x12, 0x2e, 0x0a, 0x05, 0x70, 0x6f, 0x6f, 0x6c, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, - 0x0b, 0x32, 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, 0x6f, - 0x6c, 0x73, 0x52, 0x65, 0x73, 0x70, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x05, 0x70, 0x6f, 0x6f, - 0x6c, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x64, 0x61, 0x74, 0x61, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, - 0x6f, 0x6e, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0b, 0x64, 0x61, 0x74, 0x61, 0x56, 0x65, - 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x1a, 0x86, 0x01, 0x0a, 0x04, 0x50, 0x6f, 0x6f, 0x6c, 0x12, 0x12, - 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, - 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, - 0x72, 0x65, 0x70, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, - 0x65, 0x70, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x04, 0x20, 0x01, - 0x28, 0x09, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x23, 0x0a, 0x0d, 0x72, 0x65, 0x62, - 0x75, 0x69, 0x6c, 0x64, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x0c, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x22, 0x4c, - 0x0a, 0x0b, 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, - 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, - 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, - 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, - 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x7b, 0x0a, 0x0c, - 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, - 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, - 0x61, 0x74, 0x75, 0x73, 0x12, 0x37, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, - 0x72, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, - 0x4c, 0x69, 0x73, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x2e, 0x43, 0x6f, 0x6e, - 0x74, 0x52, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x1a, 0x1a, 0x0a, - 0x04, 0x43, 0x6f, 0x6e, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x22, 0x6c, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, - 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, - 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, - 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, - 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, - 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72, - 0x79, 0x5f, 0x6d, 0x61, 0x73, 0x6b, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, - 0x65, 0x72, 0x79, 0x4d, 0x61, 0x73, 0x6b, 0x22, 0xac, 0x01, 0x0a, 0x11, 0x53, 0x74, 0x6f, 0x72, - 0x61, 0x67, 0x65, 0x55, 0x73, 0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x14, 0x0a, - 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, - 0x74, 0x61, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, - 0x04, 0x52, 0x04, 0x66, 0x72, 0x65, 0x65, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x69, 0x6e, 0x18, 0x03, - 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x6d, 0x69, 0x6e, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x61, 0x78, - 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x6d, 0x61, 0x78, 0x12, 0x12, 0x0a, 0x04, 0x6d, - 0x65, 0x61, 0x6e, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x6d, 0x65, 0x61, 0x6e, 0x12, - 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x06, 0x20, - 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, - 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09, 0x6d, 0x65, 0x64, - 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0xbb, 0x01, 0x0a, 0x11, 0x50, 0x6f, 0x6f, 0x6c, 0x52, - 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x16, 0x0a, 0x06, - 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, - 0x61, 0x74, 0x75, 0x73, 0x12, 0x33, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, - 0x01, 0x28, 0x0e, 0x32, 0x1d, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, - 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x2e, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x6f, 0x62, 0x6a, - 0x65, 0x63, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x6f, 0x62, 0x6a, 0x65, - 0x63, 0x74, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x18, 0x04, - 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x22, 0x25, 0x0a, - 0x05, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x08, 0x0a, 0x04, 0x49, 0x44, 0x4c, 0x45, 0x10, 0x00, - 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x55, - 0x53, 0x59, 0x10, 0x02, 0x22, 0xc0, 0x05, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, - 0x72, 0x79, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, - 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x12, - 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, - 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61, - 0x6c, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, - 0x0c, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x25, 0x0a, - 0x0e, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, - 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x54, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x73, 0x12, 0x29, 0x0a, 0x10, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, - 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0f, - 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, - 0x31, 0x0a, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, - 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, - 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69, - 0x6c, 0x64, 0x12, 0x36, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x73, - 0x18, 0x08, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, - 0x6f, 0x72, 0x61, 0x67, 0x65, 0x55, 0x73, 0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x52, - 0x09, 0x74, 0x69, 0x65, 0x72, 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, - 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, - 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x16, 0x0a, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, 0x0b, - 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x12, 0x23, 0x0a, 0x0d, - 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0c, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x0c, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b, - 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61, - 0x6e, 0x6b, 0x73, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x64, 0x69, 0x73, 0x61, 0x62, - 0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61, - 0x6c, 0x5f, 0x65, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x0d, 0x52, - 0x0c, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x45, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x12, 0x26, 0x0a, - 0x0f, 0x70, 0x6f, 0x6f, 0x6c, 0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, - 0x18, 0x0f, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x70, 0x6f, 0x6f, 0x6c, 0x4c, 0x61, 0x79, 0x6f, - 0x75, 0x74, 0x56, 0x65, 0x72, 0x12, 0x2c, 0x0a, 0x12, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, - 0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, 0x18, 0x10, 0x20, 0x01, 0x28, - 0x0d, 0x52, 0x10, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, - 0x56, 0x65, 0x72, 0x12, 0x2c, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x11, 0x20, 0x01, - 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, - 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, - 0x65, 0x12, 0x17, 0x0a, 0x07, 0x73, 0x76, 0x63, 0x5f, 0x6c, 0x64, 0x72, 0x18, 0x12, 0x20, 0x01, - 0x28, 0x0d, 0x52, 0x06, 0x73, 0x76, 0x63, 0x4c, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, - 0x63, 0x5f, 0x72, 0x65, 0x70, 0x73, 0x18, 0x13, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, - 0x63, 0x52, 0x65, 0x70, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72, 0x79, 0x5f, 0x6d, - 0x61, 0x73, 0x6b, 0x18, 0x14, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, 0x65, 0x72, 0x79, - 0x4d, 0x61, 0x73, 0x6b, 0x4a, 0x04, 0x08, 0x09, 0x10, 0x0a, 0x52, 0x0b, 0x74, 0x6f, 0x74, 0x61, - 0x6c, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x73, 0x22, 0x63, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x50, - 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, - 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, - 0x18, 0x0a, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x48, - 0x00, 0x52, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x12, 0x18, 0x0a, 0x06, 0x6e, 0x75, 0x6d, - 0x76, 0x61, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x48, 0x00, 0x52, 0x06, 0x6e, 0x75, 0x6d, - 0x76, 0x61, 0x6c, 0x42, 0x07, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x83, 0x01, 0x0a, - 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, - 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, - 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, - 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, - 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, - 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, - 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, - 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, - 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, - 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, - 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x83, 0x01, - 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, + 0x75, 0x73, 0x22, 0x80, 0x01, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x72, 0x61, 0x69, 0x6e, + 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, + 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x09, 0x74, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, + 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, + 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x27, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x44, 0x72, 0x61, + 0x69, 0x6e, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, + 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0xa8, + 0x01, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x71, + 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, + 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, + 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, + 0x0d, 0x52, 0x05, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, + 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, + 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, + 0x74, 0x65, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, + 0x79, 0x74, 0x65, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x64, 0x6f, + 0x6d, 0x61, 0x69, 0x6e, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x0c, 0x66, 0x61, 0x75, + 0x6c, 0x74, 0x44, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x73, 0x22, 0x6f, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, + 0x6c, 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, + 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, + 0x74, 0x75, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, + 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, + 0x65, 0x73, 0x12, 0x26, 0x0a, 0x0f, 0x6d, 0x65, 0x74, 0x61, 0x5f, 0x62, 0x6c, 0x6f, 0x62, 0x5f, + 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0d, 0x6d, 0x65, 0x74, + 0x61, 0x42, 0x6c, 0x6f, 0x62, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, 0xa5, 0x01, 0x0a, 0x12, 0x50, + 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x65, 0x52, 0x65, + 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, + 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, 0x03, 0x20, 0x01, 0x28, + 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x61, 0x72, 0x67, 0x65, + 0x74, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x09, 0x74, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x49, 0x64, 0x78, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, + 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, + 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x62, 0x79, 0x74, 0x65, + 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x04, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, 0x42, 0x79, 0x74, + 0x65, 0x73, 0x22, 0x2d, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x69, 0x6e, 0x74, 0x65, + 0x67, 0x72, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, + 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x22, 0x20, 0x0a, 0x0c, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, 0x6f, 0x6c, 0x73, 0x52, 0x65, + 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, + 0x73, 0x79, 0x73, 0x22, 0x83, 0x02, 0x0a, 0x0d, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, 0x6f, 0x6c, + 0x73, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2e, 0x0a, + 0x05, 0x70, 0x6f, 0x6f, 0x6c, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x6d, + 0x67, 0x6d, 0x74, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x50, 0x6f, 0x6f, 0x6c, 0x73, 0x52, 0x65, 0x73, + 0x70, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x05, 0x70, 0x6f, 0x6f, 0x6c, 0x73, 0x12, 0x21, 0x0a, + 0x0c, 0x64, 0x61, 0x74, 0x61, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x03, 0x20, + 0x01, 0x28, 0x04, 0x52, 0x0b, 0x64, 0x61, 0x74, 0x61, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, + 0x1a, 0x86, 0x01, 0x0a, 0x04, 0x50, 0x6f, 0x6f, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, + 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x12, 0x14, 0x0a, + 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70, 0x73, 0x18, + 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73, 0x12, 0x14, + 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x73, + 0x74, 0x61, 0x74, 0x65, 0x12, 0x23, 0x0a, 0x0d, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x5f, + 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x72, 0x65, 0x62, + 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x22, 0x4c, 0x0a, 0x0b, 0x4c, 0x69, 0x73, + 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, + 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, + 0x76, 0x63, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x7b, 0x0a, 0x0c, 0x4c, 0x69, 0x73, 0x74, 0x43, + 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, + 0x37, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x18, 0x02, 0x20, + 0x03, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x4c, 0x69, 0x73, 0x74, 0x43, + 0x6f, 0x6e, 0x74, 0x52, 0x65, 0x73, 0x70, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x52, 0x0a, 0x63, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x73, 0x1a, 0x1a, 0x0a, 0x04, 0x43, 0x6f, 0x6e, 0x74, + 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, + 0x75, 0x75, 0x69, 0x64, 0x22, 0x6c, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, + 0x79, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, + 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, + 0x6e, 0x6b, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72, 0x79, 0x5f, 0x6d, 0x61, 0x73, + 0x6b, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, 0x65, 0x72, 0x79, 0x4d, 0x61, + 0x73, 0x6b, 0x22, 0xac, 0x01, 0x0a, 0x11, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x55, 0x73, + 0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61, + 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x12, + 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66, 0x72, + 0x65, 0x65, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x69, 0x6e, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x03, 0x6d, 0x69, 0x6e, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x61, 0x78, 0x18, 0x04, 0x20, 0x01, 0x28, + 0x04, 0x52, 0x03, 0x6d, 0x61, 0x78, 0x12, 0x12, 0x0a, 0x04, 0x6d, 0x65, 0x61, 0x6e, 0x18, 0x05, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x6d, 0x65, 0x61, 0x6e, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, + 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, + 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, + 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, + 0x65, 0x22, 0xbb, 0x01, 0x0a, 0x11, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, + 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, + 0x33, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1d, + 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, + 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x2e, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, + 0x74, 0x61, 0x74, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x18, + 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x12, 0x18, + 0x0a, 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x07, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x22, 0x25, 0x0a, 0x05, 0x53, 0x74, 0x61, 0x74, + 0x65, 0x12, 0x08, 0x0a, 0x04, 0x49, 0x44, 0x4c, 0x45, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x44, + 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x55, 0x53, 0x59, 0x10, 0x02, 0x22, + 0xe6, 0x05, 0x0a, 0x0d, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x73, + 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x75, 0x69, + 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x75, 0x75, 0x69, 0x64, 0x12, 0x14, 0x0a, + 0x05, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x74, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0c, 0x74, 0x6f, 0x74, 0x61, + 0x6c, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x61, 0x63, 0x74, 0x69, + 0x76, 0x65, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, + 0x52, 0x0d, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, + 0x29, 0x0a, 0x10, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x74, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0f, 0x64, 0x69, 0x73, 0x61, 0x62, + 0x6c, 0x65, 0x64, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x31, 0x0a, 0x07, 0x72, 0x65, + 0x62, 0x75, 0x69, 0x6c, 0x64, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, + 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x52, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x53, 0x74, + 0x61, 0x74, 0x75, 0x73, 0x52, 0x07, 0x72, 0x65, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x12, 0x36, 0x0a, + 0x0a, 0x74, 0x69, 0x65, 0x72, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28, + 0x0b, 0x32, 0x17, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, + 0x55, 0x73, 0x61, 0x67, 0x65, 0x53, 0x74, 0x61, 0x74, 0x73, 0x52, 0x09, 0x74, 0x69, 0x65, 0x72, + 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, + 0x18, 0x0a, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, + 0x16, 0x0a, 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x0d, 0x52, + 0x06, 0x6c, 0x65, 0x61, 0x64, 0x65, 0x72, 0x12, 0x23, 0x0a, 0x0d, 0x65, 0x6e, 0x61, 0x62, 0x6c, + 0x65, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, + 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x52, 0x61, 0x6e, 0x6b, 0x73, 0x12, 0x25, 0x0a, 0x0e, + 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x0d, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x52, 0x61, + 0x6e, 0x6b, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x65, 0x6e, 0x67, + 0x69, 0x6e, 0x65, 0x73, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0c, 0x74, 0x6f, 0x74, 0x61, + 0x6c, 0x45, 0x6e, 0x67, 0x69, 0x6e, 0x65, 0x73, 0x12, 0x26, 0x0a, 0x0f, 0x70, 0x6f, 0x6f, 0x6c, + 0x5f, 0x6c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, 0x18, 0x0f, 0x20, 0x01, 0x28, + 0x0d, 0x52, 0x0d, 0x70, 0x6f, 0x6f, 0x6c, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x56, 0x65, 0x72, + 0x12, 0x2c, 0x0a, 0x12, 0x75, 0x70, 0x67, 0x72, 0x61, 0x64, 0x65, 0x5f, 0x6c, 0x61, 0x79, 0x6f, + 0x75, 0x74, 0x5f, 0x76, 0x65, 0x72, 0x18, 0x10, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10, 0x75, 0x70, + 0x67, 0x72, 0x61, 0x64, 0x65, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x56, 0x65, 0x72, 0x12, 0x2c, + 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x11, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, + 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, + 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x17, 0x0a, 0x07, + 0x73, 0x76, 0x63, 0x5f, 0x6c, 0x64, 0x72, 0x18, 0x12, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x73, + 0x76, 0x63, 0x4c, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x65, 0x70, + 0x73, 0x18, 0x13, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x73, 0x76, 0x63, 0x52, 0x65, 0x70, 0x73, + 0x12, 0x1d, 0x0a, 0x0a, 0x71, 0x75, 0x65, 0x72, 0x79, 0x5f, 0x6d, 0x61, 0x73, 0x6b, 0x18, 0x14, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x71, 0x75, 0x65, 0x72, 0x79, 0x4d, 0x61, 0x73, 0x6b, 0x12, + 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, + 0x73, 0x18, 0x15, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65, + 0x42, 0x79, 0x74, 0x65, 0x73, 0x4a, 0x04, 0x08, 0x09, 0x10, 0x0a, 0x52, 0x0b, 0x74, 0x6f, 0x74, + 0x61, 0x6c, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x73, 0x22, 0x63, 0x0a, 0x0c, 0x50, 0x6f, 0x6f, 0x6c, + 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, + 0x12, 0x18, 0x0a, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x48, 0x00, 0x52, 0x06, 0x73, 0x74, 0x72, 0x76, 0x61, 0x6c, 0x12, 0x18, 0x0a, 0x06, 0x6e, 0x75, + 0x6d, 0x76, 0x61, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x48, 0x00, 0x52, 0x06, 0x6e, 0x75, + 0x6d, 0x76, 0x61, 0x6c, 0x42, 0x07, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x83, 0x01, + 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, @@ -3082,76 +3120,89 @@ var file_mgmt_pool_proto_rawDesc = []byte{ 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, - 0x6e, 0x6b, 0x73, 0x22, 0x5d, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, + 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, - 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x32, - 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x02, 0x20, 0x03, - 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, 0x72, - 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, - 0x65, 0x73, 0x22, 0x4f, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61, 0x64, - 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, - 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, - 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, - 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61, - 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, - 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x81, - 0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, + 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x83, + 0x01, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x52, 0x65, + 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, + 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x02, 0x69, 0x64, 0x12, 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, + 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, + 0x6f, 0x6f, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, + 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, + 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, + 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x5d, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x47, 0x65, 0x74, 0x50, + 0x72, 0x6f, 0x70, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, + 0x32, 0x0a, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x18, 0x02, 0x20, + 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x50, + 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x52, 0x0a, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, + 0x69, 0x65, 0x73, 0x22, 0x4f, 0x0a, 0x0e, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, 0x61, + 0x64, 0x65, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x18, - 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x18, 0x0a, 0x07, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x74, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, 0x6e, - 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, 0x6e, - 0x6b, 0x73, 0x22, 0x75, 0x0a, 0x12, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61, - 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x12, - 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66, 0x72, - 0x65, 0x65, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, 0x65, - 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, 0x74, - 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, 0x09, - 0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0xda, 0x02, 0x0a, 0x13, 0x50, 0x6f, - 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, - 0x6f, 0x12, 0x38, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, - 0x24, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, - 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x3b, 0x0a, 0x05, 0x73, - 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x6d, 0x67, 0x6d, - 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, - 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x2e, 0x0a, 0x05, 0x73, 0x70, 0x61, 0x63, - 0x65, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, - 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, - 0x65, 0x52, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x22, 0x3b, 0x0a, 0x0a, 0x54, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, - 0x4e, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x48, 0x44, 0x44, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, - 0x53, 0x53, 0x44, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, 0x50, 0x4d, 0x10, 0x03, 0x12, 0x06, 0x0a, - 0x02, 0x56, 0x4d, 0x10, 0x04, 0x22, 0x5f, 0x0a, 0x0b, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, - 0x74, 0x61, 0x74, 0x65, 0x12, 0x11, 0x0a, 0x0d, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, - 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x4f, 0x57, 0x4e, 0x5f, - 0x4f, 0x55, 0x54, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, - 0x06, 0x0a, 0x02, 0x55, 0x50, 0x10, 0x03, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x50, 0x5f, 0x49, 0x4e, - 0x10, 0x04, 0x12, 0x07, 0x0a, 0x03, 0x4e, 0x45, 0x57, 0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x44, - 0x52, 0x41, 0x49, 0x4e, 0x10, 0x06, 0x22, 0x5e, 0x0a, 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, - 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, - 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2f, 0x0a, 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x18, 0x02, - 0x20, 0x03, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, - 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x52, - 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x2a, 0x25, 0x0a, 0x10, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, - 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x43, - 0x4d, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4e, 0x56, 0x4d, 0x45, 0x10, 0x01, 0x2a, 0x56, 0x0a, - 0x10, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, - 0x65, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x72, 0x65, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x10, 0x00, 0x12, - 0x09, 0x0a, 0x05, 0x52, 0x65, 0x61, 0x64, 0x79, 0x10, 0x01, 0x12, 0x0e, 0x0a, 0x0a, 0x44, 0x65, - 0x73, 0x74, 0x72, 0x6f, 0x79, 0x69, 0x6e, 0x67, 0x10, 0x02, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x65, - 0x67, 0x72, 0x61, 0x64, 0x65, 0x64, 0x10, 0x03, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e, - 0x6f, 0x77, 0x6e, 0x10, 0x04, 0x42, 0x3a, 0x5a, 0x38, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, - 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, - 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, - 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, - 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, + 0x61, 0x6e, 0x6b, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, + 0x61, 0x6e, 0x6b, 0x73, 0x22, 0x29, 0x0a, 0x0f, 0x50, 0x6f, 0x6f, 0x6c, 0x55, 0x70, 0x67, 0x72, + 0x61, 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, + 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, + 0x81, 0x01, 0x0a, 0x12, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x52, 0x65, 0x71, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x79, 0x73, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x03, 0x73, 0x79, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x61, 0x6e, 0x6b, + 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x72, 0x61, 0x6e, 0x6b, 0x12, 0x18, 0x0a, 0x07, + 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x74, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x76, 0x63, 0x5f, 0x72, 0x61, + 0x6e, 0x6b, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x76, 0x63, 0x52, 0x61, + 0x6e, 0x6b, 0x73, 0x22, 0x75, 0x0a, 0x12, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, + 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, + 0x12, 0x0a, 0x04, 0x66, 0x72, 0x65, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66, + 0x72, 0x65, 0x65, 0x12, 0x35, 0x0a, 0x0a, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x5f, 0x74, 0x79, 0x70, + 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x16, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x53, + 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x52, + 0x09, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, 0x65, 0x22, 0x80, 0x03, 0x0a, 0x13, 0x50, + 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, + 0x66, 0x6f, 0x12, 0x38, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, + 0x32, 0x24, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, + 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x3b, 0x0a, 0x05, + 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x6d, 0x67, + 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x2e, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, + 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x2e, 0x0a, 0x05, 0x73, 0x70, 0x61, + 0x63, 0x65, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x6d, 0x67, 0x6d, 0x74, 0x2e, + 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x55, 0x73, 0x61, + 0x67, 0x65, 0x52, 0x05, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x6d, 0x65, 0x6d, + 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, + 0x04, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x46, 0x69, 0x6c, 0x65, 0x42, 0x79, 0x74, 0x65, 0x73, 0x22, + 0x3b, 0x0a, 0x0a, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, + 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x48, 0x44, + 0x44, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x53, 0x44, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, + 0x50, 0x4d, 0x10, 0x03, 0x12, 0x06, 0x0a, 0x02, 0x56, 0x4d, 0x10, 0x04, 0x22, 0x5f, 0x0a, 0x0b, + 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x11, 0x0a, 0x0d, 0x53, + 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0c, + 0x0a, 0x08, 0x44, 0x4f, 0x57, 0x4e, 0x5f, 0x4f, 0x55, 0x54, 0x10, 0x01, 0x12, 0x08, 0x0a, 0x04, + 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x06, 0x0a, 0x02, 0x55, 0x50, 0x10, 0x03, 0x12, 0x09, + 0x0a, 0x05, 0x55, 0x50, 0x5f, 0x49, 0x4e, 0x10, 0x04, 0x12, 0x07, 0x0a, 0x03, 0x4e, 0x45, 0x57, + 0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x44, 0x52, 0x41, 0x49, 0x4e, 0x10, 0x06, 0x22, 0x5e, 0x0a, + 0x13, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, + 0x52, 0x65, 0x73, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x2f, 0x0a, 0x05, + 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x6d, 0x67, + 0x6d, 0x74, 0x2e, 0x50, 0x6f, 0x6f, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x52, 0x05, 0x69, 0x6e, 0x66, 0x6f, 0x73, 0x2a, 0x25, 0x0a, + 0x10, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4d, 0x65, 0x64, 0x69, 0x61, 0x54, 0x79, 0x70, + 0x65, 0x12, 0x07, 0x0a, 0x03, 0x53, 0x43, 0x4d, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4e, 0x56, + 0x4d, 0x45, 0x10, 0x01, 0x2a, 0x56, 0x0a, 0x10, 0x50, 0x6f, 0x6f, 0x6c, 0x53, 0x65, 0x72, 0x76, + 0x69, 0x63, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x0c, 0x0a, 0x08, 0x43, 0x72, 0x65, 0x61, + 0x74, 0x69, 0x6e, 0x67, 0x10, 0x00, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x65, 0x61, 0x64, 0x79, 0x10, + 0x01, 0x12, 0x0e, 0x0a, 0x0a, 0x44, 0x65, 0x73, 0x74, 0x72, 0x6f, 0x79, 0x69, 0x6e, 0x67, 0x10, + 0x02, 0x12, 0x0c, 0x0a, 0x08, 0x44, 0x65, 0x67, 0x72, 0x61, 0x64, 0x65, 0x64, 0x10, 0x03, 0x12, + 0x0b, 0x0a, 0x07, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x10, 0x04, 0x42, 0x3a, 0x5a, 0x38, + 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, + 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, + 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x2f, 0x6d, 0x67, 0x6d, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/src/control/fault/code/codes.go b/src/control/fault/code/codes.go index e9d78fd6588..4d045f7cfce 100644 --- a/src/control/fault/code/codes.go +++ b/src/control/fault/code/codes.go @@ -154,6 +154,7 @@ const ( ServerNoCompatibilityInsecure ServerPoolHasContainers ServerHugepagesDisabled + ServerPoolMemRatioNoRoles ServerBadFaultDomainLabels ) diff --git a/src/control/lib/control/pool.go b/src/control/lib/control/pool.go index 4f29cb696e9..65b042ad406 100644 --- a/src/control/lib/control/pool.go +++ b/src/control/lib/control/pool.go @@ -41,6 +41,12 @@ const ( DefaultPoolTimeout = 5 * time.Minute ) +// Pool create error conditions. +var ( + errPoolCreateFirstTierZeroBytes = errors.New("can't create pool with 0 byte first tier") + errPoolCreateFirstTierRatioZero = errors.New("can't create pool with 0.0 first tier ratio") +) + // checkUUID is a helper function for validating that the supplied // UUID string parses as a valid UUID. func checkUUID(uuidStr string) error { @@ -217,19 +223,21 @@ type ( NumRanks uint32 `json:"num_ranks"` // Auto-sizing param Ranks []ranklist.Rank `json:"ranks"` // Manual-sizing param TierBytes []uint64 `json:"tier_bytes"` // Per-rank values + MemRatio float32 `json:"mem_ratio"` // mem_file_size:meta_blob_size } // PoolCreateResp contains the response from a pool create request. PoolCreateResp struct { - UUID string `json:"uuid"` - Leader uint32 `json:"svc_ldr"` - SvcReps []uint32 `json:"svc_reps"` - TgtRanks []uint32 `json:"tgt_ranks"` - TierBytes []uint64 `json:"tier_bytes"` // Per-rank storage tier sizes + UUID string `json:"uuid"` + Leader uint32 `json:"svc_ldr"` + SvcReps []uint32 `json:"svc_reps"` + TgtRanks []uint32 `json:"tgt_ranks"` + TierBytes []uint64 `json:"tier_bytes"` // Per-rank storage tier sizes. + MemFileBytes uint64 `json:"mem_file_bytes"` // Per-rank. MD-on-SSD mode only. } ) -type maxPoolSizeGetter func() (uint64, uint64, error) +type maxPoolSizeGetter func(*PoolCreateReq) (uint64, uint64, error) func poolCreateReqChkSizes(log debugLogger, getMaxPoolSz maxPoolSizeGetter, req *PoolCreateReq) error { hasTotBytes := req.TotalBytes > 0 @@ -241,14 +249,14 @@ func poolCreateReqChkSizes(log debugLogger, getMaxPoolSz maxPoolSizeGetter, req switch { case hasTierBytes && hasNoTierRatio && !hasTotBytes: if req.TierBytes[0] == 0 { - return errors.New("can't create pool with 0 SCM") + return errPoolCreateFirstTierZeroBytes } // Storage sizes have been written to TierBytes in request (manual-size). log.Debugf("manual-size pool create mode: %+v", req) case hasNoTierBytes && hasTierRatio && hasTotBytes: if req.TierRatio[0] == 0 { - return errors.New("can't create pool with 0.0 SCM ratio") + return errPoolCreateFirstTierRatioZero } // Storage tier ratios and total pool size given, distribution of space across // ranks to be calculated on the server side (auto-total-size). @@ -256,7 +264,7 @@ func poolCreateReqChkSizes(log debugLogger, getMaxPoolSz maxPoolSizeGetter, req case hasNoTierBytes && hasTierRatio && !hasTotBytes: if req.TierRatio[0] == 0 { - return errors.New("can't create pool with 0.0 SCM ratio") + return errPoolCreateFirstTierRatioZero } availRatio := req.TierRatio[0] if req.TierRatio[1] != availRatio { @@ -265,7 +273,7 @@ func poolCreateReqChkSizes(log debugLogger, getMaxPoolSz maxPoolSizeGetter, req req.TierRatio = nil // Storage tier ratios specified without a total size, use specified fraction of // available space (auto-percentage-size). - scmBytes, nvmeBytes, err := getMaxPoolSz() + scmBytes, nvmeBytes, err := getMaxPoolSz(req) if err != nil { return err } @@ -294,8 +302,8 @@ func poolCreateGenPBReq(ctx context.Context, rpcClient UnaryInvoker, in *PoolCre return } - getMaxPoolSz := func() (uint64, uint64, error) { - return getMaxPoolSize(ctx, rpcClient, ranklist.RankList(in.Ranks)) + getMaxPoolSz := func(createReq *PoolCreateReq) (uint64, uint64, error) { + return getMaxPoolSize(ctx, rpcClient, createReq) } if err = poolCreateReqChkSizes(rpcClient, getMaxPoolSz, in); err != nil { @@ -594,14 +602,14 @@ func convertPoolTargetInfo(pbInfo *mgmtpb.PoolQueryTargetInfo) (*daos.PoolQueryT pqti.State = daos.PoolQueryTargetState(pbInfo.State) pqti.Space = []*daos.StorageUsageStats{ { - Total: uint64(pbInfo.Space[daos.StorageMediaTypeScm].Total), - Free: uint64(pbInfo.Space[daos.StorageMediaTypeScm].Free), - MediaType: daos.StorageMediaTypeScm, + Total: uint64(pbInfo.Space[0].Total), + Free: uint64(pbInfo.Space[0].Free), + MediaType: daos.StorageMediaType(pbInfo.Space[0].MediaType), }, { - Total: uint64(pbInfo.Space[daos.StorageMediaTypeNvme].Total), - Free: uint64(pbInfo.Space[daos.StorageMediaTypeNvme].Free), - MediaType: daos.StorageMediaTypeNvme, + Total: uint64(pbInfo.Space[1].Total), + Free: uint64(pbInfo.Space[1].Free), + MediaType: daos.StorageMediaType(pbInfo.Space[1].MediaType), }, } @@ -1040,6 +1048,7 @@ func newFilterRankFunc(ranks ranklist.RankList) filterRankFn { func processSCMSpaceStats(log debugLogger, filterRank filterRankFn, scmNamespaces storage.ScmNamespaces, rankNVMeFreeSpace rankFreeSpaceMap) (uint64, error) { scmBytes := uint64(math.MaxUint64) + // Realistically there should only be one-per-rank but handle the case for multiple anyway. for _, scmNamespace := range scmNamespaces { if scmNamespace.Mount == nil { return 0, errors.Errorf("SCM device %s (bdev %s, name %s) is not mounted", @@ -1075,12 +1084,17 @@ func processSCMSpaceStats(log debugLogger, filterRank filterRankFn, scmNamespace func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControllers storage.NvmeControllers, rankNVMeFreeSpace rankFreeSpaceMap) error { for _, controller := range nvmeControllers { for _, smdDevice := range controller.SmdDevices { - msgDev := fmt.Sprintf("SMD device %s (rank %d, ctrlr %s)", smdDevice.UUID, + msgDev := fmt.Sprintf("SMD device %s (rank %d, ctrlr %s", smdDevice.UUID, smdDevice.Rank, controller.PciAddr) - if !smdDevice.Roles.IsEmpty() && (smdDevice.Roles.OptionBits&storage.BdevRoleData) == 0 { - log.Debugf("Skipping %s, not used for storing data", msgDev) - continue + if smdDevice.Roles.IsEmpty() { + msgDev += ")" + } else { + msgDev += fmt.Sprintf(", roles %q)", smdDevice.Roles.String()) + if !smdDevice.Roles.HasData() { + log.Debugf("skipping %s, not used for storing data", msgDev) + continue + } } if controller.NvmeState == storage.NvmeStateNew { @@ -1114,31 +1128,59 @@ func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControl } // Return the maximal SCM and NVMe size of a pool which could be created with all the storage nodes. -func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, ranks ranklist.RankList) (uint64, uint64, error) { +func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, createReq *PoolCreateReq) (uint64, uint64, error) { + isMdOnSsdEnabled := func(log debugLogger, hsm HostStorageMap) bool { + for _, hss := range hsm { + hs := hss.HostStorage + if hs == nil { + continue + } + nvme := hs.NvmeDevices + if nvme.Len() > 0 && !nvme[0].Roles().IsEmpty() { + log.Debugf("fetch max pool size in md-on-size mode") + return true + } + } + + return false + } + + if createReq.MemRatio < 0 { + return 0, 0, errors.New("invalid mem-ratio, should be greater than zero") + } + if createReq.MemRatio > 1 { + return 0, 0, errors.New("invalid mem-ratio, should not be greater than one") + } + // Verify that the DAOS system is ready before attempting to query storage. if _, err := SystemQuery(ctx, rpcClient, &SystemQueryReq{}); err != nil { return 0, 0, err } - resp, err := StorageScan(ctx, rpcClient, &StorageScanReq{Usage: true}) + scanReq := &StorageScanReq{ + Usage: true, + MemRatio: createReq.MemRatio, + } + + scanResp, err := StorageScan(ctx, rpcClient, scanReq) if err != nil { return 0, 0, err } - if len(resp.HostStorage) == 0 { + if len(scanResp.HostStorage) == 0 { return 0, 0, errors.New("Empty host storage response from StorageScan") } // Generate function to verify a rank is in the provided rank slice. - filterRank := newFilterRankFunc(ranks) + filterRank := newFilterRankFunc(ranklist.RankList(createReq.Ranks)) rankNVMeFreeSpace := make(rankFreeSpaceMap) scmBytes := uint64(math.MaxUint64) - for _, key := range resp.HostStorage.Keys() { - hostStorage := resp.HostStorage[key].HostStorage + for _, key := range scanResp.HostStorage.Keys() { + hostStorage := scanResp.HostStorage[key].HostStorage if hostStorage.ScmNamespaces.Usable() == 0 { return 0, 0, errors.Errorf("Host without SCM storage: hostname=%s", - resp.HostStorage[key].HostSet.String()) + scanResp.HostStorage[key].HostSet.String()) } sb, err := processSCMSpaceStats(rpcClient, filterRank, hostStorage.ScmNamespaces, rankNVMeFreeSpace) @@ -1156,7 +1198,8 @@ func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, ranks ranklist. } if scmBytes == math.MaxUint64 { - return 0, 0, errors.Errorf("No SCM storage space available with rank list %s", ranks) + return 0, 0, errors.Errorf("No SCM storage space available with rank list %q", + createReq.Ranks) } nvmeBytes := uint64(math.MaxUint64) @@ -1166,8 +1209,27 @@ func getMaxPoolSize(ctx context.Context, rpcClient UnaryInvoker, ranks ranklist. } } - rpcClient.Debugf("Maximal size of a pool: scmBytes=%s (%d B) nvmeBytes=%s (%d B)", - humanize.Bytes(scmBytes), scmBytes, humanize.Bytes(nvmeBytes), nvmeBytes) + if !isMdOnSsdEnabled(rpcClient, scanResp.HostStorage) { + rpcClient.Debugf("Maximal size of a pool: scmBytes=%s (%d B) nvmeBytes=%s (%d B)", + humanize.Bytes(scmBytes), scmBytes, humanize.Bytes(nvmeBytes), nvmeBytes) + + return scmBytes, nvmeBytes, nil + } + + // In MD-on-SSD mode calculate metaBytes based on the minimum ramdisk (called scm here) + // availability across ranks. NVMe sizes returned in StorageScan response at the beginning + // of this function have been adjusted based on SSD bdev roles and MemRatio passed in the + // scan request. The rationale behind deriving pool sizes from ramdisk availability is that + // this is more likely to be the limiting factor than SSD usage. + if createReq.MemRatio == 0 { + createReq.MemRatio = 1 + } + metaBytes := uint64(float64(scmBytes) / float64(createReq.MemRatio)) + + rpcClient.Debugf("With minimum available ramdisk capacity of %s and mem-ratio %.2f,"+ + " the maximum per-rank sizes for a pool are META=%s (%d B) and DATA=%s (%d B)", + humanize.Bytes(scmBytes), createReq.MemRatio, humanize.Bytes(metaBytes), + metaBytes, humanize.Bytes(nvmeBytes), nvmeBytes) - return scmBytes, nvmeBytes, nil + return metaBytes, nvmeBytes, nil } diff --git a/src/control/lib/control/pool_test.go b/src/control/lib/control/pool_test.go index 7e342d95be8..9e0d557c490 100644 --- a/src/control/lib/control/pool_test.go +++ b/src/control/lib/control/pool_test.go @@ -368,6 +368,7 @@ func TestControl_PoolCreateReq_Convert(t *testing.T) { NumRanks: 3, Ranks: []ranklist.Rank{1, 2, 3}, TierBytes: []uint64{humanize.GiByte, 10 * humanize.GiByte}, + MemRatio: 0.55, Properties: []*daos.PoolProperty{ { Name: "label", @@ -389,6 +390,7 @@ func TestControl_PoolCreateReq_Convert(t *testing.T) { NumRanks: 3, Ranks: []uint32{1, 2, 3}, TierBytes: []uint64{humanize.GiByte, 10 * humanize.GiByte}, + MemRatio: 0.55, Properties: []*mgmtpb.PoolProperty{ {Number: 1, Value: &mgmtpb.PoolProperty_Strval{"foo"}}, }, @@ -481,7 +483,7 @@ func TestControl_poolCreateReqChkSizes(t *testing.T) { defer test.ShowBufferOnFailure(t, buf) nrGetMaxCalls := 0 - getMaxPoolSz := func() (uint64, uint64, error) { + getMaxPoolSz := func(createReq *PoolCreateReq) (uint64, uint64, error) { nrGetMaxCalls++ return tc.getMaxScm, tc.getMaxNvme, tc.getMaxErr } @@ -840,7 +842,7 @@ func TestControl_PoolQueryResp_MarshalJSON(t *testing.T) { UpgradeLayoutVer: 8, }, }, - exp: `{"query_mask":"disabled_engines,rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"status":42}`, + exp: `{"query_mask":"disabled_engines,rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":0,"status":42}`, }, "valid rankset": { pqr: &PoolQueryResp{ @@ -860,9 +862,10 @@ func TestControl_PoolQueryResp_MarshalJSON(t *testing.T) { DisabledRanks: &ranklist.RankSet{}, PoolLayoutVer: 7, UpgradeLayoutVer: 8, + MemFileBytes: 1000, }, }, - exp: `{"query_mask":"disabled_engines,rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"enabled_ranks":[0,1,2,3,5],"disabled_ranks":[],"pool_layout_ver":7,"upgrade_layout_ver":8,"status":42}`, + exp: `{"query_mask":"disabled_engines,rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"enabled_ranks":[0,1,2,3,5],"disabled_ranks":[],"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":1000,"status":42}`, }, } { t.Run(name, func(t *testing.T) { @@ -904,7 +907,7 @@ func TestControl_PoolQueryResp_UnmarshalJSON(t *testing.T) { }, }, "valid rankset": { - data: `{"enabled_ranks":"[0,1-3,5]","disabled_ranks":"[]","status":0,"uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":null,"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8}`, + data: `{"enabled_ranks":"[0,1-3,5]","disabled_ranks":"[]","status":0,"uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":null,"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"mem_file_bytes":1000}`, expResp: PoolQueryResp{ Status: 0, PoolInfo: daos.PoolInfo{ @@ -919,6 +922,7 @@ func TestControl_PoolQueryResp_UnmarshalJSON(t *testing.T) { DisabledRanks: &ranklist.RankSet{}, PoolLayoutVer: 7, UpgradeLayoutVer: 8, + MemFileBytes: 1000, }, }, }, @@ -2064,205 +2068,196 @@ func TestControl_ListPools(t *testing.T) { } } +// Helper to generate typical SCM configs with rank and optional size params. +func newScmCfg(rank int, size ...uint64) MockScmConfig { + sz := uint64(100) * humanize.GByte + if len(size) > 0 { + sz = size[0] + } + return MockScmConfig{ + MockStorageConfig: MockStorageConfig{ + TotalBytes: sz, + AvailBytes: sz, + UsableBytes: sz, + }, + Rank: ranklist.Rank(rank), + } +} + +// Helper to generate typical NVMe configs with rank, roles and optional size params. +func newNvmeCfg(rank int, roles storage.OptionBits, size ...uint64) MockNvmeConfig { + sz := uint64(humanize.TByte) + if len(size) > 0 { + sz = size[0] + } + return MockNvmeConfig{ + MockStorageConfig: MockStorageConfig{ + TotalBytes: sz, + AvailBytes: sz, + UsableBytes: sz, + NvmeRole: &storage.BdevRoles{OptionBits: roles}, + }, + Rank: ranklist.Rank(rank), + } +} + func TestControl_getMaxPoolSize(t *testing.T) { devStateFaulty := storage.NvmeStateFaulty devStateNew := storage.NvmeStateNew - type ExpectedOutput struct { - ScmBytes uint64 - NvmeBytes uint64 - Error error - QueryError error - Debug string - } for name, tc := range map[string]struct { - HostsConfigArray []MockHostStorageConfig - TgtRanks []ranklist.Rank - ExpectedOutput ExpectedOutput + hostsConfigArray []MockHostStorageConfig + tgtRanks []ranklist.Rank + memRatio float32 + queryError error + expScmBytes uint64 + expNvmeBytes uint64 + expError error + expDebug string }{ "single server": { - HostsConfigArray: []MockHostStorageConfig{ + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, + }, + }, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, + }, + "single MD-on-SSD server; no mem-ratio specified; defaults to 1.0": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, + newNvmeCfg(0, storage.BdevRoleData), + newNvmeCfg(0, + storage.BdevRoleWAL|storage.BdevRoleMeta, + 2*humanize.TByte), }, }, }, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 100 * humanize.GByte, - NvmeBytes: 1 * humanize.TByte, - }, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, }, - "single MD-on-SSD server": { - HostsConfigArray: []MockHostStorageConfig{ + "single MD-on-SSD server; invalid mem-ratio; high": { + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - NvmeRole: &storage.BdevRoles{ - storage.OptionBits(storage.BdevRoleData), - }, - }, - Rank: 0, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 2 * humanize.TByte, - AvailBytes: 2 * humanize.TByte, - UsableBytes: 2 * humanize.TByte, - NvmeRole: &storage.BdevRoles{ - storage.OptionBits(storage.BdevRoleWAL | storage.BdevRoleMeta), - }, - }, - Rank: 0, - }, + newNvmeCfg(0, storage.BdevRoleData), + newNvmeCfg(0, + storage.BdevRoleWAL|storage.BdevRoleMeta, + 2*humanize.TByte), }, }, }, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 100 * humanize.GByte, - NvmeBytes: 1 * humanize.TByte, + memRatio: 1.1, + expError: errors.New("invalid mem-ratio"), + }, + "single MD-on-SSD server; invalid mem-ratio; low": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{ + newNvmeCfg(0, storage.BdevRoleData), + newNvmeCfg(0, + storage.BdevRoleWAL|storage.BdevRoleMeta, + 2*humanize.TByte), + }, + }, }, + memRatio: -1.1, + expError: errors.New("invalid mem-ratio"), }, - "single Ephemeral server": { - HostsConfigArray: []MockHostStorageConfig{ + "single MD-on-SSD server; phase-1 mode (mem-file-sz == meta-blob-sz)": { + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{ + newNvmeCfg(0, storage.BdevRoleData), + newNvmeCfg(0, + storage.BdevRoleWAL|storage.BdevRoleMeta, + 2*humanize.TByte), }, + }, + }, + memRatio: 1, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, + }, + "single MD-on-SSD server; phase-2 mode (mem-file-sz < meta-blob-sz)": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - NvmeRole: &storage.BdevRoles{storage.OptionBits(0)}, - }, - Rank: 0, - }, + newNvmeCfg(0, storage.BdevRoleData), + newNvmeCfg(0, + storage.BdevRoleWAL|storage.BdevRoleMeta, + 2*humanize.TByte), }, }, }, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 100 * humanize.GByte, - NvmeBytes: 1 * humanize.TByte, + memRatio: 0.5, + expScmBytes: 200 * humanize.GByte, // Double meta-blob-sz due to mem-ratio. + expNvmeBytes: humanize.TByte, + }, + "single ephemeral server": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, + }, }, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: humanize.TByte, }, "double server": { - HostsConfigArray: []MockHostStorageConfig{ + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, { HostName: "bar[1,3]", ScmConfig: []MockScmConfig{ + newScmCfg(1, humanize.TByte), + newScmCfg(2), { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 2, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 50 * humanize.GByte, UsableBytes: 50 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 3, }, }, NvmeConfig: []MockNvmeConfig{ + newNvmeCfg(1, 0), { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 400 * humanize.GByte, UsableBytes: 400 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 300 * humanize.GByte, UsableBytes: 300 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, @@ -2271,124 +2266,67 @@ func TestControl_getMaxPoolSize(t *testing.T) { TotalBytes: 3 * humanize.TByte, AvailBytes: 2 * humanize.TByte, UsableBytes: 2 * humanize.TByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 3, }, }, }, }, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 50 * humanize.GByte, - NvmeBytes: 700 * humanize.GByte, - }, + expScmBytes: 50 * humanize.GByte, + expNvmeBytes: 700 * humanize.GByte, }, "double server; rank filter": { - HostsConfigArray: []MockHostStorageConfig{ + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, { HostName: "bar[1,3]", ScmConfig: []MockScmConfig{ + newScmCfg(1, humanize.TByte), + newScmCfg(2, humanize.TByte), + newScmCfg(3, humanize.GByte), { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 2, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.GByte, - AvailBytes: 1 * humanize.GByte, - UsableBytes: 1 * humanize.GByte, - }, - Rank: 3, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 50 * humanize.GByte, UsableBytes: 50 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 4, }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.GByte, - AvailBytes: 1 * humanize.GByte, - UsableBytes: 1 * humanize.GByte, - }, - Rank: 5, - }, + newScmCfg(5, humanize.GByte), }, NvmeConfig: []MockNvmeConfig{ + newNvmeCfg(1, 0), { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 400 * humanize.GByte, UsableBytes: 400 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 300 * humanize.GByte, UsableBytes: 300 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.GByte, - AvailBytes: 1 * humanize.GByte, - UsableBytes: 1 * humanize.GByte, - }, - Rank: 3, - }, + newNvmeCfg(3, 0, humanize.GByte), { MockStorageConfig: MockStorageConfig{ TotalBytes: 3 * humanize.TByte, AvailBytes: 2 * humanize.TByte, UsableBytes: 2 * humanize.TByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 4, }, @@ -2397,139 +2335,78 @@ func TestControl_getMaxPoolSize(t *testing.T) { TotalBytes: 3 * humanize.TByte, AvailBytes: 2 * humanize.TByte, UsableBytes: 2 * humanize.TByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 5, }, }, }, }, - TgtRanks: []ranklist.Rank{0, 1, 2, 4}, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 50 * humanize.GByte, - NvmeBytes: 700 * humanize.GByte, - }, + tgtRanks: []ranklist.Rank{0, 1, 2, 4}, + expScmBytes: 50 * humanize.GByte, + expNvmeBytes: 700 * humanize.GByte, }, "No NVMe; single server": { - HostsConfigArray: []MockHostStorageConfig{ + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, NvmeConfig: []MockNvmeConfig{}, }, }, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 100 * humanize.GByte, - NvmeBytes: uint64(0), - }, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: uint64(0), }, "No NVMe; double server": { - HostsConfigArray: []MockHostStorageConfig{ + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, { HostName: "bar", ScmConfig: []MockScmConfig{ + newScmCfg(1, humanize.TByte), + newScmCfg(2, humanize.TByte), + newScmCfg(3, humanize.GByte), { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 2, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.GByte, - AvailBytes: 1 * humanize.GByte, - UsableBytes: 1 * humanize.GByte, - }, - Rank: 3, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 50 * humanize.GByte, UsableBytes: 50 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 4, }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.GByte, - AvailBytes: 1 * humanize.GByte, - UsableBytes: 1 * humanize.GByte, - }, - Rank: 5, - }, + newScmCfg(5, humanize.GByte), }, NvmeConfig: []MockNvmeConfig{ { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 400 * humanize.GByte, UsableBytes: 400 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 300 * humanize.GByte, UsableBytes: 300 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.GByte, - AvailBytes: 1 * humanize.GByte, - UsableBytes: 1 * humanize.GByte, - }, - Rank: 3, - }, + newNvmeCfg(3, 0, humanize.GByte), { MockStorageConfig: MockStorageConfig{ TotalBytes: 3 * humanize.TByte, AvailBytes: 2 * humanize.TByte, UsableBytes: 2 * humanize.TByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 4, }, @@ -2538,226 +2415,132 @@ func TestControl_getMaxPoolSize(t *testing.T) { TotalBytes: 3 * humanize.TByte, AvailBytes: 2 * humanize.TByte, UsableBytes: 2 * humanize.TByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 5, }, }, }, }, - TgtRanks: []ranklist.Rank{0, 1, 2, 4}, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 50 * humanize.GByte, - NvmeBytes: uint64(0), - }, + tgtRanks: []ranklist.Rank{0, 1, 2, 4}, + expScmBytes: 50 * humanize.GByte, + expNvmeBytes: uint64(0), }, "SCM:NVMe ratio": { - HostsConfigArray: []MockHostStorageConfig{ + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.TByte, - AvailBytes: 100 * humanize.TByte, - UsableBytes: 100 * humanize.TByte, - }, - Rank: 0, - }, + newNvmeCfg(0, 0, 100*humanize.TByte), }, }, }, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 100 * humanize.GByte, - NvmeBytes: 100 * humanize.TByte, - }, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: 100 * humanize.TByte, }, - "Invalid response message": { - HostsConfigArray: []MockHostStorageConfig{{}}, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("unable to unpack message"), - }, + "invalid response message": { + hostsConfigArray: []MockHostStorageConfig{{}}, + expError: errors.New("unable to unpack message"), }, "empty response": { - HostsConfigArray: []MockHostStorageConfig{}, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("host storage response"), - }, - }, - "query fails": { - HostsConfigArray: []MockHostStorageConfig{}, - ExpectedOutput: ExpectedOutput{ - QueryError: errors.New("query whoops"), - Error: errors.New("query whoops"), - }, + hostsConfigArray: []MockHostStorageConfig{}, + expError: errors.New("host storage response"), }, - "No SCM storage": { - HostsConfigArray: []MockHostStorageConfig{ - { - HostName: "foo", - ScmConfig: []MockScmConfig{}, - NvmeConfig: []MockNvmeConfig{}, - }, - }, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("Host without SCM storage"), - }, - }, - "Engine with two SCM storage": { - HostsConfigArray: []MockHostStorageConfig{ - { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{}, - }, - }, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("Multiple SCM devices found for rank"), - }, - }, - "Unusable NVMe device": { - HostsConfigArray: []MockHostStorageConfig{ - { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - NvmeState: &devStateFaulty, - }, - Rank: 0, - }, - }, - }, - }, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("not usable"), - }, - }, - "New NVMe device": { - HostsConfigArray: []MockHostStorageConfig{ - { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - NvmeState: &devStateNew, - }, - Rank: 0, - }, - }, + "query fails": { + hostsConfigArray: []MockHostStorageConfig{}, + queryError: errors.New("query whoops"), + expError: errors.New("query whoops"), + }, + "no SCM storage": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{}, + NvmeConfig: []MockNvmeConfig{}, }, }, - ExpectedOutput: ExpectedOutput{ - ScmBytes: 100 * humanize.GByte, - NvmeBytes: uint64(0), - }, + expError: errors.New("Host without SCM storage"), }, - "Unmounted SCM device": { - HostsConfigArray: []MockHostStorageConfig{ + "engine with two SCM storage": { + hostsConfigArray: []MockHostStorageConfig{ { HostName: "foo", ScmConfig: []MockScmConfig{ + newScmCfg(0, humanize.TByte), + newScmCfg(0, humanize.TByte), + }, + NvmeConfig: []MockNvmeConfig{}, + }, + }, + expError: errors.New("Multiple SCM devices found for rank"), + }, + "unusable NVMe device": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{ { MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, + TotalBytes: humanize.TByte, + AvailBytes: humanize.TByte, + UsableBytes: humanize.TByte, + NvmeState: &devStateFaulty, + NvmeRole: &storage.BdevRoles{}, }, Rank: 0, }, }, + }, + }, + expError: errors.New("not usable"), + }, + "new NVMe device": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, NvmeConfig: []MockNvmeConfig{ { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, + AvailBytes: humanize.TByte, + UsableBytes: humanize.TByte, + NvmeState: &devStateNew, + NvmeRole: &storage.BdevRoles{}, }, Rank: 0, }, }, }, + }, + expScmBytes: 100 * humanize.GByte, + expNvmeBytes: uint64(0), + }, + "unmounted SCM device": { + hostsConfigArray: []MockHostStorageConfig{ + { + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, + }, { HostName: "bar[1,3]", ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, + newScmCfg(1, humanize.TByte), { MockStorageConfig: MockStorageConfig{ TotalBytes: uint64(0), AvailBytes: uint64(0), UsableBytes: uint64(0), + NvmeRole: &storage.BdevRoles{}, }, }, + newScmCfg(2), { MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 2, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 50 * humanize.GByte, UsableBytes: 50 * humanize.GByte, }, @@ -2765,17 +2548,10 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, NvmeConfig: []MockNvmeConfig{ + newNvmeCfg(1, 0), { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 400 * humanize.GByte, UsableBytes: 400 * humanize.GByte, }, @@ -2783,7 +2559,7 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, { MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, + TotalBytes: humanize.TByte, AvailBytes: 300 * humanize.GByte, UsableBytes: 300 * humanize.GByte, }, @@ -2800,70 +2576,28 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, }, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("is not mounted"), - }, + expError: errors.New("is not mounted"), }, "SMD without SCM": { - HostsConfigArray: []MockHostStorageConfig{ + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(1, 0)}, }, }, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("without SCM device and at least one SMD device"), - }, + expError: errors.New("without SCM device and at least one SMD device"), }, "no SCM": { - HostsConfigArray: []MockHostStorageConfig{ + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, }, - TgtRanks: []ranklist.Rank{1}, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("No SCM storage space available"), - }, + tgtRanks: []ranklist.Rank{1}, + expError: errors.New("No SCM storage space available"), }, } { t.Run(name, func(t *testing.T) { @@ -2877,7 +2611,7 @@ func TestControl_getMaxPoolSize(t *testing.T) { { Addr: "foo", Message: &mgmtpb.SystemQueryResp{}, - Error: tc.ExpectedOutput.QueryError, + Error: tc.queryError, }, }, }, @@ -2886,7 +2620,8 @@ func TestControl_getMaxPoolSize(t *testing.T) { }, }, } - for _, hostStorageConfig := range tc.HostsConfigArray { + + for _, hostStorageConfig := range tc.hostsConfigArray { var hostResponse *HostResponse if hostStorageConfig.HostName == "" { hostResponse = new(HostResponse) @@ -2904,32 +2639,26 @@ func TestControl_getMaxPoolSize(t *testing.T) { } mockInvoker := NewMockInvoker(log, mockInvokerConfig) - scmBytes, nvmeBytes, err := getMaxPoolSize(test.Context(t), mockInvoker, tc.TgtRanks) + createReq := &PoolCreateReq{Ranks: tc.tgtRanks, MemRatio: tc.memRatio} + scmBytes, nvmeBytes, gotErr := getMaxPoolSize(test.Context(t), mockInvoker, + createReq) - if tc.ExpectedOutput.Error != nil { - test.AssertTrue(t, err != nil, "Expected error") - test.CmpErr(t, tc.ExpectedOutput.Error, err) + test.CmpErr(t, tc.expError, gotErr) + if gotErr != nil { return } - test.AssertTrue(t, err == nil, - fmt.Sprintf("Expected no error: err=%q", err)) - test.AssertEqual(t, - tc.ExpectedOutput.ScmBytes, - scmBytes, - fmt.Sprintf("Invalid SCM pool size: expected=%d got=%d", - tc.ExpectedOutput.ScmBytes, - scmBytes)) + test.AssertEqual(t, tc.expScmBytes, scmBytes, + fmt.Sprintf("Invalid SCM pool size, want %s got %s", + humanize.Bytes(tc.expScmBytes), humanize.Bytes(scmBytes))) - test.AssertEqual(t, - tc.ExpectedOutput.NvmeBytes, - nvmeBytes, - fmt.Sprintf("Invalid NVMe pool size: expected=%d got=%d", - tc.ExpectedOutput.NvmeBytes, - nvmeBytes)) - if tc.ExpectedOutput.Debug != "" { - test.AssertTrue(t, strings.Contains(buf.String(), tc.ExpectedOutput.Debug), - "Missing log message: "+tc.ExpectedOutput.Debug) + test.AssertEqual(t, tc.expNvmeBytes, nvmeBytes, + fmt.Sprintf("Invalid NVMe pool size, want %s got %s", + humanize.Bytes(tc.expNvmeBytes), humanize.Bytes(nvmeBytes))) + + if tc.expDebug != "" { + test.AssertTrue(t, strings.Contains(buf.String(), tc.expDebug), + "Missing log message: "+tc.expDebug) } }) } @@ -2946,135 +2675,59 @@ func (invoker *MockRequestsRecorderInvoker) InvokeUnaryRPC(context context.Conte } func TestControl_PoolCreateAllCmd(t *testing.T) { - type ExpectedOutput struct { - PoolConfig MockPoolRespConfig - WarningMsg string - Error error - } - for name, tc := range map[string]struct { - StorageRatio float64 - HostsConfigArray []MockHostStorageConfig - TgtRanks string - ExpectedOutput ExpectedOutput + hostsConfigArray []MockHostStorageConfig + storageRatio float64 + tgtRanks string + expPoolConfig MockPoolRespConfig + expError error + expWarning string }{ "single server": { - StorageRatio: 1, - HostsConfigArray: []MockHostStorageConfig{ + storageRatio: 1, + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, }, - ExpectedOutput: ExpectedOutput{ - PoolConfig: MockPoolRespConfig{ - HostName: "foo", - Ranks: "0", - ScmBytes: 100 * humanize.GByte, - NvmeBytes: 1 * humanize.TByte, - }, + expPoolConfig: MockPoolRespConfig{ + HostName: "foo", + Ranks: "0", + ScmBytes: 100 * humanize.GByte, + NvmeBytes: 1 * humanize.TByte, }, }, "single server 30%": { - StorageRatio: 0.3, - HostsConfigArray: []MockHostStorageConfig{ + storageRatio: 0.3, + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, }, - ExpectedOutput: ExpectedOutput{ - PoolConfig: MockPoolRespConfig{ - HostName: "foo", - Ranks: "0", - ScmBytes: 30 * humanize.GByte, - NvmeBytes: 300 * humanize.GByte, - }, + expPoolConfig: MockPoolRespConfig{ + HostName: "foo", + Ranks: "0", + ScmBytes: 30 * humanize.GByte, + NvmeBytes: 300 * humanize.GByte, }, }, "double server": { - StorageRatio: 1, - HostsConfigArray: []MockHostStorageConfig{ + storageRatio: 1, + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, { HostName: "bar", ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 2, - }, + newScmCfg(1, humanize.TByte), + newScmCfg(2), { MockStorageConfig: MockStorageConfig{ TotalBytes: 1 * humanize.TByte, @@ -3085,19 +2738,13 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { }, }, NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, + newNvmeCfg(1, 0), { MockStorageConfig: MockStorageConfig{ TotalBytes: 1 * humanize.TByte, AvailBytes: 400 * humanize.GByte, UsableBytes: 400 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, @@ -3106,6 +2753,7 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { TotalBytes: 1 * humanize.TByte, AvailBytes: 300 * humanize.GByte, UsableBytes: 300 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, @@ -3114,66 +2762,33 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { TotalBytes: 3 * humanize.TByte, AvailBytes: 2 * humanize.TByte, UsableBytes: 2 * humanize.TByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 3, }, }, }, }, - ExpectedOutput: ExpectedOutput{ - PoolConfig: MockPoolRespConfig{ - HostName: "foo", - Ranks: "0,1,2,3", - ScmBytes: 50 * humanize.GByte, - NvmeBytes: 700 * humanize.GByte, - }, + expPoolConfig: MockPoolRespConfig{ + HostName: "foo", + Ranks: "0,1,2,3", + ScmBytes: 50 * humanize.GByte, + NvmeBytes: 700 * humanize.GByte, }, }, - "double server;rank filter": { - StorageRatio: 1, - HostsConfigArray: []MockHostStorageConfig{ + "double server; rank filter": { + storageRatio: 1, + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, { HostName: "bar", ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 2, - }, + newScmCfg(1, humanize.TByte), + newScmCfg(2), { MockStorageConfig: MockStorageConfig{ TotalBytes: 1 * humanize.TByte, @@ -3192,19 +2807,13 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { }, }, NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 1, - }, + newNvmeCfg(1, 0), { MockStorageConfig: MockStorageConfig{ TotalBytes: 1 * humanize.TByte, AvailBytes: 400 * humanize.GByte, UsableBytes: 400 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, @@ -3213,6 +2822,7 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { TotalBytes: 1 * humanize.TByte, AvailBytes: 300 * humanize.GByte, UsableBytes: 300 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 2, }, @@ -3221,6 +2831,7 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { TotalBytes: 3 * humanize.TByte, AvailBytes: 2 * humanize.TByte, UsableBytes: 2 * humanize.TByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 3, }, @@ -3229,90 +2840,60 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { TotalBytes: 3 * humanize.TByte, AvailBytes: 1 * humanize.GByte, UsableBytes: 1 * humanize.GByte, + NvmeRole: &storage.BdevRoles{}, }, Rank: 4, }, }, }, }, - TgtRanks: "0,1,2,3", - ExpectedOutput: ExpectedOutput{ - PoolConfig: MockPoolRespConfig{ - HostName: "foo", - Ranks: "0,1,2,3", - ScmBytes: 50 * humanize.GByte, - NvmeBytes: 700 * humanize.GByte, - }, + tgtRanks: "0,1,2,3", + expPoolConfig: MockPoolRespConfig{ + HostName: "foo", + Ranks: "0,1,2,3", + ScmBytes: 50 * humanize.GByte, + NvmeBytes: 700 * humanize.GByte, }, }, "No NVME": { - StorageRatio: 1, - HostsConfigArray: []MockHostStorageConfig{ + storageRatio: 1, + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, NvmeConfig: []MockNvmeConfig{}, }, }, - ExpectedOutput: ExpectedOutput{ - PoolConfig: MockPoolRespConfig{ - HostName: "foo", - Ranks: "0", - ScmBytes: 100 * humanize.GByte, - NvmeBytes: uint64(0), - }, - WarningMsg: "Creating DAOS pool without NVME storage", + expPoolConfig: MockPoolRespConfig{ + HostName: "foo", + Ranks: "0", + ScmBytes: 100 * humanize.GByte, + NvmeBytes: uint64(0), }, + expWarning: "Creating DAOS pool without NVME storage", }, "SCM:NVME ratio": { - StorageRatio: 1, - HostsConfigArray: []MockHostStorageConfig{ + storageRatio: 1, + hostsConfigArray: []MockHostStorageConfig{ { - HostName: "foo", - ScmConfig: []MockScmConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.GByte, - AvailBytes: 100 * humanize.GByte, - UsableBytes: 100 * humanize.GByte, - }, - Rank: 0, - }, - }, + HostName: "foo", + ScmConfig: []MockScmConfig{newScmCfg(0)}, NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 100 * humanize.TByte, - AvailBytes: 100 * humanize.TByte, - UsableBytes: 100 * humanize.TByte, - }, - Rank: 0, - }, + newNvmeCfg(0, 0, 100*humanize.TByte), }, }, }, - ExpectedOutput: ExpectedOutput{ - PoolConfig: MockPoolRespConfig{ - HostName: "foo", - Ranks: "0", - ScmBytes: 100 * humanize.GByte, - NvmeBytes: 100 * humanize.TByte, - }, - WarningMsg: "SCM:NVMe ratio is less than", + expPoolConfig: MockPoolRespConfig{ + HostName: "foo", + Ranks: "0", + ScmBytes: 100 * humanize.GByte, + NvmeBytes: 100 * humanize.TByte, }, + expWarning: "SCM:NVMe ratio is less than", }, "single server error 1%": { - StorageRatio: 0.01, - HostsConfigArray: []MockHostStorageConfig{ + storageRatio: 0.01, + hostsConfigArray: []MockHostStorageConfig{ { HostName: "foo", ScmConfig: []MockScmConfig{ @@ -3325,21 +2906,10 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { Rank: 0, }, }, - NvmeConfig: []MockNvmeConfig{ - { - MockStorageConfig: MockStorageConfig{ - TotalBytes: 1 * humanize.TByte, - AvailBytes: 1 * humanize.TByte, - UsableBytes: 1 * humanize.TByte, - }, - Rank: 0, - }, - }, + NvmeConfig: []MockNvmeConfig{newNvmeCfg(0, 0)}, }, }, - ExpectedOutput: ExpectedOutput{ - Error: errors.New("Not enough SCM storage available with ratio 1%"), - }, + expError: errors.New("Not enough SCM storage available with ratio 1%"), }, } { t.Run(name, func(t *testing.T) { @@ -3360,7 +2930,7 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { } unaryResponse := new(UnaryResponse) - for _, hostStorageConfig := range tc.HostsConfigArray { + for _, hostStorageConfig := range tc.hostsConfigArray { storageScanResp := MockStorageScanResp(t, hostStorageConfig.ScmConfig, hostStorageConfig.NvmeConfig) @@ -3372,10 +2942,10 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { } mockInvokerConfig.UnaryResponseSet = append(mockInvokerConfig.UnaryResponseSet, unaryResponse) - if tc.ExpectedOutput.PoolConfig.Ranks != "" { - poolCreateResp := MockPoolCreateResp(t, &tc.ExpectedOutput.PoolConfig) + if tc.expPoolConfig.Ranks != "" { + poolCreateResp := MockPoolCreateResp(t, &tc.expPoolConfig) hostResponse := &HostResponse{ - Addr: tc.ExpectedOutput.PoolConfig.HostName, + Addr: tc.expPoolConfig.HostName, Message: poolCreateResp, } unaryResponse = new(UnaryResponse) @@ -3390,15 +2960,15 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { req := &PoolCreateReq{} - if tc.StorageRatio != 0 { - req.TierRatio = []float64{tc.StorageRatio, tc.StorageRatio} + if tc.storageRatio != 0 { + req.TierRatio = []float64{tc.storageRatio, tc.storageRatio} } - if tc.TgtRanks != "" { - req.Ranks = ranklist.RanksFromUint32(mockRanks(tc.TgtRanks)) + if tc.tgtRanks != "" { + req.Ranks = ranklist.RanksFromUint32(mockRanks(tc.tgtRanks)) } _, gotErr := PoolCreate(context.Background(), mockInvoker, req) - test.CmpErr(t, tc.ExpectedOutput.Error, gotErr) + test.CmpErr(t, tc.expError, gotErr) if gotErr != nil { return } @@ -3419,20 +2989,20 @@ func TestControl_PoolCreateAllCmd(t *testing.T) { poolCreateRequest := mockInvoker.Requests[2].(*PoolCreateReq) test.AssertEqual(t, poolCreateRequest.TierBytes[0], - tc.ExpectedOutput.PoolConfig.ScmBytes, + tc.expPoolConfig.ScmBytes, "Invalid size of allocated SCM") test.AssertEqual(t, poolCreateRequest.TierBytes[1], - tc.ExpectedOutput.PoolConfig.NvmeBytes, + tc.expPoolConfig.NvmeBytes, "Invalid size of allocated NVME") test.AssertEqual(t, poolCreateRequest.TotalBytes, uint64(0), "Invalid size of TotalBytes attribute: disabled with manual allocation") - if tc.TgtRanks != "" { + if tc.tgtRanks != "" { test.AssertEqual(t, ranklist.RankList(poolCreateRequest.Ranks).String(), - tc.ExpectedOutput.PoolConfig.Ranks, + tc.expPoolConfig.Ranks, "Invalid list of Ranks") } else { test.AssertEqual(t, diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go index 9d5fe470de6..fb649a06ce1 100644 --- a/src/control/lib/control/storage.go +++ b/src/control/lib/control/storage.go @@ -160,6 +160,7 @@ type ( Usage bool NvmeHealth bool NvmeBasic bool + MemRatio float32 } // StorageScanResp contains the response from a storage scan request. @@ -256,8 +257,9 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe Nvme: &ctlpb.ScanNvmeReq{ Basic: req.NvmeBasic, // Health and meta details required to populate usage statistics. - Health: req.NvmeHealth || req.Usage, - Meta: req.Usage, + Health: req.NvmeHealth || req.Usage, + Meta: req.Usage, + MemRatio: req.MemRatio, // Only request link stats if health explicitly requested. LinkStats: req.NvmeHealth, }, diff --git a/src/control/lib/daos/pool.go b/src/control/lib/daos/pool.go index e47e6e2b23d..0792c46c4ea 100644 --- a/src/control/lib/daos/pool.go +++ b/src/control/lib/daos/pool.go @@ -79,6 +79,7 @@ type ( DisabledRanks *ranklist.RankSet `json:"disabled_ranks,omitempty"` PoolLayoutVer uint32 `json:"pool_layout_ver"` UpgradeLayoutVer uint32 `json:"upgrade_layout_ver"` + MemFileBytes uint64 `json:"mem_file_bytes"` } PoolQueryTargetType int32 @@ -86,9 +87,10 @@ type ( // PoolQueryTargetInfo contains information about a single target PoolQueryTargetInfo struct { - Type PoolQueryTargetType `json:"target_type"` - State PoolQueryTargetState `json:"target_state"` - Space []*StorageUsageStats `json:"space"` + Type PoolQueryTargetType `json:"target_type"` + State PoolQueryTargetState `json:"target_state"` + Space []*StorageUsageStats `json:"space"` + MemFileBytes uint64 `json:"mem_file_bytes"` } // StorageTargetUsage represents DAOS target storage usage @@ -351,6 +353,8 @@ const ( StorageMediaTypeScm = StorageMediaType(mgmtpb.StorageMediaType_SCM) // StorageMediaTypeNvme indicates that the media is NVMe SSD StorageMediaTypeNvme = StorageMediaType(mgmtpb.StorageMediaType_NVME) + // StorageMediaTypeMax indicates the end of the StorageMediaType array + StorageMediaTypeMax = StorageMediaType(StorageMediaTypeNvme + 1) ) func (smt StorageMediaType) String() string { diff --git a/src/control/server/ctl_storage_rpc.go b/src/control/server/ctl_storage_rpc.go index 71339918876..90a46495ae0 100644 --- a/src/control/server/ctl_storage_rpc.go +++ b/src/control/server/ctl_storage_rpc.go @@ -70,9 +70,8 @@ func newResponseState(inErr error, badStatus ctlpb.ResponseStatus, infoMsg strin // Package-local function variables for mocking in unit tests. var ( - scanBdevs = bdevScan // StorageScan() unit tests - scanEngineBdevs = bdevScanEngine // bdevScan() unit tests - computeMetaRdbSz = metaRdbComputeSz // TODO unit tests + scanBdevs = bdevScan // StorageScan() unit tests + scanEngineBdevs = bdevScanEngine // bdevScan() unit tests ) type scanBdevsFn func(storage.BdevScanRequest) (*storage.BdevScanResponse, error) @@ -161,7 +160,7 @@ func bdevScanEngines(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvm eReq := new(ctlpb.ScanNvmeReq) *eReq = *req if req.Meta { - ms, rs, err := computeMetaRdbSz(cs, engine, nsps) + ms, rs, err := metaRdbComputeSz(cs, engine, nsps, req.MemRatio) if err != nil { return nil, errors.Wrap(err, "computing meta and rdb size") } @@ -169,7 +168,7 @@ func bdevScanEngines(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvm } // If partial number of engines return results, indicate errors for non-ready - // engines whilst returning successful scanmresults. + // engines whilst returning successful scan results. respEng, err := scanEngineBdevs(ctx, engine, eReq) if err != nil { err = errors.Wrapf(err, "instance %d", engine.Index()) @@ -287,7 +286,7 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n // Retry once if harness scan returns unexpected number of controllers in case engines // claimed devices between when started state was checked and scan was executed. if !hasStarted { - cs.log.Debugf("retrying harness bdev scan as unexpected nr returned, want %d got %d", + cs.log.Debugf("retrying harness bdev scan as unexpected nr ctrlrs returned, want %d got %d", nrCfgBdevs, nrScannedBdevs) resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, bdevCfgs) @@ -304,7 +303,7 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n } } - cs.log.Noticef("harness bdev scan returned unexpected nr, want %d got %d", nrCfgBdevs, + cs.log.Noticef("harness bdev scan returned unexpected nr ctrlrs, want %d got %d", nrCfgBdevs, nrScannedBdevs) return bdevScanTrimResults(req, resp), nil @@ -418,39 +417,65 @@ func (cs *ControlService) getRdbSize(engineCfg *engine.Config) (uint64, error) { // Compute the maximal size of the metadata to allow the engine to fill the WallMeta field // response. The maximal metadata (i.e. VOS index file) size should be equal to the SCM available -// size divided by the number of targets of the engine. -func metaRdbComputeSz(cs *ControlService, ei Engine, nsps []*ctlpb.ScmNamespace) (md_size, rdb_size uint64, errOut error) { +// size divided by the number of targets of the engine. Sizes returned are per-target values. +func metaRdbComputeSz(cs *ControlService, ei Engine, nsps []*ctlpb.ScmNamespace, memRatio float32) (uint64, uint64, error) { + msg := fmt.Sprintf("computing meta/rdb sizes with %d scm namespaces", len(nsps)) + + var metaBytes, rdbBytes uint64 for _, nsp := range nsps { + msg += fmt.Sprintf(", scm-ns: %+v", nsp) + mp := nsp.GetMount() if mp == nil { + cs.log.Tracef("%s: skip (no mount)", msg) + continue + } + msg += fmt.Sprintf(", mount: %+v", mp) + + r, err := ei.GetRank() + if err != nil { + cs.log.Tracef("%s: skip (get rank err: %s)", msg, err.Error()) continue } - if r, err := ei.GetRank(); err != nil || uint32(r) != mp.GetRank() { + if uint32(r) != mp.Rank { + cs.log.Tracef("%s: skip (wrong rank, want %d got %d)", msg, r, mp.Rank) continue } + msg += fmt.Sprintf(", rank %d", r) - // NOTE DAOS-14223: This metadata size calculation won't necessarily match - // the meta blob size on SSD if --meta-size is specified in - // pool create command. - md_size = mp.GetUsableBytes() / uint64(ei.GetTargetCount()) + if ei.GetTargetCount() == 0 { + return 0, 0, errors.Errorf("%s: engine with zero tgts is invalid", msg) + } + metaBytes = mp.GetUsableBytes() / uint64(ei.GetTargetCount()) + + // Divide VOS index file size by memRatio fraction, if nonzero, to project the + // effective meta-blob size. In MD-on-SSD phase-2, meta-blob > VOS-file size. + if memRatio > 0 { + msg += fmt.Sprintf(", using %.2f mem-ratio", memRatio) + metaBytes = uint64(float64(metaBytes) / float64(memRatio)) + } engineCfg, err := cs.getEngineCfgFromScmNsp(nsp) if err != nil { - errOut = errors.Wrap(err, "Engine with invalid configuration") - return + return 0, 0, errors.Wrapf(err, "%s: engine with invalid configuration", msg) } - rdb_size, errOut = cs.getRdbSize(engineCfg) - if errOut != nil { - return + rdbBytes, err = cs.getRdbSize(engineCfg) + if err != nil { + return 0, 0, errors.Wrapf(err, "%s: get rdb size with engine cfg %+v", msg, + engineCfg) } - break + + break // Just use first namespace. } - if md_size == 0 { + if metaBytes == 0 { cs.log.Noticef("instance %d: no SCM space available for metadata", ei.Index) + rdbBytes = 0 } + cs.log.Tracef("%s: computed meta sz %s and rdb sz %s", msg, humanize.IBytes(metaBytes), + humanize.IBytes(rdbBytes)) - return + return metaBytes, rdbBytes, nil } type deviceToAdjust struct { @@ -464,8 +489,20 @@ type deviceSizeStat struct { devs []*deviceToAdjust } +// Dedupe and remove sysXS target ID from slice before counting IDs. See +// storage.SmdDevice.UnmarshalJSON() for tgtID sanitization. +func getSmdTgtCount(log logging.Logger, sd *ctlpb.SmdDevice) int { + var sdOut storage.SmdDevice + if err := convert.Types(sd, &sdOut); err != nil { + log.Errorf("could not retrieve target count for smd %s", sd.GetUuid()) + return 0 + } + + return len(sdOut.TargetIDs) +} + // Add a device to the input map of device to which the usable size have to be adjusted -func (cs *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat, devToAdjust *deviceToAdjust, dataClusterCount uint64) { +func (cs *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat, devToAdjust *deviceToAdjust, dataClusterCount uint64, devTgtCount int) { dev := devToAdjust.ctlr.GetSmdDevices()[devToAdjust.idx] if devsStat[devToAdjust.rank] == nil { devsStat[devToAdjust.rank] = &deviceSizeStat{ @@ -473,44 +510,45 @@ func (cs *ControlService) addDeviceToAdjust(devsStat map[uint32]*deviceSizeStat, } } devsStat[devToAdjust.rank].devs = append(devsStat[devToAdjust.rank].devs, devToAdjust) - targetCount := uint64(len(dev.GetTgtIds())) - clusterPerTarget := dataClusterCount / targetCount + clusterPerTarget := dataClusterCount / uint64(devTgtCount) cs.log.Tracef("SMD device %s (rank %d, ctlr %s) added to the list of device to adjust", dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) if clusterPerTarget < devsStat[devToAdjust.rank].clusterPerTarget { - cs.log.Tracef("Updating number of clusters per target of rank %d: old=%d new=%d", - devToAdjust.rank, devsStat[devToAdjust.rank].clusterPerTarget, clusterPerTarget) + cs.log.Tracef("Updating number of clusters per target (%d/%d) of rank %d: old=%d "+ + "new=%d", dataClusterCount, devTgtCount, devToAdjust.rank, + devsStat[devToAdjust.rank].clusterPerTarget, clusterPerTarget) devsStat[devToAdjust.rank].clusterPerTarget = clusterPerTarget } } // For a given size in bytes, returns the total number of SPDK clusters needed for a given number of targets -func getClusterCount(sizeBytes uint64, targetNb uint64, clusterSize uint64) uint64 { +func getClusterCount(sizeBytes uint64, tgtCount int, clusterSize uint64) uint64 { clusterCount := sizeBytes / clusterSize if sizeBytes%clusterSize != 0 { clusterCount += 1 } - return clusterCount * targetNb + + return clusterCount * uint64(tgtCount) } func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdjust deviceToAdjust) (subtrClusterCount uint64) { dev := devToAdjust.ctlr.GetSmdDevices()[devToAdjust.idx] clusterSize := uint64(dev.GetClusterSize()) - engineTargetNb := uint64(engineCfg.TargetCount) + // Calculate MD cluster overhead based on the number of targets allocated to the device + // as per-target blobs will be striped across all of a given role's SSDs. + devTgtCount := getSmdTgtCount(cs.log, dev) if dev.GetRoleBits()&storage.BdevRoleMeta != 0 { - // TODO DAOS-14223: GetMetaSize() should reflect custom values set through pool - // create --meta-size option. - clusterCount := getClusterCount(dev.GetMetaSize(), engineTargetNb, clusterSize) - cs.log.Tracef("Removing %d Metadata clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ", - clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) + clusterCount := getClusterCount(dev.GetMetaSize(), devTgtCount, clusterSize) + cs.log.Tracef("Removing %d Metadata clusters (cluster size: %d, dev tgts: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ", + clusterCount, clusterSize, devTgtCount, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) subtrClusterCount += clusterCount } if dev.GetRoleBits()&storage.BdevRoleWAL != 0 { - clusterCount := getClusterCount(dev.GetMetaWalSize(), engineTargetNb, clusterSize) - cs.log.Tracef("Removing %d Metadata WAL clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ", - clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) + clusterCount := getClusterCount(dev.GetMetaWalSize(), devTgtCount, clusterSize) + cs.log.Tracef("Removing %d Metadata WAL clusters (cluster size: %d, dev tgts: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ", + clusterCount, clusterSize, devTgtCount, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) subtrClusterCount += clusterCount } @@ -520,7 +558,7 @@ func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdj if dev.GetRoleBits()&storage.BdevRoleMeta != 0 { clusterCount := getClusterCount(dev.GetRdbSize(), 1, clusterSize) - cs.log.Tracef("Removing %d RDB clusters (cluster size: %d) the usable size of the SMD device %s (rank %d, ctlr %s)", + cs.log.Tracef("Removing %d RDB clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s)", clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr()) subtrClusterCount += clusterCount } @@ -535,7 +573,8 @@ func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdj return } -// Adjust the NVME available size to its real usable size. +// Estimate the NVME size available to store pool data after metadata overheads have been +// accounted for. func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { devsStat := make(map[uint32]*deviceSizeStat, 0) for _, ctlr := range resp.GetCtrlrs() { @@ -547,6 +586,7 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { for idx, dev := range ctlr.GetSmdDevices() { rank := dev.GetRank() + devTgtCount := getSmdTgtCount(cs.log, dev) if dev.GetRoleBits() != 0 && (dev.GetRoleBits()&storage.BdevRoleData) == 0 { cs.log.Debugf("SMD device %s (rank %d, ctlr %s) not used to store data (Role bits 0x%X)", @@ -565,7 +605,7 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { continue } - if dev.GetClusterSize() == 0 || len(dev.GetTgtIds()) == 0 { + if dev.GetClusterSize() == 0 || devTgtCount == 0 { cs.log.Noticef("SMD device %s (rank %d, ctlr %s) not usable: missing storage info", dev.GetUuid(), rank, ctlr.GetPciAddr()) dev.AvailBytes = 0 @@ -579,7 +619,7 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { clusterSize := uint64(dev.GetClusterSize()) availBytes := (dev.GetAvailBytes() / clusterSize) * clusterSize if dev.GetAvailBytes() != availBytes { - cs.log.Tracef("Adjusting available size of SMD device %s (rank %d, ctlr %s): from %s (%d Bytes) to %s (%d bytes)", + cs.log.Tracef("Rounding available size of SMD device %s based on cluster size (rank %d, ctlr %s): from %s (%d Bytes) to %s (%d bytes)", dev.GetUuid(), rank, ctlr.GetPciAddr(), humanize.Bytes(dev.GetAvailBytes()), dev.GetAvailBytes(), humanize.Bytes(availBytes), availBytes) @@ -595,7 +635,8 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { if dev.GetRoleBits() == 0 { cs.log.Tracef("No meta-data stored on SMD device %s (rank %d, ctlr %s)", dev.GetUuid(), rank, ctlr.GetPciAddr()) - cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount) + cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount, + devTgtCount) continue } @@ -606,17 +647,19 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) { dev.UsableBytes = 0 continue } + cs.log.Tracef("Removing %d metadata clusters from %d total", + subtrClusterCount, dataClusterCount) dataClusterCount -= subtrClusterCount - cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount) + cs.addDeviceToAdjust(devsStat, &devToAdjust, dataClusterCount, devTgtCount) } } for rank, item := range devsStat { for _, dev := range item.devs { smdDev := dev.ctlr.GetSmdDevices()[dev.idx] - targetCount := uint64(len(smdDev.GetTgtIds())) - smdDev.UsableBytes = targetCount * item.clusterPerTarget * smdDev.GetClusterSize() - cs.log.Debugf("Defining usable size of the SMD device %s (rank %d, ctlr %s) to %s (%d bytes)", + clusters := uint64(getSmdTgtCount(cs.log, smdDev)) * item.clusterPerTarget + smdDev.UsableBytes = clusters * smdDev.GetClusterSize() + cs.log.Debugf("Defining usable size of the SMD device %s (rank %d, ctlr %s) as %s (%d bytes)", smdDev.GetUuid(), rank, dev.ctlr.GetPciAddr(), humanize.Bytes(smdDev.GetUsableBytes()), smdDev.GetUsableBytes()) } @@ -680,13 +723,10 @@ func (cs *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) { } cmdPath := engineCfg.Storage.ControlMetadata.Path - if hasPrefix, err := common.HasPrefixPath(mountPath, cmdPath); hasPrefix || err != nil { - if err != nil { - cs.log.Noticef("Invalid SCM mount path or Control Metadata path: %q", err.Error()) - } - if hasPrefix { - removeControlPlaneMetadata(mnt) - } + if hasPrefix, err := common.HasPrefixPath(mountPath, cmdPath); err != nil { + cs.log.Noticef("Invalid SCM mount path or Control Metadata path: %q", err.Error()) + } else if hasPrefix { + removeControlPlaneMetadata(mnt) } } diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go index bf2d7ee43b5..c1b795b8551 100644 --- a/src/control/server/ctl_storage_rpc_test.go +++ b/src/control/server/ctl_storage_rpc_test.go @@ -58,18 +58,44 @@ var ( ) func TestServer_bdevScan(t *testing.T) { + defTgtCount := 16 + defScmMountPt := "/mnt/daos0" + defScmDev := "/dev/pmem0" + defMountAvail := uint64(12) * humanize.GiByte + defMountUsable := uint64(10) * humanize.GiByte + defMetaSize := defMountUsable / uint64(defTgtCount) + defRdbSize := uint64(humanize.GiByte) + + mockSmd := func(roles uint32) *ctlpb.SmdDevice { + return &ctlpb.SmdDevice{ + Rank: uint32(0), + TgtIds: []int32{1, 2, 3, 4}, + // Avoid rounding + AvailBytes: 32 * humanize.GiByte, + ClusterSize: 32 * humanize.MiByte, + RoleBits: roles, + MetaSize: defMetaSize, + MetaWalSize: humanize.GiByte, + RdbSize: uint64(defRdbSize), + RdbWalSize: humanize.GiByte, + } + } + for name, tc := range map[string]struct { req *ctlpb.ScanNvmeReq disableHPs bool provRes *storage.BdevScanResponse provErr error + engTgtCount int engTierCfgs []storage.TierConfigs // one per-engine engStopped []bool // one per-engine (all false if unset) + scmNamespaces []*ctlpb.ScmNamespace // one per-engine engRes []ctlpb.ScanNvmeResp // one per-engine engErr []error // one per-engine expResp *ctlpb.ScanNvmeResp expErr error expBackendScanCalls []storage.BdevScanRequest + expRemoteScanCalls []*ctlpb.ScanNvmeReq }{ "nil request": { expErr: errNilReq, @@ -257,10 +283,77 @@ func TestServer_bdevScan(t *testing.T) { }, }, }, + "scan remote; bdevs in config; missing mount in config": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + expErr: errors.New("unknown SCM mount point"), + }, + "scan remote; bdevs in config; adjustment skipped as no meta flag in req": { + req: &ctlpb.ScanNvmeReq{Health: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(2), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true}, + }, + }, + "scan remote; bdevs in config; zero namespaces": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + scmNamespaces: []*ctlpb.ScmNamespace{}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList(defScmDev). + WithScmMountPoint(defScmMountPt), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1), + test.MockPCIAddr(2)), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + proto.MockNvmeController(2), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true, Meta: true}, + }, + }, "scan remote; bdevs in config": { req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, engTierCfgs: []storage.TierConfigs{ { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList(defScmDev). + WithScmMountPoint(defScmMountPt), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(test.MockPCIAddr(1), @@ -275,23 +368,454 @@ func TestServer_bdevScan(t *testing.T) { }, State: new(ctlpb.ResponseState), }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, + }, + "scan remote; bdev with md-on-ssd roles in config; no request flags; adjustments skipped": { + req: &ctlpb.ScanNvmeReq{}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassRam.String()). + WithScmMountPoint(defScmMountPt), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles(storage.BdevRoleAll), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + engRes: []ctlpb.ScanNvmeResp{ + ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.SmdDevices = []*ctlpb.SmdDevice{ + mockSmd(storage.BdevRoleAll), + } + return nc + }(), + }, + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.HealthStats = nil + nc.SmdDevices = []*ctlpb.SmdDevice{ + mockSmd(storage.BdevRoleAll), + } + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{{}}, + }, + "scan remote; bdev with md-on-ssd roles in config; no meta flag": { + req: &ctlpb.ScanNvmeReq{Health: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassRam.String()). + WithScmMountPoint(defScmMountPt), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles(storage.BdevRoleAll), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + engRes: []ctlpb.ScanNvmeResp{ + ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.SmdDevices = []*ctlpb.SmdDevice{ + mockSmd(storage.BdevRoleAll), + } + return nc + }(), + }, + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.SmdDevices = []*ctlpb.SmdDevice{ + mockSmd(storage.BdevRoleAll), + } + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true}, + }, + }, + "scan remote; bdev with md-on-ssd roles in config; no md info in smd devs": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassRam.String()). + WithScmMountPoint(defScmMountPt), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles(storage.BdevRoleAll), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + engRes: []ctlpb.ScanNvmeResp{ + ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + sd := &ctlpb.SmdDevice{ + Rank: uint32(0), + TgtIds: []int32{1, 2, 3, 4}, + // Avoid rounding + AvailBytes: 32 * humanize.GiByte, + ClusterSize: 32 * humanize.MiByte, + RoleBits: storage.BdevRoleAll, + } + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + sd := &ctlpb.SmdDevice{ + Rank: uint32(0), + TgtIds: []int32{1, 2, 3, 4}, + AvailBytes: 32 * humanize.GiByte, + ClusterSize: 32 * humanize.MiByte, + RoleBits: storage.BdevRoleAll, + UsableBytes: 32 * humanize.GiByte, + } + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, + }, + "scan remote; bdev with md-on-ssd roles in config; nvme capacity adjusted": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassRam.String()). + WithScmMountPoint(defScmMountPt), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles(storage.BdevRoleAll), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + engRes: []ctlpb.ScanNvmeResp{ + ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.SmdDevices = []*ctlpb.SmdDevice{ + mockSmd(storage.BdevRoleAll), + } + return nc + }(), + }, + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + sd := mockSmd(storage.BdevRoleAll) + // See TestServer_CtlSvc_adjustNvmeSize + // 80 metadata, 128 wal, 64 rdb = 272 clusters + sd.UsableBytes = (1024 - 272) * (32 * humanize.MiByte) + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, + }, + "scan remote; bdev with md-on-ssd roles in config; no health flag": { + req: &ctlpb.ScanNvmeReq{Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassRam.String()). + WithScmMountPoint(defScmMountPt), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles(storage.BdevRoleAll), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + engRes: []ctlpb.ScanNvmeResp{ + ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.SmdDevices = []*ctlpb.SmdDevice{ + mockSmd(storage.BdevRoleAll), + } + return nc + }(), + }, + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.HealthStats = nil + sd := mockSmd(storage.BdevRoleAll) + sd.UsableBytes = (1024 - 272) * (32 * humanize.MiByte) + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, + }, + "scan remote; bdev with md-on-ssd roles in config; separate data role": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassRam.String()). + WithScmMountPoint(defScmMountPt), + // Roles are read from scan resp, adding here for posterity. + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles( + storage.BdevRoleWAL | storage.BdevRoleMeta), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(2)). + WithBdevDeviceRoles(storage.BdevRoleData), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + engRes: []ctlpb.ScanNvmeResp{ + ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.SmdDevices = []*ctlpb.SmdDevice{ + mockSmd(storage.BdevRoleWAL | storage.BdevRoleMeta), + } + return nc + }(), + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(2) + nc.SmdDevices = []*ctlpb.SmdDevice{ + mockSmd(storage.BdevRoleData), + } + return nc + }(), + }, + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + sd := mockSmd(storage.BdevRoleWAL | storage.BdevRoleMeta) + sd.AvailBytes = 0 + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(2) + sd := mockSmd(storage.BdevRoleData) + sd.UsableBytes = 32 * humanize.GiByte + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, + }, + "scan remote; bdev with md-on-ssd roles in config; phase-2 scan (mem-ratio in req)": { + req: &ctlpb.ScanNvmeReq{Meta: true, MemRatio: 0.5}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassRam.String()). + WithScmMountPoint(defScmMountPt), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles(storage.BdevRoleAll), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + engRes: []ctlpb.ScanNvmeResp{ + ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + sd := mockSmd(storage.BdevRoleAll) + // Populated from scan request based on 0.5 MemRatio. + sd.MetaSize = defMetaSize * 2 + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + nc.HealthStats = nil + sd := mockSmd(storage.BdevRoleAll) + sd.MetaSize = defMetaSize * 2 + // Before doubling meta-size, 272 clusters removed from 1024 + // 128 wal, 64 rdb, 80 meta. Add 80 meta gives 352 to remove. + sd.UsableBytes = (1024 - 352) * (32 * humanize.MiByte) + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + // Double MetaSize passed in request because of 0.5 MemRatio. + {Meta: true, MemRatio: 0.5, MetaSize: defMetaSize * 2, RdbSize: defRdbSize}, + }, + }, + "scan remote; bdev with md-on-ssd roles in config; duplicate and sysXS tgt ids": { + req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, + engTierCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassRam.String()). + WithScmMountPoint(defScmMountPt), + storage.NewTierConfig(). + WithStorageClass(storage.ClassNvme.String()). + WithBdevDeviceList(test.MockPCIAddr(1)). + WithBdevDeviceRoles(storage.BdevRoleAll), + }, + }, + engStopped: []bool{false}, + engErr: []error{nil}, + engRes: []ctlpb.ScanNvmeResp{ + ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + sd := mockSmd(storage.BdevRoleAll) + sd.TgtIds = []int32{ + 1024, 1024, 1, 1, 2, 2, 3, 3, 4, 4, + } + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + }, + }, + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + func() *ctlpb.NvmeController { + nc := proto.MockNvmeController(1) + sd := mockSmd(storage.BdevRoleAll) + sd.TgtIds = []int32{ + // See storage.SmdDevice.UnmarshalJSON() + // for tgtID sanitization. + 1024, 1024, 1, 1, 2, 2, 3, 3, 4, 4, + } + // See TestServer_CtlSvc_adjustNvmeSize + // 80 metadata, 128 wal, 64 rdb = 272 clusters + sd.UsableBytes = (1024 - 272) * (32 * humanize.MiByte) + nc.SmdDevices = []*ctlpb.SmdDevice{sd} + return nc + }(), + }, + State: new(ctlpb.ResponseState), + }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, }, "scan remote; collate results from multiple engines": { req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, engTierCfgs: []storage.TierConfigs{ { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList(defScmDev). + WithScmMountPoint(defScmMountPt), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(test.MockPCIAddr(1), test.MockPCIAddr(2)), }, { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList("/dev/pmem1"). + WithScmMountPoint("/mnt/daos1"), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(test.MockPCIAddr(3), test.MockPCIAddr(4)), }, }, + scmNamespaces: []*ctlpb.ScmNamespace{ + { + Mount: &ctlpb.ScmNamespace_Mount{ + Path: defScmMountPt, + AvailBytes: defMountAvail, + UsableBytes: defMountUsable, + Class: storage.ClassDcpm.String(), + }, + }, + { + Mount: &ctlpb.ScmNamespace_Mount{ + Rank: 1, + Path: "/mnt/daos1", + AvailBytes: defMountAvail, + UsableBytes: defMountUsable, + Class: storage.ClassDcpm.String(), + }, + }, + }, engRes: []ctlpb.ScanNvmeResp{ { Ctrlrs: proto.NvmeControllers{ @@ -319,17 +843,29 @@ func TestServer_bdevScan(t *testing.T) { }, State: new(ctlpb.ResponseState), }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, }, "scan remote; both engine scans fail": { req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, engTierCfgs: []storage.TierConfigs{ { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList(defScmDev). + WithScmMountPoint(defScmMountPt), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(test.MockPCIAddr(1), test.MockPCIAddr(2)), }, { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList("/dev/pmem1"). + WithScmMountPoint("/mnt/daos1"), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(test.MockPCIAddr(3), @@ -345,18 +881,45 @@ func TestServer_bdevScan(t *testing.T) { req: &ctlpb.ScanNvmeReq{Health: true, Meta: true}, engTierCfgs: []storage.TierConfigs{ { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList(defScmDev). + WithScmMountPoint(defScmMountPt), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(test.MockPCIAddr(1), test.MockPCIAddr(2)), }, { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList("/dev/pmem1"). + WithScmMountPoint("/mnt/daos1"), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(test.MockPCIAddr(3), test.MockPCIAddr(4)), }, }, + scmNamespaces: []*ctlpb.ScmNamespace{ + { + Mount: &ctlpb.ScmNamespace_Mount{ + Path: defScmMountPt, + AvailBytes: defMountAvail, + UsableBytes: defMountUsable, + Class: storage.ClassDcpm.String(), + }, + }, + { + Mount: &ctlpb.ScmNamespace_Mount{ + Rank: 1, + Path: "/mnt/daos1", + AvailBytes: defMountAvail, + UsableBytes: defMountUsable, + Class: storage.ClassDcpm.String(), + }, + }, + }, engRes: []ctlpb.ScanNvmeResp{ {}, { @@ -379,6 +942,10 @@ func TestServer_bdevScan(t *testing.T) { Status: ctlpb.ResponseStatus_CTL_ERR_NVME, }, }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + {Health: true, Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, }, "scan remote; filter results based on request basic flag": { req: &ctlpb.ScanNvmeReq{Basic: true}, @@ -422,6 +989,9 @@ func TestServer_bdevScan(t *testing.T) { }, State: new(ctlpb.ResponseState), }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Basic: true}, + }, }, "scan local; filter results based on request basic flag": { req: &ctlpb.ScanNvmeReq{Basic: true}, @@ -517,6 +1087,10 @@ func TestServer_bdevScan(t *testing.T) { req: &ctlpb.ScanNvmeReq{Meta: true}, engTierCfgs: []storage.TierConfigs{ { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmDeviceList(defScmDev). + WithScmMountPoint(defScmMountPt), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList("0000:05:05.5"), @@ -538,6 +1112,9 @@ func TestServer_bdevScan(t *testing.T) { }, State: new(ctlpb.ResponseState), }, + expRemoteScanCalls: []*ctlpb.ScanNvmeReq{ + {Meta: true, MetaSize: defMetaSize, RdbSize: defRdbSize}, + }, }, } { t.Run(name, func(t *testing.T) { @@ -555,9 +1132,27 @@ func TestServer_bdevScan(t *testing.T) { t.Fatal("len tc.engStopped != len tc.tierCfgs") } + if tc.scmNamespaces == nil { + tc.scmNamespaces = []*ctlpb.ScmNamespace{ + { + Mount: &ctlpb.ScmNamespace_Mount{ + Path: defScmMountPt, + AvailBytes: defMountAvail, + UsableBytes: defMountUsable, + Class: storage.ClassRam.String(), + }, + }, + } + } + + var remoteScanCalls []*ctlpb.ScanNvmeReq idx := 0 // Mock per-engine-scan function to focus on unit testing bdevScan(). - scanEngineBdevs = func(_ context.Context, _ Engine, _ *ctlpb.ScanNvmeReq) (*ctlpb.ScanNvmeResp, error) { + scanEngineBdevs = func(_ context.Context, _ Engine, eReq *ctlpb.ScanNvmeReq) (*ctlpb.ScanNvmeResp, error) { + // Store request call. + remoteScanCalls = append(remoteScanCalls, eReq) + + // Generate response. if len(tc.engRes) <= idx { t.Fatal("engine scan called but response not specified") } @@ -567,15 +1162,20 @@ func TestServer_bdevScan(t *testing.T) { engRes := tc.engRes[idx] engErr := tc.engErr[idx] idx++ + return &engRes, engErr } defer func() { scanEngineBdevs = bdevScanEngine }() + if tc.engTgtCount == 0 { + tc.engTgtCount = defTgtCount + } engCfgs := []*engine.Config{} for _, tcs := range tc.engTierCfgs { - engCfg := engine.MockConfig().WithStorage(tcs...) + engCfg := engine.MockConfig().WithStorage(tcs...). + WithTargetCount(tc.engTgtCount) engCfgs = append(engCfgs, engCfg) } sCfg := config.DefaultServer().WithEngines(engCfgs...). @@ -591,7 +1191,7 @@ func TestServer_bdevScan(t *testing.T) { cs := newMockControlServiceFromBackends(t, log, sCfg, bmb, smb, nil, tc.engStopped...) - resp, err := bdevScan(test.Context(t), cs, tc.req, nil) + resp, err := bdevScan(test.Context(t), cs, tc.req, tc.scmNamespaces) test.CmpErr(t, tc.expErr, err) if err != nil { return @@ -609,19 +1209,27 @@ func TestServer_bdevScan(t *testing.T) { return x.Equals(y) }) + // Verify expected provider backend scan requests have been made. bmb.RLock() if len(tc.expBackendScanCalls) != len(bmb.ScanCalls) { t.Fatalf("unexpected number of backend scan calls, want %d got %d", len(tc.expBackendScanCalls), len(bmb.ScanCalls)) } - if len(tc.expBackendScanCalls) == 0 { - return - } if diff := cmp.Diff(tc.expBackendScanCalls, bmb.ScanCalls, append(defStorageScanCmpOpts, cmpopt)...); diff != "" { t.Fatalf("unexpected backend scan calls (-want, +got):\n%s\n", diff) } bmb.RUnlock() + + // Verify expected remote drpc scan requests have been made. + if len(tc.expRemoteScanCalls) != len(remoteScanCalls) { + t.Fatalf("unexpected number of remote scan calls, want %d got %d", + len(tc.expRemoteScanCalls), len(remoteScanCalls)) + } + if diff := cmp.Diff(tc.expRemoteScanCalls, remoteScanCalls, + append(defStorageScanCmpOpts, cmpopt)...); diff != "" { + t.Fatalf("unexpected remote scan calls (-want, +got):\n%s\n", diff) + } }) } } @@ -2310,15 +2918,15 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { rdbWalSize uint64 = 512 * humanize.MiByte ) - type StorageCfg struct { + type storageCfg struct { targetCount int tierCfgs storage.TierConfigs } - type DataInput struct { - storageCfgs []*StorageCfg + type dataInput struct { + storageCfgs []*storageCfg scanNvmeResp *ctlpb.ScanNvmeResp } - type ExpectedOutput struct { + type expectedOutput struct { totalBytes []uint64 availableBytes []uint64 usableBytes []uint64 @@ -2344,12 +2952,12 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { } for name, tc := range map[string]struct { - input DataInput - output ExpectedOutput + input dataInput + output expectedOutput }{ "homogeneous": { - input: DataInput{ - storageCfgs: []*StorageCfg{ + input: dataInput{ + storageCfgs: []*storageCfg{ { targetCount: 12, tierCfgs: storage.TierConfigs{ @@ -2444,7 +3052,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ totalBytes: []uint64{ 10 * hugeClusterSize, 10 * hugeClusterSize, @@ -2469,8 +3077,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, "heterogeneous": { - input: DataInput{ - storageCfgs: []*StorageCfg{ + input: dataInput{ + storageCfgs: []*storageCfg{ { targetCount: 11, tierCfgs: storage.TierConfigs{ @@ -2567,7 +3175,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ totalBytes: []uint64{ 10 * hugeClusterSize, 10 * hugeClusterSize, @@ -2592,8 +3200,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, "new": { - input: DataInput{ - storageCfgs: []*StorageCfg{ + input: dataInput{ + storageCfgs: []*storageCfg{ { targetCount: 7, tierCfgs: storage.TierConfigs{ @@ -2637,7 +3245,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ totalBytes: []uint64{ 10 * hugeClusterSize, 10 * hugeClusterSize, @@ -2654,8 +3262,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, "evicted": { - input: DataInput{ - storageCfgs: []*StorageCfg{ + input: dataInput{ + storageCfgs: []*storageCfg{ { targetCount: 7, tierCfgs: storage.TierConfigs{ @@ -2699,7 +3307,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ totalBytes: []uint64{ 10 * hugeClusterSize, 10 * hugeClusterSize, @@ -2716,8 +3324,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, "missing targets": { - input: DataInput{ - storageCfgs: []*StorageCfg{ + input: dataInput{ + storageCfgs: []*storageCfg{ { targetCount: 4, tierCfgs: storage.TierConfigs{ @@ -2761,7 +3369,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ totalBytes: []uint64{ 10 * hugeClusterSize, 10 * hugeClusterSize, @@ -2778,8 +3386,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, "missing cluster size": { - input: DataInput{ - storageCfgs: []*StorageCfg{ + input: dataInput{ + storageCfgs: []*storageCfg{ { targetCount: 7, tierCfgs: storage.TierConfigs{ @@ -2822,7 +3430,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ totalBytes: []uint64{ 10 * hugeClusterSize, 10 * hugeClusterSize, @@ -2839,8 +3447,8 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, "multi bdev tier": { - input: DataInput{ - storageCfgs: []*StorageCfg{ + input: dataInput{ + storageCfgs: []*storageCfg{ { targetCount: 5, tierCfgs: storage.TierConfigs{newTierCfg(1)}, @@ -2873,7 +3481,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { SmdDevices: []*ctlpb.SmdDevice{ { Uuid: "nvme0", - TgtIds: []int32{0, 1, 2, 3}, + TgtIds: []int32{0, 1, 2, 3, 4}, TotalBytes: 10 * humanize.GiByte, AvailBytes: 10 * humanize.GiByte, ClusterSize: clusterSize, @@ -2903,7 +3511,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { SmdDevices: []*ctlpb.SmdDevice{ { Uuid: "nvme2", - TgtIds: []int32{0, 1, 2, 3}, + TgtIds: []int32{0, 1, 2, 3, 4, 5}, TotalBytes: 10 * humanize.GiByte, AvailBytes: 10 * humanize.GiByte, ClusterSize: clusterSize, @@ -2933,7 +3541,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { SmdDevices: []*ctlpb.SmdDevice{ { Uuid: "nvme4", - TgtIds: []int32{0, 1, 2, 3}, + TgtIds: []int32{0, 1, 2, 3, 4}, TotalBytes: 10 * humanize.GiByte, AvailBytes: 10 * humanize.GiByte, ClusterSize: clusterSize, @@ -2948,12 +3556,12 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { SmdDevices: []*ctlpb.SmdDevice{ { Uuid: "nvme5", - TgtIds: []int32{0, 1, 2, 3}, + TgtIds: []int32{0, 1, 2, 3, 4, 5}, TotalBytes: 10 * humanize.GiByte, AvailBytes: 10 * humanize.GiByte, ClusterSize: clusterSize, Rank: 5, - RoleBits: storage.BdevRoleMeta | storage.BdevRoleMeta, + RoleBits: storage.BdevRoleMeta | storage.BdevRoleWAL, }, }, DevState: devStateNormal, @@ -2961,7 +3569,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ totalBytes: []uint64{ 320 * clusterSize, 320 * clusterSize, @@ -2979,9 +3587,22 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { 0 * humanize.GiByte, }, usableBytes: []uint64{ + // 5tgts * 64mib = 320mib of meta on SSD (10 clusters) + // 256mib RDB = 8 clusters + // 320-18 = 302 remaining clusters + // 302 / 5 = 60 clusters-per-target (rounding diff) 300 * clusterSize, + // 4tgts * 128mib = 512mib of wal on SSD (16 clusters) + // 512mib WAL RDB = 16 clusters + // 320-32 = 288 remaining clusters 288 * clusterSize, - 260 * clusterSize, + // 6tgts * 64mib = 384mib of meta on SSD (12 clusters) + // 6tgts * 128mib = 768mib of wal on SSD (24 clusters) + // 256mib RDB = 8 clusters + // 512mib WAL RDB = 16 clusters + // 320-60 = 260 remaining clusters + // 260 / 6 = 43 clusters-per-target (rounding diff) + 258 * clusterSize, 0 * humanize.GiByte, 0 * humanize.GiByte, 0 * humanize.GiByte, @@ -3008,14 +3629,14 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { for idx, ctlr := range tc.input.scanNvmeResp.GetCtrlrs() { dev := ctlr.GetSmdDevices()[0] test.AssertEqual(t, tc.output.totalBytes[idx], dev.GetTotalBytes(), - fmt.Sprintf("Invalid total bytes with ctlr %s (index=%d): wait=%d, got=%d", + fmt.Sprintf("Invalid total bytes with ctlr %s (index=%d): want=%d, got=%d", ctlr.GetPciAddr(), idx, tc.output.totalBytes[idx], dev.GetTotalBytes())) test.AssertEqual(t, tc.output.availableBytes[idx], dev.GetAvailBytes(), - fmt.Sprintf("Invalid available bytes with ctlr %s (index=%d): wait=%d, got=%d", + fmt.Sprintf("Invalid available bytes with ctlr %s (index=%d): want=%d, got=%d", ctlr.GetPciAddr(), idx, tc.output.availableBytes[idx], dev.GetAvailBytes())) test.AssertEqual(t, tc.output.usableBytes[idx], dev.GetUsableBytes(), fmt.Sprintf("Invalid usable bytes with ctlr %s (index=%d), "+ - "wait=%d (%d clusters) got=%d (%d clusters)", + "want=%d (%d clusters) got=%d (%d clusters)", ctlr.GetPciAddr(), idx, tc.output.usableBytes[idx], tc.output.usableBytes[idx]/clusterSize, dev.GetUsableBytes(), dev.GetUsableBytes()/clusterSize)) @@ -3030,7 +3651,7 @@ func TestServer_CtlSvc_adjustNvmeSize(t *testing.T) { } func TestServer_getRdbSize(t *testing.T) { - type ExpectedOutput struct { + type expectedOutput struct { size uint64 message string err error @@ -3038,23 +3659,23 @@ func TestServer_getRdbSize(t *testing.T) { for name, tc := range map[string]struct { rdbSize string - output ExpectedOutput + output expectedOutput }{ "simple env var": { rdbSize: "DAOS_MD_CAP=1024", - output: ExpectedOutput{ + output: expectedOutput{ size: 1024 * humanize.MiByte, }, }, "simple default": { - output: ExpectedOutput{ + output: expectedOutput{ size: defaultRdbSize, message: "using default RDB file size", }, }, "invalid mdcap": { rdbSize: "DAOS_MD_CAP=foo", - output: ExpectedOutput{ + output: expectedOutput{ err: errors.New("invalid RDB file size"), }, }, @@ -3100,23 +3721,23 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { mountPoints []string } - type DataInput struct { + type dataInput struct { configs []*EngineConfig response *ctlpb.ScanScmResp } - type ExpectedOutput struct { + type expectedOutput struct { availableBytes []uint64 usableBytes []uint64 message string } for name, tc := range map[string]struct { - input DataInput - output ExpectedOutput + input dataInput + output expectedOutput }{ "single mountPoint": { - input: DataInput{ + input: dataInput{ configs: []*EngineConfig{ { mountPoints: []string{"/mnt/daos0"}, @@ -3128,19 +3749,19 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt/daos0", AvailBytes: uint64(64) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ availableBytes: []uint64{uint64(64) * humanize.GiByte}, usableBytes: []uint64{uint64(64)*humanize.GiByte - defaultRdbSize - mdDaosScmBytes - mdFsScmBytes}, }, }, "three mountPoints": { - input: DataInput{ + input: dataInput{ configs: []*EngineConfig{ { mdCap: "DAOS_MD_CAP=1024", @@ -3153,27 +3774,27 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt/daos0", AvailBytes: uint64(64) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt/daos1", AvailBytes: uint64(32) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt/daos2", AvailBytes: uint64(128) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ availableBytes: []uint64{ uint64(64) * humanize.GiByte, uint64(32) * humanize.GiByte, @@ -3187,7 +3808,7 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { }, }, "Missing SCM": { - input: DataInput{ + input: dataInput{ configs: []*EngineConfig{ { mdCap: "DAOS_MD_CAP=1024", @@ -3200,27 +3821,27 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt/daos0", AvailBytes: uint64(64) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt/daos1", AvailBytes: uint64(32) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt/daos2", AvailBytes: uint64(128) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ availableBytes: []uint64{ uint64(64) * humanize.GiByte, uint64(32) * humanize.GiByte, @@ -3235,7 +3856,7 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { }, }, "No more space": { - input: DataInput{ + input: dataInput{ configs: []*EngineConfig{ { mountPoints: []string{"/mnt/daos0"}, @@ -3247,20 +3868,20 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt/daos0", AvailBytes: uint64(64) * humanize.KiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ availableBytes: []uint64{uint64(64) * humanize.KiByte}, usableBytes: []uint64{0}, message: "No more usable space in SCM device", }, }, "Multi bdev Tiers": { - input: DataInput{ + input: dataInput{ configs: []*EngineConfig{ { mdCap: "DAOS_MD_CAP=1024", @@ -3274,20 +3895,20 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/mnt", AvailBytes: uint64(64) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, { Mount: &ctlpb.ScmNamespace_Mount{ Path: "/opt", AvailBytes: uint64(32) * humanize.GiByte, - Class: storage.ClassFile.String(), + Class: storage.ClassRam.String(), }, }, }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ availableBytes: []uint64{ uint64(64) * humanize.GiByte, uint64(32) * humanize.GiByte, @@ -3335,12 +3956,12 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { for index, namespace := range tc.input.response.Namespaces { test.AssertEqual(t, tc.output.availableBytes[index], namespace.GetMount().GetAvailBytes(), - fmt.Sprintf("Invalid SCM available bytes: nsp=%s, wait=%s (%d bytes), got=%s (%d bytes)", + fmt.Sprintf("Invalid SCM available bytes: nsp=%s, want=%s (%d bytes), got=%s (%d bytes)", namespace.GetMount().GetPath(), humanize.Bytes(tc.output.availableBytes[index]), tc.output.availableBytes[index], humanize.Bytes(namespace.GetMount().GetAvailBytes()), namespace.GetMount().GetAvailBytes())) test.AssertEqual(t, tc.output.usableBytes[index], namespace.GetMount().GetUsableBytes(), - fmt.Sprintf("Invalid SCM usable bytes: nsp=%s, wait=%s (%d bytes), got=%s (%d bytes)", + fmt.Sprintf("Invalid SCM usable bytes: nsp=%s, want=%s (%d bytes), got=%s (%d bytes)", namespace.GetMount().GetPath(), humanize.Bytes(tc.output.usableBytes[index]), tc.output.usableBytes[index], humanize.Bytes(namespace.GetMount().GetUsableBytes()), namespace.GetMount().GetUsableBytes())) @@ -3355,11 +3976,11 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) { } func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) { - type DataInput struct { + type dataInput struct { tierCfgs storage.TierConfigs nvmeCtlr *ctl.NvmeController } - type ExpectedOutput struct { + type expectedOutput struct { res bool msg string } @@ -3376,32 +3997,32 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) { } for name, tc := range map[string]struct { - input DataInput - output ExpectedOutput + input dataInput + output expectedOutput }{ "find NVME Ctlr": { - input: DataInput{ + input: dataInput{ tierCfgs: newTierCfgs(5), nvmeCtlr: &ctl.NvmeController{ PciAddr: test.MockPCIAddr(3), }, }, - output: ExpectedOutput{res: true}, + output: expectedOutput{res: true}, }, "not find NVME Ctlr": { - input: DataInput{ + input: dataInput{ tierCfgs: newTierCfgs(5), nvmeCtlr: &ctl.NvmeController{ PciAddr: test.MockPCIAddr(13), }, }, - output: ExpectedOutput{ + output: expectedOutput{ res: false, msg: "unknown PCI device", }, }, "find VMD device": { - input: DataInput{ + input: dataInput{ tierCfgs: storage.TierConfigs{ storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). @@ -3411,10 +4032,10 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) { PciAddr: "040603:02:00.0", }, }, - output: ExpectedOutput{res: true}, + output: expectedOutput{res: true}, }, "Invalid address": { - input: DataInput{ + input: dataInput{ tierCfgs: storage.TierConfigs{ storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). @@ -3424,7 +4045,7 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) { PciAddr: "666", }, }, - output: ExpectedOutput{ + output: expectedOutput{ res: false, msg: "Invalid PCI address", }, @@ -3442,7 +4063,7 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) { if tc.output.res { test.AssertEqual(t, engineCfg, ec, - fmt.Sprintf("Invalid engine config: wait=%v got=%v", engineCfg, ec)) + fmt.Sprintf("Invalid engine config: want=%v got=%v", engineCfg, ec)) return } @@ -3456,11 +4077,11 @@ func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) { } func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) { - type DataInput struct { + type dataInput struct { tierCfgs storage.TierConfigs scmNsp *ctl.ScmNamespace } - type ExpectedOutput struct { + type expectedOutput struct { res bool msg string } @@ -3477,11 +4098,11 @@ func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) { } for name, tc := range map[string]struct { - input DataInput - output ExpectedOutput + input dataInput + output expectedOutput }{ "find SCM Nsp": { - input: DataInput{ + input: dataInput{ tierCfgs: newTierCfgs(5), scmNsp: &ctl.ScmNamespace{ Mount: &ctl.ScmNamespace_Mount{ @@ -3489,10 +4110,10 @@ func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) { }, }, }, - output: ExpectedOutput{res: true}, + output: expectedOutput{res: true}, }, "not find SCM Nsp": { - input: DataInput{ + input: dataInput{ tierCfgs: newTierCfgs(5), scmNsp: &ctl.ScmNamespace{ Mount: &ctl.ScmNamespace_Mount{ @@ -3500,7 +4121,7 @@ func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) { }, }, }, - output: ExpectedOutput{ + output: expectedOutput{ res: false, msg: "unknown SCM mount point"}, }, @@ -3517,7 +4138,7 @@ func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) { if tc.output.res { test.AssertEqual(t, engineCfg, ec, - fmt.Sprintf("Invalid engine config: wait=%v got=%v", engineCfg, ec)) + fmt.Sprintf("Invalid engine config: want=%v got=%v", engineCfg, ec)) return } diff --git a/src/control/server/faults.go b/src/control/server/faults.go index 5a5526e36fa..ad70f202c1c 100644 --- a/src/control/server/faults.go +++ b/src/control/server/faults.go @@ -171,6 +171,14 @@ func FaultNoCompatibilityInsecure(self, other build.Version) *fault.Fault { ) } +// FaultPoolMemRatioNoRoles indicates a fault when pool create request contains MD-on-SSD +// parameters but MD-on-SSD has not been enabled on the server. +var FaultPoolMemRatioNoRoles = serverFault( + code.ServerPoolMemRatioNoRoles, + "pool create request contains MD-on-SSD parameters but MD-on-SSD has not been enabled", + "either remove MD-on-SSD-specific options from the command request or set bdev_roles in "+ + "server config file to enable MD-on-SSD") + func FaultBadFaultDomainLabels(faultPath, addr string, reqLabels, systemLabels []string) *fault.Fault { return serverFault( code.ServerBadFaultDomainLabels, diff --git a/src/control/server/mgmt_pool.go b/src/control/server/mgmt_pool.go index bd9a0064787..4ad98ccd1bf 100644 --- a/src/control/server/mgmt_pool.go +++ b/src/control/server/mgmt_pool.go @@ -24,6 +24,7 @@ import ( "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/server/engine" + "github.com/daos-stack/daos/src/control/server/storage" "github.com/daos-stack/daos/src/control/system" ) @@ -160,9 +161,8 @@ func minPoolNvme(tgtCount, rankCount uint64) uint64 { return minRankNvme(tgtCount) * rankCount } -// calculateCreateStorage determines the amount of SCM/NVMe storage to -// allocate per engine in order to fulfill the create request, if those -// values are not already supplied as part of the request. +// calculateCreateStorage determines the amount of SCM/NVMe storage to allocate per engine in order +// to fulfill the create request, if those values are not already supplied as part of the request. func (svc *mgmtSvc) calculateCreateStorage(req *mgmtpb.PoolCreateReq) error { instances := svc.harness.Instances() if len(instances) < 1 { @@ -172,11 +172,21 @@ func (svc *mgmtSvc) calculateCreateStorage(req *mgmtpb.PoolCreateReq) error { return errors.New("zero ranks in calculateCreateStorage()") } - // NB: The following logic is based on the assumption that - // a request will always include SCM as tier 0. Currently, - // we only support one additional tier, NVMe, which is - // optional. As we add support for other tiers, this logic - // will need to be updated. + mdOnSSD := instances[0].GetStorage().BdevRoleMetaConfigured() + switch { + case !mdOnSSD && req.MemRatio > 0: + // Prevent MD-on-SSD parameters being used in incompatible mode. + return FaultPoolMemRatioNoRoles + case mdOnSSD && req.MemRatio == 0: + // Set reasonable default if not set in MD-on-SSD mode. + req.MemRatio = storage.DefaultMemoryFileRatio + svc.log.Infof("Default memory-file:md-on-ssd ratio of %d%% applied", + int(storage.DefaultMemoryFileRatio)*100) + } + + // NB: The following logic is based on the assumption that a request will always include SCM + // as tier 0. Currently, we only support one additional tier, NVMe, which is optional. As we + // add support for other tiers, this logic will need to be updated. nvmeMissing := !instances[0].GetStorage().HasBlockDevices() @@ -251,6 +261,7 @@ func (svc *mgmtSvc) PoolCreate(ctx context.Context, req *mgmtpb.PoolCreateReq) ( if err != nil { return nil, err } + return msg.(*mgmtpb.PoolCreateResp), nil } @@ -300,7 +311,6 @@ func (svc *mgmtSvc) poolCreate(parent context.Context, req *mgmtpb.PoolCreateReq resp.SvcReps = ranklist.RanksToUint32(ps.Replicas) resp.TgtRanks = ranklist.RanksToUint32(ps.Storage.CreationRanks()) resp.TierBytes = ps.Storage.PerRankTierStorage - // TODO DAOS-14223: Store Meta-Blob-Size in sysdb. return resp, nil } @@ -947,6 +957,13 @@ func (svc *mgmtSvc) PoolQuery(ctx context.Context, req *mgmtpb.PoolQueryReq) (*m // Preserve compatibility with pre-2.6 callers. resp.Leader = resp.SvcLdr + // TODO DAOS-16209: After VOS query API is updated, zero-value mem_file_bytes will be + // returned in non-MD-on-SSD mode and this hack can be removed. + storage := svc.harness.Instances()[0].GetStorage() + if !storage.ControlMetadataPathConfigured() { + resp.MemFileBytes = 0 + } + return resp, nil } @@ -966,6 +983,15 @@ func (svc *mgmtSvc) PoolQueryTarget(ctx context.Context, req *mgmtpb.PoolQueryTa return nil, errors.Wrap(err, "unmarshal PoolQueryTarget response") } + // TODO DAOS-16209: After VOS query API is updated, zero-value mem_file_bytes will be + // returned in non-MD-on-SSD mode and this hack can be removed. + storage := svc.harness.Instances()[0].GetStorage() + if !storage.ControlMetadataPathConfigured() { + for _, tgtInfo := range resp.Infos { + tgtInfo.MemFileBytes = 0 + } + } + return resp, nil } diff --git a/src/control/server/mgmt_pool_test.go b/src/control/server/mgmt_pool_test.go index 31684752af2..24f109cf196 100644 --- a/src/control/server/mgmt_pool_test.go +++ b/src/control/server/mgmt_pool_test.go @@ -200,10 +200,11 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) { nvmeTooSmallReq := nvmeTooSmallTotal for name, tc := range map[string]struct { - disableNVMe bool - in *mgmtpb.PoolCreateReq - expOut *mgmtpb.PoolCreateReq - expErr error + disableNVMe bool + enableMdOnSsd bool + in *mgmtpb.PoolCreateReq + expOut *mgmtpb.PoolCreateReq + expErr error }{ "auto sizing": { in: &mgmtpb.PoolCreateReq{ @@ -245,6 +246,15 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) { Ranks: []uint32{0}, }, }, + "auto sizing (mem-ratio but not MD-on-SSD)": { + in: &mgmtpb.PoolCreateReq{ + TotalBytes: defaultTotal, + TierRatio: defaultRatios, + Ranks: []uint32{0, 1}, + MemRatio: 0.2, + }, + expErr: FaultPoolMemRatioNoRoles, + }, "tier bytes set for both (no NVMe in config)": { disableNVMe: true, in: &mgmtpb.PoolCreateReq{ @@ -264,6 +274,39 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) { Ranks: []uint32{0}, }, }, + "mem-ratio is set (mdonssd not configured)": { + in: &mgmtpb.PoolCreateReq{ + TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, + Ranks: []uint32{0}, + MemRatio: storage.DefaultMemoryFileRatio, + }, + expErr: FaultPoolMemRatioNoRoles, + }, + "mem-ratio is unset (mdonssd configured)": { + enableMdOnSsd: true, + in: &mgmtpb.PoolCreateReq{ + TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, + Ranks: []uint32{0}, + }, + expOut: &mgmtpb.PoolCreateReq{ + TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, + Ranks: []uint32{0}, + MemRatio: storage.DefaultMemoryFileRatio, + }, + }, + "mem-ratio is set (mdonssd configured)": { + enableMdOnSsd: true, + in: &mgmtpb.PoolCreateReq{ + TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, + Ranks: []uint32{0}, + MemRatio: 0.25, + }, + expOut: &mgmtpb.PoolCreateReq{ + TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, + Ranks: []uint32{0}, + MemRatio: 0.25, + }, + }, "manual sizing": { in: &mgmtpb.PoolCreateReq{ TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, @@ -288,6 +331,27 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) { }, expErr: FaultPoolNvmeTooSmall(nvmeTooSmallReq, minPoolNvme), }, + "manual sizing (MD-on-SSD syntax used)": { + enableMdOnSsd: true, + in: &mgmtpb.PoolCreateReq{ + TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, + Ranks: []uint32{0, 1}, + MemRatio: 1, + }, + expOut: &mgmtpb.PoolCreateReq{ + TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, + Ranks: []uint32{0, 1}, + MemRatio: 1, + }, + }, + "manual sizing (MD-on-SSD syntax used but not MD-on-SSD)": { + in: &mgmtpb.PoolCreateReq{ + TierBytes: []uint64{defaultScmBytes - 1, defaultNvmeBytes - 1}, + Ranks: []uint32{0, 1}, + MemRatio: 1, + }, + expErr: FaultPoolMemRatioNoRoles, + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) @@ -301,7 +365,12 @@ func TestServer_MgmtSvc_calculateCreateStorage(t *testing.T) { WithStorageClass("nvme"). WithBdevDeviceList("foo", "bar"), ) + if tc.enableMdOnSsd { + engineCfg.Storage.Tiers[0]. + WithBdevDeviceRoles(storage.BdevRoleAll) + } } + svc := newTestMgmtSvc(t, log) sp := storage.MockProvider(log, 0, &engineCfg.Storage, nil, nil, nil, nil) svc.harness.instances[0] = newTestEngine(log, false, sp, engineCfg) @@ -417,6 +486,26 @@ func TestServer_MgmtSvc_PoolCreate(t *testing.T) { TgtRanks: []uint32{0, 1}, }, }, + "successful creation with memory file ratio": { + targetCount: 8, + mdonssdEnabled: true, + req: &mgmtpb.PoolCreateReq{ + Uuid: test.MockUUID(1), + TierBytes: []uint64{100 * humanize.GiByte, 10 * humanize.TByte}, + MemRatio: storage.DefaultMemoryFileRatio, + Properties: testPoolLabelProp(), + }, + drpcRet: &mgmtpb.PoolCreateResp{ + TierBytes: []uint64{100 * humanize.GiByte, 10 * humanize.TByte}, + MemFileBytes: 50 * humanize.GiByte, + TgtRanks: []uint32{0, 1}, + }, + expResp: &mgmtpb.PoolCreateResp{ + TierBytes: []uint64{100 * humanize.GiByte, 10 * humanize.TByte}, + MemFileBytes: 50 * humanize.GiByte, + TgtRanks: []uint32{0, 1}, + }, + }, "successful creation minimum size": { targetCount: 8, req: &mgmtpb.PoolCreateReq{ diff --git a/src/control/server/storage/bdev.go b/src/control/server/storage/bdev.go index 981a64e936a..96286052fcf 100644 --- a/src/control/server/storage/bdev.go +++ b/src/control/server/storage/bdev.go @@ -39,6 +39,12 @@ const ( sysXSTgtID = 1024 // Minimum amount of hugepage memory (in bytes) needed for each target. memHugepageMinPerTarget = 1 << 30 // 1GiB + + // DefaultMemoryFileRatio (mem_size:meta_size) describes the behavior of MD-on-SSD in + // phase-1 mode where the per-target-meta-blob size is equal to the per-target-VOS-file + // size. In phase-2 mode where the per-target-meta-blob size is greater than + // per-target-VOS-file size, the memory file ratio will be less than one. + DefaultMemoryFileRatio = 1.0 ) // JSON config file constants. @@ -312,6 +318,7 @@ func (sd *SmdDevice) UnmarshalJSON(data []byte) error { sd.Roles.OptionBits = OptionBits(from.RoleBits) } + // Handle any duplicate target IDs and set flag instead of sysXS target ID. seen := make(map[int32]bool) newTgts := make([]int32, 0, len(sd.TargetIDs)) for _, i := range sd.TargetIDs { @@ -389,6 +396,26 @@ func (nc NvmeController) Free() (tb uint64) { return } +// Roles returns bdev_roles for NVMe controller being used in MD-on-SSD mode. Assume that all SMD +// devices on a controller have the same roles. +func (nc *NvmeController) Roles() *BdevRoles { + if len(nc.SmdDevices) > 0 { + return &nc.SmdDevices[0].Roles + } + + return &BdevRoles{} +} + +// Rank returns rank on which this NVMe controller is being used. Assume that all SMD devices on a +// controller have the same rank. +func (nc *NvmeController) Rank() ranklist.Rank { + if len(nc.SmdDevices) > 0 { + return nc.SmdDevices[0].Rank + } + + return ranklist.NilRank +} + // NvmeControllers is a type alias for []*NvmeController. type NvmeControllers []*NvmeController @@ -404,6 +431,11 @@ func (ncs NvmeControllers) String() string { return strings.Join(ss, ", ") } +// Len returns the length of the NvmeController reference slice. +func (ncs NvmeControllers) Len() int { + return len(ncs) +} + // Capacity returns the cumulative total bytes of all controller capacities. func (ncs NvmeControllers) Capacity() (tb uint64) { for _, c := range ncs { diff --git a/src/control/server/storage/config.go b/src/control/server/storage/config.go index 180b9663b31..402bee1c400 100644 --- a/src/control/server/storage/config.go +++ b/src/control/server/storage/config.go @@ -319,8 +319,7 @@ func (tcs TierConfigs) HasBdevRoleMeta() bool { } for _, bc := range tcs.BdevConfigs() { - bits := bc.Bdev.DeviceRoles.OptionBits - if (bits & BdevRoleMeta) != 0 { + if bc.Bdev.DeviceRoles.HasMeta() { return true } } @@ -403,10 +402,9 @@ func (tcs TierConfigs) validateBdevRoles() error { return FaultBdevConfigRolesMissing } - bits := roles.OptionBits - hasWAL := (bits & BdevRoleWAL) != 0 - hasMeta := (bits & BdevRoleMeta) != 0 - hasData := (bits & BdevRoleData) != 0 + hasWAL := roles.HasWAL() + hasMeta := roles.HasMeta() + hasData := roles.HasData() // Disallow having both wal and data only on a tier. if hasWAL && hasData && !hasMeta { @@ -942,6 +940,33 @@ func (bdr *BdevRoles) String() string { return bdr.toString(roleOptFlags) } +// HasDataRole returns true if BdevRoles has DATA role set. +func (bdr *BdevRoles) HasData() bool { + if bdr == nil { + return false + } + + return bdr.OptionBits&BdevRoleData != 0 +} + +// HasMetaRole returns true if BdevRoles has META role set. +func (bdr *BdevRoles) HasMeta() bool { + if bdr == nil { + return false + } + + return bdr.OptionBits&BdevRoleMeta != 0 +} + +// HasWALRole returns true if BdevRoles has WAL role set. +func (bdr *BdevRoles) HasWAL() bool { + if bdr == nil { + return false + } + + return bdr.OptionBits&BdevRoleWAL != 0 +} + // BdevConfig represents a Block Device (NVMe, etc.) configuration entry. type BdevConfig struct { DeviceList *BdevDeviceList `yaml:"bdev_list,omitempty"` @@ -1105,6 +1130,7 @@ type BdevAutoFaulty struct { MaxCsumErrs uint32 `yaml:"max_csum_errs,omitempty" json:"max_csum_errs"` } +// Config defines engine storage. type Config struct { ControlMetadata ControlMetadata `yaml:"-"` // inherited from server EngineIdx uint `yaml:"-"` @@ -1118,6 +1144,7 @@ type Config struct { AutoFaultyProps BdevAutoFaulty `yaml:"bdev_auto_faulty,omitempty"` } +// SetNUMAAffinity enables the assignment of NUMA affinity to tier configs. func (c *Config) SetNUMAAffinity(node uint) { c.NumaNodeIndex = node for _, tier := range c.Tiers { @@ -1125,14 +1152,12 @@ func (c *Config) SetNUMAAffinity(node uint) { } } +// GetBdevs retrieves bdev device list of storage tiers. func (c *Config) GetBdevs() *BdevDeviceList { return c.Tiers.Bdevs() } -func (c *Config) GetNVMeBdevs() *BdevDeviceList { - return c.Tiers.NVMeBdevs() -} - +// Validate checks the validity of the storage config. func (c *Config) Validate() error { if err := c.Tiers.Validate(); err != nil { return err diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 7e7c8199155..ba45aaa2616 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -81,9 +81,11 @@ dtx_coll_prep_ult(void *arg) } if (dcpa->dcpa_result != 0) { - if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST) - D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n", - DP_DTI(&dci->dci_xid), opc, DP_RC(rc)); + if (dcpa->dcpa_result < 0 && + dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST) + D_ERROR("Failed to load mbs for "DF_DTI" in "DF_UUID"/"DF_UUID", opc %u: " + DF_RC"\n", DP_DTI(&dci->dci_xid), DP_UUID(dci->dci_po_uuid), + DP_UUID(dci->dci_co_uuid), opc, DP_RC(dcpa->dcpa_result)); goto out; } diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index ecb156729ed..1ee74ae11a4 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1271,7 +1271,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul int status = -1; int rc = 0; bool aborted = false; - bool unpin = false; D_ASSERT(cont != NULL); @@ -1339,7 +1338,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * it persistently. Otherwise, the subsequent DTX resync may not find it as * to regard it as failed transaction and abort it. */ - if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo && + if (!dth->dth_active && !dth->dth_prepared && (dth->dth_dist || dth->dth_modification_cnt > 0)) { result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false); if (unlikely(result < 0)) { @@ -1349,7 +1348,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul } } - if (dth->dth_prepared || dtx_batched_ult_max == 0) { + if ((dth->dth_prepared && !dlh->dlh_coll) || dtx_batched_ult_max == 0) { dth->dth_sync = 1; goto sync; } @@ -1363,14 +1362,12 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul if (DAOS_FAIL_CHECK(DAOS_DTX_MISS_COMMIT)) dth->dth_sync = 1; - /* For synchronous DTX, do not add it into CoS cache, otherwise, - * we may have no way to remove it from the cache. - */ if (dth->dth_sync) goto sync; D_ASSERT(dth->dth_mbs != NULL); +cache: if (dlh->dlh_coll) { rc = dtx_cos_add(cont, dlh->dlh_coll_entry, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_epoch, DCF_EXP_CMT | DCF_COLL); @@ -1378,38 +1375,47 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size; D_ALLOC(dte, size); if (dte == NULL) { - dth->dth_sync = 1; - goto sync; - } - - mbs = (struct dtx_memberships *)(dte + 1); - memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); - - dte->dte_xid = dth->dth_xid; - dte->dte_ver = dth->dth_ver; - dte->dte_refs = 1; - dte->dte_mbs = mbs; + rc = -DER_NOMEM; + } else { + mbs = (struct dtx_memberships *)(dte + 1); + memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); + + dte->dte_xid = dth->dth_xid; + dte->dte_ver = dth->dth_ver; + dte->dte_refs = 1; + dte->dte_mbs = mbs; + + if (!(mbs->dm_flags & DMF_SRDG_REP)) + flags = DCF_EXP_CMT; + else if (dth->dth_modify_shared) + flags = DCF_SHARED; + else + flags = 0; - if (!(mbs->dm_flags & DMF_SRDG_REP)) - flags = DCF_EXP_CMT; - else if (dth->dth_modify_shared) - flags = DCF_SHARED; - else - flags = 0; + rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash, + dth->dth_epoch, flags); + dtx_entry_put(dte); + } + } - rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash, - dth->dth_epoch, flags); - dtx_entry_put(dte); + /* + * NOTE: If we failed to add the committable DTX into CoS cache, then we also have no way + * to commit (or abort) the DTX because of out of memory. Such DTX will be finally + * committed via next DTX resync (after recovered from OOM). + * + * Here, we only warning to notify the trouble, but not failed the transaction. + */ + if (rc != 0) { + D_WARN(DF_UUID": Fail to cache %s DTX "DF_DTI": "DF_RC"\n", + DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", + DP_DTI(&dth->dth_xid), DP_RC(rc)); + D_GOTO(out, result = 0); } - if (rc == 0) { - if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) { - vos_dtx_mark_committable(dth); - if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll) - sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req); - } - } else { - dth->dth_sync = 1; + if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) { + vos_dtx_mark_committable(dth); + if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll) + sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req); } sync: @@ -1428,10 +1434,15 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul rc = dtx_commit(cont, &dte, NULL, 1, false); } - if (rc != 0) + if (rc != 0) { D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n", DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", DP_DTI(&dth->dth_xid), DP_RC(rc)); + if (likely(dtx_batched_ult_max != 0)) { + dth->dth_sync = 0; + goto cache; + } + } /* * NOTE: The semantics of 'sync' commit does not guarantee that all @@ -1451,7 +1462,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * to locally retry for avoiding related forwarded RPC timeout, instead, * The leader will trigger retry globally without abort 'prepared' ones. */ - if (unpin || (result < 0 && result != -DER_AGAIN && !dth->dth_solo)) { + if (result < 0 && result != -DER_AGAIN && !dth->dth_solo) { /* 1. Drop partial modification for distributed transaction. * 2. Remove the pinned DTX entry. */ diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c index 4c165f94d0c..0f6dd1c5913 100644 --- a/src/dtx/dtx_cos.c +++ b/src/dtx/dtx_cos.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 2ccbfec2734..6d34e871269 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -1657,8 +1657,9 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d } D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, - "Collectively commit DTX "DF_DTI": %d/%d/%d\n", - DP_DTI(&dce->dce_xid), rc, rc1, rc2); + "Collectively commit DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/%d\n", + DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), + rc, rc1, rc2); return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; } @@ -1717,8 +1718,9 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc rc2 = 0; D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, - "Collectively abort DTX "DF_DTI": %d/%d/%d\n", - DP_DTI(&dce->dce_xid), rc, rc1, rc2); + "Collectively abort DTX "DF_DTI" with epoch "DF_X64" in " + DF_UUID"/"DF_UUID": %d/%d/%d\n", DP_DTI(&dce->dce_xid), epoch, + DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), rc, rc1, rc2); return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; } @@ -1766,8 +1768,9 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc } D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR, - DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n", - DP_DTI(&dce->dce_xid), rc, rc1); + DB_TRACE, "Collectively check DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/\n", + DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), + rc, rc1); return dce->dce_ranks != NULL ? rc : rc1; } diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 41480b6e8b0..0885b7f6cdc 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -474,8 +474,9 @@ dtx_coll_handler(crt_rpc_t *rpc) out: D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, - "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n", - opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc)); + "Handled collective DTX PRC %u on rank %u for "DF_DTI" in " + DF_UUID"/"DF_UUID": "DF_RC"\n", opc, myrank, DP_DTI(&dci->dci_xid), + DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), DP_RC(rc)); dco->dco_status = rc; rc = crt_reply_send(rpc); diff --git a/src/engine/sched.c b/src/engine/sched.c index 1fe400204be..49a46ca3618 100644 --- a/src/engine/sched.c +++ b/src/engine/sched.c @@ -781,7 +781,7 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi) { struct sched_info *info = &dx->dx_sched_info; struct vos_pool_space vps = { 0 }; - uint64_t scm_left, nvme_left; + uint64_t scm_left, nvme_left, ne_left, ne_sys; struct pressure_ratio *pr; int orig_pressure, rc; @@ -814,6 +814,17 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi) else scm_left = 0; + if (vps.vps_ne_total == 0) { + ne_left = UINT64_MAX; + } else { + D_ASSERT(vps.vps_ne_total < SCM_TOTAL(&vps)); + ne_sys = SCM_SYS(&vps) * vps.vps_ne_total / SCM_TOTAL(&vps); + if (vps.vps_ne_free > ne_sys) + ne_left = vps.vps_ne_free - ne_sys; + else + ne_left = 0; + } + if (NVME_TOTAL(&vps) == 0) /* NVMe not enabled */ nvme_left = UINT64_MAX; else if (NVME_FREE(&vps) > NVME_SYS(&vps)) @@ -824,7 +835,8 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi) orig_pressure = spi->spi_space_pressure; for (pr = &pressure_gauge[0]; pr->pr_free != 0; pr++) { if (scm_left > (SCM_TOTAL(&vps) * pr->pr_free / 100) && - nvme_left > (NVME_TOTAL(&vps) * pr->pr_free / 100)) + nvme_left > (NVME_TOTAL(&vps) * pr->pr_free / 100) && + ne_left > (vps.vps_ne_total * pr->pr_free / 100)) break; } spi->spi_space_pressure = pr->pr_pressure; @@ -832,10 +844,11 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi) if (spi->spi_space_pressure != SCHED_SPACE_PRESS_NONE && spi->spi_space_pressure != orig_pressure) { D_INFO("Pool:"DF_UUID" is under %d pressure, " - "SCM: tot["DF_U64"], sys["DF_U64"], free["DF_U64"] " + "SCM: tot["DF_U64"], sys["DF_U64"], free["DF_U64"], ne["DF_U64"/"DF_U64"] " "NVMe: tot["DF_U64"], sys["DF_U64"], free["DF_U64"]\n", DP_UUID(spi->spi_pool_id), spi->spi_space_pressure, SCM_TOTAL(&vps), SCM_SYS(&vps), SCM_FREE(&vps), + vps.vps_ne_free, vps.vps_ne_total, NVME_TOTAL(&vps), NVME_SYS(&vps), NVME_FREE(&vps)); spi->spi_pressure_ts = info->si_cur_ts; diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h index ffd3296b4eb..6b016f8fcf9 100644 --- a/src/include/daos/mem.h +++ b/src/include/daos/mem.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -30,19 +30,30 @@ int umempobj_settings_init(bool md_on_ssd); /* convert backend type to umem class id */ int umempobj_backend_type2class_id(int backend); +/* get page size for the backend */ +size_t +umempobj_pgsz(int backend); + /* umem persistent object property flags */ #define UMEMPOBJ_ENABLE_STATS 0x1 #ifdef DAOS_PMEM_BUILD + +/* The backend type is stored in meta blob header, don't change the value */ enum { DAOS_MD_PMEM = 0, DAOS_MD_BMEM = 1, DAOS_MD_ADMEM = 2, + DAOS_MD_BMEM_V2 = 3, }; /* return umem backend type */ int umempobj_get_backend_type(void); +/* returns whether bmem_v2 pools are allowed */ +bool +umempobj_allow_md_bmem_v2(); + #endif struct umem_wal_tx; @@ -108,7 +119,12 @@ struct umem_store_iod { struct umem_store; struct umem_store_ops { - int (*so_load)(struct umem_store *store, char *start); + int (*so_waitqueue_create)(void **wq); + void (*so_waitqueue_destroy)(void *wq); + void (*so_waitqueue_wait)(void *wq, bool yield_only); + void (*so_waitqueue_wakeup)(void *wq, bool wakeup_all); + int (*so_load)(struct umem_store *store, char *start_addr, daos_off_t offset, + daos_size_t len); int (*so_read)(struct umem_store *store, struct umem_store_iod *iod, d_sg_list_t *sgl); int (*so_write)(struct umem_store *store, struct umem_store_iod *iod, @@ -151,6 +167,8 @@ struct umem_store { struct umem_store_ops *stor_ops; /* backend type */ int store_type; + /* whether the store has evictable zones */ + bool store_evictable; /* standalone store */ bool store_standalone; /* backend SSD is in faulty state */ @@ -169,6 +187,312 @@ struct umem_pool { struct umem_slab_desc up_slabs[0]; }; +#ifdef DAOS_PMEM_BUILD +#define UMEM_CACHE_PAGE_SZ_SHIFT 24 /* 16MB */ +#define UMEM_CACHE_PAGE_SZ (1 << UMEM_CACHE_PAGE_SZ_SHIFT) + +#define UMEM_CACHE_CHUNK_SZ_SHIFT 12 /* 4KB */ +#define UMEM_CACHE_CHUNK_SZ (1 << UMEM_CACHE_CHUNK_SZ_SHIFT) +#define UMEM_CACHE_CHUNK_SZ_MASK (UMEM_CACHE_CHUNK_SZ - 1) + +#define UMEM_CACHE_MIN_EVICTABLE_PAGES 2 + +enum umem_page_event_types { + UMEM_CACHE_EVENT_PGLOAD = 0, + UMEM_CACHE_EVENT_PGEVICT +}; + +struct umem_page_info; +/* MD page */ +struct umem_page { + /** Pointing to memory page when it's mapped */ + struct umem_page_info *pg_info; +}; + +enum umem_page_stats { + UMEM_PG_STATS_NONEVICTABLE = 0, + UMEM_PG_STATS_PINNED, + UMEM_PG_STATS_FREE, + UMEM_PG_STATS_MAX, +}; + +enum umem_cache_stats { + /* How many page cache hit */ + UMEM_CACHE_STATS_HIT = 0, + /* How many page cache miss */ + UMEM_CACHE_STATS_MISS, + /* How many pages are evicted */ + UMEM_CACHE_STATS_EVICT, + /* How many dirty pages are flushed on evicting */ + UMEM_CACHE_STATS_FLUSH, + /* How many pages are loaded on cache miss */ + UMEM_CACHE_STATS_LOAD, + UMEM_CACHE_STATS_MAX, +}; + +/** Global cache status for each umem_store */ +struct umem_cache { + struct umem_store *ca_store; + /** Base address of the page cache */ + void *ca_base; + /** Offset of first page */ + uint32_t ca_base_off; + /** Cache Mode */ + uint32_t ca_mode; + /** WAL replay status */ + uint32_t ca_replay_done; + /** Total MD pages */ + uint32_t ca_md_pages; + /** Total memory pages in cache */ + uint32_t ca_mem_pages; + /** Maximum non-evictable memory pages */ + uint32_t ca_max_ne_pages; + /** Page size */ + uint32_t ca_page_sz; + /** Page size shift */ + uint32_t ca_page_shift; + /** Page size mask */ + uint32_t ca_page_mask; + /** Per-page Bitmap size (in uint64_t) */ + uint32_t ca_bmap_sz; + /** Free list for unmapped page info */ + d_list_t ca_pgs_free; + /** Non-evictable & evictable dirty pages */ + d_list_t ca_pgs_dirty; + /** All Non-evictable[0] & evictable[1] pages */ + d_list_t ca_pgs_lru[2]; + /** all the pages in the progress of flushing */ + d_list_t ca_pgs_flushing; + /** all the pages waiting for commit */ + d_list_t ca_pgs_wait_commit; + /** all the pages being pinned */ + d_list_t ca_pgs_pinned; + /** Highest committed transaction ID */ + uint64_t ca_commit_id; + /** Callback to tell if a page is evictable */ + bool (*ca_evictable_fn)(void *arg, uint32_t pg_id); + /** Callback being called on page loaded/evicted */ + int (*ca_evtcb_fn)(int event_type, void *arg, uint32_t pg_id); + /** Argument to the callback function */ + void *ca_fn_arg; + /** Page stats */ + uint32_t ca_pgs_stats[UMEM_PG_STATS_MAX]; + /** Cache stats */ + uint64_t ca_cache_stats[UMEM_CACHE_STATS_MAX]; + /** How many waiters waiting on free page reserve */ + uint32_t ca_reserve_waiters; + /** Waitqueue for free page reserve: umem_cache_reserve() */ + void *ca_reserve_wq; + /** TODO: some other global status */ + /** MD page array, array index is page ID */ + struct umem_page ca_pages[0]; +}; + +struct umem_cache_chkpt_stats { + /** Last committed checkpoint id */ + uint64_t *uccs_chkpt_id; + /** Number of pages processed */ + int uccs_nr_pages; + /** Number of dirty chunks copied */ + int uccs_nr_dchunks; + /** Number of sgl iovs used to copy dirty chunks */ + int uccs_nr_iovs; +}; + +/** Allocate global cache for umem store. + * + * \param[in] store The umem store + * \param[in] page_sz Page size + * \param[in] md_pgs Total MD pages + * \param[in] mem_pgs Total memory pages + * \param[in] max_ne_pgs Maximum Non-evictable pages + * \param[in] base_off Offset of the umem cache base + * \param[in] base Start address of the page cache + * \param[in] is_evictable_fn Callback function to check if page is evictable + * \param[in] pageload_fn Callback called on page being loaded + * \param[in] arg Argument to callback functions. + * + * \return 0 on success + */ +int +umem_cache_alloc(struct umem_store *store, uint32_t page_sz, uint32_t md_pgs, uint32_t mem_pgs, + uint32_t max_ne_pgs, uint32_t base_off, void *base, + bool (*is_evictable_fn)(void *arg, uint32_t pg_id), + int (*evtcb_fn)(int evt_flag, void *arg, uint32_t pg_id), void *arg); + +/** Free global cache for umem store. + * + * \param[in] store Store for which to free cache + * + * \return 0 on success + */ +int +umem_cache_free(struct umem_store *store); + +/** Check MD-blob offset is already loaded onto umem cache. + * + * \param[in] store The umem store + * \param[in] offset MD-blob offset to be converted + * + * \return true or false + */ +bool +umem_cache_offisloaded(struct umem_store *store, umem_off_t offset); + +/** Convert MD-blob offset to memory pointer, the corresponding page must be mapped already. + * + * \param[in] store The umem store + * \param[in] offset MD-blob offset to be converted + * + * \return Memory pointer + */ +void * +umem_cache_off2ptr(struct umem_store *store, umem_off_t offset); + +/** Convert memory pointer to MD-blob offset, the corresponding page must be mapped already. + * + * \param[in] store The umem store + * \param[in] ptr Memory pointer to be converted + * + * \return MD-blob offset + */ +umem_off_t +umem_cache_ptr2off(struct umem_store *store, const void *ptr); + +/** Update umem_cache post WAL replay. This routine is called after + * WAL replay and the evictability of all pages are determined. + * + * \param[in] store The umem store + * + * \return None + */ +void +umem_cache_post_replay(struct umem_store *store); + +struct umem_cache_range { + umem_off_t cr_off; + daos_size_t cr_size; +}; + +/** Map MD pages in specified range to memory pages. The range to be mapped should be empty + * (no page loading required). If caller tries to map non-evictable pages, page eviction + * won't be triggered when there are not enough free pages; If caller tries to map evictable + * page, page eviction could be triggered, but it can only map single evictable page at a time. + * + * \param[in] store The umem store + * \param[in] ranges Ranges to be mapped + * \param[in] range_nr Number of ranges + * + * \return 0 : On success + * -DER_BUSY : Not enough free pages + * -ve : Errors + */ +int +umem_cache_map(struct umem_store *store, struct umem_cache_range *ranges, int range_nr); + +/** Load & map MD pages in specified range to memory pages. + * + * \param[in] store The umem store + * \param[in] ranges Ranges to be mapped + * \param[in] range_nr Number of ranges + * \param[in] for_sys Internal access from system ULTs (aggregation etc.) + * + * \return 0 on success, negative value on error. + */ +int +umem_cache_load(struct umem_store *store, struct umem_cache_range *ranges, int range_nr, + bool for_sys); + +struct umem_pin_handle; + +/** Load & map MD pages in specified range to memory pages, then take a reference on the mapped + * memory pages, so that the pages won't be evicted until unpin is called. It's usually for the + * cases where we need the pages stay loaded across a yield. + * + * \param[in] store The umem store + * \param[in] ranges Ranges to be pinned + * \param[in] range_nr Number of ranges + * \param[in] for_sys Internal access from system ULTs (aggregation etc.) + * \param[out] pin_handle Returned pin handle + * + * \return 0 on success + */ +int +umem_cache_pin(struct umem_store *store, struct umem_cache_range *rangs, int range_nr, bool for_sys, + struct umem_pin_handle **pin_handle); + +/** Unpin the pages pinned by prior umem_cache_pin(). + * + * \param[in] store The umem store + * \param[in] pin_handle Pin handle got from umem_cache_pin() + * \param[in] range_nr Number of ranges + */ +void +umem_cache_unpin(struct umem_store *store, struct umem_pin_handle *pin_handle); + +/** Reserve few free pages for potential non-evictable zone grow within a transaction. + * Caller needs to ensure there is no CPU yielding after this call till transaction + * start. + * + * \param[in] store The umem store + * + * \return 0 on success + */ +int +umem_cache_reserve(struct umem_store *store); + +/** Inform umem cache the last committed ID. + * + * \param[in] store The umem store + * \param[in] commit_id The last committed ID + */ +void +umem_cache_commit(struct umem_store *store, uint64_t commit_id); + +/** + * Touched the region identified by @addr and @size, it will mark pages in this region as + * dirty (also set bitmap within each page), and put it on dirty list + * + * This function is called by allocator(probably VOS as well) each time it creates memory + * snapshot (calls tx_snap) or just to mark a region to be flushed. + * + * \param[in] store The umem store + * \param[in] wr_tx The writing transaction + * \param[in] addr The start address + * \param[in] size size of dirty region + * + * \return 0 on success, -DER_CHKPT_BUSY if a checkpoint is in progress on the page. The calling + * transaction must either abort or find another location to modify. + */ +int +umem_cache_touch(struct umem_store *store, uint64_t wr_tx, umem_off_t addr, daos_size_t size); + +/** Callback for checkpoint to wait for the commit of chkpt_tx. + * + * \param[in] arg Argument passed to umem_cache_checkpoint + * \param[in] chkpt_tx The WAL transaction ID we are waiting to commit to WAL + * \param[out] committed_tx The WAL tx ID of the last transaction committed to WAL + */ +typedef void +umem_cache_wait_cb_t(void *arg, uint64_t chkpt_tx, uint64_t *committed_tx); + +/** + * This function can yield internally, it is called by checkpoint service of upper level stack. + * + * \param[in] store The umem store + * \param[in] wait_cb Callback for to wait for wal commit completion + * \param[in] arg argument for wait_cb + * \param[in,out] chkpt_id Input is last committed id, output is checkpointed id + * \param[out] chkpt_stats check point stats + * + * \return 0 on success + */ +int +umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, void *arg, + uint64_t *chkpt_id, struct umem_cache_chkpt_stats *chkpt_stats); + +#endif /*DAOS_PMEM_BUILD*/ + /* umem persistent object functions */ struct umem_pool *umempobj_create(const char *path, const char *layout_name, int prop_flags, size_t poolsize, @@ -179,6 +503,9 @@ void umempobj_close(struct umem_pool *pool); void *umempobj_get_rootptr(struct umem_pool *pool, size_t size); int umempobj_get_heapusage(struct umem_pool *pool, daos_size_t *cur_allocated); +int + umempobj_get_mbusage(struct umem_pool *pool, uint32_t mb_id, daos_size_t *cur_allocated, + daos_size_t *maxsz); void umempobj_log_fraginfo(struct umem_pool *pool); /** Number of flag bits to reserve for encoding extra information in @@ -273,6 +600,8 @@ typedef enum { UMEM_CLASS_BMEM, /** ad-hoc memory */ UMEM_CLASS_ADMEM, + /** blob backed memory v2 */ + UMEM_CLASS_BMEM_V2, /** unknown */ UMEM_CLASS_UNKNOWN, } umem_class_id_t; @@ -314,6 +643,9 @@ struct umem_instance; #define UMEM_FLAG_NO_FLUSH (((uint64_t)1) << 1) #define UMEM_XADD_NO_SNAPSHOT (((uint64_t)1) << 2) +/* Macros associated with Memory buckets */ +#define UMEM_DEFAULT_MBKT_ID 0 + /* type num used by umem ops */ enum { UMEM_TYPE_ANY, @@ -334,11 +666,12 @@ typedef struct { * * \param umm [IN] umem class instance. * \param size [IN] size to allocate. - * \param flags [IN] flags like zeroing, noflush (for PMDK) - * \param type_num [IN] struct type (for PMDK) + * \param flags [IN] flags like zeroing, noflush (for PMDK and BMEM) + * \param type_num [IN] struct type (for PMDK and BMEM) + * \param mbkt_id [IN] memory bucket id (for BMEM) */ - umem_off_t (*mo_tx_alloc)(struct umem_instance *umm, size_t size, - uint64_t flags, unsigned int type_num); + umem_off_t (*mo_tx_alloc)(struct umem_instance *umm, size_t size, uint64_t flags, + unsigned int type_num, unsigned int mbkt_id); /** * Add the specified range of umoff to current memory transaction. * @@ -361,7 +694,7 @@ typedef struct { * \param offset [IN] start offset of \a umoff tracked by the * transaction. * \param size [IN] size of \a umoff tracked by the transaction. - * \param flags [IN] PMDK flags + * \param flags [IN] PMDK and BMEM flags */ int (*mo_tx_xadd)(struct umem_instance *umm, umem_off_t umoff, uint64_t offset, @@ -394,9 +727,10 @@ typedef struct { * \param act [IN|OUT] action used for later cancel/publish. * \param size [IN] size to be reserved. * \param type_num [IN] struct type (for PMDK) + * \param mbkt_id [IN] memory bucket id (for BMEM) */ - umem_off_t (*mo_reserve)(struct umem_instance *umm, void *act, size_t size, - unsigned int type_num); + umem_off_t (*mo_reserve)(struct umem_instance *umm, void *act, size_t size, + unsigned int type_num, unsigned int mbkt_id); /** * Defer free til commit. For use with reserved extents that are not @@ -446,13 +780,14 @@ typedef struct { /** * allocate umoff with the specified size & flags atomically * - * \param umm [IN] umem class instance. - * \param size [IN] size to allocate. - * \param flags [IN] flags like zeroing, noflush (for PMDK) - * \param type_num [IN] struct type (for PMDK) + * \param umm [IN] umem class instance. + * \param size [IN] size to allocate. + * \param flags [IN] flags like zeroing, noflush (for PMDK) + * \param type_num [IN] struct type (for PMDK) + * \param mbkt_id [IN] memory bucket id (for BMEM) */ - umem_off_t (*mo_atomic_alloc)(struct umem_instance *umm, size_t size, - unsigned int type_num); + umem_off_t (*mo_atomic_alloc)(struct umem_instance *umm, size_t size, unsigned int type_num, + unsigned int mbkt_id); /** * flush data at specific offset to persistent store. @@ -464,6 +799,14 @@ typedef struct { void (*mo_atomic_flush)(struct umem_instance *umm, void *addr, size_t size); + /** + * returns an evictable memory bucket for tasks like new object creation etc. + * + * \param umm [IN] umem class instance. + * \param flags [IN] flags for MB selection criteria. Currently unused. + */ + uint32_t (*mo_allot_evictable_mb)(struct umem_instance *umm, int flags); + #endif /** * Add one commit or abort callback to current transaction. @@ -522,6 +865,10 @@ umem_off2ptr(const struct umem_instance *umm, umem_off_t umoff) if (UMOFF_IS_NULL(umoff)) return NULL; +#ifdef DAOS_PMEM_BUILD + if (umm->umm_pool && (umm->umm_pool->up_store.store_type == DAOS_MD_BMEM_V2)) + return umem_cache_off2ptr(&umm->umm_pool->up_store, umem_off2offset(umoff)); +#endif return (void *)(umm->umm_base + umem_off2offset(umoff)); } @@ -538,7 +885,12 @@ umem_ptr2off(const struct umem_instance *umm, void *ptr) if (ptr == NULL) return UMOFF_NULL; - return (umem_off_t)ptr - umm->umm_base; +#ifdef DAOS_PMEM_BUILD + if (umm->umm_pool && (umm->umm_pool->up_store.store_type == DAOS_MD_BMEM_V2)) { + return umem_cache_ptr2off(&umm->umm_pool->up_store, ptr); + } else +#endif + return (umem_off_t)ptr - umm->umm_base; } /** @@ -558,11 +910,11 @@ umem_has_tx(struct umem_instance *umm) return umm->umm_ops->mo_tx_add != NULL; } -#define umem_alloc_verb(umm, flags, size) \ +#define umem_alloc_verb(umm, flags, size, mbkt_id) \ ({ \ umem_off_t __umoff; \ \ - __umoff = (umm)->umm_ops->mo_tx_alloc(umm, size, flags, UMEM_TYPE_ANY); \ + __umoff = (umm)->umm_ops->mo_tx_alloc(umm, size, flags, UMEM_TYPE_ANY, mbkt_id); \ D_ASSERTF(umem_off2flags(__umoff) == 0, \ "Invalid assumption about allocnot using flag bits"); \ D_DEBUG(DB_MEM, \ @@ -573,14 +925,17 @@ umem_has_tx(struct umem_instance *umm) __umoff; \ }) -#define umem_alloc(umm, size) \ - umem_alloc_verb(umm, 0, size) +#define umem_alloc(umm, size) umem_alloc_verb(umm, 0, size, UMEM_DEFAULT_MBKT_ID) + +#define umem_alloc_from_bucket(umm, size, mbkt_id) umem_alloc_verb(umm, 0, size, mbkt_id) -#define umem_zalloc(umm, size) \ - umem_alloc_verb(umm, UMEM_FLAG_ZERO, size) +#define umem_zalloc(umm, size) umem_alloc_verb(umm, UMEM_FLAG_ZERO, size, UMEM_DEFAULT_MBKT_ID) -#define umem_alloc_noflush(umm, size) \ - umem_alloc_verb(umm, UMEM_FLAG_NO_FLUSH, size) +#define umem_zalloc_from_bucket(umm, size, mbkt_id) \ + umem_alloc_verb(umm, UMEM_FLAG_ZERO, size, mbkt_id) + +#define umem_alloc_noflush(umm, size) \ + umem_alloc_verb(umm, UMEM_FLAG_NO_FLUSH, size, UMEM_DEFAULT_MBKT_ID) #define umem_free(umm, umoff) \ ({ \ @@ -736,13 +1091,20 @@ int umem_rsrvd_act_realloc(struct umem_instance *umm, struct umem_rsrvd_act **ac /* Free up the array of reserved actions */ int umem_rsrvd_act_free(struct umem_rsrvd_act **act); -umem_off_t umem_reserve(struct umem_instance *umm, - struct umem_rsrvd_act *rsrvd_act, size_t size); -void umem_defer_free(struct umem_instance *umm, umem_off_t off, - struct umem_rsrvd_act *rsrvd_act); -void umem_cancel(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act); -int umem_tx_publish(struct umem_instance *umm, - struct umem_rsrvd_act *rsrvd_act); +umem_off_t +umem_reserve_common(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act, size_t size, + unsigned int mbkt_id); +#define umem_reserve(umm, rsrvd_act, size) \ + umem_reserve_common(umm, rsrvd_act, size, UMEM_DEFAULT_MBKT_ID) +#define umem_reserve_from_bucket(umm, rsrvd_act, size, mbkt_id) \ + umem_reserve_common(umm, rsrvd_act, size, mbkt_id) + +void +umem_defer_free(struct umem_instance *umm, umem_off_t off, struct umem_rsrvd_act *rsrvd_act); +void +umem_cancel(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act); +int +umem_tx_publish(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_act); static inline void * umem_atomic_copy(struct umem_instance *umm, void *dest, void *src, size_t len, @@ -756,7 +1118,15 @@ static inline umem_off_t umem_atomic_alloc(struct umem_instance *umm, size_t len, unsigned int type_num) { D_ASSERT(umm->umm_ops->mo_atomic_alloc != NULL); - return umm->umm_ops->mo_atomic_alloc(umm, len, type_num); + return umm->umm_ops->mo_atomic_alloc(umm, len, type_num, UMEM_DEFAULT_MBKT_ID); +} + +static inline umem_off_t +umem_atomic_alloc_from_bucket(struct umem_instance *umm, size_t len, unsigned int type_num, + unsigned int mbkt_id) +{ + D_ASSERT(umm->umm_ops->mo_atomic_alloc != NULL); + return umm->umm_ops->mo_atomic_alloc(umm, len, type_num, mbkt_id); } static inline int @@ -786,6 +1156,48 @@ umem_tx_add_callback(struct umem_instance *umm, struct umem_tx_stage_data *txd, return umm->umm_ops->mo_tx_add_callback(umm, txd, stage, cb, data); } +/** + * Allot an evictable memory bucket for tasks like new object creation etc. + * + * \param[in] umm umem instance pointer. + * \param[in] flags MB selection criteria. + * + * \return id > 0, memory bucket id. + * id = 0, no evictable memory was be chosen. + */ +static inline uint32_t +umem_allot_mb_evictable(struct umem_instance *umm, int flags) +{ + if (umm->umm_ops->mo_allot_evictable_mb) + return umm->umm_ops->mo_allot_evictable_mb(umm, flags); + else + return 0; +} + +/** + * Get memory bucket id associated with the offset. + * + * \param[in] umm umem instance pointer. + * \param[in] off offset within the umem pool + * + * \return id > 0, id of evictable memory bucket. + * id = 0, Memory bucket is non-evictable. + */ +uint32_t +umem_get_mb_from_offset(struct umem_instance *umm, umem_off_t off); + +/** + * Get base offset of the memory bucket + * + * \param[in] umm umem instance pointer. + * \param[in] mb_id memory bucket id. + * + * \return off > 0, base offset of evictable memory bucket. + * off = 0, base offset of non-evictable memory bucket. + */ +umem_off_t +umem_get_mb_base_offset(struct umem_instance *umm, uint32_t mb_id); + /*********************************************************************************/ /* Type of memory actions */ @@ -855,219 +1267,5 @@ struct umem_action { }; }; -#define UMEM_CACHE_PAGE_SZ_SHIFT 24 /* 16MB */ -#define UMEM_CACHE_PAGE_SZ (1 << UMEM_CACHE_PAGE_SZ_SHIFT) -#define UMEM_CACHE_PAGE_SZ_MASK (UMEM_CACHE_PAGE_SZ - 1) - -#define UMEM_CACHE_CHUNK_SZ_SHIFT 12 /* 4KB */ -#define UMEM_CACHE_CHUNK_SZ (1 << UMEM_CACHE_CHUNK_SZ_SHIFT) -#define UMEM_CACHE_CHUNK_SZ_MASK (UMEM_CACHE_CHUNK_SZ - 1) - -#define UMEM_CACHE_BMAP_SZ (1 << (UMEM_CACHE_PAGE_SZ_SHIFT - UMEM_CACHE_CHUNK_SZ_SHIFT - 6)) - -struct umem_page_info; -/** 16 MB page */ -struct umem_page { - /** page ID */ - unsigned int pg_id; - /** refcount */ - int pg_ref; - /** page info */ - struct umem_page_info *pg_info; -}; - -/** Global cache status for each umem_store */ -struct umem_cache { - struct umem_store *ca_store; - /** Total pages store */ - uint64_t ca_num_pages; - /** Total pages in cache */ - uint64_t ca_mapped; - /** Maximum number of cached pages */ - uint64_t ca_max_mapped; - /** Free list for mapped page info */ - d_list_t ca_pi_free; - /** all the dirty pages */ - d_list_t ca_pgs_dirty; - /** Pages waiting for copy to DMA buffer */ - d_list_t ca_pgs_copying; - /** LRU list all pages not in one of the other states for future eviction support */ - d_list_t ca_pgs_lru; - /** TODO: some other global status */ - /** All pages, sorted by umem_page::pg_id */ - struct umem_page ca_pages[0]; -}; - -struct umem_cache_chkpt_stats { - /** Last committed checkpoint id */ - uint64_t *uccs_chkpt_id; - /** Number of pages processed */ - int uccs_nr_pages; - /** Number of dirty chunks copied */ - int uccs_nr_dchunks; - /** Number of sgl iovs used to copy dirty chunks */ - int uccs_nr_iovs; -}; - -static inline uint64_t -umem_cache_size2pages(uint64_t len) -{ - D_ASSERT((len & UMEM_CACHE_PAGE_SZ_MASK) == 0); - - return len >> UMEM_CACHE_PAGE_SZ_SHIFT; -} - -static inline uint64_t -umem_cache_size_round(uint64_t len) -{ - return (len + UMEM_CACHE_PAGE_SZ_MASK) & ~UMEM_CACHE_PAGE_SZ_MASK; -} - -static inline struct umem_page * -umem_cache_off2page(struct umem_cache *cache, umem_off_t offset) -{ - uint64_t idx = offset >> UMEM_CACHE_PAGE_SZ_SHIFT; - - D_ASSERTF(idx < cache->ca_num_pages, - "offset=" DF_U64 ", num_pages=" DF_U64 ", idx=" DF_U64 "\n", offset, - cache->ca_num_pages, idx); - - return &cache->ca_pages[idx]; -} - -/** From a mapped page address, return the umem_cache it belongs to */ -static inline struct umem_cache * -umem_page2cache(struct umem_page *page) -{ - return (struct umem_cache *)container_of(&page[-page->pg_id], struct umem_cache, ca_pages); -} - -/** From a mapped page address, return the umem_store it belongs to */ -static inline struct umem_store * -umem_page2store(struct umem_page *page) -{ - return umem_page2cache(page)->ca_store; -} - -/** Allocate global cache for umem store. All 16MB pages are initially unmapped - * - * \param[in] store The umem store - * \param[in] max_mapped 0 or Maximum number of mapped 16MB pages (must be 0 for now) - * - * \return 0 on success - */ -int -umem_cache_alloc(struct umem_store *store, uint64_t max_mapped); - -/** Free global cache for umem store. Pages must be unmapped first - * - * \param[in] store Store for which to free cache - * - * \return 0 on success - */ -int -umem_cache_free(struct umem_store *store); - -/** Query if the page cache has enough space to map a range - * - * \param[in] store The store - * \param[in] num_pages Number of pages to bring into cache - * - * \return number of pages that need eviction to support mapping the range - */ -int -umem_cache_check(struct umem_store *store, uint64_t num_pages); - -/** Evict the pages. This invokes the unmap callback. (XXX: not yet implemented) - * - * \param[in] store The store - * \param[in] num_pages Number of pages to evict - * - * \return 0 on success, -DER_BUSY means a checkpoint is needed to evict the pages - */ -int -umem_cache_evict(struct umem_store *store, uint64_t num_pages); - -/** Adds a mapped range of pages to the page cache. - * - * \param[in] store The store - * \param[in] offset The offset in the umem cache - * \param[in] start_addr Start address of mapping - * \param[in] num_pages Number of consecutive 16MB pages to being cached - * - * \return 0 on success - */ -int -umem_cache_map_range(struct umem_store *store, umem_off_t offset, void *start_addr, - uint64_t num_pages); - -/** Take a reference on the pages in the range. Only needed for cases where we need the page to - * stay loaded across a yield, such as the VOS object cache. Pages in the range must be mapped. - * - * \param[in] store The umem store - * \param[in] addr The address of the hold - * \param[in] size The size of the hold - * - * \return 0 on success - */ -int -umem_cache_pin(struct umem_store *store, umem_off_t addr, daos_size_t size); - -/** Release a reference on pages in the range. Pages in the range must be mapped and held. - * - * \param[in] store The umem store - * \param[in] addr The address of the hold - * \param[in] size The size of the hold - * - * \return 0 on success - */ -int -umem_cache_unpin(struct umem_store *store, umem_off_t addr, daos_size_t size); - -/** - * Touched the region identified by @addr and @size, it will mark pages in this region as - * dirty (also set bitmap within each page), and put it on dirty list - * - * This function is called by allocator(probably VOS as well) each time it creates memory - * snapshot (calls tx_snap) or just to mark a region to be flushed. - * - * \param[in] store The umem store - * \param[in] wr_tx The writing transaction - * \param[in] addr The start address - * \param[in] size size of dirty region - * - * \return 0 on success, -DER_CHKPT_BUSY if a checkpoint is in progress on the page. The calling - * transaction must either abort or find another location to modify. - */ -int -umem_cache_touch(struct umem_store *store, uint64_t wr_tx, umem_off_t addr, daos_size_t size); - -/** Callback for checkpoint to wait for the commit of chkpt_tx. - * - * \param[in] arg Argument passed to umem_cache_checkpoint - * \param[in] chkpt_tx The WAL transaction ID we are waiting to commit to WAL - * \param[out] committed_tx The WAL tx ID of the last transaction committed to WAL - */ -typedef void -umem_cache_wait_cb_t(void *arg, uint64_t chkpt_tx, uint64_t *committed_tx); - -/** - * Write all dirty pages before @wal_tx to MD blob. (XXX: not yet implemented) - * - * This function can yield internally, it is called by checkpoint service of upper level stack. - * - * \param[in] store The umem store - * \param[in] wait_cb Callback for to wait for wal commit completion - * \param[in] arg argument for wait_cb - * \param[in,out] chkpt_id Input is last committed id, output is checkpointed id - * \param[out] chkpt_stats check point stats - * - * \return 0 on success - */ -int -umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, void *arg, - uint64_t *chkpt_id, struct umem_cache_chkpt_stats *chkpt_stats); - #endif /** DAOS_PMEM_BUILD */ - #endif /* __DAOS_MEM_H__ */ diff --git a/src/include/daos/task.h b/src/include/daos/task.h index 5cc4672fa30..88d5ef8c4c8 100644 --- a/src/include/daos/task.h +++ b/src/include/daos/task.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2015-2023 Intel Corporation. + * (C) Copyright 2015-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/include/daos_errno.h b/src/include/daos_errno.h index 8f2960b5933..d7decc6654d 100644 --- a/src/include/daos_errno.h +++ b/src/include/daos_errno.h @@ -211,9 +211,11 @@ extern "C" { ACTION(DER_CHKPT_BUSY, Page is temporarily read only due to checkpointing) \ ACTION(DER_DIV_BY_ZERO, Division by zero) \ /** Target is overload, retry RPC */ \ - ACTION(DER_OVERLOAD_RETRY, "retry later because of overloaded service") \ + ACTION(DER_OVERLOAD_RETRY, retry later because of overloaded service) \ ACTION(DER_NOT_RESUME, Cannot resume former DAOS check instance) +/* clang-format on */ + /** Defines the gurt error codes */ #define D_FOREACH_ERR_RANGE(ACTION) \ ACTION(GURT, 1000) \ diff --git a/src/include/daos_srv/bio.h b/src/include/daos_srv/bio.h index c32202a1b19..1486692d947 100644 --- a/src/include/daos_srv/bio.h +++ b/src/include/daos_srv/bio.h @@ -1017,15 +1017,18 @@ enum bio_mc_flags { * * \param[in] xs_ctxt Per-xstream NVMe context * \param[in] pool_id Pool UUID + * \param[in] scm_sz VOS file size in bytes * \param[in] meta_sz Meta blob size in bytes * \param[in] wal_sz WAL blob in bytes * \param[in] data_sz Data blob in bytes * \param[in] flags bio_mc_flags + * \param[in] backend_type Backend allocator type * * \return Zero on success, negative value on error. */ -int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t meta_sz, - uint64_t wal_sz, uint64_t data_sz, enum bio_mc_flags flags); +int bio_mc_create(struct bio_xs_context *xs_ctxt, uuid_t pool_id, uint64_t scm_sz, + uint64_t meta_sz, uint64_t wal_sz, uint64_t data_sz, enum bio_mc_flags flags, + uint8_t backend_type); /* * Destroy Meta/Data/WAL blobs @@ -1151,10 +1154,10 @@ int bio_wal_flush_header(struct bio_meta_context *mc); int bio_wal_checkpoint(struct bio_meta_context *mc, uint64_t tx_id, uint64_t *purge_size); /* - * Query meta capacity & meta block size & meta blob header blocks. + * Query the attributes of umem_store */ void bio_meta_get_attr(struct bio_meta_context *mc, uint64_t *capacity, uint32_t *blk_sz, - uint32_t *hdr_blks); + uint32_t *hdr_blks, uint8_t *backend_type, bool *evictable); struct bio_wal_info { uint32_t wi_tot_blks; /* Total blocks */ diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index f99f4df14e3..9fc615c2a8b 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -108,6 +108,9 @@ struct ds_cont_child { uint32_t sc_dtx_committable_count; uint32_t sc_dtx_committable_coll_count; + /* Last timestamp when EC aggregation reports -DER_INPROGRESS. */ + uint64_t sc_ec_agg_busy_ts; + /* The global minimum EC aggregation epoch, which will be upper * limit for VOS aggregation, i.e. EC object VOS aggregation can * not cross this limit. For simplification purpose, all objects diff --git a/src/include/daos_srv/evtree.h b/src/include/daos_srv/evtree.h index 63224259ccc..292c8848c87 100644 --- a/src/include/daos_srv/evtree.h +++ b/src/include/daos_srv/evtree.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -70,6 +70,10 @@ struct evt_desc_cbs { struct evt_desc *desc, daos_size_t nob, void *args); void *dc_bio_free_args; + /** + * Argument for allocation. + */ + void *dc_alloc_arg; /** * Availability check, it is for data tracked by DTX undo log. * It is optional, EVTree always treats data extent is available if diff --git a/src/include/daos_srv/smd.h b/src/include/daos_srv/smd.h index 9efc2e790dc..d4de8b7b32b 100644 --- a/src/include/daos_srv/smd.h +++ b/src/include/daos_srv/smd.h @@ -56,6 +56,7 @@ struct smd_dev_info { struct smd_pool_info { d_list_t spi_link; uuid_t spi_id; + uint64_t spi_scm_sz; uint64_t spi_blob_sz[SMD_DEV_TYPE_MAX]; uint16_t spi_flags[SMD_DEV_TYPE_MAX]; uint16_t spi_tgt_cnt[SMD_DEV_TYPE_MAX]; @@ -169,12 +170,13 @@ int smd_dev_replace(uuid_t old_id, uuid_t new_id, unsigned int old_roles); * \param [IN] tgt_id Target ID * \param [IN] blob_id Blob ID * \param [IN] smd_type SMD type - * \param [IN] blob_sz Blob size + * \param [IN] blob_sz Blob size in bytes + * \param [IN] scm_sz VOS file size in bytes * * \return Zero on success, negative value on error */ int smd_pool_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, - enum smd_dev_type smd_type, uint64_t blob_sz); + enum smd_dev_type smd_type, uint64_t blob_sz, uint64_t scm_sz); /* Assign a blob to a RDB pool target */ int smd_rdb_add_tgt(uuid_t pool_id, uint32_t tgt_id, uint64_t blob_id, diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 3b838c2b4a6..3d94065b64a 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -288,8 +288,9 @@ vos_self_fini(void); * \param path [IN] Path of the memory pool * \param uuid [IN] Pool UUID * \param scm_sz [IN] Size of SCM for the pool - * \param blob_sz[IN] Size of blob for the pool + * \param data_sz[IN] Size of data blob for the pool * \param wal_sz [IN] Size of WAL blob for the pool + * \param meta_sz[IN] Size of Meta blob for the pool * \param flags [IN] Pool open flags (see vos_pool_open_flags) * \param version[IN] Pool version (0 for default version) * \param poh [OUT] Returned pool handle if not NULL @@ -297,8 +298,9 @@ vos_self_fini(void); * \return Zero on success, negative value if error */ int -vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t blob_sz, - daos_size_t wal_sz, unsigned int flags, uint32_t version, daos_handle_t *poh); +vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t data_sz, + daos_size_t wal_sz, daos_size_t meta_sz, unsigned int flags, uint32_t version, + daos_handle_t *poh); /** * Create a Versioning Object Storage Pool (VOSP), and open it if \a poh is not @@ -307,7 +309,8 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ * \param path [IN] Path of the memory pool * \param uuid [IN] Pool UUID * \param scm_sz [IN] Size of SCM for the pool - * \param blob_sz[IN] Size of blob for the pool + * \param data_sz[IN] Size of data blob for the pool + * \param meta_sz[IN] Size of Meta blob for the pool * \param flags [IN] Pool open flags (see vos_pool_open_flags) * \param version[IN] Pool version (0 for default version) * \param poh [OUT] Returned pool handle if not NULL @@ -315,8 +318,8 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ * \return Zero on success, negative value if error */ int -vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t blob_sz, - unsigned int flags, uint32_t version, daos_handle_t *poh); +vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t data_sz, + daos_size_t meta_sz, unsigned int flags, uint32_t version, daos_handle_t *poh); /** * Kill a VOS pool before destroy @@ -516,6 +519,16 @@ int vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr, int (*yield_func)(void *arg), void *yield_arg, uint32_t flags); +/** + * Round up the scm and meta sizes to match the backend requirement. + * \param[in/out] scm_sz SCM size that needs to be aligned up + * \param[in/out] meta_sz META size that needs to be aligned up + * + * \return 0 on success, error otherwise. + */ +int +vos_pool_roundup_size(size_t *scm_sz, size_t *meta_sz); + /** * Discards changes in all epochs with the epoch range \a epr * @@ -1538,4 +1551,30 @@ vos_aggregate_enter(daos_handle_t coh, daos_epoch_range_t *epr); void vos_aggregate_exit(daos_handle_t coh); +struct vos_pin_handle; + +/** + * Unpin the pinned objects in md-on-ssd phase2 mode + * + * \param[in] coh container open handle. + * \param[in] hdl pin handle. + * + * \return 0 on success, error otherwise. + */ +void +vos_unpin_objects(daos_handle_t coh, struct vos_pin_handle *hdl); + +/** + * Pin bunch of objects in md-on-ssd phase2 mode + * + * \param[in] coh container open handle. + * \param[in] oids object IDs. + * \param[in] count number of object IDs. + * \param[out] hdl pin handle. + * + * \return 0 on success, error otherwise. + */ +int +vos_pin_objects(daos_handle_t coh, daos_unit_oid_t oids[], int count, struct vos_pin_handle **hdl); + #endif /* __VOS_API_H */ diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index b57220f9a7c..3a30fd45399 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -125,6 +125,9 @@ struct vos_pool_space { struct vea_attr vps_vea_attr; /** NVMe block allocator statistics */ struct vea_stat vps_vea_stat; + /** Total & free non-evictable space for md-on-ssd phase2 pool */ + uint64_t vps_ne_total; + uint64_t vps_ne_free; }; #define SCM_TOTAL(vps) ((vps)->vps_space.s_total[DAOS_MEDIA_SCM]) diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h index 060cdc86c18..e7d61ed3842 100644 --- a/src/include/gurt/common.h +++ b/src/include/gurt/common.h @@ -390,6 +390,7 @@ d_realpath(const char *path, char *resolved_path) _dalloc_; }) #define D_SPIN_LOCK(x) __D_PTHREAD(pthread_spin_lock, x) +#define D_MUTEX_TRYLOCK(x) __D_PTHREAD_TRYLOCK(pthread_mutex_trylock, x) #define D_SPIN_UNLOCK(x) __D_PTHREAD(pthread_spin_unlock, x) #define D_MUTEX_UNLOCK(x) __D_PTHREAD(pthread_mutex_unlock, x) #define D_RWLOCK_TRYWRLOCK(x) __D_PTHREAD_TRYLOCK(pthread_rwlock_trywrlock, x) diff --git a/src/mgmt/pool.pb-c.c b/src/mgmt/pool.pb-c.c index 6fed6ca6973..dfcbdc24c99 100644 --- a/src/mgmt/pool.pb-c.c +++ b/src/mgmt/pool.pb-c.c @@ -1504,166 +1504,315 @@ void mgmt__pool_query_target_resp__free_unpacked assert(message->base.descriptor == &mgmt__pool_query_target_resp__descriptor); protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator); } -static const ProtobufCFieldDescriptor mgmt__pool_create_req__field_descriptors[13] = { - { - "uuid", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateReq, uuid), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "sys", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "user", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateReq, user), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "user_group", 4, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateReq, user_group), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "acl", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING, - offsetof(Mgmt__PoolCreateReq, n_acl), offsetof(Mgmt__PoolCreateReq, acl), NULL, - &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "properties", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, - offsetof(Mgmt__PoolCreateReq, n_properties), offsetof(Mgmt__PoolCreateReq, properties), - &mgmt__pool_property__descriptor, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "fault_domains", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolCreateReq, n_fault_domains), - offsetof(Mgmt__PoolCreateReq, fault_domains), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "num_svc_reps", 8, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateReq, num_svc_reps), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "total_bytes", 9, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT64, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateReq, total_bytes), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "tier_ratio", 10, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_DOUBLE, - offsetof(Mgmt__PoolCreateReq, n_tier_ratio), offsetof(Mgmt__PoolCreateReq, tier_ratio), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "num_ranks", 11, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateReq, num_ranks), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "ranks", 12, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolCreateReq, n_ranks), offsetof(Mgmt__PoolCreateReq, ranks), NULL, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "tier_bytes", 13, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT64, - offsetof(Mgmt__PoolCreateReq, n_tier_bytes), offsetof(Mgmt__PoolCreateReq, tier_bytes), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, +static const ProtobufCFieldDescriptor mgmt__pool_create_req__field_descriptors[14] = +{ + { + "uuid", + 1, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateReq, uuid), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "sys", + 2, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateReq, sys), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "user", + 3, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateReq, user), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "user_group", + 4, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateReq, user_group), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "acl", + 5, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_STRING, + offsetof(Mgmt__PoolCreateReq, n_acl), + offsetof(Mgmt__PoolCreateReq, acl), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "properties", + 6, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_MESSAGE, + offsetof(Mgmt__PoolCreateReq, n_properties), + offsetof(Mgmt__PoolCreateReq, properties), + &mgmt__pool_property__descriptor, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "fault_domains", + 7, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolCreateReq, n_fault_domains), + offsetof(Mgmt__PoolCreateReq, fault_domains), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "num_svc_reps", + 8, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateReq, num_svc_reps), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "total_bytes", + 9, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT64, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateReq, total_bytes), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "tier_ratio", + 10, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_DOUBLE, + offsetof(Mgmt__PoolCreateReq, n_tier_ratio), + offsetof(Mgmt__PoolCreateReq, tier_ratio), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "num_ranks", + 11, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateReq, num_ranks), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "ranks", + 12, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolCreateReq, n_ranks), + offsetof(Mgmt__PoolCreateReq, ranks), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "tier_bytes", + 13, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT64, + offsetof(Mgmt__PoolCreateReq, n_tier_bytes), + offsetof(Mgmt__PoolCreateReq, tier_bytes), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "mem_ratio", + 14, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_FLOAT, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateReq, mem_ratio), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_create_req__field_indices_by_name[] = { - 4, /* field[4] = acl */ - 6, /* field[6] = fault_domains */ - 10, /* field[10] = num_ranks */ - 7, /* field[7] = num_svc_reps */ - 5, /* field[5] = properties */ - 11, /* field[11] = ranks */ - 1, /* field[1] = sys */ - 12, /* field[12] = tier_bytes */ - 9, /* field[9] = tier_ratio */ - 8, /* field[8] = total_bytes */ - 2, /* field[2] = user */ - 3, /* field[3] = user_group */ - 0, /* field[0] = uuid */ -}; -static const ProtobufCIntRange mgmt__pool_create_req__number_ranges[1 + 1] = {{1, 0}, {0, 13}}; -const ProtobufCMessageDescriptor mgmt__pool_create_req__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "mgmt.PoolCreateReq", - "PoolCreateReq", - "Mgmt__PoolCreateReq", - "mgmt", - sizeof(Mgmt__PoolCreateReq), - 13, - mgmt__pool_create_req__field_descriptors, - mgmt__pool_create_req__field_indices_by_name, - 1, - mgmt__pool_create_req__number_ranges, - (ProtobufCMessageInit)mgmt__pool_create_req__init, - NULL, - NULL, - NULL /* reserved[123] */ -}; -static const ProtobufCFieldDescriptor mgmt__pool_create_resp__field_descriptors[5] = { - { - "status", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_INT32, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateResp, status), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "svc_ldr", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolCreateResp, svc_ldr), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "svc_reps", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolCreateResp, n_svc_reps), offsetof(Mgmt__PoolCreateResp, svc_reps), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "tgt_ranks", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolCreateResp, n_tgt_ranks), offsetof(Mgmt__PoolCreateResp, tgt_ranks), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "tier_bytes", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT64, - offsetof(Mgmt__PoolCreateResp, n_tier_bytes), offsetof(Mgmt__PoolCreateResp, tier_bytes), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, + 4, /* field[4] = acl */ + 6, /* field[6] = fault_domains */ + 13, /* field[13] = mem_ratio */ + 10, /* field[10] = num_ranks */ + 7, /* field[7] = num_svc_reps */ + 5, /* field[5] = properties */ + 11, /* field[11] = ranks */ + 1, /* field[1] = sys */ + 12, /* field[12] = tier_bytes */ + 9, /* field[9] = tier_ratio */ + 8, /* field[8] = total_bytes */ + 2, /* field[2] = user */ + 3, /* field[3] = user_group */ + 0, /* field[0] = uuid */ }; -static const unsigned mgmt__pool_create_resp__field_indices_by_name[] = { - 0, /* field[0] = status */ - 1, /* field[1] = svc_ldr */ - 2, /* field[2] = svc_reps */ - 3, /* field[3] = tgt_ranks */ - 4, /* field[4] = tier_bytes */ -}; -static const ProtobufCIntRange mgmt__pool_create_resp__number_ranges[1 + 1] = {{1, 0}, {0, 5}}; -const ProtobufCMessageDescriptor mgmt__pool_create_resp__descriptor = { - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "mgmt.PoolCreateResp", - "PoolCreateResp", - "Mgmt__PoolCreateResp", - "mgmt", - sizeof(Mgmt__PoolCreateResp), - 5, - mgmt__pool_create_resp__field_descriptors, - mgmt__pool_create_resp__field_indices_by_name, +static const ProtobufCIntRange mgmt__pool_create_req__number_ranges[1 + 1] = +{ + { 1, 0 }, + { 0, 14 } +}; +const ProtobufCMessageDescriptor mgmt__pool_create_req__descriptor = +{ + PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, + "mgmt.PoolCreateReq", + "PoolCreateReq", + "Mgmt__PoolCreateReq", + "mgmt", + sizeof(Mgmt__PoolCreateReq), + 14, + mgmt__pool_create_req__field_descriptors, + mgmt__pool_create_req__field_indices_by_name, + 1, mgmt__pool_create_req__number_ranges, + (ProtobufCMessageInit) mgmt__pool_create_req__init, + NULL,NULL,NULL /* reserved[123] */ +}; +static const ProtobufCFieldDescriptor mgmt__pool_create_resp__field_descriptors[6] = +{ + { + "status", 1, - mgmt__pool_create_resp__number_ranges, - (ProtobufCMessageInit)mgmt__pool_create_resp__init, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_INT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateResp, status), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "svc_ldr", + 2, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateResp, svc_ldr), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "svc_reps", + 3, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolCreateResp, n_svc_reps), + offsetof(Mgmt__PoolCreateResp, svc_reps), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "tgt_ranks", + 4, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolCreateResp, n_tgt_ranks), + offsetof(Mgmt__PoolCreateResp, tgt_ranks), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "tier_bytes", + 5, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT64, + offsetof(Mgmt__PoolCreateResp, n_tier_bytes), + offsetof(Mgmt__PoolCreateResp, tier_bytes), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "mem_file_bytes", + 6, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT64, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolCreateResp, mem_file_bytes), NULL, NULL, - NULL /* reserved[123] */ + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, +}; +static const unsigned mgmt__pool_create_resp__field_indices_by_name[] = { + 5, /* field[5] = mem_file_bytes */ + 0, /* field[0] = status */ + 1, /* field[1] = svc_ldr */ + 2, /* field[2] = svc_reps */ + 3, /* field[3] = tgt_ranks */ + 4, /* field[4] = tier_bytes */ +}; +static const ProtobufCIntRange mgmt__pool_create_resp__number_ranges[1 + 1] = +{ + { 1, 0 }, + { 0, 6 } +}; +const ProtobufCMessageDescriptor mgmt__pool_create_resp__descriptor = +{ + PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, + "mgmt.PoolCreateResp", + "PoolCreateResp", + "Mgmt__PoolCreateResp", + "mgmt", + sizeof(Mgmt__PoolCreateResp), + 6, + mgmt__pool_create_resp__field_descriptors, + mgmt__pool_create_resp__field_indices_by_name, + 1, mgmt__pool_create_resp__number_ranges, + (ProtobufCMessageInit) mgmt__pool_create_resp__init, + NULL,NULL,NULL /* reserved[123] */ }; static const ProtobufCFieldDescriptor mgmt__pool_destroy_req__field_descriptors[5] = { @@ -1960,41 +2109,75 @@ const ProtobufCMessageDescriptor mgmt__pool_evict_resp__descriptor = (ProtobufCMessageInit) mgmt__pool_evict_resp__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_exclude_req__field_descriptors[5] = { - { - "sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolExcludeReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "id", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolExcludeReq, id), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "rank", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolExcludeReq, rank), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "target_idx", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolExcludeReq, n_target_idx), offsetof(Mgmt__PoolExcludeReq, target_idx), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "svc_ranks", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolExcludeReq, n_svc_ranks), offsetof(Mgmt__PoolExcludeReq, svc_ranks), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, +static const ProtobufCFieldDescriptor mgmt__pool_exclude_req__field_descriptors[5] = +{ + { + "sys", + 1, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolExcludeReq, sys), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "id", + 2, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolExcludeReq, id), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "rank", + 3, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolExcludeReq, rank), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "target_idx", + 4, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolExcludeReq, n_target_idx), + offsetof(Mgmt__PoolExcludeReq, target_idx), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "svc_ranks", + 5, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolExcludeReq, n_svc_ranks), + offsetof(Mgmt__PoolExcludeReq, svc_ranks), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_exclude_req__field_indices_by_name[] = { - 1, /* field[1] = id */ - 2, /* field[2] = rank */ - 4, /* field[4] = svc_ranks */ - 0, /* field[0] = sys */ - 3, /* field[3] = target_idx */ + 1, /* field[1] = id */ + 2, /* field[2] = rank */ + 4, /* field[4] = svc_ranks */ + 0, /* field[0] = sys */ + 3, /* field[3] = target_idx */ }; static const ProtobufCIntRange mgmt__pool_exclude_req__number_ranges[1 + 1] = { @@ -2054,41 +2237,75 @@ const ProtobufCMessageDescriptor mgmt__pool_exclude_resp__descriptor = (ProtobufCMessageInit) mgmt__pool_exclude_resp__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_drain_req__field_descriptors[5] = { - { - "sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolDrainReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "id", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolDrainReq, id), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "rank", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolDrainReq, rank), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "target_idx", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolDrainReq, n_target_idx), offsetof(Mgmt__PoolDrainReq, target_idx), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "svc_ranks", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolDrainReq, n_svc_ranks), offsetof(Mgmt__PoolDrainReq, svc_ranks), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, +static const ProtobufCFieldDescriptor mgmt__pool_drain_req__field_descriptors[5] = +{ + { + "sys", + 1, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolDrainReq, sys), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "id", + 2, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolDrainReq, id), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "rank", + 3, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolDrainReq, rank), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "target_idx", + 4, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolDrainReq, n_target_idx), + offsetof(Mgmt__PoolDrainReq, target_idx), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "svc_ranks", + 5, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolDrainReq, n_svc_ranks), + offsetof(Mgmt__PoolDrainReq, svc_ranks), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_drain_req__field_indices_by_name[] = { - 1, /* field[1] = id */ - 2, /* field[2] = rank */ - 4, /* field[4] = svc_ranks */ - 0, /* field[0] = sys */ - 3, /* field[3] = target_idx */ + 1, /* field[1] = id */ + 2, /* field[2] = rank */ + 4, /* field[4] = svc_ranks */ + 0, /* field[0] = sys */ + 3, /* field[3] = target_idx */ }; static const ProtobufCIntRange mgmt__pool_drain_req__number_ranges[1 + 1] = { @@ -2148,49 +2365,88 @@ const ProtobufCMessageDescriptor mgmt__pool_drain_resp__descriptor = (ProtobufCMessageInit) mgmt__pool_drain_resp__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_extend_req__field_descriptors[6] = { - { - "sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolExtendReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "id", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolExtendReq, id), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "ranks", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolExtendReq, n_ranks), offsetof(Mgmt__PoolExtendReq, ranks), NULL, NULL, - 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "svc_ranks", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolExtendReq, n_svc_ranks), offsetof(Mgmt__PoolExtendReq, svc_ranks), NULL, - NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "tier_bytes", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT64, - offsetof(Mgmt__PoolExtendReq, n_tier_bytes), offsetof(Mgmt__PoolExtendReq, tier_bytes), - NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "fault_domains", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolExtendReq, n_fault_domains), - offsetof(Mgmt__PoolExtendReq, fault_domains), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, +static const ProtobufCFieldDescriptor mgmt__pool_extend_req__field_descriptors[6] = +{ + { + "sys", + 1, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolExtendReq, sys), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "id", + 2, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolExtendReq, id), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "ranks", + 3, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolExtendReq, n_ranks), + offsetof(Mgmt__PoolExtendReq, ranks), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "svc_ranks", + 4, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolExtendReq, n_svc_ranks), + offsetof(Mgmt__PoolExtendReq, svc_ranks), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "tier_bytes", + 5, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT64, + offsetof(Mgmt__PoolExtendReq, n_tier_bytes), + offsetof(Mgmt__PoolExtendReq, tier_bytes), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "fault_domains", + 6, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolExtendReq, n_fault_domains), + offsetof(Mgmt__PoolExtendReq, fault_domains), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_extend_req__field_indices_by_name[] = { - 5, /* field[5] = fault_domains */ - 1, /* field[1] = id */ - 2, /* field[2] = ranks */ - 3, /* field[3] = svc_ranks */ - 0, /* field[0] = sys */ - 4, /* field[4] = tier_bytes */ + 5, /* field[5] = fault_domains */ + 1, /* field[1] = id */ + 2, /* field[2] = ranks */ + 3, /* field[3] = svc_ranks */ + 0, /* field[0] = sys */ + 4, /* field[4] = tier_bytes */ }; static const ProtobufCIntRange mgmt__pool_extend_req__number_ranges[1 + 1] = { @@ -2212,7 +2468,7 @@ const ProtobufCMessageDescriptor mgmt__pool_extend_req__descriptor = (ProtobufCMessageInit) mgmt__pool_extend_req__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_extend_resp__field_descriptors[2] = +static const ProtobufCFieldDescriptor mgmt__pool_extend_resp__field_descriptors[3] = { { "status", @@ -2238,15 +2494,28 @@ static const ProtobufCFieldDescriptor mgmt__pool_extend_resp__field_descriptors[ 0, /* flags */ 0,NULL,NULL /* reserved1,reserved2, etc */ }, + { + "meta_blob_bytes", + 3, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolExtendResp, meta_blob_bytes), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_extend_resp__field_indices_by_name[] = { + 2, /* field[2] = meta_blob_bytes */ 0, /* field[0] = status */ 1, /* field[1] = tier_bytes */ }; static const ProtobufCIntRange mgmt__pool_extend_resp__number_ranges[1 + 1] = { { 1, 0 }, - { 0, 2 } + { 0, 3 } }; const ProtobufCMessageDescriptor mgmt__pool_extend_resp__descriptor = { @@ -2256,55 +2525,95 @@ const ProtobufCMessageDescriptor mgmt__pool_extend_resp__descriptor = "Mgmt__PoolExtendResp", "mgmt", sizeof(Mgmt__PoolExtendResp), - 2, + 3, mgmt__pool_extend_resp__field_descriptors, mgmt__pool_extend_resp__field_indices_by_name, 1, mgmt__pool_extend_resp__number_ranges, (ProtobufCMessageInit) mgmt__pool_extend_resp__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_reintegrate_req__field_descriptors[6] = { - { - "sys", 1, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolReintegrateReq, sys), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "id", 2, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_STRING, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolReintegrateReq, id), NULL, &protobuf_c_empty_string, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "rank", 3, PROTOBUF_C_LABEL_NONE, PROTOBUF_C_TYPE_UINT32, 0, /* quantifier_offset */ - offsetof(Mgmt__PoolReintegrateReq, rank), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "target_idx", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolReintegrateReq, n_target_idx), - offsetof(Mgmt__PoolReintegrateReq, target_idx), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "svc_ranks", 5, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT32, - offsetof(Mgmt__PoolReintegrateReq, n_svc_ranks), - offsetof(Mgmt__PoolReintegrateReq, svc_ranks), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, - { - "tier_bytes", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_UINT64, - offsetof(Mgmt__PoolReintegrateReq, n_tier_bytes), - offsetof(Mgmt__PoolReintegrateReq, tier_bytes), NULL, NULL, 0, /* flags */ - 0, NULL, NULL /* reserved1,reserved2, etc */ - }, +static const ProtobufCFieldDescriptor mgmt__pool_reintegrate_req__field_descriptors[6] = +{ + { + "sys", + 1, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolReintegrateReq, sys), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "id", + 2, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_STRING, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolReintegrateReq, id), + NULL, + &protobuf_c_empty_string, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "rank", + 3, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT32, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolReintegrateReq, rank), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "target_idx", + 4, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolReintegrateReq, n_target_idx), + offsetof(Mgmt__PoolReintegrateReq, target_idx), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "svc_ranks", + 5, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT32, + offsetof(Mgmt__PoolReintegrateReq, n_svc_ranks), + offsetof(Mgmt__PoolReintegrateReq, svc_ranks), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, + { + "tier_bytes", + 6, + PROTOBUF_C_LABEL_REPEATED, + PROTOBUF_C_TYPE_UINT64, + offsetof(Mgmt__PoolReintegrateReq, n_tier_bytes), + offsetof(Mgmt__PoolReintegrateReq, tier_bytes), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_reintegrate_req__field_indices_by_name[] = { - 1, /* field[1] = id */ - 2, /* field[2] = rank */ - 4, /* field[4] = svc_ranks */ - 0, /* field[0] = sys */ - 3, /* field[3] = target_idx */ - 5, /* field[5] = tier_bytes */ + 1, /* field[1] = id */ + 2, /* field[2] = rank */ + 4, /* field[4] = svc_ranks */ + 0, /* field[0] = sys */ + 3, /* field[3] = target_idx */ + 5, /* field[5] = tier_bytes */ }; static const ProtobufCIntRange mgmt__pool_reintegrate_req__number_ranges[1 + 1] = { @@ -2996,7 +3305,7 @@ const ProtobufCMessageDescriptor mgmt__pool_rebuild_status__descriptor = (ProtobufCMessageInit) mgmt__pool_rebuild_status__init, NULL,NULL,NULL /* reserved[123] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_query_resp__field_descriptors[19] = +static const ProtobufCFieldDescriptor mgmt__pool_query_resp__field_descriptors[20] = { { "status", @@ -3226,6 +3535,18 @@ static const ProtobufCFieldDescriptor mgmt__pool_query_resp__field_descriptors[1 0, /* flags */ 0,NULL,NULL /* reserved1,reserved2, etc */ }, + { + "mem_file_bytes", + 21, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT64, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryResp, mem_file_bytes), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_query_resp__field_indices_by_name[] = { 4, /* field[4] = active_targets */ @@ -3234,6 +3555,7 @@ static const unsigned mgmt__pool_query_resp__field_indices_by_name[] = { 10, /* field[10] = enabled_ranks */ 2, /* field[2] = label */ 9, /* field[9] = leader */ + 19, /* field[19] = mem_file_bytes */ 13, /* field[13] = pool_layout_ver */ 18, /* field[18] = query_mask */ 6, /* field[6] = rebuild */ @@ -3252,7 +3574,7 @@ static const ProtobufCIntRange mgmt__pool_query_resp__number_ranges[2 + 1] = { { 1, 0 }, { 10, 8 }, - { 0, 19 } + { 0, 20 } }; const ProtobufCMessageDescriptor mgmt__pool_query_resp__descriptor = { @@ -3262,7 +3584,7 @@ const ProtobufCMessageDescriptor mgmt__pool_query_resp__descriptor = "Mgmt__PoolQueryResp", "mgmt", sizeof(Mgmt__PoolQueryResp), - 19, + 20, mgmt__pool_query_resp__field_descriptors, mgmt__pool_query_resp__field_indices_by_name, 2, mgmt__pool_query_resp__number_ranges, @@ -3904,7 +4226,7 @@ const ProtobufCEnumDescriptor mgmt__pool_query_target_info__target_state__descri mgmt__pool_query_target_info__target_state__value_ranges, NULL,NULL,NULL,NULL /* reserved[1234] */ }; -static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descriptors[3] = +static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descriptors[4] = { { "type", @@ -3942,8 +4264,21 @@ static const ProtobufCFieldDescriptor mgmt__pool_query_target_info__field_descri 0, /* flags */ 0,NULL,NULL /* reserved1,reserved2, etc */ }, + { + "mem_file_bytes", + 4, + PROTOBUF_C_LABEL_NONE, + PROTOBUF_C_TYPE_UINT64, + 0, /* quantifier_offset */ + offsetof(Mgmt__PoolQueryTargetInfo, mem_file_bytes), + NULL, + NULL, + 0, /* flags */ + 0,NULL,NULL /* reserved1,reserved2, etc */ + }, }; static const unsigned mgmt__pool_query_target_info__field_indices_by_name[] = { + 3, /* field[3] = mem_file_bytes */ 2, /* field[2] = space */ 1, /* field[1] = state */ 0, /* field[0] = type */ @@ -3951,7 +4286,7 @@ static const unsigned mgmt__pool_query_target_info__field_indices_by_name[] = { static const ProtobufCIntRange mgmt__pool_query_target_info__number_ranges[1 + 1] = { { 1, 0 }, - { 0, 3 } + { 0, 4 } }; const ProtobufCMessageDescriptor mgmt__pool_query_target_info__descriptor = { @@ -3961,7 +4296,7 @@ const ProtobufCMessageDescriptor mgmt__pool_query_target_info__descriptor = "Mgmt__PoolQueryTargetInfo", "mgmt", sizeof(Mgmt__PoolQueryTargetInfo), - 3, + 4, mgmt__pool_query_target_info__field_descriptors, mgmt__pool_query_target_info__field_indices_by_name, 1, mgmt__pool_query_target_info__number_ranges, diff --git a/src/mgmt/pool.pb-c.h b/src/mgmt/pool.pb-c.h index 10ea70360d9..5ae75572370 100644 --- a/src/mgmt/pool.pb-c.h +++ b/src/mgmt/pool.pb-c.h @@ -160,7 +160,7 @@ struct _Mgmt__PoolCreateReq /* * formatted group e.g. "builders@" */ - char *user_group; + char *user_group; /* * Access Control Entries in short string format */ @@ -180,25 +180,25 @@ struct _Mgmt__PoolCreateReq /* * Fault domain tree, minimal format */ - size_t n_fault_domains; - uint32_t *fault_domains; + size_t n_fault_domains; + uint32_t *fault_domains; /* * desired number of pool service replicas */ - uint32_t num_svc_reps; + uint32_t num_svc_reps; /* * Total pool size in bytes */ - uint64_t total_bytes; + uint64_t total_bytes; /* * Ratio of storage tiers expressed as % of totalbytes */ - size_t n_tier_ratio; - double *tier_ratio; + size_t n_tier_ratio; + double *tier_ratio; /* * Number of target ranks to use */ - uint32_t num_ranks; + uint32_t num_ranks; /* * target ranks */ @@ -207,16 +207,17 @@ struct _Mgmt__PoolCreateReq /* * Size in bytes of storage tier */ - size_t n_tier_bytes; - uint64_t *tier_bytes; + size_t n_tier_bytes; + uint64_t *tier_bytes; + /* + * Fraction of meta-blob-sz to use as mem-file-sz + */ + float mem_ratio; }; -#define MGMT__POOL_CREATE_REQ__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT(&mgmt__pool_create_req__descriptor) \ - , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, \ - (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, NULL, 0, \ - NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, NULL, 0, NULL \ - } +#define MGMT__POOL_CREATE_REQ__INIT \ + { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_create_req__descriptor) \ + , (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0,NULL, 0,NULL, 0,NULL, 0, 0, 0,NULL, 0, 0,NULL, 0,NULL, 0 } + /* * PoolCreateResp returns created pool uuid and ranks. @@ -243,16 +244,19 @@ struct _Mgmt__PoolCreateResp size_t n_tgt_ranks; uint32_t *tgt_ranks; /* - * storage tiers allocated to pool + * per-rank storage tier sizes allocated in pool */ size_t n_tier_bytes; - uint64_t *tier_bytes; + uint64_t *tier_bytes; + /* + * per-rank accumulated value of memory file sizes + */ + uint64_t mem_file_bytes; }; -#define MGMT__POOL_CREATE_RESP__INIT \ - { \ - PROTOBUF_C_MESSAGE_INIT(&mgmt__pool_create_resp__descriptor) \ - , 0, 0, 0, NULL, 0, NULL, 0, NULL \ - } +#define MGMT__POOL_CREATE_RESP__INIT \ + { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_create_resp__descriptor) \ + , 0, 0, 0,NULL, 0,NULL, 0,NULL, 0 } + /* * PoolDestroyReq supplies pool identifier and force flag. @@ -386,8 +390,8 @@ struct _Mgmt__PoolExcludeReq /* * target ranks */ - size_t n_target_idx; - uint32_t *target_idx; + size_t n_target_idx; + uint32_t *target_idx; /* * List of pool service ranks */ @@ -436,8 +440,8 @@ struct _Mgmt__PoolDrainReq /* * rank targets */ - size_t n_target_idx; - uint32_t *target_idx; + size_t n_target_idx; + uint32_t *target_idx; /* * List of pool service ranks */ @@ -492,13 +496,13 @@ struct _Mgmt__PoolExtendReq /* * Size in bytes of storage tiers */ - size_t n_tier_bytes; - uint64_t *tier_bytes; + size_t n_tier_bytes; + uint64_t *tier_bytes; /* * fault domain tree, minimal format */ - size_t n_fault_domains; - uint32_t *fault_domains; + size_t n_fault_domains; + uint32_t *fault_domains; }; #define MGMT__POOL_EXTEND_REQ__INIT \ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_extend_req__descriptor) \ @@ -520,10 +524,14 @@ struct _Mgmt__PoolExtendResp */ size_t n_tier_bytes; uint64_t *tier_bytes; + /* + * Size in bytes of metadata blob on SSD + */ + uint32_t meta_blob_bytes; }; #define MGMT__POOL_EXTEND_RESP__INIT \ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_extend_resp__descriptor) \ - , 0, 0,NULL } + , 0, 0,NULL, 0 } /* @@ -547,8 +555,8 @@ struct _Mgmt__PoolReintegrateReq /* * target ranks */ - size_t n_target_idx; - uint32_t *target_idx; + size_t n_target_idx; + uint32_t *target_idx; /* * List of pool service ranks */ @@ -557,8 +565,8 @@ struct _Mgmt__PoolReintegrateReq /* * Size in bytes of storage tiers */ - size_t n_tier_bytes; - uint64_t *tier_bytes; + size_t n_tier_bytes; + uint64_t *tier_bytes; }; #define MGMT__POOL_REINTEGRATE_REQ__INIT \ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_reintegrate_req__descriptor) \ @@ -857,10 +865,14 @@ struct _Mgmt__PoolQueryResp * Bitmask of pool query options used */ uint64_t query_mask; + /* + * per-pool accumulated value of memory file sizes + */ + uint64_t mem_file_bytes; }; #define MGMT__POOL_QUERY_RESP__INIT \ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_query_resp__descriptor) \ - , 0, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, 0, 0, NULL, 0,NULL, 0, 0, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, 0, 0, MGMT__POOL_SERVICE_STATE__Creating, 0, 0,NULL, 0 } + , 0, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, 0, 0, NULL, 0,NULL, 0, 0, (char *)protobuf_c_empty_string, (char *)protobuf_c_empty_string, 0, 0, 0, MGMT__POOL_SERVICE_STATE__Creating, 0, 0,NULL, 0, 0 } typedef enum { @@ -1102,10 +1114,14 @@ struct _Mgmt__PoolQueryTargetInfo */ size_t n_space; Mgmt__StorageTargetUsage **space; + /* + * per-target value of memory file size + */ + uint64_t mem_file_bytes; }; #define MGMT__POOL_QUERY_TARGET_INFO__INIT \ { PROTOBUF_C_MESSAGE_INIT (&mgmt__pool_query_target_info__descriptor) \ - , MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__UNKNOWN, MGMT__POOL_QUERY_TARGET_INFO__TARGET_STATE__STATE_UNKNOWN, 0,NULL } + , MGMT__POOL_QUERY_TARGET_INFO__TARGET_TYPE__UNKNOWN, MGMT__POOL_QUERY_TARGET_INFO__TARGET_STATE__STATE_UNKNOWN, 0,NULL, 0 } /* diff --git a/src/mgmt/rpc.h b/src/mgmt/rpc.h index 47e4deacc20..e7b5501be1f 100644 --- a/src/mgmt/rpc.h +++ b/src/mgmt/rpc.h @@ -166,7 +166,8 @@ CRT_RPC_DECLARE(mgmt_pool_list, DAOS_ISEQ_MGMT_POOL_LIST, DAOS_OSEQ_MGMT_POOL_LI ((uuid_t) (tc_pool_uuid) CRT_VAR) \ ((d_string_t) (tc_tgt_dev) CRT_VAR) \ ((daos_size_t) (tc_scm_size) CRT_VAR) \ - ((daos_size_t) (tc_nvme_size) CRT_VAR) + ((daos_size_t) (tc_nvme_size) CRT_VAR) \ + ((daos_size_t) (tc_meta_size) CRT_VAR) #define DAOS_OSEQ_MGMT_TGT_CREATE /* output fields */ \ ((d_rank_t) (tc_ranks) CRT_ARRAY) \ diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index be1a67a8c54..cd5fcdcb999 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -441,6 +441,7 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) daos_prop_t *base_props = NULL; uint8_t *body; size_t len; + size_t scm_size; int rc; /* Unpack the inner request from the drpc call body */ @@ -495,13 +496,18 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) } /** - * Ranks to allocate targets (in) & svc for pool replicas (out). Meta-blob size set equal - * to SCM size for MD-on-SSD phase 1. + * Ranks to allocate targets (in) & svc for pool replicas (out). Mapping of tier_bytes in + * MD-on-SSD mode is (tier0*mem_ratio)->scm_size (mem-file-size), tier0->meta_size and + * tier1->nvme_size (data_size). */ - rc = ds_mgmt_create_pool(pool_uuid, req->sys, "pmem", targets, - req->tier_bytes[DAOS_MEDIA_SCM], req->tier_bytes[DAOS_MEDIA_NVME], - prop, &svc, req->n_fault_domains, req->fault_domains, - req->tier_bytes[DAOS_MEDIA_SCM]); + + scm_size = req->tier_bytes[DAOS_MEDIA_SCM]; + if (req->mem_ratio) + scm_size *= (double)req->mem_ratio; + + rc = ds_mgmt_create_pool(pool_uuid, req->sys, "pmem", targets, scm_size, + req->tier_bytes[DAOS_MEDIA_NVME], prop, &svc, req->n_fault_domains, + req->fault_domains, req->tier_bytes[DAOS_MEDIA_SCM]); if (rc != 0) { D_ERROR("failed to create pool: "DF_RC"\n", DP_RC(rc)); goto out; @@ -510,6 +516,14 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) rc = pool_create_fill_resp(&resp, pool_uuid, svc); d_rank_list_free(svc); + /** + * TODO DAOS-16209: Populate per-rank VOS-file sizes. For now just calculate here based on + * the supplied input values but really should be returned from + * ds_mgmt_pool_query() through the VOS query API and set in + * pool_create_fill_resp(). Return zero for non-MD-on-SSD mode. + */ + resp.mem_file_bytes = req->tier_bytes[DAOS_MEDIA_SCM] * req->mem_ratio; + out: resp.status = rc; len = mgmt__pool_create_resp__get_packed_size(&resp); @@ -696,7 +710,7 @@ ds_mgmt_drpc_pool_evict(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) static int pool_change_target_state(char *id, d_rank_list_t *svc_ranks, size_t n_target_idx, uint32_t *target_idx, uint32_t rank, pool_comp_state_t state, - size_t scm_size, size_t nvme_size) + size_t scm_size, size_t nvme_size, size_t meta_blob_bytes) { uuid_t uuid; struct pool_target_addr_list target_addr_list; @@ -725,7 +739,7 @@ pool_change_target_state(char *id, d_rank_list_t *svc_ranks, size_t n_target_idx } rc = ds_mgmt_pool_target_update_state(uuid, svc_ranks, &target_addr_list, state, scm_size, - nvme_size); + nvme_size, meta_blob_bytes); if (rc != 0) { D_ERROR("Failed to set pool target up "DF_UUID": "DF_RC"\n", DP_UUID(uuid), DP_RC(rc)); @@ -765,7 +779,7 @@ ds_mgmt_drpc_pool_exclude(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) rc = pool_change_target_state(req->id, svc_ranks, req->n_target_idx, req->target_idx, req->rank, PO_COMP_ST_DOWN, 0 /* scm_size */, - 0 /* nvme_size */); + 0 /* nvme_size */, 0 /* meta_blob_bytes */); d_rank_list_free(svc_ranks); @@ -814,7 +828,7 @@ ds_mgmt_drpc_pool_drain(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) rc = pool_change_target_state(req->id, svc_ranks, req->n_target_idx, req->target_idx, req->rank, PO_COMP_ST_DRAIN, 0 /* scm_size */, - 0 /* nvme_size */); + 0 /* nvme_size */, 0 /* meta_blob_bytes */); d_rank_list_free(svc_ranks); @@ -883,7 +897,7 @@ ds_mgmt_drpc_pool_extend(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) if (svc_ranks == NULL) D_GOTO(out_list, rc = -DER_NOMEM); - rc = ds_mgmt_pool_extend(uuid, svc_ranks, rank_list, "pmem", scm_bytes, nvme_bytes, + rc = ds_mgmt_pool_extend(uuid, svc_ranks, rank_list, "pmem", scm_bytes, nvme_bytes, 0, req->n_fault_domains, req->fault_domains); if (rc != 0) @@ -898,6 +912,7 @@ ds_mgmt_drpc_pool_extend(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) */ resp.n_tier_bytes = req->n_tier_bytes; resp.tier_bytes = req->tier_bytes; + resp.meta_blob_bytes = 0; out_list: d_rank_list_free(rank_list); @@ -957,7 +972,7 @@ ds_mgmt_drpc_pool_reintegrate(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) D_GOTO(out, rc = -DER_NOMEM); rc = pool_change_target_state(req->id, svc_ranks, req->n_target_idx, req->target_idx, - req->rank, PO_COMP_ST_UP, scm_bytes, nvme_bytes); + req->rank, PO_COMP_ST_UP, scm_bytes, nvme_bytes, 0); d_rank_list_free(svc_ranks); @@ -1831,6 +1846,13 @@ ds_mgmt_drpc_pool_query(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) pool_rebuild_status_from_info(&rebuild, &pool_info.pi_rebuild_st); resp.rebuild = &rebuild; + /** + * TODO DAOS-16209: Populate VOS-file sizes in response. For now just return the meta-blob + * size until VOS query API is updated. When updated, zero-value should + * be returned in non-MD-on-SSD mode. + */ + resp.mem_file_bytes = scm.total; + error: resp.status = rc; diff --git a/src/mgmt/srv_internal.h b/src/mgmt/srv_internal.h index 11fe77c9b1c..a9de41a39bf 100644 --- a/src/mgmt/srv_internal.h +++ b/src/mgmt/srv_internal.h @@ -82,20 +82,21 @@ int ds_mgmt_group_update_handler(struct mgmt_grp_up_in *in); /** srv_pool.c */ int ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, char *tgt_dev, d_rank_list_t *targets, size_t scm_size, size_t nvme_size, daos_prop_t *prop, d_rank_list_t **svcp, - int domains_nr, uint32_t *domains, size_t meta_blob_size); + int domains_nr, uint32_t *domains, size_t meta_blob_bytes); int ds_mgmt_destroy_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks); int ds_mgmt_evict_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uuid_t *handles, size_t n_handles, uint32_t destroy, uint32_t force_destroy, char *machine, uint32_t *count); int ds_mgmt_pool_target_update_state(uuid_t pool_uuid, d_rank_list_t *svc_ranks, struct pool_target_addr_list *target_addrs, - pool_comp_state_t state, size_t scm_size, size_t nvme_size); + pool_comp_state_t state, size_t scm_size, size_t nvme_size, + size_t meta_blob_bytes); int ds_mgmt_pool_reintegrate(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uint32_t reint_rank, struct pool_target_id_list *reint_list); int ds_mgmt_pool_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t *rank_list, char *tgt_dev, - size_t scm_size, size_t nvme_size, + size_t scm_size, size_t nvme_size, size_t meta_blob_bytes, size_t domains_nr, uint32_t *domains); int ds_mgmt_pool_set_prop(uuid_t pool_uuid, d_rank_list_t *svc_ranks, daos_prop_t *prop); diff --git a/src/mgmt/srv_pool.c b/src/mgmt/srv_pool.c index 0497058191c..9674b9e2652 100644 --- a/src/mgmt/srv_pool.c +++ b/src/mgmt/srv_pool.c @@ -83,7 +83,7 @@ pool_create_rpc_timeout(crt_rpc_t *tc_req, size_t scm_size) static int ds_mgmt_tgt_pool_create_ranks(uuid_t pool_uuid, char *tgt_dev, d_rank_list_t *rank_list, - size_t scm_size, size_t nvme_size) + size_t scm_size, size_t nvme_size, size_t meta_size) { crt_rpc_t *tc_req; crt_opcode_t opc; @@ -117,6 +117,7 @@ ds_mgmt_tgt_pool_create_ranks(uuid_t pool_uuid, char *tgt_dev, d_rank_list_t *ra tc_in->tc_tgt_dev = tgt_dev; tc_in->tc_scm_size = scm_size; tc_in->tc_nvme_size = nvme_size; + tc_in->tc_meta_size = meta_size; rc = dss_rpc_send(tc_req); if (rc == 0 && DAOS_FAIL_CHECK(DAOS_POOL_CREATE_FAIL_CORPC)) rc = -DER_TIMEDOUT; @@ -170,14 +171,15 @@ ds_mgmt_pool_svc_create(uuid_t pool_uuid, int ntargets, const char *group, d_ran int ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, char *tgt_dev, d_rank_list_t *targets, size_t scm_size, size_t nvme_size, daos_prop_t *prop, d_rank_list_t **svcp, - int domains_nr, uint32_t *domains, size_t meta_blob_size) + int domains_nr, uint32_t *domains, size_t meta_size) { d_rank_list_t *pg_ranks = NULL; d_rank_list_t *pg_targets = NULL; int rc; int rc_cleanup; - D_DEBUG(DB_MGMT, DF_UUID ": meta blob size %ld", DP_UUID(pool_uuid), meta_blob_size); + D_DEBUG(DB_MGMT, DF_UUID ": create scm/meta/nvme sizes %ld/%ld/%ld\n", DP_UUID(pool_uuid), + scm_size, meta_size, nvme_size); /* Sanity check targets versus cart's current primary group members. * If any targets not in PG, flag error before MGMT_TGT_ corpcs fail. @@ -215,7 +217,7 @@ ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, char *tgt_dev, d_rank_l } rc = ds_mgmt_tgt_pool_create_ranks(pool_uuid, tgt_dev, targets, - scm_size, nvme_size); + scm_size, nvme_size, meta_size); if (rc != 0) { D_ERROR("creating pool "DF_UUID" on ranks failed: rc "DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc)); @@ -280,8 +282,8 @@ ds_mgmt_destroy_pool(uuid_t pool_uuid, d_rank_list_t *ranks) int ds_mgmt_pool_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t *rank_list, - char *tgt_dev, size_t scm_size, size_t nvme_size, size_t domains_nr, - uint32_t *domains) + char *tgt_dev, size_t scm_size, size_t nvme_size, size_t meta_size, + size_t domains_nr, uint32_t *domains) { d_rank_list_t *unique_add_ranks = NULL; int ntargets; @@ -294,7 +296,7 @@ ds_mgmt_pool_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t *r D_GOTO(out, rc); rc = ds_mgmt_tgt_pool_create_ranks(pool_uuid, tgt_dev, unique_add_ranks, scm_size, - nvme_size); + nvme_size, meta_size); if (rc != 0) { D_ERROR("creating pool on ranks "DF_UUID" failed: rc "DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc)); @@ -336,7 +338,8 @@ ds_mgmt_evict_pool(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uuid_t *handles, int ds_mgmt_pool_target_update_state(uuid_t pool_uuid, d_rank_list_t *svc_ranks, struct pool_target_addr_list *target_addrs, - pool_comp_state_t state, size_t scm_size, size_t nvme_size) + pool_comp_state_t state, size_t scm_size, size_t nvme_size, + size_t meta_size) { int rc; @@ -354,7 +357,7 @@ ds_mgmt_pool_target_update_state(uuid_t pool_uuid, d_rank_list_t *svc_ranks, reint_ranks.rl_ranks = &target_addrs->pta_addrs[0].pta_rank; rc = ds_mgmt_tgt_pool_create_ranks(pool_uuid, "pmem", &reint_ranks, scm_size, - nvme_size); + nvme_size, meta_size); if (rc != 0) { D_ERROR("creating pool on ranks "DF_UUID" failed: rc " DF_RC"\n", DP_UUID(pool_uuid), DP_RC(rc)); diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index 7975a2115d4..fa54a05b529 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -576,7 +576,9 @@ recreate_pooltgts() DP_UUID(pool_info->spi_id), DP_RC(rc)); goto out; } - rc = tgt_recreate(pool_info->spi_id, pool_info->spi_blob_sz[SMD_DEV_TYPE_META], + + D_ASSERT(pool_info->spi_scm_sz > 0); + rc = tgt_recreate(pool_info->spi_id, pool_info->spi_scm_sz, pool_info->spi_tgt_cnt[SMD_DEV_TYPE_META], rdb_blob_sz); if (rc) goto out; @@ -719,6 +721,7 @@ struct vos_pool_arg { uuid_t vpa_uuid; daos_size_t vpa_scm_size; daos_size_t vpa_nvme_size; + daos_size_t vpa_meta_size; }; static int @@ -735,7 +738,8 @@ tgt_vos_create_one(void *varg) return rc; rc = vos_pool_create(path, (unsigned char *)vpa->vpa_uuid, vpa->vpa_scm_size, - vpa->vpa_nvme_size, 0, 0 /* version */, NULL); + vpa->vpa_nvme_size, vpa->vpa_meta_size, 0 /* flags */, + 0 /* version */, NULL); if (rc) D_ERROR(DF_UUID": failed to init vos pool %s: %d\n", DP_UUID(vpa->vpa_uuid), path, rc); @@ -755,7 +759,8 @@ tgt_vos_preallocate(uuid_t uuid, daos_size_t scm_size, int tgt_id) if (rc) goto out; - D_DEBUG(DB_MGMT, DF_UUID": creating vos file %s\n", DP_UUID(uuid), path); + D_DEBUG(DB_MGMT, DF_UUID ": creating vos file %s (%ld bytes)\n", DP_UUID(uuid), path, + scm_size); fd = open(path, O_CREAT|O_RDWR, 0600); if (fd < 0) { @@ -1043,15 +1048,14 @@ tgt_create_preallocate(void *arg) * 16MB minimum per pmemobj file (SCM partition) */ D_ASSERT(dss_tgt_nr > 0); + D_ASSERT((tca->tca_scm_size / dss_tgt_nr) >= (1 << 24)); if (!bio_nvme_configured(SMD_DEV_TYPE_META)) { - rc = tgt_vos_preallocate_sequential(tca->tca_ptrec->dptr_uuid, - max(tca->tca_scm_size / dss_tgt_nr, - 1 << 24), dss_tgt_nr); + rc = tgt_vos_preallocate_sequential( + tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr); } else { - rc = tgt_vos_preallocate_parallel(tca->tca_ptrec->dptr_uuid, - max(tca->tca_scm_size / dss_tgt_nr, - 1 << 24), dss_tgt_nr, - &tca->tca_ptrec->cancel_create); + rc = tgt_vos_preallocate_parallel( + tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr, + &tca->tca_ptrec->cancel_create); } if (rc) goto out; @@ -1078,6 +1082,8 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) pthread_t thread; bool canceled_thread = false; int rc = 0; + size_t tgt_scm_sz; + size_t tgt_meta_sz; /** incoming request buffer */ tc_in = crt_req_get(tc_req); @@ -1114,6 +1120,17 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) D_DEBUG(DB_MGMT, DF_UUID": record inserted to dpt_creates_ht\n", DP_UUID(tca.tca_ptrec->dptr_uuid)); + tgt_scm_sz = tc_in->tc_scm_size / dss_tgt_nr; + tgt_meta_sz = tc_in->tc_meta_size / dss_tgt_nr; + rc = vos_pool_roundup_size(&tgt_scm_sz, &tgt_meta_sz); + if (rc) { + D_ERROR(DF_UUID": failed to roundup the vos size: "DF_RC"\n", + DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc)); + goto out_rec; + } + tc_in->tc_scm_size = tgt_scm_sz * dss_tgt_nr; + tc_in->tc_meta_size = tgt_meta_sz * dss_tgt_nr; + tca.tca_scm_size = tc_in->tc_scm_size; tca.tca_nvme_size = tc_in->tc_nvme_size; tca.tca_dx = dss_current_xstream(); @@ -1178,8 +1195,9 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) D_ASSERT(dss_tgt_nr > 0); uuid_copy(vpa.vpa_uuid, tc_in->tc_pool_uuid); /* A zero size accommodates the existing file */ - vpa.vpa_scm_size = 0; + vpa.vpa_scm_size = 0; vpa.vpa_nvme_size = tc_in->tc_nvme_size / dss_tgt_nr; + vpa.vpa_meta_size = tc_in->tc_meta_size / dss_tgt_nr; rc = dss_thread_collective(tgt_vos_create_one, &vpa, DSS_ULT_DEEP_STACK); if (rc) { D_ERROR(DF_UUID": thread collective tgt_vos_create_one failed, "DF_RC"\n", diff --git a/src/mgmt/tests/mocks.c b/src/mgmt/tests/mocks.c index 912a36f293a..4840030bca0 100644 --- a/src/mgmt/tests/mocks.c +++ b/src/mgmt/tests/mocks.c @@ -427,7 +427,8 @@ uuid_t ds_mgmt_target_update_uuid; int ds_mgmt_pool_target_update_state(uuid_t pool_uuid, d_rank_list_t *svc_ranks, struct pool_target_addr_list *target_addrs, - pool_comp_state_t state, size_t scm_size, size_t nvme_size) + pool_comp_state_t state, size_t scm_size, size_t nvme_size, + size_t meta_blob_bytes) { uuid_copy(ds_mgmt_target_update_uuid, pool_uuid); return ds_mgmt_target_update_return; @@ -445,7 +446,7 @@ uuid_t ds_mgmt_pool_extend_uuid; int ds_mgmt_pool_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, d_rank_list_t *rank_list, - char *tgt_dev, size_t scm_size, size_t nvme_size, + char *tgt_dev, size_t scm_size, size_t nvme_size, size_t meta_blob_bytes, size_t domains_nr, uint32_t *domains) { uuid_copy(ds_mgmt_pool_extend_uuid, pool_uuid); @@ -525,7 +526,7 @@ ds_mgmt_group_update_handler(struct mgmt_grp_up_in *in) int ds_mgmt_create_pool(uuid_t pool_uuid, const char *group, char *tgt_dev, d_rank_list_t *targets, size_t scm_size, size_t nvme_size, daos_prop_t *prop, d_rank_list_t **svcp, - int domains_nr, uint32_t *domains, size_t meta_blob_size) + int domains_nr, uint32_t *domains, size_t meta_blob_bytes) { return 0; } diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index 3285bec58b3..d517e3269d6 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -873,7 +873,7 @@ queue_coll_query_task(tse_task_t *api_task, struct obj_auxi_args *obj_auxi, stru 0, 0, ocdc); for (i = 0; i < ocdc->grp_nr; i++) { - obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep); + obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep, obj->cob_md.omd_id); tmp = coa->coa_dcts[ocdc->cur_pos].dct_shards[tgt_ep.ep_tag].dcs_idx; rc = queue_shard_query_key_task(api_task, obj_auxi, epoch, tmp, map_ver, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index fcc4f7601f4..75d661d0665 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -1718,15 +1718,20 @@ dc_obj_retry_delay(tse_task_t *task, int err, uint16_t *retry_cnt, uint16_t *inp uint32_t timeout_sec) { uint32_t delay = 0; + uint32_t limit = 4; /* - * Randomly delay 5 - 68 us if it is not the first retry for + * Randomly delay 5 ~ 1028 us if it is not the first retry for * -DER_INPROGRESS || -DER_UPDATE_AGAIN cases. */ ++(*retry_cnt); if (err == -DER_INPROGRESS || err == -DER_UPDATE_AGAIN) { if (++(*inprogress_cnt) > 1) { - delay = (d_rand() & ((1 << 6) - 1)) + 5; + limit += *inprogress_cnt; + if (limit > 10) + limit = 10; + + delay = (d_rand() & ((1 << limit) - 1)) + 5; /* Rebuild is being established on the server side, wait a bit longer */ if (err == -DER_UPDATE_AGAIN) delay <<= 10; @@ -4856,11 +4861,14 @@ obj_comp_cb(tse_task_t *task, void *data) D_ASSERT(daos_handle_is_inval(obj_auxi->th)); D_ASSERT(obj_is_modification_opc(obj_auxi->opc)); - if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) - /* XXX: it is must because miss to set "RESEND" flag, that is bug. */ - D_ASSERTF(0, - "Miss 'RESEND' flag (%x) when resend the RPC for task %p: %u\n", - obj_auxi->flags, task, obj_auxi->retry_cnt); + if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) { + D_ERROR("TX ID maybe reused for unknown reason, " + "task %p, opc %u, flags %x, retry_cnt %u\n", + task, obj_auxi->opc, obj_auxi->flags, obj_auxi->retry_cnt); + task->dt_result = -DER_IO; + obj_auxi->io_retry = 0; + goto args_fini; + } if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) { daos_obj_rw_t *api_args = dc_task_get_args(obj_auxi->obj_task); @@ -4886,6 +4894,7 @@ obj_comp_cb(tse_task_t *task, void *data) } } +args_fini: if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH) obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa); diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 0c9dfc1418e..9f084140f80 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1451,11 +1451,14 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data) shard_args->pa_auxi.obj_auxi->max_delay = timeout; } - DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, - "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " - DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout", - rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, - *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags, + DL_CDEBUG(task->dt_result < 0 && task->dt_result != -DER_INPROGRESS, + DLOG_ERR, DB_IO, task->dt_result, + "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" in "DF_UUID"/"DF_UUID"/" + DF_UUID" with DTX "DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout", + rpc, DP_UOID(ocpi->ocpi_oid), DP_UUID(ocpi->ocpi_po_uuid), + DP_UUID(ocpi->ocpi_co_hdl), DP_UUID(ocpi->ocpi_co_uuid), DP_DTI(&ocpi->ocpi_xid), + task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, + (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags, cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous"); crt_req_decref(rpc); diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 06cfdb5b195..c0df21dd009 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -1100,7 +1100,7 @@ int daos_obj_query_merge(struct obj_query_merge_args *oqma); void obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc); void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, - crt_endpoint_t *tgt_ep); + crt_endpoint_t *tgt_ep, daos_obj_id_t oid); void obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc); int obj_utils_init(void); void obj_utils_fini(void); diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 82d91c966ac..c01947a05a1 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -616,23 +616,22 @@ obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, - crt_endpoint_t *tgt_ep) + crt_endpoint_t *tgt_ep, daos_obj_id_t oid) { struct daos_coll_target *dct = &tgts[ocdc->cur_pos]; struct daos_coll_target tmp; - unsigned long rand = 0; uint32_t size; int pos; int i; if (ocdc->cur_step > 2) { - rand = d_rand(); /* - * Randomly choose an engine as the relay one for load balance. - * If the one corresponding to "pos" is former moved one, then - * use the "cur_pos" as the relay engine. + * Choose an engine (according to the given oid) as the relay one for load balance. + * If the one corresponding to "pos" is former moved one, then use the "cur_pos" as + * the relay engine. Then even if related RPC was resent without changing pool map, + * then the relay one will be the same as the original case. */ - pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; + pos = oid.lo % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { memcpy(&tmp, &tgts[pos], sizeof(tmp)); memcpy(&tgts[pos], dct, sizeof(tmp)); @@ -642,8 +641,8 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t size = dct->dct_bitmap_sz << 3; - /* Randomly choose a XS as the local leader on target engine for load balance. */ - for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) { + /* Choose a target as the local agent on the engine for load balance. */ + for (i = 0, pos = oid.lo % dct->dct_tgt_nr; i < size; i++) { if (isset(dct->dct_bitmap, i)) { pos -= dct->dct_shards[i].dcs_nr; if (pos < 0) diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 71c630fa947..3e31067f7d2 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -1010,7 +1010,14 @@ agg_diff_preprocess(struct ec_agg_entry *entry, unsigned char *diff, hole_off = 0; d_list_for_each_entry(extent, &entry->ae_cur_stripe.as_dextents, ae_link) { - D_ASSERT(!extent->ae_hole); + if (extent->ae_hole) { + /* valid hole processed by agg_process_holes_ult() */ + D_ASSERTF(extent->ae_epoch < entry->ae_par_extent.ape_epoch, + "hole ext epoch " DF_X64 ", parity epoch " DF_X64 "\n", + extent->ae_epoch, entry->ae_par_extent.ape_epoch); + continue; + } + if (extent->ae_epoch <= entry->ae_par_extent.ape_epoch) continue; D_ASSERT(extent->ae_recx.rx_idx >= ss); @@ -2667,8 +2674,13 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, struct ec_agg_param *ec_agg_param = agg_param->ap_data; vos_iter_param_t iter_param = { 0 }; struct vos_iter_anchors anchors = { 0 }; + struct dtx_handle *dth = NULL; + struct dtx_share_peer *dsp; + struct dtx_id dti = { 0 }; + struct dtx_epoch epoch = { 0 }; + daos_unit_oid_t oid = { 0 }; + int blocks = 0; int rc = 0; - int blocks = 0; /* * Avoid calling into vos_aggregate() when aborting aggregation @@ -2715,8 +2727,32 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL); retry: + epoch.oe_value = epr->epr_hi; + rc = dtx_begin(cont->sc_hdl, &dti, &epoch, 0, cont->sc_pool->spc_map_version, &oid, + NULL, 0, 0, NULL, &dth); + if (rc != 0) + goto update_hae; + rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors, agg_iterate_pre_cb, - agg_iterate_post_cb, ec_agg_param, NULL); + agg_iterate_post_cb, ec_agg_param, dth); + if (rc == -DER_INPROGRESS && !d_list_empty(&dth->dth_share_tbd_list)) { + uint64_t now = daos_gettime_coarse(); + + /* Report warning per each 10 seconds to avoid log flood. */ + if (now - cont->sc_ec_agg_busy_ts > 10) { + while ((dsp = d_list_pop_entry(&dth->dth_share_tbd_list, + struct dtx_share_peer, dsp_link)) != NULL) { + D_WARN(DF_CONT ": EC aggregate hit non-committed DTX " DF_DTI "\n", + DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), + DP_DTI(&dsp->dsp_xid)); + dtx_dsp_free(dsp); + } + + cont->sc_ec_agg_busy_ts = now; + } + } + + dtx_end(dth, cont, rc); /* Post_cb may not being executed in some cases */ agg_clear_extents(&ec_agg_param->ap_agg_entry); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 041ea903c4f..cdd6f4ffa67 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2953,8 +2953,11 @@ ds_obj_rw_handler(crt_rpc_t *rpc) d_tm_inc_counter(opm->opm_update_resent, 1); -again1: - e = 0; +again: + if (flags & ORF_RESEND) + e = orw->orw_epoch; + else + e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &e, &version); switch (rc) { @@ -2965,8 +2968,13 @@ ds_obj_rw_handler(crt_rpc_t *rpc) orw->orw_epoch = e; /* TODO: Also recover the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &orw->orw_dti, e); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: - rc = 0; + flags = 0; break; default: D_GOTO(out, rc); @@ -2976,7 +2984,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) D_GOTO(out, rc); } -again2: /* For leader case, we need to find out the potential conflict * (or share the same non-committed object/dkey) DTX(s) in the * CoS (committable) cache, piggyback them via the dispdatched @@ -3021,7 +3028,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; exec_arg.start = orw->orw_start_shard; /* Execute the operation on all targets */ @@ -3036,28 +3043,25 @@ ds_obj_rw_handler(crt_rpc_t *rpc) case -DER_TX_RESTART: /* * If this is a standalone operation, we can restart the - * internal transaction right here. Otherwise, we have to defer - * the restart to the RPC client. + * internal transaction right here. Otherwise we have to + * defer the restart to the RPC sponsor. */ - if (opc == DAOS_OBJ_RPC_UPDATE) { - /* - * Only standalone updates use this RPC. Retry with - * newer epoch. - */ - orw->orw_epoch = d_hlc_get(); - orw->orw_flags &= ~ORF_RESEND; - flags = 0; - d_tm_inc_counter(opm->opm_update_restart, 1); - goto again2; - } + if (opc != DAOS_OBJ_RPC_UPDATE) + break; - break; + /* Only standalone updates use this RPC. Retry with newer epoch. */ + orw->orw_epoch = d_hlc_get(); + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + d_tm_inc_counter(opm->opm_update_restart, 1); + goto again; case -DER_AGAIN: - orw->orw_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; d_tm_inc_counter(opm->opm_update_retry, 1); ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -3875,8 +3879,11 @@ ds_obj_punch_handler(crt_rpc_t *rpc) if (opi->opi_flags & ORF_RESEND) { daos_epoch_t e; -again1: - e = 0; +again: + if (flags & ORF_RESEND) + e = opi->opi_epoch; + else + e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &e, &version); switch (rc) { @@ -3887,8 +3894,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc) flags |= ORF_RESEND; /* TODO: Also recovery the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: - rc = 0; + flags = 0; break; default: D_GOTO(out, rc); @@ -3898,7 +3910,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) goto cleanup; } -again2: /* For leader case, we need to find out the potential conflict * (or share the same non-committed object/dkey) DTX(s) in the * CoS (committable) cache, piggyback them via the dispdatched @@ -3943,7 +3954,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; /* Execute the operation on all shards */ if (opi->opi_api_flags & DAOS_COND_PUNCH) @@ -3959,19 +3970,17 @@ ds_obj_punch_handler(crt_rpc_t *rpc) rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { case -DER_TX_RESTART: - /* - * Only standalone punches use this RPC. Retry with newer - * epoch. - */ + /* Only standalone punches use this RPC. Retry with newer epoch. */ opi->opi_epoch = d_hlc_get(); - opi->opi_flags &= ~ORF_RESEND; - flags = 0; - goto again2; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + goto again; case -DER_AGAIN: - opi->opi_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -4401,6 +4410,44 @@ obj_cpd_reply(crt_rpc_t *rpc, int status, uint32_t map_version) oco->oco_sub_epochs.ca_count = 0; } +static inline void +cpd_unpin_objects(daos_handle_t coh, struct vos_pin_handle *pin_hdl) +{ + if (pin_hdl != NULL) + vos_unpin_objects(coh, pin_hdl); +} + +static int +cpd_pin_objects(daos_handle_t coh, struct daos_cpd_sub_req *dcsrs, + struct daos_cpd_req_idx *dcri, int count, struct vos_pin_handle **pin_hdl) +{ + struct daos_cpd_sub_req *dcsr; + daos_unit_oid_t *oids; + int i, rc; + + if (count == 0) + return 0; + + D_ALLOC_ARRAY(oids, count); + if (oids == NULL) + return -DER_NOMEM; + + for (i = 0; i < count; i++) { + dcsr = &dcsrs[dcri[i].dcri_req_idx]; + dcsr->dcsr_oid.id_shard = dcri[i].dcri_shard_id; + + D_ASSERT(dcsr->dcsr_opc != DCSO_READ); + oids[i] = dcsr->dcsr_oid; + } + + rc = vos_pin_objects(coh, oids, count, pin_hdl); + if (rc) + DL_ERROR(rc, "Failed to pin CPD objects."); + + D_FREE(oids); + return rc; +} + /* Locally process the operations belong to one DTX. * Common logic, shared by both leader and non-leader. */ @@ -4438,6 +4485,7 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp int i; uint64_t update_flags; uint64_t sched_seq = sched_cur_seq(); + struct vos_pin_handle *pin_hdl = NULL; if (dth->dth_flags & DTE_LEADER && DAOS_FAIL_CHECK(DAOS_DTX_RESTART)) @@ -4500,6 +4548,12 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp } } + rc = cpd_pin_objects(ioc->ioc_vos_coh, dcsrs, dcri, dcde->dcde_write_cnt, &pin_hdl); + if (rc) { + DL_ERROR(rc, "Failed to pin objects."); + goto out; + } + /* P2: vos_update_begin. */ for (i = 0; i < dcde->dcde_write_cnt; i++) { dcsr = &dcsrs[dcri[i].dcri_req_idx]; @@ -4820,6 +4874,8 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp } } + cpd_unpin_objects(ioc->ioc_vos_coh, pin_hdl); + D_FREE(iohs); D_FREE(biods); D_FREE(bulks); @@ -5663,8 +5719,11 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (ocpi->ocpi_flags & ORF_RESEND) { -again1: - tmp = 0; +again: + if (!(ocpi->ocpi_flags & ORF_LEADER) || (flags & ORF_RESEND)) + tmp = ocpi->ocpi_epoch; + else + tmp = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version); switch (rc) { case -DER_ALREADY: @@ -5674,7 +5733,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) flags |= ORF_RESEND; /* TODO: Also recovery the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &ocpi->ocpi_xid, tmp); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: + flags = 0; break; default: D_GOTO(out, rc); @@ -5683,7 +5748,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) dce->dce_ver = version; } -again2: epoch.oe_value = ocpi->ocpi_epoch; epoch.oe_first = epoch.oe_value; epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags); @@ -5695,7 +5759,7 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; exec_arg.coll_shards = dcts[0].dct_shards; exec_arg.coll_tgts = dcts; obj_coll_disp_init(dct_nr, ocpi->ocpi_max_tgt_sz, @@ -5728,14 +5792,15 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) switch (rc) { case -DER_TX_RESTART: ocpi->ocpi_epoch = d_hlc_get(); - ocpi->ocpi_flags &= ~ORF_RESEND; - flags = 0; - goto again2; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + goto again; case -DER_AGAIN: - ocpi->ocpi_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -5755,12 +5820,14 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) max_ver = version; DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, - "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc " - DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, " - "tgt_nr %u, forward width %u, forward depth %u, flags %x", + "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/" + DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, " + "bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, - DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, + DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, + DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl), + DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch, ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count, ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c index ce06723621b..f64d851e5b4 100644 --- a/src/object/srv_obj_remote.c +++ b/src/object/srv_obj_remote.c @@ -136,7 +136,7 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, *orw = *orw_parent; orw->orw_oid.id_shard = shard_tgt->st_shard_id; - orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags; + orw->orw_flags |= (ORF_BULK_BIND | obj_exec_arg->flags) & ~ORF_LEADER; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) orw->orw_api_flags &= ~DAOS_COND_MASK; orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; @@ -247,7 +247,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, *opi = *opi_parent; opi->opi_oid.id_shard = shard_tgt->st_shard_id; - opi->opi_flags |= obj_exec_arg->flags; + opi->opi_flags |= obj_exec_arg->flags & ~ORF_LEADER; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) opi->opi_api_flags &= ~DAOS_COND_PUNCH; opi->opi_dti_cos.ca_count = dth->dth_dti_cos_count; @@ -495,7 +495,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, crt_endpoint_t tgt_ep = { 0 }; crt_rpc_t *parent_req = exec_arg->rpc; crt_rpc_t *req; - struct obj_coll_punch_in *ocpi_parent; + struct obj_coll_punch_in *ocpi_parent = crt_req_get(parent_req); struct obj_coll_punch_in *ocpi; int tag; int rc = 0; @@ -509,7 +509,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, if (remote_arg == NULL) D_GOTO(out, rc = -DER_NOMEM); - obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep); + obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocpi_parent->ocpi_oid.id_pub); tag = tgt_ep.ep_tag; crt_req_addref(parent_req); @@ -524,9 +524,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, D_GOTO(out, rc); } - ocpi_parent = crt_req_get(parent_req); ocpi = crt_req_get(req); - ocpi->ocpi_odm = ocpi_parent->ocpi_odm; uuid_copy(ocpi->ocpi_po_uuid, ocpi_parent->ocpi_po_uuid); uuid_copy(ocpi->ocpi_co_hdl, ocpi_parent->ocpi_co_hdl); @@ -634,7 +632,7 @@ ds_obj_coll_query_remote(struct dtx_leader_handle *dlh, void *data, int idx, if (remote_arg == NULL) D_GOTO(out, rc = -DER_NOMEM); - obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep); + obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocqi_parent->ocqi_oid.id_pub); tag = tgt_ep.ep_tag; remote_arg->dlh = dlh; diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index a3125152013..988680b3e76 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -447,8 +447,10 @@ pool_child_recreate(struct ds_pool_child *child) goto pool_info; } - rc = vos_pool_create(path, child->spc_uuid, 0, pool_info->spi_blob_sz[SMD_DEV_TYPE_DATA], - 0, 0 /* version */, NULL); + rc = vos_pool_create(path, child->spc_uuid, 0 /* scm_sz */, + pool_info->spi_blob_sz[SMD_DEV_TYPE_DATA], + pool_info->spi_blob_sz[SMD_DEV_TYPE_META], + 0 /* flags */, 0 /* version */, NULL); if (rc) DL_ERROR(rc, DF_UUID": Create VOS pool failed.", DP_UUID(child->spc_uuid)); diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto index be068e5274d..81864169e33 100644 --- a/src/proto/ctl/storage_nvme.proto +++ b/src/proto/ctl/storage_nvme.proto @@ -28,7 +28,8 @@ message ScanNvmeReq { bool Basic = 3; // Strip NVMe device details to only basic uint64 MetaSize = 4; // Size of the metadata blob uint64 RdbSize = 5; // Size of the RDB blob - bool LinkStats = 6; // Populate PCIe link info in health statistics + float MemRatio = 6; // Ratio of VOS-file:meta-blob sizes + bool LinkStats = 7; // Populate PCIe link info in health statistics } message ScanNvmeResp { diff --git a/src/proto/mgmt/pool.proto b/src/proto/mgmt/pool.proto index e65374afaec..ad6920bbf6a 100644 --- a/src/proto/mgmt/pool.proto +++ b/src/proto/mgmt/pool.proto @@ -32,6 +32,7 @@ message PoolCreateReq { uint32 num_ranks = 11; // Number of target ranks to use repeated uint32 ranks = 12; // target ranks repeated uint64 tier_bytes = 13; // Size in bytes of storage tier + float mem_ratio = 14; // Fraction of meta-blob-sz to use as mem-file-sz } // PoolCreateResp returns created pool uuid and ranks. @@ -40,7 +41,8 @@ message PoolCreateResp { uint32 svc_ldr = 2; // Current service leader rank repeated uint32 svc_reps = 3; // pool service replica ranks repeated uint32 tgt_ranks = 4; // pool target ranks - repeated uint64 tier_bytes = 5; // storage tiers allocated to pool + repeated uint64 tier_bytes = 5; // per-rank storage tier sizes allocated in pool + uint64 mem_file_bytes = 6; // per-rank accumulated value of memory file sizes } // PoolDestroyReq supplies pool identifier and force flag. @@ -116,6 +118,7 @@ message PoolExtendReq { message PoolExtendResp { int32 status = 1; // DAOS error code repeated uint64 tier_bytes = 2; // storage tiers allocated to pool + uint32 meta_blob_bytes = 3; // Size in bytes of metadata blob on SSD } // PoolReintegrateReq supplies pool identifier, rank, and target_idxs. @@ -235,7 +238,8 @@ message PoolQueryResp { PoolServiceState state = 17; // pool state uint32 svc_ldr = 18; // current raft leader (2.6+) repeated uint32 svc_reps = 19; // service replica ranks - uint64 query_mask = 20; // Bitmask of pool query options used + uint64 query_mask = 20; // Bitmask of pool query options used + uint64 mem_file_bytes = 21; // per-pool accumulated value of memory file sizes } message PoolProperty { @@ -326,6 +330,7 @@ message PoolQueryTargetInfo { TargetState state = 2; // target state see enum daos_target_state_t // TODO: target performance data repeated StorageTargetUsage space = 3; // this target's usage per storage tier + uint64 mem_file_bytes = 4; // per-target value of memory file size } // PoolQueryTargetResp represents a pool target query response diff --git a/src/rdb/rdb.c b/src/rdb/rdb.c index 7ca0879ed3a..ccae3e3f647 100644 --- a/src/rdb/rdb.c +++ b/src/rdb/rdb.c @@ -58,7 +58,7 @@ rdb_create(const char *path, const uuid_t uuid, uint64_t caller_term, size_t siz * basic system memory reservation and VOS_POF_EXCL for concurrent * access protection. */ - rc = vos_pool_create(path, (unsigned char *)uuid, size, 0 /* nvme_sz */, + rc = vos_pool_create(path, (unsigned char *)uuid, size, 0 /* data_sz */, 0 /* meta_sz */, VOS_POF_SMALL | VOS_POF_EXCL | VOS_POF_RDB, vos_df_version, &pool); if (rc != 0) goto out; diff --git a/src/tests/ftest/control/dmg_pool_query_test.py b/src/tests/ftest/control/dmg_pool_query_test.py index b7c83b59b55..593862a0ea0 100644 --- a/src/tests/ftest/control/dmg_pool_query_test.py +++ b/src/tests/ftest/control/dmg_pool_query_test.py @@ -97,7 +97,11 @@ def test_pool_query_basic(self): "tier_name": "NVME", "size": self.params.get("total", path="/run/exp_vals/nvme/*") } - ] + ], + "mem_file_bytes": ( + self.params.get("total", path="/run/exp_vals/scm/*") if + self.server_managers[0].manager.job.using_control_metadata else + 0) } self.assertDictEqual( diff --git a/src/tests/ftest/pool/list_verbose.py b/src/tests/ftest/pool/list_verbose.py index 370ca81ad40..a46dfb73408 100644 --- a/src/tests/ftest/pool/list_verbose.py +++ b/src/tests/ftest/pool/list_verbose.py @@ -108,7 +108,12 @@ def create_expected(self, pool, scm_free, nvme_free, scm_imbalance, "size": nvme_size, "free": nvme_free, "imbalance": nvme_imbalance - }], + }, + ], + "mem_file_bytes": ( + scm_size if + self.server_managers[0].manager.job.using_control_metadata else + 0) } @staticmethod diff --git a/src/tests/ftest/util/dfuse_utils.py b/src/tests/ftest/util/dfuse_utils.py index a26f372e76d..900da63ebf1 100644 --- a/src/tests/ftest/util/dfuse_utils.py +++ b/src/tests/ftest/util/dfuse_utils.py @@ -30,7 +30,6 @@ def __init__(self, namespace, command, path=""): self.sys_name = FormattedParameter("--sys-name {}") self.thread_count = FormattedParameter("--thread-count {}") self.eq_count = FormattedParameter("--eq-count {}") - self.singlethreaded = FormattedParameter("--singlethread", False) self.foreground = FormattedParameter("--foreground", False) self.enable_caching = FormattedParameter("--enable-caching", False) self.enable_wb_cache = FormattedParameter("--enable-wb-cache", False) diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index a48b45e59dd..271560275f5 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -601,8 +601,11 @@ def pool_create(self, scm_size, uid=None, gid=None, nvme_size=None, # 0, # 1 # ], - # "scm_bytes": 256000000, - # "nvme_bytes": 0 + # "tier_bytes": [ + # 256000000, + # 0 + # ], + # "mem_file_bytes": 0 # }, # "error": null, # "status": 0 @@ -622,6 +625,7 @@ def pool_create(self, scm_size, uid=None, gid=None, nvme_size=None, data["ranks"] = ",".join([str(r) for r in output["response"]["tgt_ranks"]]) data["scm_per_rank"] = output["response"]["tier_bytes"][0] data["nvme_per_rank"] = output["response"]["tier_bytes"][1] + data["memfile_per_rank"] = output["response"]["mem_file_bytes"] return data diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index 70d2352881d..fb6b37d9ef3 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -427,7 +427,7 @@ class TelemetryUtils(): ENGINE_NVME_CRIT_WARN_METRICS +\ ENGINE_NVME_INTEL_VENDOR_METRICS ENGINE_MEM_USAGE_METRICS = [ - "engine_mem_vos_vos_obj_360", + "engine_mem_vos_vos_obj_408", "engine_mem_vos_vos_lru_size", "engine_mem_dtx_dtx_leader_handle_360"] ENGINE_MEM_TOTAL_USAGE_METRICS = [ diff --git a/src/tests/vos_engine.c b/src/tests/vos_engine.c index 077dd4f061b..33fe7068ffc 100644 --- a/src/tests/vos_engine.c +++ b/src/tests/vos_engine.c @@ -32,7 +32,8 @@ engine_pool_init(struct credit_context *tsc) if (tsc_create_pool(tsc)) { /* Use pool size as blob size for this moment. */ - rc = vos_pool_create(pmem_file, tsc->tsc_pool_uuid, 0, tsc->tsc_nvme_size, 0, + rc = vos_pool_create(pmem_file, tsc->tsc_pool_uuid, 0 /* scm_sz */, + tsc->tsc_nvme_size, 0 /* meta_sz */, 0 /* flags */, 0 /* version */, &poh); if (rc) return rc; diff --git a/src/utils/ddb/ddb_vos.c b/src/utils/ddb/ddb_vos.c index b331e830fdd..ff52e052247 100644 --- a/src/utils/ddb/ddb_vos.c +++ b/src/utils/ddb/ddb_vos.c @@ -1746,7 +1746,7 @@ sync_cb(struct ddbs_sync_info *info, void *cb_args) D_WARN("delete target failed: " DF_RC "\n", DP_RC(rc)); rc = smd_pool_add_tgt(pool_id, info->dsi_hdr->bbh_vos_id, - info->dsi_hdr->bbh_blob_id, st, blob_size); + info->dsi_hdr->bbh_blob_id, st, blob_size, 0); if (!SUCCESS(rc)) { D_ERROR("add target failed: "DF_RC"\n", DP_RC(rc)); args->sync_rc = rc; diff --git a/src/utils/ddb/tests/ddb_test_driver.c b/src/utils/ddb/tests/ddb_test_driver.c index 2f9c24f931f..e88e045120f 100644 --- a/src/utils/ddb/tests/ddb_test_driver.c +++ b/src/utils/ddb/tests/ddb_test_driver.c @@ -243,7 +243,9 @@ ddb_test_pool_setup(struct dt_vos_pool_ctx *tctx) return rc; } - rc = vos_pool_create(tctx->dvt_pmem_file, tctx->dvt_pool_uuid, 0, 0, 0, 0, NULL); + rc = vos_pool_create(tctx->dvt_pmem_file, tctx->dvt_pool_uuid, 0 /* scm_sz */, + 0 /* data_sz */, 0 /* meta_sz */, 0 /* flags */, 0 /* version */, + NULL); if (rc) { close(tctx->dvt_fd); return rc; diff --git a/src/vos/evtree.c b/src/vos/evtree.c index d635453f8b2..59f8855c3c1 100644 --- a/src/vos/evtree.c +++ b/src/vos/evtree.c @@ -1443,8 +1443,9 @@ evt_node_alloc(struct evt_context *tcx, unsigned int flags, struct evt_node *nd; umem_off_t nd_off; bool leaf = (flags & EVT_NODE_LEAF); + struct vos_object *obj = tcx->tc_desc_cbs.dc_alloc_arg; - nd_off = umem_zalloc(evt_umm(tcx), evt_node_size(tcx, leaf)); + nd_off = vos_obj_alloc(evt_umm(tcx), obj, evt_node_size(tcx, leaf), true); if (UMOFF_IS_NULL(nd_off)) return -DER_NOSPACE; @@ -3249,8 +3250,9 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd, } if (leaf) { - umem_off_t desc_off; - uint32_t csum_buf_size = 0; + umem_off_t desc_off; + uint32_t csum_buf_size = 0; + struct vos_object *obj = tcx->tc_desc_cbs.dc_alloc_arg; if (ci_is_valid(&ent->ei_csum)) csum_buf_size = ci_csums_len(ent->ei_csum); @@ -3263,7 +3265,7 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd, D_DEBUG(DB_TRACE, "Allocating an extra %d bytes " "for checksum", csum_buf_size); } - desc_off = umem_zalloc(evt_umm(tcx), desc_size); + desc_off = vos_obj_alloc(evt_umm(tcx), obj, desc_size, true); if (UMOFF_IS_NULL(desc_off)) return -DER_NOSPACE; diff --git a/src/vos/sys_db.c b/src/vos/sys_db.c index d1f6d4bce98..afac65f2879 100644 --- a/src/vos/sys_db.c +++ b/src/vos/sys_db.c @@ -129,8 +129,8 @@ db_open_create(struct sys_db *db, bool try_create) } D_DEBUG(DB_IO, "Opening %s, try_create=%d\n", vdb->db_file, try_create); if (try_create) { - rc = vos_pool_create(vdb->db_file, vdb->db_pool, SYS_DB_SIZE, 0, VOS_POF_SYSDB, - 0 /* version */, &vdb->db_poh); + rc = vos_pool_create(vdb->db_file, vdb->db_pool, SYS_DB_SIZE, 0 /* data_sz */, + 0 /* meta_sz */, VOS_POF_SYSDB, 0 /* version */, &vdb->db_poh); if (rc) { D_CRIT("sys pool create error: "DF_RC"\n", DP_RC(rc)); goto failed; diff --git a/src/vos/tests/pool_scrubbing_tests.c b/src/vos/tests/pool_scrubbing_tests.c index c6fc20b3716..a7111045b73 100644 --- a/src/vos/tests/pool_scrubbing_tests.c +++ b/src/vos/tests/pool_scrubbing_tests.c @@ -225,8 +225,8 @@ sts_ctx_pool_init(struct sts_context *ctx) } /* Use pool size as blob size for this moment. */ - rc = vos_pool_create(pmem_file, ctx->tsc_pool_uuid, 0, ctx->tsc_nvme_size, 0, - 0 /* version */, &poh); + rc = vos_pool_create(pmem_file, ctx->tsc_pool_uuid, 0 /* scm_sz */, ctx->tsc_nvme_size, + 0 /* meta_sz */, 0 /* flags */, 0 /* version */, &poh); assert_success(rc); ctx->tsc_poh = poh; diff --git a/src/vos/tests/vos_cmd.c b/src/vos/tests/vos_cmd.c index fa8b9e00d0b..4cba4793c52 100644 --- a/src/vos/tests/vos_cmd.c +++ b/src/vos/tests/vos_cmd.c @@ -273,7 +273,8 @@ create_pool(struct cmd_info *cinfo) close(fd); - rc = vos_pool_create(known_pool->kp_path, known_pool->kp_uuid, 0, 0, 0, 0 /* version */, + rc = vos_pool_create(known_pool->kp_path, known_pool->kp_uuid, 0 /* scm_sz */, + 0 /* data_sz */, 0 /* meta_sz */, 0 /* flags */, 0 /* version */, NULL); if (rc != 0) { D_ERROR("Could not create vos pool at %s, rc=" DF_RC "\n", known_pool->kp_path, diff --git a/src/vos/tests/vts_common.c b/src/vos/tests/vts_common.c index 618f9fedddb..0e8902bf5b4 100644 --- a/src/vos/tests/vts_common.c +++ b/src/vos/tests/vts_common.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -89,9 +89,9 @@ vts_pool_fallocate(char **fname) } int -vts_ctx_init(struct vos_test_ctx *tcx, size_t psize) +vts_ctx_init_ex(struct vos_test_ctx *tcx, size_t psize, size_t meta_size) { - int rc; + int rc; memset(tcx, 0, sizeof(*tcx)); oid_cnt = 0; @@ -107,8 +107,8 @@ vts_ctx_init(struct vos_test_ctx *tcx, size_t psize) uuid_generate_time_safe(tcx->tc_co_uuid); /* specify @psize as both NVMe size and SCM size */ - rc = vos_pool_create(tcx->tc_po_name, tcx->tc_po_uuid, psize, psize, 0, 0 /* version */, - &tcx->tc_po_hdl); + rc = vos_pool_create(tcx->tc_po_name, tcx->tc_po_uuid, psize, psize, meta_size, + 0 /* flags */, 0 /* version */, &tcx->tc_po_hdl); if (rc) { print_error("vpool create %s failed with error : %d\n", tcx->tc_po_name, rc); @@ -139,6 +139,12 @@ vts_ctx_init(struct vos_test_ctx *tcx, size_t psize) return rc; } +int +vts_ctx_init(struct vos_test_ctx *tcx, size_t psize) +{ + return vts_ctx_init_ex(tcx, psize, 0); +} + void vts_ctx_fini(struct vos_test_ctx *tcx) { @@ -268,8 +274,8 @@ pool_init(struct credit_context *tsc) /* Use pool size as blob size for this moment. */ if (tsc_create_pool(tsc)) { - rc = vos_pool_create(pmem_file, tsc->tsc_pool_uuid, 0, tsc->tsc_nvme_size, 0, - 0 /* version */, &poh); + rc = vos_pool_create(pmem_file, tsc->tsc_pool_uuid, 0, tsc->tsc_nvme_size, + 0 /* meta_sz */, 0 /* flags */, 0 /* version */, &poh); if (rc) goto out; } else { diff --git a/src/vos/tests/vts_common.h b/src/vos/tests/vts_common.h index 2a08cbd8ff8..11529f040b7 100644 --- a/src/vos/tests/vts_common.h +++ b/src/vos/tests/vts_common.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -95,6 +95,8 @@ vts_pool_fallocate(char **fname); int vts_ctx_init(struct vos_test_ctx *tcx, size_t pool_size); +int +vts_ctx_init_ex(struct vos_test_ctx *tcx, size_t pool_size, size_t meta_size); void vts_ctx_fini(struct vos_test_ctx *tcx); diff --git a/src/vos/tests/vts_container.c b/src/vos/tests/vts_container.c index 6966ae866d8..d19b11a0101 100644 --- a/src/vos/tests/vts_container.c +++ b/src/vos/tests/vts_container.c @@ -161,8 +161,8 @@ setup(void **state) uuid_generate_time_safe(test_arg->pool_uuid); vts_pool_fallocate(&test_arg->fname); - ret = vos_pool_create(test_arg->fname, test_arg->pool_uuid, 0, 0, 0, 0 /* version */, - &test_arg->poh); + ret = vos_pool_create(test_arg->fname, test_arg->pool_uuid, 0 /* scm_sz */, 0 /* data_sz */, + 0 /* meta_sz */, 0 /* flags */, 0 /* version */, &test_arg->poh); assert_rc_equal(ret, 0); *state = test_arg; return 0; diff --git a/src/vos/tests/vts_io.c b/src/vos/tests/vts_io.c index 2f084a2d99d..ff02abaf1e2 100644 --- a/src/vos/tests/vts_io.c +++ b/src/vos/tests/vts_io.c @@ -898,7 +898,7 @@ io_update_and_fetch_dkey(struct io_test_args *arg, daos_epoch_t update_epoch, static inline int hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *epr, daos_epoch_t bound, uint64_t flags, uint32_t intent, struct vos_object **obj_p, - struct vos_ts_set *ts_set) + struct vos_ts_set *ts_set, struct umem_instance *umm) { int rc; @@ -908,7 +908,16 @@ hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *ep if (flags & VOS_OBJ_CREATE) { assert_ptr_not_equal(*obj_p, NULL); + + if (umm != NULL) { + rc = umem_tx_begin(umm, NULL); + assert_rc_equal(rc, 0); + } + rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set); + + if (umm != NULL) + rc = umem_tx_end(umm, rc); } return rc; @@ -926,7 +935,8 @@ hold_objects(struct vos_object **objs, daos_handle_t *coh, daos_unit_oid_t *oid, hold_flags |= VOS_OBJ_VISIBLE; for (i = start; i < end; i++) { rc = hold_obj(vos_hdl2cont(*coh), *oid, &epr, 0, hold_flags, - no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE, &objs[i], 0); + no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE, + &objs[i], 0, NULL); if (rc != exp_rc) return 1; } @@ -991,7 +1001,8 @@ io_obj_cache_test(void **state) assert_int_equal(rc, 0); uuid_generate_time_safe(pool_uuid); - rc = vos_pool_create(po_name, pool_uuid, VPOOL_256M, 0, 0, 0 /* version */, &l_poh); + rc = vos_pool_create(po_name, pool_uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, &l_poh); assert_rc_equal(rc, 0); rc = vos_cont_create(l_poh, ctx->tc_co_uuid); @@ -1005,82 +1016,72 @@ io_obj_cache_test(void **state) ummg = vos_cont2umm(vos_hdl2cont(ctx->tc_co_hdl)); umml = vos_cont2umm(vos_hdl2cont(l_coh)); - rc = umem_tx_begin(ummg, NULL); - assert_rc_equal(rc, 0); rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0); + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0, ummg); assert_rc_equal(rc, 0); /** Hold object for discard */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj1, 0); + DAOS_INTENT_DISCARD, &obj1, 0, ummg); assert_rc_equal(rc, 0); /** Second discard should fail */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + DAOS_INTENT_DISCARD, &obj2, 0, ummg); assert_rc_equal(rc, -DER_BUSY); /** Should prevent simultaneous aggregation */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj2, 0); + DAOS_INTENT_PURGE, &obj2, 0, ummg); assert_rc_equal(rc, -DER_BUSY); /** Should prevent simultaneous hold for create as well */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0); + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, + 0, ummg); assert_rc_equal(rc, -DER_UPDATE_AGAIN); /** Need to be able to hold for read though or iteration won't work */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE, - DAOS_INTENT_DEFAULT, &obj2, 0); + DAOS_INTENT_DEFAULT, &obj2, 0, ummg); vos_obj_release(obj2, 0, false); vos_obj_release(obj1, VOS_OBJ_DISCARD, false); /** Hold object for aggregation */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj1, 0); + DAOS_INTENT_PURGE, &obj1, 0, ummg); assert_rc_equal(rc, 0); /** Discard should fail */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + DAOS_INTENT_DISCARD, &obj2, 0, ummg); assert_rc_equal(rc, -DER_BUSY); /** Second aggregation should fail */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj2, 0); + DAOS_INTENT_PURGE, &obj2, 0, ummg); assert_rc_equal(rc, -DER_BUSY); /** Simultaneous create should work */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0); + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0, ummg); assert_rc_equal(rc, 0); vos_obj_release(obj2, 0, false); /** Need to be able to hold for read though or iteration won't work */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE, - DAOS_INTENT_DEFAULT, &obj2, 0); + DAOS_INTENT_DEFAULT, &obj2, 0, ummg); vos_obj_release(obj2, 0, false); vos_obj_release(obj1, VOS_OBJ_AGGREGATE, false); /** Now that other one is done, this should work */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + DAOS_INTENT_DISCARD, &obj2, 0, ummg); assert_rc_equal(rc, 0); vos_obj_release(obj2, VOS_OBJ_DISCARD, false); - rc = umem_tx_end(ummg, 0); - assert_rc_equal(rc, 0); - vos_obj_release(objs[0], 0, false); - rc = umem_tx_begin(umml, NULL); - assert_rc_equal(rc, 0); - rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0); + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0, umml); assert_rc_equal(rc, 0); vos_obj_release(objs[0], 0, false); - rc = umem_tx_end(umml, 0); - assert_rc_equal(rc, 0); - rc = hold_objects(objs, &ctx->tc_co_hdl, &oids[0], 0, 10, true, 0); assert_int_equal(rc, 0); @@ -1090,7 +1091,7 @@ io_obj_cache_test(void **state) rc = hold_objects(objs, &l_coh, &oids[1], 10, 15, true, 0); assert_int_equal(rc, 0); rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0, VOS_OBJ_VISIBLE, - DAOS_INTENT_DEFAULT, &objs[16], 0); + DAOS_INTENT_DEFAULT, &objs[16], 0, NULL); assert_rc_equal(rc, 0); vos_obj_release(objs[16], 0, false); @@ -1904,7 +1905,8 @@ pool_cont_same_uuid(void **state) uuid_generate(pool_uuid); uuid_copy(co_uuid, pool_uuid); - ret = vos_pool_create(arg->fname, pool_uuid, VPOOL_256M, 0, 0, 0 /* version */, &poh); + ret = vos_pool_create(arg->fname, pool_uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, &poh); assert_rc_equal(ret, 0); ret = vos_cont_create(poh, co_uuid); diff --git a/src/vos/tests/vts_pool.c b/src/vos/tests/vts_pool.c index acfd4e46a8a..fc1e7aefef7 100644 --- a/src/vos/tests/vts_pool.c +++ b/src/vos/tests/vts_pool.c @@ -89,7 +89,8 @@ pool_ref_count_test(void **state) int num = 10; uuid_generate(uuid); - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, NULL); for (i = 0; i < num; i++) { ret = vos_pool_open(arg->fname[0], uuid, 0, &arg->poh[i]); assert_rc_equal(ret, 0); @@ -119,7 +120,8 @@ pool_interop(void **state) uuid_generate(uuid); daos_fail_loc_set(FLC_POOL_DF_VER | DAOS_FAIL_ONCE); - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, NULL); assert_rc_equal(ret, 0); ret = vos_pool_open(arg->fname[0], uuid, 0, &poh); @@ -149,15 +151,19 @@ pool_ops_run(void **state) if (arg->fcreate[j]) { ret = vts_pool_fallocate(&arg->fname[j]); assert_int_equal(ret, 0); - ret = vos_pool_create_ex(arg->fname[j], arg->uuid[j], 0, 0, - VPOOL_TEST_WAL_SZ, 0, - 0 /* version */, poh); + ret = vos_pool_create_ex(arg->fname[j], arg->uuid[j], + 0 /* scm_sz */, 0 /* data_sz */, + VPOOL_TEST_WAL_SZ, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, + poh); } else { ret = vts_alloc_gen_fname(&arg->fname[j]); assert_int_equal(ret, 0); ret = vos_pool_create_ex(arg->fname[j], arg->uuid[j], - VPOOL_256M, 0, VPOOL_TEST_WAL_SZ, - 0, 0 /* version */, poh); + VPOOL_256M, 0 /* data_sz */, + VPOOL_TEST_WAL_SZ, + 0 /* meta_sz */, 0 /* flags */, + 0 /* version */, poh); } break; case OPEN: @@ -421,7 +427,8 @@ pool_open_excl_test(void **state) uuid_generate(uuid); print_message("open EXCL shall fail upon existing create opener\n"); - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, &arg->poh[0]); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, &arg->poh[0]); assert_rc_equal(ret, 0); ret = vos_pool_open(arg->fname[0], uuid, VOS_POF_EXCL, &arg->poh[1]); assert_rc_equal(ret, -DER_BUSY); @@ -431,7 +438,8 @@ pool_open_excl_test(void **state) assert_rc_equal(ret, 0); print_message("open EXCL shall fail upon existing opener\n"); - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, NULL); assert_rc_equal(ret, 0); ret = vos_pool_open(arg->fname[0], uuid, 0, &arg->poh[0]); assert_rc_equal(ret, 0); @@ -443,7 +451,8 @@ pool_open_excl_test(void **state) assert_rc_equal(ret, 0); print_message("open EXCL shall fail upon existing EXCL opener\n"); - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, NULL); assert_rc_equal(ret, 0); ret = vos_pool_open(arg->fname[0], uuid, VOS_POF_EXCL, &arg->poh[0]); assert_rc_equal(ret, 0); @@ -456,8 +465,8 @@ pool_open_excl_test(void **state) print_message("open EXCL shall fail upon existing EXCL create " "opener\n"); - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, VOS_POF_EXCL, 0 /* version */, - &arg->poh[0]); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + VOS_POF_EXCL, 0 /* version */, &arg->poh[0]); assert_rc_equal(ret, 0); ret = vos_pool_open(arg->fname[0], uuid, VOS_POF_EXCL, &arg->poh[1]); assert_rc_equal(ret, -DER_BUSY); @@ -467,7 +476,8 @@ pool_open_excl_test(void **state) assert_rc_equal(ret, 0); print_message("open shall fail upon existing EXCL opener\n"); - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 0 /* version */, NULL); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, NULL); assert_rc_equal(ret, 0); ret = vos_pool_open(arg->fname[0], uuid, VOS_POF_EXCL, &arg->poh[0]); assert_rc_equal(ret, 0); @@ -479,8 +489,8 @@ pool_open_excl_test(void **state) assert_rc_equal(ret, 0); print_message("open shall fail upon existing EXCL create opener\n"); - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, VOS_POF_EXCL, 0 /* version */, - &arg->poh[0]); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + VOS_POF_EXCL, 0 /* version */, &arg->poh[0]); assert_rc_equal(ret, 0); ret = vos_pool_open(arg->fname[0], uuid, 0, &arg->poh[1]); assert_rc_equal(ret, -DER_BUSY); @@ -503,11 +513,13 @@ pool_interop_create_old(void **state) uuid_generate(uuid); /* DF version too old. */ - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, 1 /* version */, NULL); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, 1 /* version */, NULL); assert_rc_equal(ret, -DER_INVAL); /* DF version old but supported. */ - ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0, 0, VOS_POOL_DF_2_4, NULL); + ret = vos_pool_create(arg->fname[0], uuid, VPOOL_256M, 0 /* data_sz */, 0 /* meta_sz */, + 0 /* flags */, VOS_POOL_DF_2_4, NULL); assert_rc_equal(ret, 0); ret = vos_pool_open(arg->fname[0], uuid, 0, &poh); diff --git a/src/vos/tests/vts_wal.c b/src/vos/tests/vts_wal.c index 029be44b2bc..7506c7ffa79 100644 --- a/src/vos/tests/vts_wal.c +++ b/src/vos/tests/vts_wal.c @@ -295,7 +295,8 @@ wal_tst_pool_cont(void **state) assert_int_equal(rc, 0); /* Create pool: Create meta & WAL blobs, write meta & WAL header */ - rc = vos_pool_create(pool_name, pool_id, 0, VPOOL_1G, 0, 0 /* version */, NULL); + rc = vos_pool_create(pool_name, pool_id, 0 /* scm_sz */, VPOOL_1G, 0 /* meta_sz */, + 0 /* flags */, 0 /* version */, NULL); assert_int_equal(rc, 0); /* Create cont: write WAL */ @@ -626,6 +627,50 @@ setup_wal_io(void **state) return 0; } +static struct io_test_args test_args; + +#define MDTEST_META_BLOB_SIZE (256 * 1024 * 1024UL) +#define MDTEST_VOS_SIZE (160 * 1024 * 1024UL) +#define MDTEST_MB_SIZE (16 * 1024 * 1024UL) +#define MDTEST_MB_CNT (MDTEST_META_BLOB_SIZE / MDTEST_MB_SIZE) +#define MDTEST_MB_VOS_CNT (MDTEST_VOS_SIZE / MDTEST_MB_SIZE) +#define MDTEST_MAX_NEMB_CNT (MDTEST_MB_VOS_CNT * 8 / 10) +#define MDTEST_MAX_EMB_CNT (MDTEST_MB_CNT - MDTEST_MAX_NEMB_CNT) + +static int +setup_mb_io(void **state) +{ + int rc; + + memset(&test_args, 0, sizeof(test_args)); + rc = vts_ctx_init_ex(&test_args.ctx, MDTEST_VOS_SIZE, MDTEST_META_BLOB_SIZE); + *state = (void *)&test_args; + return rc; +} + +static int +teardown_mb_io(void **state) +{ + struct io_test_args *args = (struct io_test_args *)*state; + + vts_ctx_fini(&args->ctx); + return 0; +} + +static int +setup_mb_io_nembpct(void **state) +{ + d_setenv("DAOS_MD_ON_SSD_NEMB_PCT", "40", true); + return setup_mb_io(state); +} + +static int +teardown_mb_io_nembpct(void **state) +{ + d_unsetenv("DAOS_MD_ON_SSD_NEMB_PCT"); + return teardown_mb_io(state); +} + /* refill:true - perform the pool re-load and refill after every key update/punch */ static int wal_update_and_fetch_dkey(struct io_test_args *arg, daos_epoch_t update_epoch, @@ -1259,6 +1304,1173 @@ wal14_setup(void **state) return 0; } +static void +wal_mb_tests(void **state) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + uint32_t mb_id; + uint64_t *ptr; + umem_off_t umoff; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + mb_id = umem_allot_mb_evictable(umm, 0); + assert_true(mb_id != 0); + umem_tx_begin(umm, NULL); + umoff = umem_alloc_from_bucket(umm, 1024, mb_id); + assert_false(UMOFF_IS_NULL(umoff)); + assert_true(umem_get_mb_from_offset(umm, umoff) == mb_id); + ptr = umem_off2ptr(umm, umoff); + *ptr = 0xdeadcab; + umem_tx_commit(umm); + + wal_pool_refill(arg); + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + ptr = umem_off2ptr(umm, umoff); + assert_true(*ptr == 0xdeadcab); + + umem_atomic_free(umm, umoff); +} + +struct bucket_alloc_info { + umem_off_t start_umoff; + uint32_t num_allocs; + uint32_t mb_id; + uint32_t alloc_size; +}; + +#define CHECKPOINT_FREQ 10000 +static void +checkpoint_fn(void *arg) +{ + struct umem_store *store; + uint64_t committed_id; + daos_handle_t phdl = *(daos_handle_t *)arg; + int rc; + + vos_pool_checkpoint_init(phdl, update_cb, wait_cb, &committed_id, &store); + rc = vos_pool_checkpoint(phdl); + assert_rc_equal(rc, 0); + vos_pool_checkpoint_fini(phdl); +} + +static void +alloc_bucket_to_full(struct umem_instance *umm, struct bucket_alloc_info *ainfo, + void (*chkpt_fn)(void *arg), void *arg) +{ + umem_off_t umoff, prev_umoff; + size_t alloc_size = 512; + umem_off_t *ptr; + struct umem_cache_range rg = {0}; + struct umem_pin_handle *p_hdl; + uint32_t id = ainfo->mb_id; + + if (ainfo->alloc_size) + alloc_size = ainfo->alloc_size; + else + ainfo->alloc_size = alloc_size; + + rg.cr_off = umem_get_mb_base_offset(umm, id); + rg.cr_size = 1; + assert_true(umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl) == 0); + + if (UMOFF_IS_NULL(ainfo->start_umoff)) { + umem_tx_begin(umm, NULL); + ainfo->start_umoff = umem_alloc_from_bucket(umm, alloc_size, id); + umem_tx_commit(umm); + assert_false(UMOFF_IS_NULL(ainfo->start_umoff)); + ainfo->num_allocs++; + assert_true(umem_get_mb_from_offset(umm, ainfo->start_umoff) == id); + prev_umoff = ainfo->start_umoff; + ptr = (umem_off_t *)umem_off2ptr(umm, prev_umoff); + *ptr = UMOFF_NULL; + } else + prev_umoff = ainfo->start_umoff; + + while (true) { + ptr = (umem_off_t *)umem_off2ptr(umm, prev_umoff); + umoff = *ptr; + if (UMOFF_IS_NULL(umoff)) + break; + prev_umoff = umoff; + } + + while (1) { + umem_tx_begin(umm, NULL); + umoff = umem_alloc_from_bucket(umm, alloc_size, id); + + if (UMOFF_IS_NULL(umoff) || (umem_get_mb_from_offset(umm, umoff) != id)) { + umem_tx_abort(umm, 1); + break; + } + umem_tx_add(umm, prev_umoff, sizeof(umem_off_t)); + ptr = (umem_off_t *)umem_off2ptr(umm, prev_umoff); + *ptr = umoff; + ptr = (umem_off_t *)umem_off2ptr(umm, umoff); + *ptr = UMOFF_NULL; + umem_tx_commit(umm); + prev_umoff = umoff; + if (((ainfo->num_allocs++ % CHECKPOINT_FREQ) == 0) && (chkpt_fn != NULL)) + chkpt_fn(arg); + } + if (chkpt_fn != NULL) + chkpt_fn(arg); + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + print_message("Bulk Alloc: Bucket %d, start off %lu num_allocation %d\n", ainfo->mb_id, + ainfo->start_umoff, ainfo->num_allocs); +} + +static void +free_bucket_by_pct(struct umem_instance *umm, struct bucket_alloc_info *ainfo, int pct, + void (*chkpt_fn)(void *arg), void *arg) +{ + int num_free = (ainfo->num_allocs * pct) / 100; + umem_off_t umoff, *ptr, next_umoff; + struct umem_pin_handle *p_hdl; + struct umem_cache_range rg = {0}; + int i, rc; + + assert_true((pct >= 0) && (pct <= 100)); + + if (UMOFF_IS_NULL(ainfo->start_umoff)) + return; + print_message("Bulk Free BEFORE: Bucket %d, start off %lu num_allocation %d\n", + ainfo->mb_id, ainfo->start_umoff, ainfo->num_allocs); + + rg.cr_off = umem_get_mb_base_offset(umm, ainfo->mb_id); + rg.cr_size = 1; + rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); + assert_true(rc == 0); + + umoff = ainfo->start_umoff; + for (i = 0; i < num_free; i++) { + assert_true(umem_get_mb_from_offset(umm, umoff) == ainfo->mb_id); + ptr = (umem_off_t *)umem_off2ptr(umm, umoff); + next_umoff = *ptr; + umem_atomic_free(umm, umoff); + umoff = next_umoff; + if (((ainfo->num_allocs-- % CHECKPOINT_FREQ) == 0) && (chkpt_fn != NULL)) + chkpt_fn(arg); + if (UMOFF_IS_NULL(umoff)) + break; + } + ainfo->start_umoff = umoff; + if (chkpt_fn != NULL) + chkpt_fn(arg); + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + print_message("Bulk Free AFTER: Bucket %d, start off %lu num_allocation %d\n", ainfo->mb_id, + ainfo->start_umoff, ainfo->num_allocs); +} + +static void +wal_mb_utilization_tests(void **state) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1]; + uint32_t id; + int i, j; + int mb_reuse = 0; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + assert_true(MDTEST_MAX_EMB_CNT >= 8); + for (i = 0; i < MDTEST_MAX_EMB_CNT - 1; i++) { + /* Create an MB and fill it with allocs */ + ainfo[i].mb_id = umem_allot_mb_evictable(umm, 0); + ainfo[i].num_allocs = 0; + ainfo[i].start_umoff = UMOFF_NULL; + ainfo[i].alloc_size = 0; + assert_true(ainfo[i].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); + } + + /* Free 5% of space for MB 2 */ + free_bucket_by_pct(umm, &ainfo[0], 5, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 90+ */ + /* Free 30% of space for MB 3 */ + free_bucket_by_pct(umm, &ainfo[1], 30, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 30-75 */ + /* Free 80% of space for MB 4 */ + free_bucket_by_pct(umm, &ainfo[2], 80, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 0-30 */ + /* Free 15% of space for MB 5 */ + free_bucket_by_pct(umm, &ainfo[3], 20, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 75-90 */ + /* Free 10% of space for MB 6 */ + free_bucket_by_pct(umm, &ainfo[4], 18, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 75-90 */ + /* Free 50% of space for MB 7 */ + free_bucket_by_pct(umm, &ainfo[5], 50, checkpoint_fn, &arg->ctx.tc_po_hdl); /* 30-75 */ + /* Free 90% of space for MB 8 */ + free_bucket_by_pct(umm, &ainfo[6], 90, NULL, NULL); /* 0-30 */ + + wal_pool_refill(arg); + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + /* Allocator should return mb with utilization 30%-75% */ + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[1].mb_id); + assert_true(id == ainfo[1].mb_id); + alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl); + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[5].mb_id); + assert_true(id == ainfo[5].mb_id); + alloc_bucket_to_full(umm, &ainfo[5], checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* Next preference should be 0%-30% */ + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[2].mb_id); + assert_true(id == ainfo[2].mb_id); + alloc_bucket_to_full(umm, &ainfo[2], checkpoint_fn, &arg->ctx.tc_po_hdl); + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[6].mb_id); + assert_true(id == ainfo[6].mb_id); + alloc_bucket_to_full(umm, &ainfo[6], checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* Next is to create a new memory bucket. */ + id = umem_allot_mb_evictable(umm, 0); + for (i = 0; i < MDTEST_MAX_EMB_CNT - 1; i++) + assert_true(id != ainfo[i].mb_id); + print_message("obtained id %d\n", id); + i = MDTEST_MAX_EMB_CNT - 1; + + ainfo[i].mb_id = id; + ainfo[i].num_allocs = 0; + ainfo[i].start_umoff = UMOFF_NULL; + ainfo[i].alloc_size = 0; + assert_true(ainfo[i].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* Next preference should be 75%-90% */ + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[3].mb_id); + assert_true(id == ainfo[3].mb_id); + alloc_bucket_to_full(umm, &ainfo[3], checkpoint_fn, &arg->ctx.tc_po_hdl); + id = umem_allot_mb_evictable(umm, 0); + print_message("obtained id %d, expected is %d\n", id, ainfo[4].mb_id); + assert_true(id == ainfo[4].mb_id); + alloc_bucket_to_full(umm, &ainfo[4], checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* If there are no more new evictable mb available it should return + * one with 90% or more utilization. + */ + id = umem_allot_mb_evictable(umm, 0); + for (j = 0; j < i; j++) { + if (id == ainfo[j].mb_id) { + print_message("reusing evictable mb %d\n", id); + mb_reuse = 1; + break; + } + } + assert_true(mb_reuse); +} + +#define ZONE_MAX_SIZE (16 * 1024 * 1024) + +static void +wal_mb_emb_evicts_emb(void **state) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + int i, j, po; + struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1]; + uint32_t id; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + /* Fill non-evictable buckets. */ + ainfo[0].mb_id = 0; + ainfo[0].num_allocs = 0; + ainfo[0].start_umoff = UMOFF_NULL; + ainfo[0].alloc_size = 0; + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* + * validate whether non-evictable mbs have actually consumed MDTEST_MAX_NEMB_CNT + */ + print_message("allocations in non-evictable mbs = %u\n", ainfo[0].num_allocs); + print_message("space used in non-evictable mbs = %u\n", + ainfo[0].num_allocs * ainfo[0].alloc_size); + po = (ainfo[0].num_allocs * ainfo[0].alloc_size + ZONE_MAX_SIZE - 1) / ZONE_MAX_SIZE; + assert_true(po == MDTEST_MAX_NEMB_CNT); + + /* Now free few allocation to support spill */ + free_bucket_by_pct(umm, &ainfo[0], 20, checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* Create and fill MDTEST_MB_CNT evictable memory buckets. */ + for (i = 1; i < MDTEST_MB_CNT + 1; i++) { + /* Create an MB and fill it with allocs */ + id = umem_allot_mb_evictable(umm, 0); + for (j = 0; j < i; j++) { + if (id == ainfo[j].mb_id) { + print_message("evictable mb reused at iteration %d\n", id); + goto out; + } + } + ainfo[i].mb_id = id; + ainfo[i].num_allocs = 0; + ainfo[i].start_umoff = UMOFF_NULL; + ainfo[i].alloc_size = 0; + assert_true(ainfo[i].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); + } +out: + assert_true(i == MDTEST_MAX_EMB_CNT + 1); + + /* Validate and free all allocations in evictable MBs */ + for (j = 0; j < i; j++) + free_bucket_by_pct(umm, &ainfo[j], 100, checkpoint_fn, &arg->ctx.tc_po_hdl); +} + +static void +wal_mb_nemb_evicts_emb(void **state) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + int i, j, po; + struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1]; + uint32_t id; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + /* Create and fill evictable memory buckets. */ + for (i = 1; i < MDTEST_MB_CNT + 1; i++) { + /* Create an MB and fill it with allocs */ + id = umem_allot_mb_evictable(umm, 0); + for (j = 1; j < i; j++) { + if (id == ainfo[j].mb_id) { + print_message("evictable mb reused at iteration %d\n", id); + goto out; + } + } + ainfo[i].mb_id = id; + ainfo[i].num_allocs = 0; + ainfo[i].start_umoff = UMOFF_NULL; + ainfo[i].alloc_size = 0; + assert_true(ainfo[i].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); + } +out: + assert_true(i == MDTEST_MAX_EMB_CNT + 1); + + /* Fill non-evictable buckets. */ + ainfo[0].mb_id = 0; + ainfo[0].num_allocs = 0; + ainfo[0].start_umoff = UMOFF_NULL; + ainfo[0].alloc_size = 0; + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* + * validate whether non-evictable mbs have actually consumed MDTEST_MAX_NEMB_CNT buckets. + */ + print_message("allocations in non-evictable mbs = %u\n", ainfo[0].num_allocs); + print_message("space used in non-evictable mbs = %u\n", + ainfo[0].num_allocs * ainfo[0].alloc_size); + po = (ainfo[0].num_allocs * ainfo[0].alloc_size + ZONE_MAX_SIZE - 1) / ZONE_MAX_SIZE; + assert_true(po == MDTEST_MAX_NEMB_CNT); + + /* Validate and free all allocations in evictable MBs */ + for (j = 0; j < i; j++) + free_bucket_by_pct(umm, &ainfo[j], 100, checkpoint_fn, &arg->ctx.tc_po_hdl); +} + +static void +wal_mb_nemb_pct(void **state) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + int i, j, rc, found = 0; + struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1]; + daos_size_t maxsz, cur_allocated1, cur_allocated; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + /* + * The setup for this test would have set environment variable + * DAOS_MD_ON_SSD_NEMB_PCT to 40 before creating the pool. + */ + ainfo[0].mb_id = 0; + ainfo[0].num_allocs = 0; + ainfo[0].start_umoff = UMOFF_NULL; + ainfo[0].alloc_size = 2048; + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated, &maxsz); + assert_true(rc == 0); + print_message("nemb space utilization is %lu max is %lu\n", cur_allocated, maxsz); + assert_true(maxsz == MDTEST_VOS_SIZE * 40 / 100); + + /* Reopen pool after setting DAOS_MD_ON_SSD_NEMB_PCT to 80% + * It should not impact already created vos pool. + */ + d_setenv("DAOS_MD_ON_SSD_NEMB_PCT", "80", true); + wal_pool_refill(arg); + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated1, &maxsz); + assert_true(rc == 0); + print_message("nemb space utilization is %lu max is %lu\n", cur_allocated1, maxsz); + assert_true(maxsz == MDTEST_VOS_SIZE * 40 / 100); + assert_true(cur_allocated == cur_allocated1); + + /* Allocate from Evictable Buckets. */ + for (i = 1; i <= MDTEST_MB_CNT; i++) { + /* Create an MB and fill it with allocs */ + ainfo[i].mb_id = umem_allot_mb_evictable(umm, 0); + for (j = 1; j < i; j++) { + if (ainfo[i].mb_id == ainfo[j].mb_id) { + found = 1; + break; + } + } + if (found) + break; + ainfo[i].num_allocs = 0; + ainfo[i].start_umoff = UMOFF_NULL; + ainfo[i].alloc_size = 2048; + assert_true(ainfo[i].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); + } + i--; + print_message("Created %d evictable buckets, expected = %ld\n", i, + (MDTEST_META_BLOB_SIZE - maxsz) / MDTEST_MB_SIZE); + assert_true(i == (MDTEST_META_BLOB_SIZE - maxsz) / MDTEST_MB_SIZE); +} + +static int +umoff_in_freelist(umem_off_t *free_list, int cnt, umem_off_t umoff, bool clear) +{ + int i; + + for (i = 0; i < cnt; i++) + if (umoff == free_list[i]) + break; + + if (i < cnt) { + if (clear) + free_list[i] = UMOFF_NULL; + return 1; + } + return 0; +} + +static void +wal_umempobj_block_reuse_internal(void **state, int restart) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + umem_off_t umoff, next_umoff, nnext_umoff; + umem_off_t *ptr_cur, *ptr_next; + umem_off_t *free_list[MDTEST_MB_CNT + 1]; + umem_off_t *free_list_bk[MDTEST_MB_CNT + 1]; + int free_num[MDTEST_MB_CNT + 1]; + struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1]; + int i, j, cnt, rc, num, total_frees; + struct umem_pin_handle *p_hdl; + struct umem_cache_range rg = {0}; + uint64_t space_used_before, space_used_after; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + /* Allocate from NE Buckets. It should use 80% 360M i.e, 16 buckets */ + ainfo[0].mb_id = 0; + ainfo[0].num_allocs = 0; + ainfo[0].start_umoff = UMOFF_NULL; + ainfo[0].alloc_size = 512; + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* Allocate from Evictable Buckets. */ + for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) { + /* Create an MB and fill it with allocs */ + ainfo[i].mb_id = umem_allot_mb_evictable(umm, 0); + ainfo[i].num_allocs = 0; + ainfo[i].start_umoff = UMOFF_NULL; + ainfo[i].alloc_size = 512; + assert_true(ainfo[i].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); + } + + /* Free few allocations from each NE bucket */ + umem_tx_begin(umm, NULL); + umoff = ainfo[0].start_umoff; + num = ainfo[0].num_allocs; + free_num[0] = num / 10000; + cnt = 0; + D_ALLOC_ARRAY(free_list[0], free_num[0]); + for (j = 1; j <= num; j++) { + ptr_cur = (umem_off_t *)umem_off2ptr(umm, umoff); + next_umoff = *ptr_cur; + if ((j % 10000) == 0) { + if (UMOFF_IS_NULL(next_umoff)) + break; + ptr_next = (umem_off_t *)umem_off2ptr(umm, next_umoff); + nnext_umoff = *ptr_next; + umem_tx_add_ptr(umm, ptr_cur, sizeof(umoff)); + *ptr_cur = nnext_umoff; + umem_free(umm, next_umoff); + print_message("id=0:Freeing offset %lu\n", next_umoff); + ainfo->num_allocs--; + free_list[0][cnt++] = next_umoff; + umoff = nnext_umoff; + } else + umoff = next_umoff; + if (UMOFF_IS_NULL(umoff)) + break; + } + umem_tx_commit(umm); + assert_true(cnt == free_num[0]); + print_message("id=0:Total frees %d\n", cnt); + + /* Free few allocations from each E bucket */ + for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) { + rg.cr_off = umem_get_mb_base_offset(umm, ainfo[i].mb_id); + rg.cr_size = 1; + rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); + assert_true(rc == 0); + + umem_tx_begin(umm, NULL); + umoff = ainfo[i].start_umoff; + num = ainfo[i].num_allocs; + free_num[i] = num / 10000; + cnt = 0; + D_ALLOC_ARRAY(free_list[i], free_num[i]); + for (j = 1; j <= num; j++) { + ptr_cur = (umem_off_t *)umem_off2ptr(umm, umoff); + next_umoff = *ptr_cur; + if ((j % 10000) == 0) { + if (UMOFF_IS_NULL(next_umoff)) + break; + ptr_next = (umem_off_t *)umem_off2ptr(umm, next_umoff); + nnext_umoff = *ptr_next; + umem_tx_add_ptr(umm, ptr_cur, sizeof(umoff)); + *ptr_cur = nnext_umoff; + umem_free(umm, next_umoff); + print_message("id=%d:Freeing offset %lu\n", i, next_umoff); + ainfo->num_allocs--; + free_list[i][cnt++] = next_umoff; + umoff = nnext_umoff; + } else + umoff = next_umoff; + if (UMOFF_IS_NULL(umoff)) + break; + } + umem_tx_commit(umm); + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + assert_true(cnt == free_num[i]); + print_message("id=%d:Total frees %d\n", ainfo[i].mb_id, cnt); + } + + /* restart with or without checkpoint */ + if (restart) { + wal_pool_refill(arg); + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + } + + for (i = 0; i < MDTEST_MAX_EMB_CNT + 1; i++) { + D_ALLOC_ARRAY(free_list_bk[i], free_num[i]); + memcpy(free_list_bk[i], free_list[i], free_num[i] * sizeof(umem_off_t)); + } + + /* Allocate from NE Buckets and it should reuse the previous freed blocks */ + for (j = 0; j < free_num[0]; j++) { + umem_tx_begin(umm, NULL); + umoff = umem_alloc(umm, ainfo[0].alloc_size); + umem_tx_commit(umm); + assert_true(!UMOFF_IS_NULL(umoff)); + assert_true(umoff_in_freelist(free_list[0], free_num[0], umoff, true)); + } + + /* New allocation should fail */ + umem_tx_begin(umm, NULL); + umoff = umem_alloc(umm, ainfo[0].alloc_size); + umem_tx_abort(umm, 1); + assert_true(UMOFF_IS_NULL(umoff)); + + /* Allocate from E Buckets and it should reuse the previous freed blocks */ + for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) { + rg.cr_off = umem_get_mb_base_offset(umm, ainfo[i].mb_id); + rg.cr_size = 1; + rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); + assert_true(rc == 0); + + for (j = 0; j < free_num[i]; j++) { + umem_tx_begin(umm, NULL); + umoff = umem_alloc_from_bucket(umm, ainfo[i].alloc_size, ainfo[i].mb_id); + assert_true(!UMOFF_IS_NULL(umoff)); + umem_tx_commit(umm); + assert_true(umoff_in_freelist(free_list[i], free_num[i], umoff, true)); + } + umem_tx_begin(umm, NULL); + /* New allocation should fail */ + umoff = umem_alloc(umm, ainfo[i].alloc_size); + umem_tx_abort(umm, 1); + assert_true(UMOFF_IS_NULL(umoff)); + print_message("Finished reallocating for id = %d\n", ainfo[i].mb_id); + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + } + + /* Free the allocated memory to see whether they are properly accounted */ + rc = umempobj_get_heapusage(umm->umm_pool, &space_used_before); + if (rc) { + print_message("Failed to get heap usage\n"); + assert_true(rc == 0); + } + for (j = 0; j < free_num[0]; j++) + umem_atomic_free(umm, free_list_bk[0][j]); + D_FREE(free_list[0]); + D_FREE(free_list_bk[0]); + + total_frees = free_num[0]; + + for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) { + rg.cr_off = umem_get_mb_base_offset(umm, ainfo[i].mb_id); + rg.cr_size = 1; + rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); + assert_true(rc == 0); + + for (j = 0; j < free_num[i]; j++) { + umoff = umem_atomic_free(umm, free_list_bk[i][j]); + } + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + total_frees += free_num[i]; + D_FREE(free_list[i]); + D_FREE(free_list_bk[i]); + } + rc = umempobj_get_heapusage(umm->umm_pool, &space_used_after); + if (rc) { + print_message("Failed to get heap usage\n"); + assert_true(rc == 0); + } + print_message("Space usage: before free %lu, after free %lu, expected %lu\n", + space_used_before, space_used_after, (space_used_before - total_frees * 512)); + assert_true(space_used_after <= (space_used_before - total_frees * 512)); +} + +static void +wal_umempobj_block_reuse(void **state) +{ + wal_umempobj_block_reuse_internal(state, 0); +} + +static void +wal_umempobj_replay_block_reuse(void **state) +{ + wal_umempobj_block_reuse_internal(state, 1); +} + +static void +wal_umempobj_chkpt_block_reuse(void **state) +{ + struct io_test_args *arg = *state; + + arg->checkpoint = true; + arg->no_replay = true; + wal_umempobj_block_reuse_internal(state, 1); + arg->checkpoint = false; + arg->no_replay = false; + daos_fail_loc_set(0); +} + +static void +wal_umempobj_mbusage_test(void **state) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + struct bucket_alloc_info ainfo[2]; + struct umem_pin_handle *p_hdl; + struct umem_cache_range rg = {0}; + uint64_t allocated0, allocated1, maxsz0, maxsz1, maxsz_exp; + uint64_t allocated, maxsz; + int rc; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + maxsz_exp = MDTEST_MAX_NEMB_CNT * MDTEST_MB_SIZE; + + /* Allocate from NE Buckets. It should use 80% 360M i.e, 16 buckets */ + ainfo[0].mb_id = 0; + ainfo[0].num_allocs = 0; + ainfo[0].start_umoff = UMOFF_NULL; + ainfo[0].alloc_size = 512; + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + + /* Create an MB and fill it with allocs */ + ainfo[1].mb_id = umem_allot_mb_evictable(umm, 0); + ainfo[1].num_allocs = 0; + ainfo[1].start_umoff = UMOFF_NULL; + ainfo[1].alloc_size = 512; + assert_true(ainfo[1].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl); + free_bucket_by_pct(umm, &ainfo[1], 50, checkpoint_fn, &arg->ctx.tc_po_hdl); + + rc = umempobj_get_mbusage(umm->umm_pool, ainfo[0].mb_id, &allocated0, &maxsz0); + print_message("NE usage max_size = %lu allocated = %lu\n", maxsz0, allocated0); + assert_int_equal(rc, 0); + assert_int_equal(maxsz0, maxsz_exp); + + rc = umempobj_get_mbusage(umm->umm_pool, ainfo[1].mb_id, &allocated1, &maxsz1); + print_message("E usage max_size = %lu allocated = %lu\n", maxsz1, allocated1); + assert_int_equal(rc, 0); + assert_int_equal(maxsz1, MDTEST_MB_SIZE); + + wal_pool_refill(arg); + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + rc = umempobj_get_mbusage(umm->umm_pool, ainfo[0].mb_id, &allocated, &maxsz); + print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated); + assert_int_equal(rc, 0); + assert_int_equal(maxsz, maxsz_exp); + assert_int_equal(allocated, allocated0); + + rc = umempobj_get_mbusage(umm->umm_pool, ainfo[1].mb_id, &allocated, &maxsz); + print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated); + assert_int_equal(rc, 0); + /* allocated info is based on the hint */ + assert_true((allocated != 0) && (allocated < allocated1)); + assert_int_equal(maxsz, MDTEST_MB_SIZE); + + rg.cr_off = umem_get_mb_base_offset(umm, ainfo[1].mb_id); + rg.cr_size = 1; + rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); + assert_true(rc == 0); + rc = umempobj_get_mbusage(umm->umm_pool, ainfo[1].mb_id, &allocated, &maxsz); + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + print_message("E usage max_size = %lu allocated = %lu\n", maxsz, allocated); + assert_int_equal(rc, 0); + /* allocated info is based on the actual stats recorded */ + assert_int_equal(allocated, allocated1); + assert_int_equal(maxsz, MDTEST_MB_SIZE); +} + +static void +dump_cache_stats(struct vos_pool *pool, char *op_str) +{ + struct umem_pool *umm_pool = vos_pool2umm(pool)->umm_pool; + struct umem_cache *cache = vos_pool2store(pool)->cache; + daos_size_t scm_used, ne_used, ne_tot; + int rc; + + rc = umempobj_get_heapusage(umm_pool, &scm_used); + assert_rc_equal(rc, 0); + + rc = umempobj_get_mbusage(umm_pool, UMEM_DEFAULT_MBKT_ID, &ne_used, &ne_tot); + assert_int_equal(rc, 0); + + print_message("==================== (dump stats %s)\n", op_str); + print_message("[Space usage] Total used:%lu, NE used:%lu, NE total:%lu\n", + scm_used, ne_used, ne_tot); + + print_message("[Page stats] NE:%u, Pinned:%u, Free:%u\n", + cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE], + cache->ca_pgs_stats[UMEM_PG_STATS_PINNED], + cache->ca_pgs_stats[UMEM_PG_STATS_FREE]); + + print_message("[Swapping stats] Hit:%lu, Miss:%lu, Evict:%lu, Flush:%lu, Load:%lu\n", + cache->ca_cache_stats[UMEM_CACHE_STATS_HIT], + cache->ca_cache_stats[UMEM_CACHE_STATS_MISS], + cache->ca_cache_stats[UMEM_CACHE_STATS_EVICT], + cache->ca_cache_stats[UMEM_CACHE_STATS_FLUSH], + cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD]); + print_message("====================\n"); +} + +static int +obj_rw(struct io_test_args *arg, daos_unit_oid_t oid, char *dkey, char *akey, + daos_iod_type_t iod_type, daos_epoch_t epoch, int io_size, char *buf, bool update) +{ + daos_recx_t recx = {.rx_idx = 0, .rx_nr = 1}; + daos_key_t dkey_iov, akey_iov; + daos_iod_t iod = { 0 }; + d_sg_list_t sgl = { 0 }; + int rc; + + arg->oid = oid; + d_iov_set(&dkey_iov, dkey, strlen(dkey)); + d_iov_set(&akey_iov, akey, strlen(akey)); + + rc = d_sgl_init(&sgl, 1); + assert_rc_equal(rc, 0); + + sgl.sg_iovs[0].iov_buf = buf; + sgl.sg_iovs[0].iov_buf_len = io_size; + sgl.sg_iovs[0].iov_len = io_size; + + iod.iod_name = akey_iov; + iod.iod_nr = 1; + iod.iod_type = iod_type; + iod.iod_size = io_size; + iod.iod_recxs = (iod_type == DAOS_IOD_SINGLE) ? NULL : &recx; + + if (update) + rc = io_test_obj_update(arg, epoch, 0, &dkey_iov, &iod, &sgl, NULL, true); + else + rc = io_test_obj_fetch(arg, epoch, 0, &dkey_iov, &iod, &sgl, true); + + d_sgl_fini(&sgl, false); + + return rc; +} + +static inline uint64_t +verify_space(struct vos_pool *pool, uint32_t bkt_id, uint64_t prev_used, int64_t delta, char *op_str) +{ + struct umem_pool *umm_pool = vos_pool2umm(pool)->umm_pool; + daos_size_t allocated, total; + int rc; + + rc = umempobj_get_mbusage(umm_pool, bkt_id, &allocated, &total); + assert_int_equal(rc, 0); + + print_message("[%s] %s %u used space: %lu/%lu\n", op_str, + bkt_id == UMEM_DEFAULT_MBKT_ID ? "Non-evictable" : "Evictable", + bkt_id, allocated, total); + + if (delta == INT64_MAX) + return allocated; + + if (delta == 0) + assert_true(allocated == prev_used); + else if (delta > 0) + assert_true(allocated > (prev_used + delta)); + else if (delta < 0) + assert_true(allocated <= (prev_used + delta)); + + return allocated; +} + +static void +reclaim_obj(struct io_test_args *arg, daos_unit_oid_t *oid, int oid_nr, daos_epoch_t *epoch) +{ + daos_epoch_range_t epr; + int i, rc; + + /* Punch object */ + for (i = 0; i < oid_nr; i++) { + rc = vos_obj_punch(arg->ctx.tc_co_hdl, *oid, (*epoch)++, 0, 0, NULL, 0, + NULL, NULL); + oid++; + assert_rc_equal(rc, 0); + } + + /* Aggregate punched object */ + epr.epr_lo = 0; + epr.epr_hi = (*epoch)++; + rc = vos_aggregate(arg->ctx.tc_co_hdl, &epr, NULL, NULL, 0); + assert_rc_equal(rc, 0); + + /* Wait GC done */ + gc_wait(); +} + +/* Update/punch object, re-open pool, verify space usage and bucket ID */ +static void +p2_basic_test(void **state) +{ + struct io_test_args *arg = *state; + struct vos_pool *pool = vos_hdl2pool(arg->ctx.tc_po_hdl); + struct vos_container *cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + struct umem_cache *cache; + daos_unit_oid_t oid; + char dkey[UPDATE_DKEY_SIZE] = { 0 }; + char akey[UPDATE_AKEY_SIZE] = { 0 }; + char *buf; + daos_epoch_t epoch = 1; + daos_size_t io_size = 512; + struct vos_object *obj; + uint32_t bkt_id = 1, missed, loaded; + uint64_t used[2], ne_init; + int rc; + + dts_key_gen(dkey, UPDATE_DKEY_SIZE, UPDATE_DKEY); + dts_key_gen(akey, UPDATE_AKEY_SIZE, UPDATE_AKEY); + + D_ALLOC(buf, io_size); + assert_non_null(buf); + dts_buf_render(buf, io_size); + + /* Get initial space usage */ + used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, 0, INT64_MAX, "Init"); + ne_init = used[0]; + + /* Update object1 */ + oid = dts_unit_oid_gen(0, 0); + rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_SINGLE, epoch++, io_size, buf, true); + assert_rc_equal(rc, 0); + + /* Verify object1 bucket ID */ + rc = vos_obj_acquire(cont, oid, false, &obj); + assert_rc_equal(rc, 0); + + assert_int_equal(obj->obj_bkt_ids[0], bkt_id); + + vos_obj_release(obj, 0, true); + + /* Verify space usage */ + used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], 1, "Object1"); + used[1] = verify_space(pool, bkt_id, 0, INT64_MAX, "Object1"); + + /* Reclaim object1 */ + reclaim_obj(arg, &oid, 1, &epoch); + + /* Verify space usage */ + used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], -1, "Reclaim object1"); + used[1] = verify_space(pool, bkt_id, used[1], -used[1], "Reclaim object1"); + + /* Update object2 */ + oid = dts_unit_oid_gen(0, 0); + rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_ARRAY, epoch++, io_size, buf, true); + assert_rc_equal(rc, 0); + + /* Verify object2 bucket ID */ + rc = vos_obj_acquire(cont, oid, false, &obj); + assert_rc_equal(rc, 0); + + assert_int_equal(obj->obj_bkt_ids[0], bkt_id); + + /* Verify space usage */ + used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], 1, "Object2.1"); + used[1] = verify_space(pool, bkt_id, used[1], io_size, "Object2.1"); + + /* Update object2 again */ + dts_key_gen(dkey, UPDATE_DKEY_SIZE, UPDATE_DKEY); + dts_key_gen(akey, UPDATE_AKEY_SIZE, UPDATE_AKEY); + rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_SINGLE, epoch++, io_size, buf, true); + assert_rc_equal(rc, 0); + + /* Verify object2 bucket ID */ + assert_int_equal(obj->obj_bkt_ids[0], bkt_id); + + vos_obj_release(obj, 0, true); + + /* Verify space usage */ + used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], 0, "Object2.2"); + used[1] = verify_space(pool, bkt_id, used[1], io_size, "Object2.2"); + + /* Re-open pool */ + arg->checkpoint = true; + wal_pool_refill(arg); + pool = vos_hdl2pool(arg->ctx.tc_po_hdl); + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + cache = vos_pool2store(pool)->cache; + arg->checkpoint = false; + + missed = cache->ca_cache_stats[UMEM_CACHE_STATS_MISS]; + loaded = cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD]; + + /* Verify NE space usage */ + used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], 0, "Re-open"); + + /* Fetch object2 */ + rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_SINGLE, DAOS_EPOCH_MAX, io_size, buf, false); + assert_rc_equal(rc, 0); + + dump_cache_stats(pool, "after re-open & fetch"); + /* Verify cache stats */ + assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_MISS], missed + 1); + assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD], loaded + 1); + + /* Verify E space usage */ + used[1] = verify_space(pool, bkt_id, used[1], 0, "Re-open"); + + /* Verify object2 bucket ID */ + rc = vos_obj_acquire(cont, oid, false, &obj); + assert_rc_equal(rc, 0); + + assert_int_equal(obj->obj_bkt_ids[0], bkt_id); + vos_obj_release(obj, 0, true); + + /* Reclaim object2 */ + reclaim_obj(arg, &oid, 1, &epoch); + + /* Verify space usage */ + used[0] = verify_space(pool, UMEM_DEFAULT_MBKT_ID, used[0], -1, "Reclaim object2"); + used[1] = verify_space(pool, bkt_id, used[1], -used[1], "Reclaim object2"); + assert_int_equal(used[0], ne_init); + + D_FREE(buf); +} + +static int +fill_one(struct io_test_args *arg, daos_unit_oid_t oid, char *dkey, char *akey, + daos_epoch_t *epoch, daos_size_t io_size, char *buf, uint32_t *ret_id) +{ + struct vos_object *obj; + struct vos_container *cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + uint32_t bkt_id = UMEM_DEFAULT_MBKT_ID; + uint64_t used, total = 0, prev_used = 0; + daos_size_t written = 0; + int rc = 0; + + while (written < MDTEST_MB_SIZE) { + rc = obj_rw(arg, oid, dkey, akey, DAOS_IOD_ARRAY, (*epoch)++, io_size, buf, true); + if (rc != 0) + break; + written += io_size; + + if (bkt_id == UMEM_DEFAULT_MBKT_ID) { + rc = vos_obj_acquire(cont, oid, false, &obj); + assert_rc_equal(rc, 0); + + bkt_id = obj->obj_bkt_ids[0]; + vos_obj_release(obj, 0, false); + /* All evictable buckets are used up */ + if (bkt_id == UMEM_DEFAULT_MBKT_ID) { + rc = 1; + break; + } + } + + rc = umempobj_get_mbusage(vos_cont2umm(cont)->umm_pool, bkt_id, &used, &total); + assert_int_equal(rc, 0); + assert_int_equal(total, MDTEST_MB_SIZE); + + /* This evictable bucket is filled up */ + if (used == prev_used) + break; + + prev_used = used; + } + + print_message("Filled bucket:%u total:%lu, used:%lu/%lu, written:%lu, rc:%d\n", + bkt_id, total, used, prev_used, written, rc); + *ret_id = bkt_id; + + return rc; +} + +/* Fill all evictable buckets */ +static void +p2_fill_test(void **state) +{ + struct io_test_args *arg = *state; + struct vos_pool *pool = vos_hdl2pool(arg->ctx.tc_po_hdl); + struct umem_cache *cache = vos_pool2store(pool)->cache; + daos_unit_oid_t oids[MDTEST_MAX_EMB_CNT]; + daos_epoch_t epoch = 1; + char dkey[UPDATE_DKEY_SIZE] = { 0 }; + char akey[UPDATE_AKEY_SIZE] = { 0 }; + char *buf; + uint32_t missed, loaded, evicted; + daos_size_t io_size = 800; + uint32_t bkt_ids[MDTEST_MAX_EMB_CNT]; + uint64_t bkt_used[MDTEST_MAX_EMB_CNT]; + uint64_t ne_used, ne_init; + int i, rc, obj_cnt = 0; + + dts_key_gen(dkey, UPDATE_DKEY_SIZE, UPDATE_DKEY); + dts_key_gen(akey, UPDATE_AKEY_SIZE, UPDATE_AKEY); + + D_ALLOC(buf, io_size); + assert_non_null(buf); + dts_buf_render(buf, io_size); + + /* Get initial space usage */ + ne_init = verify_space(pool, UMEM_DEFAULT_MBKT_ID, 0, INT64_MAX, "Init"); + + /* Fill up pool */ + while (obj_cnt < MDTEST_MAX_EMB_CNT) { + oids[obj_cnt] = dts_unit_oid_gen(0, 0); + rc = fill_one(arg, oids[obj_cnt], dkey, akey, &epoch, io_size, buf, + &bkt_ids[obj_cnt]); + if (rc) + break; + bkt_used[obj_cnt] = verify_space(pool, bkt_ids[obj_cnt], 0, INT64_MAX, "Fill"); + + obj_cnt++; + print_message("%d objects are allocated.\n", obj_cnt); + + if (obj_cnt && (obj_cnt % 4 == 0)) + checkpoint_fn(&arg->ctx.tc_po_hdl); + } + assert_true(obj_cnt > 0); + + for (i = 0; i < obj_cnt; i++) + bkt_used[i] = verify_space(pool, bkt_ids[i], bkt_used[i], 0, "Filled"); + + missed = cache->ca_cache_stats[UMEM_CACHE_STATS_MISS]; + loaded = cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD]; + evicted = cache->ca_cache_stats[UMEM_CACHE_STATS_EVICT]; + + /* Fetch first object to trigger cache miss and page evict */ + rc = obj_rw(arg, oids[0], dkey, akey, DAOS_IOD_ARRAY, DAOS_EPOCH_MAX, io_size, buf, false); + assert_rc_equal(rc, 0); + + dump_cache_stats(pool, "after fetch"); + assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_MISS], missed + 1); + assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD], loaded + 1); + assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_EVICT], evicted + 1); + + /* Re-open pool */ + arg->checkpoint = true; + wal_pool_refill(arg); + pool = vos_hdl2pool(arg->ctx.tc_po_hdl); + cache = vos_pool2store(pool)->cache; + arg->checkpoint = false; + + missed = cache->ca_cache_stats[UMEM_CACHE_STATS_MISS]; + loaded = cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD]; + + /* Fetch first object to trigger cache miss */ + rc = obj_rw(arg, oids[0], dkey, akey, DAOS_IOD_ARRAY, DAOS_EPOCH_MAX, io_size, buf, false); + assert_rc_equal(rc, 0); + + dump_cache_stats(pool, "after re-open & fetch"); + assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_MISS], missed + 1); + assert_int_equal(cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD], loaded + 1); + + ne_used = verify_space(pool, UMEM_DEFAULT_MBKT_ID, ne_init, 1, "Re-open"); + bkt_used[0] = verify_space(pool, bkt_ids[0], bkt_used[0], 0, "Re-open"); + + /* Reclaim all objects */ + reclaim_obj(arg, &oids[0], obj_cnt, &epoch); + dump_cache_stats(pool, "after reclaim objs"); + + /* Verify used space */ + ne_used = verify_space(pool, UMEM_DEFAULT_MBKT_ID, ne_used, -1, "Reclaim objs"); + assert_int_equal(ne_used, ne_init); + for (i = 0; i < obj_cnt; i++) + bkt_used[i] = verify_space(pool, bkt_ids[i], bkt_used[i], -bkt_used[i], + "Reclaim objs"); + + /* Close container */ + rc = vos_cont_close(arg->ctx.tc_co_hdl); + assert_rc_equal(rc, 0); + arg->ctx.tc_step = TCX_CO_CREATE; + + /* Destroy container */ + rc = vos_cont_destroy(arg->ctx.tc_po_hdl, arg->ctx.tc_co_uuid); + assert_rc_equal(rc, 0); + arg->ctx.tc_step = TCX_PO_CREATE_OPEN; + + gc_wait(); + + dump_cache_stats(pool, "after cont destroy"); + + ne_used = verify_space(pool, UMEM_DEFAULT_MBKT_ID, ne_used, -1, "Cont destroy"); + for (i = 0; i < obj_cnt; i++) + bkt_used[i] = verify_space(pool, bkt_ids[i], bkt_used[i], -bkt_used[i], + "Cont destroy"); + + D_FREE(buf); +} + static const struct CMUnitTest wal_tests[] = { {"WAL01: Basic pool/cont create/destroy test", wal_tst_pool_cont, NULL, NULL}, {"WAL02: Basic pool/cont create/destroy test with checkpointing", wal_tst_pool_cont, @@ -1285,6 +2497,25 @@ static const struct CMUnitTest wal_io_int_tests[] = { {"WAL24: Key query punch with subsequent update", wal_io_query_key_punch_update, NULL, NULL}, }; +static const struct CMUnitTest wal_MB_tests[] = { + {"WAL30: UMEM MB Basic Test", wal_mb_tests, setup_mb_io, teardown_mb_io}, + {"WAL31: UMEM MB EMB selection based on utilization Test", wal_mb_utilization_tests, + setup_mb_io, teardown_mb_io}, + {"WAL32: UMEM MB EMB eviction by other EMBs Test", wal_mb_emb_evicts_emb, setup_mb_io, + teardown_mb_io}, + {"WAL33: UMEM MB EMB eviction by NEMB expansion Test", wal_mb_nemb_evicts_emb, setup_mb_io, + teardown_mb_io}, + {"WAL34: UMEM MB garbage collection", wal_umempobj_block_reuse, setup_mb_io, teardown_mb_io}, + {"WAL35: UMEM MB checkpoint restart garbage collection", wal_umempobj_chkpt_block_reuse, + setup_mb_io, teardown_mb_io}, + {"WAL36: UMEM MB restart replay garbage collection", wal_umempobj_replay_block_reuse, + setup_mb_io, teardown_mb_io}, + {"WAL37: UMEM MB stats test ", wal_umempobj_mbusage_test, setup_mb_io, teardown_mb_io}, + {"WAL38: P2 basic", p2_basic_test, setup_mb_io, teardown_mb_io}, + {"WAL39: P2 fill evictable buckets", p2_fill_test, setup_mb_io, teardown_mb_io}, + {"WAL40: nemb pct test", wal_mb_nemb_pct, setup_mb_io_nembpct, teardown_mb_io_nembpct}, +}; + int run_wal_tests(const char *cfg) { @@ -1332,5 +2563,11 @@ run_wal_tests(const char *cfg) setup_wal_io, teardown_io); } } + + if (umempobj_get_backend_type() == DAOS_MD_BMEM_V2) { + dts_create_config(test_name, "Memory Bucket tests with WAL %s", cfg); + D_PRINT("Running %s\n", test_name); + rc += cmocka_run_group_tests_name(test_name, wal_MB_tests, NULL, NULL); + } return rc; } diff --git a/src/vos/tests/wal_ut.c b/src/vos/tests/wal_ut.c index f123a3990a0..32b4b4c9957 100644 --- a/src/vos/tests/wal_ut.c +++ b/src/vos/tests/wal_ut.c @@ -29,7 +29,7 @@ ut_mc_init(struct bio_ut_args *args, uint64_t meta_sz, uint64_t wal_sz, uint64_t int rc, ret; uuid_generate(args->bua_pool_id); - rc = bio_mc_create(args->bua_xs_ctxt, args->bua_pool_id, meta_sz, wal_sz, data_sz, 0); + rc = bio_mc_create(args->bua_xs_ctxt, args->bua_pool_id, 0, meta_sz, wal_sz, data_sz, 0, 0); if (rc) { D_ERROR("UT MC create failed. "DF_RC"\n", DP_RC(rc)); return rc; diff --git a/src/vos/vos_aggregate.c b/src/vos/vos_aggregate.c index 65d70dd7762..5064e74d730 100644 --- a/src/vos/vos_aggregate.c +++ b/src/vos/vos_aggregate.c @@ -984,7 +984,7 @@ reserve_segment(struct vos_object *obj, struct agg_io_context *io, if (vos_io_scm(vos_obj2pool(obj), DAOS_IOD_ARRAY, size, VOS_IOS_AGGREGATION)) { /** Store on SCM */ - off = vos_reserve_scm(obj->obj_cont, io->ic_rsrvd_scm, size); + off = vos_reserve_scm(obj->obj_cont, io->ic_rsrvd_scm, size, obj); if (UMOFF_IS_NULL(off)) { now = daos_gettime_coarse(); if (now - obj->obj_cont->vc_agg_nospc_ts > VOS_NOSPC_ERROR_INTVL) { diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index cd2f2a5a693..e19768d4c03 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -693,8 +693,7 @@ static inline int vos_metrics_count(void) { return vea_metrics_count() + - (sizeof(struct vos_agg_metrics) + sizeof(struct vos_space_metrics) + - sizeof(struct vos_chkpt_metrics)) / sizeof(struct d_tm_node_t *); + sizeof(struct vos_pool_metrics) / sizeof(struct d_tm_node_t *); } static void @@ -874,6 +873,9 @@ vos_metrics_alloc(const char *path, int tgt_id) /* Initialize metrics for WAL */ vos_wal_metrics_init(&vp_metrics->vp_wal_metrics, path, tgt_id); + /* Initialize metrcis for umem cache */ + vos_cache_metrics_init(&vp_metrics->vp_cache_metrics, path, tgt_id); + return vp_metrics; } diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c index 6e6cbeeeb2a..a5a55a902b9 100644 --- a/src/vos/vos_container.c +++ b/src/vos/vos_container.c @@ -63,7 +63,7 @@ cont_df_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) cont_df = umem_off2ptr(&tins->ti_umm, rec->rec_off); vos_ts_evict(&cont_df->cd_ts_idx, VOS_TS_TYPE_CONT, vos_pool->vp_sysdb); - return gc_add_item(vos_pool, DAOS_HDL_INVAL, GC_CONT, rec->rec_off, 0); + return gc_add_item(vos_pool, DAOS_HDL_INVAL, GC_CONT, rec->rec_off, NULL); } static int @@ -92,6 +92,17 @@ cont_df_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, cont_df = umem_off2ptr(&tins->ti_umm, offset); uuid_copy(cont_df->cd_id, ukey->uuid); + cont_df->cd_ext = umem_zalloc(&tins->ti_umm, sizeof(struct vos_cont_ext_df)); + if (UMOFF_IS_NULL(cont_df->cd_ext)) { + D_ERROR("Failed to allocate cont df extension.\n"); + rc = -DER_NOSPACE; + goto failed; + } + + rc = gc_init_cont(&tins->ti_umm, cont_df); + if (rc) + goto failed; + rc = dbtree_create_inplace_ex(VOS_BTR_OBJ_TABLE, 0, VOS_OBJ_ORDER, &pool->vp_uma, &cont_df->cd_obj_root, DAOS_HDL_INVAL, pool, &hdl); @@ -101,12 +112,13 @@ cont_df_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, } dbtree_close(hdl); - gc_init_cont(&tins->ti_umm, cont_df); args->ca_cont_df = cont_df; rec->rec_off = offset; return 0; failed: /* Ignore umem_free failure. */ + if (!UMOFF_IS_NULL(cont_df->cd_ext)) + umem_free(&tins->ti_umm, cont_df->cd_ext); umem_free(&tins->ti_umm, offset); return rc; } @@ -191,6 +203,7 @@ cont_free_internal(struct vos_container *cont) if (!d_list_empty(&cont->vc_gc_link)) d_list_del(&cont->vc_gc_link); + gc_close_cont(cont); for (i = 0; i < VOS_IOS_CNT; i++) { if (cont->vc_hint_ctxt[i]) @@ -384,6 +397,9 @@ vos_cont_open(daos_handle_t poh, uuid_t co_uuid, daos_handle_t *coh) D_INIT_LIST_HEAD(&cont->vc_dtx_act_list); cont->vc_dtx_committed_count = 0; cont->vc_solo_dtx_epoch = d_hlc_get(); + rc = gc_open_cont(cont); + if (rc) + D_GOTO(exit, rc); gc_check_cont(cont); /* Cache this btr object ID in container handle */ diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 0a325088a77..86c100f4739 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -311,16 +311,38 @@ dtx_act_ent_update(struct btr_instance *tins, struct btr_record *rec, if (unlikely(!dae_old->dae_aborted)) { /* - * XXX: There are two possible reasons for that: - * - * 1. Client resent the RPC but without set 'RESEND' flag. - * 2. Client reused the DTX ID for different modifications. - * - * Currently, the 1st case is more suspected. + * If the new entry and the old entry are for the same transaction, then the RPC + * for the new one will take 'RESEND' flag, that will cause the old one has been + * aborted before arriving at here. So it is quite possible that the new one and + * the old one are for different transactions. */ - D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n", - DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); - return -DER_TX_ID_REUSED; + if (DAE_EPOCH(dae_old) < DAE_EPOCH(dae_new)) { + D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); + return -DER_TX_ID_REUSED; + } + + /* + * If the old entry has higher epoch, it is quite possible that the resent RPC + * was handled before the original RPC (corresponding to 'dae_new'). Returning + * -DER_INPROGRESS to make the RPC sponsor to retry the RPC with 'RESEND' flag, + * then related RPC handler logic will handle such case. + */ + if (DAE_EPOCH(dae_old) > DAE_EPOCH(dae_new)) { + D_ERROR("Resent RPC may be handled before original one for DTX "DF_DTI + " with epoch "DF_X64" vs "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); + return -DER_INPROGRESS; + } + + /* + * The two entries uses the same epoch, then it may be caused by repeated RPCs + * from different sources, such as multiple relay engines forward the same RPC + * to current target. We need to notify related caller for such buggy case. + */ + D_ERROR("Receive repeated DTX "DF_DTI" with epoch "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old)); + return -DER_MISC; } rec->rec_off = umem_ptr2off(&tins->ti_umm, dae_new); @@ -1171,16 +1193,20 @@ vos_dtx_check_availability(daos_handle_t coh, uint32_t entry, } if (intent == DAOS_INTENT_PURGE) { - uint32_t age = d_hlc_age2sec(DAE_XID(dae).dti_hlc); + uint64_t now = daos_gettime_coarse(); /* * The DTX entry still references related data record, * then we cannot (vos) aggregate related data record. + * Report warning per each 10 seconds to avoid log flood. */ - if (age >= DAOS_AGG_THRESHOLD) - D_WARN("DTX "DF_DTI" (state:%u, age:%u) still references the data, " - "cannot be (VOS) aggregated\n", - DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), age); + if (now - cont->vc_agg_busy_ts > 10) { + D_WARN("DTX "DF_DTI" (state:%u, flags:%x, age:%u) still references " + "the modification, cannot be (VOS) aggregated\n", + DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), DAE_FLAGS(dae), + (unsigned int)d_hlc_age2sec(DAE_XID(dae).dti_hlc)); + cont->vc_agg_busy_ts = now; + } return ALB_AVAILABLE_DIRTY; } @@ -1908,8 +1934,13 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, daos_epoch_t e = *epoch; *epoch = DAE_EPOCH(dae); - if (e != 0 && e != DAE_EPOCH(dae)) - return -DER_MISMATCH; + if (e != 0) { + if (e > DAE_EPOCH(dae)) + return -DER_MISMATCH; + + if (e < DAE_EPOCH(dae)) + return -DER_TX_RESTART; + } } return vos_dae_is_prepare(dae) ? DTX_ST_PREPARED : DTX_ST_INITED; @@ -2205,14 +2236,139 @@ vos_dtx_post_handle(struct vos_container *cont, } } +static inline void +dtx_unpin(struct vos_container *cont, struct umem_pin_handle *pin_hdl) +{ + struct vos_pool *pool = vos_cont2pool(cont); + + if (pin_hdl != NULL) + umem_cache_unpin(vos_pool2store(pool), pin_hdl); +} + +static inline int +bkts_add_rec(struct vos_pool *pool, struct vos_bkt_array *bkts, umem_off_t rec_off) +{ + uint32_t bkt_id; + int rc; + + if (UMOFF_IS_NULL(rec_off)) + return 0; + + bkt_id = umem_get_mb_from_offset(vos_pool2umm(pool), rec_off); + if (bkt_id == UMEM_DEFAULT_MBKT_ID) + return 0; + + rc = vos_bkt_array_add(bkts, bkt_id); + if (rc) + DL_ERROR(rc, "Failed to add %u into bucket array.", bkt_id); + + return rc; +} + +static int +bkts_add_dae(struct vos_pool *pool, struct vos_bkt_array *bkts_in, struct vos_dtx_act_ent *dae) +{ + struct vos_bkt_array local_bkts, *bkts; + umem_off_t rec_off; + int i, count, rc = 0; + + vos_bkt_array_init(&local_bkts); + bkts = bkts_in->vba_cnt == 0 ? bkts_in : &local_bkts; + + if (dae->dae_records != NULL) { + D_ASSERT(DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT); + + for (i = DAE_REC_CNT(dae) - DTX_INLINE_REC_CNT - 1; i >= 0; i--) { + rec_off = umem_off2offset(dae->dae_records[i]); + rc = bkts_add_rec(pool, bkts, rec_off); + if (rc) + goto out; + } + count = DTX_INLINE_REC_CNT; + } else { + count = DAE_REC_CNT(dae); + } + + for (i = count - 1; i >= 0; i--) { + rec_off = umem_off2offset(DAE_REC_INLINE(dae)[i]); + rc = bkts_add_rec(pool, bkts, rec_off); + if (rc) + goto out; + } + + /* Stop adding the dae when current dae not located in the subset of @bkts_in */ + if (local_bkts.vba_cnt != 0 && !vos_bkt_array_subset(bkts_in, &local_bkts)) + rc = 1; +out: + vos_bkt_array_fini(&local_bkts); + return rc; +} + +static int +dtx_commit_pin(struct vos_container *cont, struct dtx_id dtis[], int count, int *pinned, + struct umem_pin_handle **pin_hdl) +{ + struct vos_dtx_act_ent *dae; + struct vos_bkt_array bkts; + d_iov_t kiov, riov; + int i, rc; + + *pinned = count; + *pin_hdl = NULL; + + if (!vos_pool_is_evictable(vos_cont2pool(cont))) + return 0; + + vos_bkt_array_init(&bkts); + + for (i = 0; i < count; i++) { + d_iov_set(&kiov, &dtis[i], sizeof(struct dtx_id)); + d_iov_set(&riov, NULL, 0); + + rc = dbtree_lookup(cont->vc_dtx_active_hdl, &kiov, &riov); + if (rc == -DER_NONEXIST) { + rc = 0; + continue; + } else if (rc) { + DL_ERROR(rc, "Failed to lookup DTX active table."); + goto out; + } + + dae = riov.iov_buf; + D_ASSERT(dae->dae_preparing == 0); + + if (vos_dae_is_abort(dae) || dae->dae_committed || dae->dae_committing || + dae->dae_need_release == 0) + continue; + + rc = bkts_add_dae(vos_cont2pool(cont), &bkts, dae); + if (rc) { + if (rc < 0) { + DL_ERROR(rc, "Failed to add DTX to bucket array."); + goto out; + } + *pinned = i; + break; + } + } + + rc = vos_bkt_array_pin(vos_cont2pool(cont), &bkts, pin_hdl); + if (rc) + DL_ERROR(rc, "Failed to pin buckets."); +out: + vos_bkt_array_fini(&bkts); + return rc; +} + int vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool rm_cos[]) { struct vos_dtx_act_ent **daes = NULL; struct vos_dtx_cmt_ent **dces = NULL; struct vos_container *cont; - int committed = 0; - int rc = 0; + struct umem_pin_handle *pin_hdl; + int tot_committed = 0, committed, pinned; + int idx = 0, rc = 0; D_ASSERT(count > 0); @@ -2227,24 +2383,73 @@ vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool rm_cos[] cont = vos_hdl2cont(coh); D_ASSERT(cont != NULL); +pin_objects: + rc = dtx_commit_pin(cont, &dtis[idx], count, &pinned, &pin_hdl); + if (rc) { + DL_ERROR(rc, "Pin objects failed before DTX commit."); + goto out; + } + + D_ASSERT(pinned > 0 && pinned <= count); + count -= pinned; + /* Commit multiple DTXs via single local transaction. */ rc = umem_tx_begin(vos_cont2umm(cont), NULL); if (rc == 0) { - committed = vos_dtx_commit_internal(cont, dtis, count, 0, rm_cos, daes, dces); + committed = vos_dtx_commit_internal(cont, &dtis[idx], pinned, 0, + rm_cos != NULL ? &rm_cos[idx] : NULL, + &daes[idx], &dces[idx]); if (committed >= 0) { rc = umem_tx_commit(vos_cont2umm(cont)); D_ASSERT(rc == 0); + tot_committed += committed; } else { rc = umem_tx_abort(vos_cont2umm(cont), committed); } - vos_dtx_post_handle(cont, daes, dces, count, false, rc != 0); + vos_dtx_post_handle(cont, &daes[idx], &dces[idx], pinned, false, rc != 0); } + dtx_unpin(cont, pin_hdl); + + if (count > 0) { + idx += pinned; + goto pin_objects; + } out: D_FREE(daes); D_FREE(dces); - return rc < 0 ? rc : committed; + return rc < 0 ? rc : tot_committed; +} + +static int +dtx_abort_pin(struct vos_container *cont, struct vos_dtx_act_ent *dae, + struct umem_pin_handle **pin_hdl) +{ + struct vos_bkt_array bkts; + int rc; + + if (!vos_pool_is_evictable(vos_cont2pool(cont))) + return 0; + + if (dae->dae_need_release == 0) + return 0; + + vos_bkt_array_init(&bkts); + rc = bkts_add_dae(vos_cont2pool(cont), &bkts, dae); + if (rc) { + D_ASSERT(rc < 0); + DL_ERROR(rc, "Failed to add DTX to buckets."); + goto out; + } + + rc = vos_bkt_array_pin(vos_cont2pool(cont), &bkts, pin_hdl); + if (rc) + DL_ERROR(rc, "Failed to pin buckets."); +out: + vos_bkt_array_fini(&bkts); + return rc; + } int @@ -2252,8 +2457,13 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae, { struct dtx_handle *dth = dae->dae_dth; struct umem_instance *umm; + struct umem_pin_handle *pin_hdl = NULL; int rc; + rc = dtx_abort_pin(cont, dae, &pin_hdl); + if (rc) + goto out; + umm = vos_cont2umm(cont); rc = umem_tx_begin(umm, NULL); if (rc != 0) @@ -2294,6 +2504,8 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae, */ out: + dtx_unpin(cont, pin_hdl); + if (rc == 0 || force) vos_dtx_post_handle(cont, &dae, NULL, 1, true, false); else if (rc != 0) @@ -3044,6 +3256,11 @@ vos_dtx_attach(struct dtx_handle *dth, bool persistent, bool exist) } } + /* + * Doesn't need to pin the object before starting tx, since the DTX commit from + * following vos_dtx_prepared() is for read-only DTX transaction, no object data + * will be accessed during DTX commit. + */ if (persistent) { rc = umem_tx_begin(umm, NULL); if (rc != 0) @@ -3298,6 +3515,13 @@ vos_dtx_local_begin(struct dtx_handle *dth, daos_handle_t poh) pool = vos_hdl2pool(poh); umm = vos_pool2umm(pool); + if (vos_pool_is_evictable(pool)) { + D_ERROR("VOS local tx doesn't support evictable pool:"DF_UUID"\n", + DP_UUID(pool->vp_id)); + rc = -DER_NOTSUPPORTED; + goto error; + } + rc = vos_tx_begin(dth, umm, pool->vp_sysdb); if (rc != 0) { D_ERROR("Failed to start transaction: rc=" DF_RC "\n", DP_RC(rc)); diff --git a/src/vos/vos_gc.c b/src/vos/vos_gc.c index 0937b883f33..5d5383ed766 100644 --- a/src/vos/vos_gc.c +++ b/src/vos/vos_gc.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -11,6 +11,7 @@ #define D_LOGFAC DD_FAC(vos) #include +#include #include #include #include "vos_internal.h" @@ -74,12 +75,26 @@ struct vos_gc { */ static int gc_drain_btr(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, - struct btr_root *root, int *credits, bool *empty) + struct vos_gc_item *item, struct btr_root *root, int *credits, bool *empty) { - daos_handle_t toh; - int rc; + struct vos_object dummy_obj = { 0 }; + struct vos_container dummy_cont = { 0 }; + daos_handle_t toh; + void *priv; + int rc, i; + + if (gc->gc_type == GC_CONT) { + priv = pool; + } else { + dummy_cont.vc_pool = pool; + dummy_obj.obj_cont = &dummy_cont; + dummy_obj.obj_bkt_alloted = 1; + for (i = 0; i < VOS_GC_BKTS_MAX; i++) + dummy_obj.obj_bkt_ids[i] = item->it_bkt_ids[i]; + priv = &dummy_obj; + } - rc = dbtree_open_inplace_ex(root, &pool->vp_uma, coh, pool, &toh); + rc = dbtree_open_inplace_ex(root, &pool->vp_uma, coh, priv, &toh); if (rc == -DER_NONEXIST) { /* empty tree */ *empty = true; return 0; @@ -115,7 +130,7 @@ gc_drain_evt(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, daos_handle_t toh; int rc; - vos_evt_desc_cbs_init(&cbs, pool, coh); + vos_evt_desc_cbs_init(&cbs, pool, coh, NULL); rc = evt_open(root, &pool->vp_uma, &cbs, &toh); if (rc == -DER_NONEXIST) { *empty = true; @@ -126,7 +141,7 @@ gc_drain_evt(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, D_DEBUG(DB_TRACE, "drain %s evtree, creds=%d\n", gc->gc_name, *credits); rc = evt_drain(toh, credits, empty); - D_ASSERT(evt_close(toh) == 0); + evt_close(toh); if (rc) goto failed; @@ -160,7 +175,7 @@ gc_drain_key(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, } if (key->kr_bmap & KREC_BF_BTR) { - rc = gc_drain_btr(gc, pool, coh, &key->kr_btr, credits, empty); + rc = gc_drain_btr(gc, pool, coh, item, &key->kr_btr, credits, empty); } else if (key->kr_bmap & KREC_BF_EVT) { D_ASSERT(gc->gc_type == GC_AKEY); @@ -195,7 +210,7 @@ gc_free_dkey(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, struct D_ASSERT(krec->kr_bmap & KREC_BF_DKEY); if (krec->kr_bmap & KREC_BF_NO_AKEY) - gc_add_item(pool, coh, GC_AKEY, item->it_addr, item->it_args); + gc_add_item(pool, coh, GC_AKEY, item->it_addr, &item->it_bkt_ids[0]); else umem_free(&pool->vp_umm, item->it_addr); return 0; @@ -211,7 +226,7 @@ gc_drain_obj(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, { struct vos_obj_df *obj = umem_off2ptr(&pool->vp_umm, item->it_addr); - return gc_drain_btr(gc, pool, coh, &obj->vo_tree, credits, empty); + return gc_drain_btr(gc, pool, coh, item, &obj->vo_tree, credits, empty); } static int @@ -294,20 +309,29 @@ gc_drain_cont(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, return rc; /** Indicate to caller that we've taken over container bags */ - return 1; + if (!vos_pool_is_evictable(pool)) + return 1; } D_ASSERT(daos_handle_is_inval(coh)); - return gc_drain_btr(gc, pool, coh, &cont->cd_obj_root, - credits, empty); + return gc_drain_btr(gc, pool, coh, item, &cont->cd_obj_root, credits, empty); } static int gc_free_cont(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, struct vos_gc_item *item) { - int rc; + struct vos_cont_df *cd = umem_off2ptr(&pool->vp_umm, item->it_addr); + int rc; - rc = vos_dtx_table_destroy(&pool->vp_umm, umem_off2ptr(&pool->vp_umm, item->it_addr)); + if (!UMOFF_IS_NULL(cd->cd_ext)) { + rc = umem_free(&pool->vp_umm, cd->cd_ext); + if (rc) { + DL_ERROR(rc, "Failed to free cont_df extension"); + return rc; + } + } + + rc = vos_dtx_table_destroy(&pool->vp_umm, cd); if (rc == 0) rc = umem_free(&pool->vp_umm, item->it_addr); @@ -369,19 +393,102 @@ gc_type2bin(struct vos_pool *pool, struct vos_container *cont, return &cont->vc_cont_df->cd_gc_bins[type]; } +static int +gc_bkt2bins(uint32_t *bkt_id, struct vos_gc_info *gc_info, bool create, bool try_next, + struct vos_gc_bin_df **bins_ret) +{ + struct vos_gc_bin_df dummy_bins[GC_CONT]; + d_iov_t key, key_out, val, val_out; + uint64_t *new_id, key_id = *bkt_id; + int probe_op = try_next ? BTR_PROBE_FIRST : BTR_PROBE_EQ; + int i, rc; + + D_ASSERT(try_next || *bkt_id != UMEM_DEFAULT_MBKT_ID); + D_ASSERT(daos_handle_is_valid(gc_info->gi_bins_btr)); + + /* Fetch the in-tree record */ + d_iov_set(&key, &key_id, sizeof(key_id)); + d_iov_set(&key_out, NULL, 0); + d_iov_set(&val_out, NULL, 0); + + rc = dbtree_fetch(gc_info->gi_bins_btr, probe_op, DAOS_INTENT_DEFAULT, &key, + &key_out, &val_out); + if (rc && rc != -DER_NONEXIST) { + DL_ERROR(rc, "Failed to lookup GC bins for bkt_id:%u", *bkt_id); + return rc; + } + + if (rc == 0) { + *bins_ret = (struct vos_gc_bin_df *)val_out.iov_buf; + new_id = (uint64_t *)key_out.iov_buf; + D_ASSERT(new_id && (try_next || *bkt_id == *new_id)); + *bkt_id = (uint32_t)*new_id; + } else if (create) { + D_ASSERT(!try_next); + memset(&dummy_bins[0], 0, sizeof(dummy_bins)); + for (i = 0; i < GC_CONT; i++) { + dummy_bins[i].bin_bag_first = UMOFF_NULL; + dummy_bins[i].bin_bag_last = UMOFF_NULL; + dummy_bins[i].bin_bag_size = gc_bag_size; + dummy_bins[i].bin_bag_nr = 0; + } + + d_iov_set(&val, &dummy_bins[0], sizeof(dummy_bins)); + d_iov_set(&val_out, NULL, 0); + + rc = dbtree_upsert(gc_info->gi_bins_btr, BTR_PROBE_BYPASS, DAOS_INTENT_UPDATE, + &key, &val, &val_out); + if (rc != 0) { + DL_ERROR(rc, "Failed to insert GC bins for bkt_id:%u", *bkt_id); + return rc; + } + *bins_ret = (struct vos_gc_bin_df *)val_out.iov_buf; + } + + return rc; +} + +static int +gc_get_bin(struct vos_pool *pool, struct vos_container *cont, enum vos_gc_type type, + uint32_t bkt_id, struct vos_gc_bin_df **bin_df) +{ + struct vos_gc_bin_df *bins = NULL; + int rc; + + D_ASSERT(type < GC_MAX); + if (!vos_pool_is_evictable(pool) || bkt_id == UMEM_DEFAULT_MBKT_ID) { + *bin_df = gc_type2bin(pool, cont, type); + return 0; + } + + D_ASSERT(type < GC_CONT); + if (cont == NULL) + rc = gc_bkt2bins(&bkt_id, &pool->vp_gc_info, true, false, &bins); + else + rc = gc_bkt2bins(&bkt_id, &cont->vc_gc_info, true, false, &bins); + + if (rc == 0) { + D_ASSERT(bins != NULL); + *bin_df = &bins[type]; + } + + return rc; +} + /** * Free the first (oldest) garbage bag of a garbage bin unless it is also the * last (newest) bag. */ static int -gc_bin_free_bag(struct umem_instance *umm, struct vos_container *cont, - struct vos_gc_bin_df *bin, umem_off_t bag_id) +gc_bin_free_bag(struct umem_instance *umm, struct vos_gc_bin_df *bin, umem_off_t bag_id, + bool free_last_bag) + { struct vos_gc_bag_df *bag = umem_off2ptr(umm, bag_id); int rc; D_ASSERT(bag_id == bin->bin_bag_first); - if (cont == NULL && bag_id == bin->bin_bag_last) { + if (!free_last_bag && bag_id == bin->bin_bag_last) { /* don't free the last bag, only reset it */ D_ASSERT(bin->bin_bag_nr == 1); rc = umem_tx_add_ptr(umm, bag, sizeof(*bag)); @@ -393,7 +500,7 @@ gc_bin_free_bag(struct umem_instance *umm, struct vos_container *cont, return rc; } - if (cont != NULL) { + if (free_last_bag) { D_ASSERT(bin->bin_bag_nr > 0); } else { D_ASSERT(bin->bin_bag_nr > 1); @@ -494,11 +601,10 @@ gc_bin_add_item(struct umem_instance *umm, struct vos_gc_bin_df *bin, return rc; } -static struct vos_gc_item * -gc_get_item(struct vos_gc *gc, struct vos_pool *pool, - struct vos_container *cont) + +static inline struct vos_gc_item * +bin_get_item(struct vos_pool *pool, struct vos_gc_bin_df *bin) { - struct vos_gc_bin_df *bin = gc_type2bin(pool, cont, gc->gc_type); struct vos_gc_bag_df *bag; bag = umem_off2ptr(&pool->vp_umm, bin->bin_bag_first); @@ -513,6 +619,14 @@ gc_get_item(struct vos_gc *gc, struct vos_pool *pool, return &bag->bag_items[bag->bag_item_first]; } +static inline struct vos_gc_item * +gc_get_item(struct vos_gc *gc, struct vos_pool *pool, struct vos_container *cont) +{ + struct vos_gc_bin_df *bin = gc_type2bin(pool, cont, gc->gc_type); + + return bin_get_item(pool, bin); +} + static int gc_drain_item(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, struct vos_gc_item *item, int *credits, bool *empty) @@ -554,10 +668,9 @@ gc_drain_item(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, } static int -gc_free_item(struct vos_gc *gc, struct vos_pool *pool, - struct vos_container *cont, struct vos_gc_item *item) +gc_free_item(struct vos_gc *gc, struct vos_pool *pool, struct vos_container *cont, + struct vos_gc_item *item, struct vos_gc_bin_df *bin) { - struct vos_gc_bin_df *bin = gc_type2bin(pool, cont, gc->gc_type); struct vos_gc_bag_df *bag; int first; struct vos_gc_item it; @@ -575,8 +688,8 @@ gc_free_item(struct vos_gc *gc, struct vos_pool *pool, if (first == bag->bag_item_last) { /* it's going to be a empty bag */ D_ASSERT(bag->bag_item_nr == 1); - rc = gc_bin_free_bag(&pool->vp_umm, cont, bin, - bin->bin_bag_first); + rc = gc_bin_free_bag(&pool->vp_umm, bin, bin->bin_bag_first, + (cont != NULL || item->it_bkt_ids[0] != UMEM_DEFAULT_MBKT_ID)); if (rc) goto failed; } else { @@ -627,12 +740,12 @@ gc_free_item(struct vos_gc *gc, struct vos_pool *pool, */ int gc_add_item(struct vos_pool *pool, daos_handle_t coh, - enum vos_gc_type type, umem_off_t item_off, uint64_t args) + enum vos_gc_type type, umem_off_t item_off, uint32_t *bkt_ids) { struct vos_container *cont = vos_hdl2cont(coh); - struct vos_gc_bin_df *bin = gc_type2bin(pool, cont, type); + struct vos_gc_bin_df *bin; struct vos_gc_item item; - int rc; + int rc, i; D_DEBUG(DB_TRACE, "Add %s addr="DF_X64"\n", gc_type2name(type), item_off); @@ -641,7 +754,16 @@ gc_add_item(struct vos_pool *pool, daos_handle_t coh, return 0; /* OK to ignore because the pool is being deleted */ item.it_addr = item_off; - item.it_args = args; + for (i = 0; i < VOS_GC_BKTS_MAX; i++) + item.it_bkt_ids[i] = bkt_ids ? bkt_ids[i] : UMEM_DEFAULT_MBKT_ID; + + rc = gc_get_bin(pool, cont, type, item.it_bkt_ids[0], &bin); + if (rc) { + DL_ERROR(rc, "Failed to get GC bin for type:%d, bkt_id:%u", + type, item.it_bkt_ids[0]); + return rc; + } + rc = gc_bin_add_item(&pool->vp_umm, bin, &item); if (rc) { D_ERROR("Failed to add item, pool=" DF_UUID ", rc=" DF_RC "\n", @@ -711,6 +833,7 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret) { struct vos_container *cont = gc_get_container(pool); struct vos_gc *gc = &gc_table[0]; /* start from akey */ + struct vos_gc_bin_df *bin; int creds = *credits; int rc; @@ -777,8 +900,9 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret) } if (empty && creds) { + bin = gc_type2bin(pool, cont, gc->gc_type); /* item can be released and removed from bin */ - rc = gc_free_item(gc, pool, cont, item); + rc = gc_free_item(gc, pool, cont, item, bin); if (rc) { D_ERROR("GC=%s free item error: "DF_RC"\n", gc->gc_name, DP_RC(rc)); break; @@ -812,7 +936,7 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret) "pool="DF_UUID", creds origin=%d, current=%d, rc=%s\n", DP_UUID(pool->vp_id), *credits, creds, d_errstr(rc)); - rc = umem_tx_end(&pool->vp_umm, rc); + rc = umem_tx_end(&pool->vp_umm, rc < 0 ? rc : 0); if (rc == 0) *credits = creds; @@ -833,6 +957,592 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret) return rc; } +static inline bool +bins_empty(struct vos_pool *pool, struct vos_gc_bin_df *bins) +{ + int i; + + for (i = 0; i < GC_CONT; i++) { + if (bin_get_item(pool, &bins[i]) != NULL) + return false; + } + return true; +} + +/* Add gc_bin[GC_CONT] from container bucket tree to pool bucket tree */ +static int +gc_add_bins(struct vos_pool *pool, struct vos_gc_bin_df *src_bins, uint32_t bkt_id) +{ + struct vos_gc_bin_df *dst_bins, dummy_bins[GC_CONT]; + daos_handle_t pool_btr = pool->vp_gc_info.gi_bins_btr; + d_iov_t key, val, val_out; + uint64_t key_id = bkt_id; + int i, rc, added = 0; + + D_ASSERT(daos_handle_is_valid(pool_btr)); + /* Fetch the in-tree record from pool */ + d_iov_set(&key, &key_id, sizeof(key_id)); + d_iov_set(&val_out, NULL, 0); + + rc = dbtree_fetch(pool_btr, BTR_PROBE_EQ, DAOS_INTENT_DEFAULT, &key, NULL, &val_out); + if (rc == -DER_NONEXIST) { + d_iov_set(&val, src_bins, sizeof(dummy_bins)); + rc = dbtree_upsert(pool_btr, BTR_PROBE_BYPASS, DAOS_INTENT_UPDATE, &key, &val, NULL); + if (rc) + DL_ERROR(rc, "Failed to add bins for bkt_id:%u", bkt_id); + return rc; + } else if (rc) { + DL_ERROR(rc, "Failed to fetch bins from pool bucket tree for bkt_id:%u", bkt_id); + return rc; + } + + dst_bins = (struct vos_gc_bin_df *)val_out.iov_buf; + D_ASSERT(dst_bins && !bins_empty(pool, dst_bins)); + + for (i = GC_AKEY; i < GC_CONT; i++) { + if (src_bins[i].bin_bag_first == UMOFF_NULL) + continue; + + rc = gc_bags_move(pool, &dst_bins[i], &src_bins[i]); + if (rc != 0) { + DL_ERROR(rc, "Failed to move bags for bkt_id:%u, type:%d", bkt_id, i); + return rc; + } + added++; + } + + D_ASSERT(added > 0); + return 0; +} + +static int +gc_move_bins(struct vos_pool *pool, struct vos_gc_item *item, int *credits, bool *empty_ret) +{ + struct umem_instance *umm = &pool->vp_umm; + struct umem_attr *uma = &pool->vp_uma; + struct vos_cont_df *cd = umem_off2ptr(umm, item->it_addr); + struct vos_cont_ext_df *cd_ext = umem_off2ptr(umm, cd->cd_ext); + daos_handle_t cont_btr; + d_iov_t key, key_out, val_out; + uint64_t key_id = UMEM_DEFAULT_MBKT_ID; + struct vos_gc_bin_df *bins; + uint64_t *bkt_id; + int rc, creds = *credits, moved = 0; + + D_ASSERT(cd_ext != NULL); + rc = dbtree_open_inplace(&cd_ext->ced_gc_bkt.gd_bins_root, uma, &cont_btr); + if (rc == -DER_NONEXIST) { + *empty_ret = true; + return 0; + } else if (rc) { + DL_ERROR(rc, "Failed to open container bucket tree."); + return rc; + } + D_ASSERT(daos_handle_is_valid(cont_btr)); + + *empty_ret = false; + while (creds > 0) { + /* Fetch the in-tree record from container */ + d_iov_set(&key, &key_id, sizeof(key_id)); + d_iov_set(&key_out, NULL, 0); + d_iov_set(&val_out, NULL, 0); + + rc = dbtree_fetch(cont_btr, BTR_PROBE_GE, DAOS_INTENT_DEFAULT, + &key, &key_out, &val_out); + if (rc == -DER_NONEXIST) { + *empty_ret = true; + rc = 0; + break; + } else if (rc) { + DL_ERROR(rc, "Failed to fetch bins from container bucket tree."); + break; + } + + bins = (struct vos_gc_bin_df *)val_out.iov_buf; + D_ASSERT(bins && !bins_empty(pool, bins)); + bkt_id = (uint64_t *)key_out.iov_buf; + D_ASSERT(bkt_id && *bkt_id != UMEM_DEFAULT_MBKT_ID); + + rc = gc_add_bins(pool, bins, (uint32_t)*bkt_id); + if (rc) + break; + + rc = dbtree_delete(cont_btr, BTR_PROBE_BYPASS, &key_out, NULL); + if (rc) { + DL_ERROR(rc, "Failed to delete bins from container bucket tree."); + break; + } + + moved++; + /* Consume 1 user credit on moving 8 gc_bin[GC_CONT] */ + if (moved % 8 == 0) + creds--; + } + + if (*empty_ret) + dbtree_destroy(cont_btr, NULL); + else + dbtree_close(cont_btr); + + if (rc == 0) + *credits = creds; + + return rc; +} + +static int +gc_flatten_cont(struct vos_pool *pool, int *credits) +{ + struct vos_gc *gc = &gc_table[GC_CONT]; + struct vos_gc_item *item; + struct vos_gc_bin_df *bin; + int creds = *credits; + int rc = 0, flattened = 0; + + while (creds > 0) { + bool empty = false; + + item = gc_get_item(gc, pool, NULL); + if (item == NULL) /* No containers to be flattened */ + break; + + /* Move all gc_bin[GC_CONT] from container to pool */ + rc = gc_move_bins(pool, item, &creds, &empty); + if (rc) { + DL_ERROR(rc, "GC move bins failed."); + break; + } + + if (!empty) { + D_ASSERT(creds == 0); + break; + } + + if (creds == 0) + break; + + empty = false; + /* Container drain doesn't consume user credits */ + rc = gc_drain_item(gc, pool, DAOS_HDL_INVAL, item, NULL, &empty); + if (rc) { + D_ASSERT(rc < 0); + DL_ERROR(rc, "GC drain %s failed.", gc->gc_name); + break; + } + + flattened++; + /* Consume 1 user credit on flattening every 8 objects */ + if (flattened % 8 == 0) + creds--; + + /* The container is flattened, free the gc_item */ + if (empty && creds) { + bin = gc_type2bin(pool, NULL, gc->gc_type); + rc = gc_free_item(gc, pool, NULL, item, bin); + if (rc) { + DL_ERROR(rc, "GC free %s item failed.", gc->gc_name); + break; + } + creds--; + } + } + + if (rc == 0) + *credits = creds; + + return rc; +} + +static int +bkt_get_bins(struct vos_pool *pool, struct vos_container *cont, uint32_t *bkt_id, bool try_next, + struct vos_gc_bin_df **bins_ret) +{ + struct vos_gc_info *gc_info; + struct vos_gc_bin_df *bins = NULL; + int rc; + + if (*bkt_id == UMEM_DEFAULT_MBKT_ID || try_next) { + if (cont != NULL) + bins = &cont->vc_cont_df->cd_gc_bins[0]; + else + bins = &pool->vp_pool_df->pd_gc_bins[0]; + + if (!bins_empty(pool, bins)) { + *bkt_id = UMEM_DEFAULT_MBKT_ID; + *bins_ret = bins; + return 0; + } else if (!try_next) { + return -DER_NONEXIST; + } + } + + gc_info = (cont != NULL) ? &cont->vc_gc_info : &pool->vp_gc_info; + rc = gc_bkt2bins(bkt_id, gc_info, false, try_next, &bins); + if (rc) + return rc; + + D_ASSERT(bins && !bins_empty(pool, bins)); + *bins_ret = bins; + + return 0; +} + +static inline bool +cont_bins_empty(struct vos_pool *pool, struct vos_container *cont) +{ + struct vos_gc_bin_df *bins = &cont->vc_cont_df->cd_gc_bins[0]; + + if (!bins_empty(pool, bins)) + return false; + + D_ASSERT(daos_handle_is_valid(cont->vc_gc_info.gi_bins_btr)); + if (!dbtree_is_empty(cont->vc_gc_info.gi_bins_btr)) + return false; + + return true; +} + +/* + * Return non-empty gc_bin[GC_CONT] with specified bucket ID, different bucket ID + * could be returned if there is nothing to be reclaimed on the specified bucket. + */ +static int +gc_get_bkt(struct vos_pool *pool, struct vos_container **cont_in, uint32_t *bkt_id, + struct vos_gc_bin_df **bins_ret) +{ + struct vos_container *cont, *tmp; + bool try_next = false; + int rc; + +switch_bkt: + /* Find non-empty gc_bin[GC_CONT] from containers */ + d_list_for_each_entry_safe(cont, tmp, &pool->vp_gc_cont, vc_gc_link) { + if (cont_bins_empty(pool, cont)) { + d_list_del_init(&cont->vc_gc_link); + continue; + } + + rc = bkt_get_bins(pool, cont, bkt_id, try_next, bins_ret); + if ((rc && rc != -DER_NONEXIST) || rc == 0) + goto done; + } + + /* Find satisfied gc_bin[GC_CONT] from pool */ + cont = NULL; + rc = bkt_get_bins(pool, NULL, bkt_id, try_next, bins_ret); + if ((rc && rc != -DER_NONEXIST) || rc == 0) + goto done; + + if (!try_next) { + try_next = true; + goto switch_bkt; + } +done: + if (*cont_in) { + vos_cont_decref(*cont_in); + *cont_in = NULL; + } + + if (rc == 0 && cont) { + vos_cont_addref(cont); + *cont_in = cont; + /* Keep fairness */ + d_list_del_init(&cont->vc_gc_link); + d_list_add_tail(&cont->vc_gc_link, &pool->vp_gc_cont); + } + + return rc; +} + +static int +gc_reclaim_bins(struct vos_pool *pool, struct vos_container *cont, + struct vos_gc_bin_df *bins, int *credits) +{ + struct vos_gc *gc = &gc_table[0]; /* Start from akey */ + struct vos_gc_item *item; + int rc = 0, creds = *credits; + + while (creds > 0) { + bool empty = false; + + D_ASSERT(gc->gc_type < GC_CONT); + item = bin_get_item(pool, &bins[gc->gc_type]); + if (item == NULL) { + if (gc->gc_type == GC_OBJ) /* hit the top level */ + break; + + /* Try upper level */ + gc++; + continue; + } + + rc = gc_drain_item(gc, pool, vos_cont2hdl(cont), item, &creds, &empty); + if (rc < 0) { + DL_ERROR(rc, "GC drain %s failed.", gc->gc_name); + break; + } + + if (empty && creds) { + rc = gc_free_item(gc, pool, cont, item, &bins[gc->gc_type]); + if (rc) { + DL_ERROR(rc, "GC free %s item failed.", gc->gc_name); + break; + } + creds--; + } + + /* always try to free akeys and values because they are the + * items consuming most storage space. + */ + if (gc->gc_type == GC_AKEY) + continue; + + /* should have flattened some items to the child GC, switch + * to the child GC. + */ + gc--; + } + + if (rc == 0) + *credits = creds; + + return rc; +} + +static int +gc_delete_bins(struct vos_pool *pool, struct vos_container *cont, uint32_t bkt_id) +{ + struct vos_gc_bin_df *bins; + struct vos_gc_info *gc_info; + d_iov_t key, val_out; + uint64_t key_id = bkt_id; + int rc; + + if (bkt_id == UMEM_DEFAULT_MBKT_ID) + return 0; + + gc_info = (cont != NULL) ? &cont->vc_gc_info : &pool->vp_gc_info; + D_ASSERT(daos_handle_is_valid(gc_info->gi_bins_btr)); + + /* Fetch the in-tree record */ + d_iov_set(&key, &key_id, sizeof(key_id)); + d_iov_set(&val_out, NULL, 0); + + rc = dbtree_fetch(gc_info->gi_bins_btr, BTR_PROBE_EQ, DAOS_INTENT_DEFAULT, &key, + NULL, &val_out); + if (rc) { + DL_ERROR(rc, "Failed to lookup GC bins for bkt_id:%u", bkt_id); + return rc; + } + + bins = (struct vos_gc_bin_df *)val_out.iov_buf; + D_ASSERT(bins && bins_empty(pool, bins)); + + rc = dbtree_delete(gc_info->gi_bins_btr, BTR_PROBE_BYPASS, &key, NULL); + if (rc) + DL_ERROR(rc, "Failed to delete GC bins for bkt_id:%u", bkt_id); + + return rc; +} + +static int +gc_reclaim_pool_p2(struct vos_pool *pool, int *credits, bool *empty_ret) +{ + struct vos_container *cont = NULL; + struct vos_gc_bin_df *bins = NULL; + struct vos_gc_info *gc_info = &pool->vp_gc_info; + uint32_t bkt = gc_info->gi_last_pinned, pinned_bkt = UMEM_DEFAULT_MBKT_ID; + struct umem_pin_handle *pin_hdl = NULL; + struct umem_cache_range rg; + bool tx_started = false; + int creds = *credits, rc = 0; + + if (pool->vp_dying) { + *empty_ret = true; + return rc; + } + + *empty_ret = false; + while(creds > 0) { + if (bkt != UMEM_DEFAULT_MBKT_ID && bkt != pinned_bkt) { + if (tx_started) { + tx_started = false; + rc = umem_tx_end(&pool->vp_umm, 0); + if (rc) { + DL_ERROR(rc, "Failed to commit GC tx."); + break; + } + } + + if (pin_hdl != NULL) { + umem_cache_unpin(vos_pool2store(pool), pin_hdl); + pin_hdl = NULL; + } + + rg.cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), bkt); + rg.cr_size = vos_pool2store(pool)->cache->ca_page_sz; + + rc = vos_cache_pin(pool, &rg, 1, false, &pin_hdl); + if (rc) { + DL_ERROR(rc, "Failed to pin bucket %u.", bkt); + break; + } + pinned_bkt = bkt; + gc_info->gi_last_pinned = pinned_bkt; + } + + if (!tx_started) { + rc = umem_tx_begin(&pool->vp_umm, NULL); + if (rc) { + DL_ERROR(rc, "Failed to start tx for pool:"DF_UUID".", + DP_UUID(pool->vp_id)); + break; + } + tx_started = true; + } + + /* Flatten all containers first */ + rc = gc_flatten_cont(pool, &creds); + if (rc < 0) { + DL_ERROR(rc, "GC flatten cont failed."); + break; + } + + /* Container flattening used up all user credits */ + if (creds == 0) + break; + + /* + * Pick gc_bin[GC_CONT] by bucket ID, the bucket ID could be switched if + * there is nothing to be reclaimed for the specified ID + */ + rc = gc_get_bkt(pool, &cont, &bkt, &bins); + if (rc == -DER_NONEXIST) { + *empty_ret = true; + rc = 0; + break; + } else if (rc) { + DL_ERROR(rc, "Failed to get GC bkt bins for bkt_id:%u", bkt); + break; + } + + /* Bucket ID is switched, need to unpin current bucket then pin the new bucket */ + if (bkt != UMEM_DEFAULT_MBKT_ID && bkt != pinned_bkt) + continue; + + rc = gc_reclaim_bins(pool, cont, bins, &creds); + if (rc) { + DL_ERROR(rc, "GC reclaim bins for bkt_id:%u failed.", bkt); + break; + } + + if (bins_empty(pool, bins)) { + /* The gc_bin[GC_CONT] is empty, delete it to condense the bucket tree */ + rc = gc_delete_bins(pool, cont, bkt); + if (rc) { + DL_ERROR(rc, "GC delete bins for bkt_id:%u failed.", bkt); + break; + } + } + } + + if (tx_started) { + rc = umem_tx_end(&pool->vp_umm, rc); + if (rc) + DL_ERROR(rc, "Failed to commit GC tx."); + } + + if (pin_hdl != NULL) { + umem_cache_unpin(vos_pool2store(pool), pin_hdl); + pin_hdl = NULL; + } + + if (cont != NULL) + vos_cont_decref(cont); + + if (rc == 0) + *credits = creds; + + gc_update_stats(pool); + return rc; +} + +static inline void +gc_close_bkt(struct vos_gc_info *gc_info) +{ + + if (daos_handle_is_valid(gc_info->gi_bins_btr)) { + dbtree_close(gc_info->gi_bins_btr); + gc_info->gi_bins_btr = DAOS_HDL_INVAL; + } + gc_info->gi_last_pinned = UMEM_DEFAULT_MBKT_ID; +} + +static inline int +gc_open_bkt(struct umem_attr *uma, struct vos_gc_bkt_df *bkt_df, struct vos_gc_info *gc_info) +{ + int rc; + + rc = dbtree_open_inplace(&bkt_df->gd_bins_root, uma, &gc_info->gi_bins_btr); + if (rc) + DL_ERROR(rc, "Failed to open GC bin tree."); + return rc; +} + +void +gc_close_pool(struct vos_pool *pool) +{ + return gc_close_bkt(&pool->vp_gc_info); +} + +int +gc_open_pool(struct vos_pool *pool) +{ + struct vos_pool_ext_df *pd_ext = umem_off2ptr(&pool->vp_umm, pool->vp_pool_df->pd_ext); + + if (pd_ext != NULL) + return gc_open_bkt(&pool->vp_uma, &pd_ext->ped_gc_bkt, &pool->vp_gc_info); + return 0; +} + +void +gc_close_cont(struct vos_container *cont) +{ + return gc_close_bkt(&cont->vc_gc_info); +} + +int +gc_open_cont(struct vos_container *cont) +{ + struct vos_pool *pool = vos_cont2pool(cont); + struct vos_cont_ext_df *cd_ext = umem_off2ptr(&pool->vp_umm, cont->vc_cont_df->cd_ext); + + if (cd_ext != NULL) + return gc_open_bkt(&pool->vp_uma, &cd_ext->ced_gc_bkt, &cont->vc_gc_info); + return 0; +} + +static int +gc_init_bkt(struct umem_instance *umm, struct vos_gc_bkt_df *bkt_df) +{ + struct umem_attr uma; + daos_handle_t bins_btr; + int rc; + + uma.uma_id = umm->umm_id; + uma.uma_pool = umm->umm_pool; + + rc = dbtree_create_inplace(DBTREE_CLASS_IFV, BTR_FEAT_UINT_KEY, 12, &uma, + &bkt_df->gd_bins_root, &bins_btr); + if (rc) { + DL_ERROR(rc, "Failed to create GC bin tree."); + return rc; + } + dbtree_close(bins_btr); + + return 0; +} + /** * Initialize garbage bins for a pool. * @@ -842,10 +1552,9 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret) int gc_init_pool(struct umem_instance *umm, struct vos_pool_df *pd) { - int i; - umem_off_t bag_id; - int size; - int rc; + struct vos_pool_ext_df *pd_ext = umem_off2ptr(umm, pd->pd_ext); + umem_off_t bag_id; + int i, size, rc; D_DEBUG(DB_IO, "Init garbage bins for pool="DF_UUID"\n", DP_UUID(pd->pd_id)); @@ -867,6 +1576,10 @@ gc_init_pool(struct umem_instance *umm, struct vos_pool_df *pd) bin->bin_bag_last = bag_id; bin->bin_bag_nr = 1; } + + if (pd_ext != NULL) + return gc_init_bkt(umm, &pd_ext->ped_gc_bkt); + return 0; } @@ -879,7 +1592,8 @@ gc_init_pool(struct umem_instance *umm, struct vos_pool_df *pd) int gc_init_cont(struct umem_instance *umm, struct vos_cont_df *cd) { - int i; + struct vos_cont_ext_df *cd_ext = umem_off2ptr(umm, cd->cd_ext); + int i; D_DEBUG(DB_IO, "Init garbage bins for cont="DF_UUID"\n", DP_UUID(cd->cd_id)); @@ -892,6 +1606,10 @@ gc_init_cont(struct umem_instance *umm, struct vos_cont_df *cd) bin->bin_bag_size = gc_bag_size; bin->bin_bag_nr = 0; } + + if (cd_ext != NULL) + return gc_init_bkt(umm, &cd_ext->ced_gc_bkt); + return 0; } @@ -903,17 +1621,25 @@ gc_check_cont(struct vos_container *cont) { int i; struct vos_gc_bin_df *bin; + struct vos_pool *pool = cont->vc_pool; D_INIT_LIST_HEAD(&cont->vc_gc_link); for (i = 0; i < GC_CONT; i++) { - bin = gc_type2bin(cont->vc_pool, cont, i); + bin = gc_type2bin(pool, cont, i); if (bin->bin_bag_first != UMOFF_NULL) { - d_list_add_tail(&cont->vc_gc_link, - &cont->vc_pool->vp_gc_cont); + d_list_add_tail(&cont->vc_gc_link, &pool->vp_gc_cont); return; } } + + if (vos_pool_is_evictable(pool)) { + struct vos_gc_info *gc_info = &cont->vc_gc_info; + + D_ASSERT(daos_handle_is_valid(gc_info->gi_bins_btr)); + if (!dbtree_is_empty(gc_info->gi_bins_btr)) + d_list_add_tail(&cont->vc_gc_link, &pool->vp_gc_cont); + } } /** @@ -949,8 +1675,10 @@ gc_del_pool(struct vos_pool *pool) D_ASSERT(!d_list_empty(&pool->vp_gc_link)); pool->vp_opened--; - if (pool->vp_opened == 0) + if (pool->vp_opened == 0) { vos_pool_hash_del(pool); /* un-pin from open-hash */ + gc_close_pool(pool); + } d_list_del_init(&pool->vp_gc_link); vos_pool_decref(pool); /* -1 for the link */ @@ -1018,7 +1746,10 @@ vos_gc_run(int *credits) D_DEBUG(DB_TRACE, "GC pool="DF_UUID", creds=%d\n", DP_UUID(pool->vp_id), creds); - rc = gc_reclaim_pool(pool, &creds, &empty); + if (vos_pool_is_evictable(pool)) + rc = gc_reclaim_pool_p2(pool, &creds, &empty); + else + rc = gc_reclaim_pool(pool, &creds, &empty); if (rc) { D_ERROR("GC pool="DF_UUID" error=%s\n", DP_UUID(pool->vp_id), d_errstr(rc)); @@ -1097,7 +1828,10 @@ vos_gc_pool_tight(daos_handle_t poh, int *credits) return 0; /* nothing to reclaim for this pool */ total = *credits; - rc = gc_reclaim_pool(pool, credits, &empty); + if (vos_pool_is_evictable(pool)) + rc = gc_reclaim_pool_p2(pool, credits, &empty); + else + rc = gc_reclaim_pool(pool, credits, &empty); if (rc) { D_CRIT("gc_reclaim_pool failed " DF_RC "\n", DP_RC(rc)); return 0; /* caller can't do anything for it */ diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 9441ba45265..ade36cb769a 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -241,6 +241,21 @@ struct vos_wal_metrics { void vos_wal_metrics_init(struct vos_wal_metrics *vw_metrics, const char *path, int tgt_id); +/* VOS pool metrics for umem cache */ +struct vos_cache_metrics { + struct d_tm_node_t *vcm_pg_ne; + struct d_tm_node_t *vcm_pg_pinned; + struct d_tm_node_t *vcm_pg_free; + struct d_tm_node_t *vcm_pg_hit; + struct d_tm_node_t *vcm_pg_miss; + struct d_tm_node_t *vcm_pg_evict; + struct d_tm_node_t *vcm_pg_flush; + struct d_tm_node_t *vcm_pg_load; + struct d_tm_node_t *vcm_obj_hit; +}; + +void vos_cache_metrics_init(struct vos_cache_metrics *vc_metrcis, const char *path, int tgt_id); + struct vos_pool_metrics { void *vp_vea_metrics; struct vos_agg_metrics vp_agg_metrics; @@ -248,9 +263,15 @@ struct vos_pool_metrics { struct vos_space_metrics vp_space_metrics; struct vos_chkpt_metrics vp_chkpt_metrics; struct vos_wal_metrics vp_wal_metrics; + struct vos_cache_metrics vp_cache_metrics; /* TODO: add more metrics for VOS */ }; +struct vos_gc_info { + daos_handle_t gi_bins_btr; + uint32_t gi_last_pinned; +}; + /** * VOS pool (DRAM) */ @@ -310,6 +331,8 @@ struct vos_pool { uint32_t vp_data_thresh; /** Space (in percentage) reserved for rebuild */ unsigned int vp_space_rb; + /* GC runtime for pool */ + struct vos_gc_info vp_gc_info; }; /** @@ -353,6 +376,8 @@ struct vos_container { daos_epoch_range_t vc_epr_aggregation; /* Current ongoing discard EPR */ daos_epoch_range_t vc_epr_discard; + /* Last timestamp when VOS aggregation reports -DER_TX_BUSY */ + uint64_t vc_agg_busy_ts; /* Last timestamp when VOS aggregation reporting ENOSPACE */ uint64_t vc_agg_nospc_ts; /* Last timestamp when IO reporting ENOSPACE */ @@ -363,7 +388,8 @@ struct vos_container { * * transaction with older epoch must have been committed. */ daos_epoch_t vc_solo_dtx_epoch; - + /* GC runtime for container */ + struct vos_gc_info vc_gc_info; /* Various flags */ unsigned int vc_in_aggregation:1, vc_in_discard:1, @@ -1256,7 +1282,7 @@ vos_bio_addr_free(struct vos_pool *pool, bio_addr_t *addr, daos_size_t nob); void vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, - daos_handle_t coh); + daos_handle_t coh, struct vos_object *obj); int vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb); @@ -1307,7 +1333,7 @@ vos_dedup_invalidate(struct vos_pool *pool); umem_off_t vos_reserve_scm(struct vos_container *cont, struct umem_rsrvd_act *rsrvd_scm, - daos_size_t size); + daos_size_t size, struct vos_object *obj); int vos_publish_scm(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_scm, bool publish); int @@ -1324,6 +1350,12 @@ vos_pool2umm(struct vos_pool *pool) return &pool->vp_umm; } +static inline struct umem_store * +vos_pool2store(struct vos_pool *pool) +{ + return &pool->vp_umm.umm_pool->up_store; +} + static inline struct umem_instance * vos_cont2umm(struct vos_container *cont) { @@ -1360,11 +1392,19 @@ void gc_check_cont(struct vos_container *cont); int gc_add_item(struct vos_pool *pool, daos_handle_t coh, - enum vos_gc_type type, umem_off_t item_off, uint64_t args); + enum vos_gc_type type, umem_off_t item_off, uint32_t *bkt_ids); int vos_gc_pool_tight(daos_handle_t poh, int *credits); void gc_reserve_space(daos_size_t *rsrvd); +int +gc_open_pool(struct vos_pool *pool); +void +gc_close_pool(struct vos_pool *pool); +int +gc_open_cont(struct vos_container *cont); +void +gc_close_cont(struct vos_container *cont); /** * If the object is fully punched, bypass normal aggregation and move it to container @@ -1839,4 +1879,149 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v int vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid); +static inline bool +vos_pool_is_p2(struct vos_pool *pool) +{ + struct umem_store *store = vos_pool2store(pool); + + return store->store_type == DAOS_MD_BMEM_V2; +} + +static inline bool +vos_pool_is_evictable(struct vos_pool *pool) +{ + struct umem_store *store = vos_pool2store(pool); + + if (store->store_evictable) { + D_ASSERT(store->store_type == DAOS_MD_BMEM_V2); + return true; + } + + return false; +} + +static inline umem_off_t +vos_obj_alloc(struct umem_instance *umm, struct vos_object *obj, size_t size, bool zeroing) +{ + + if (obj != NULL && vos_pool_is_evictable(vos_obj2pool(obj))) { + D_ASSERT(obj->obj_bkt_alloted == 1); + if (zeroing) + return umem_zalloc_from_bucket(umm, size, obj->obj_bkt_ids[0]); + + return umem_alloc_from_bucket(umm, size, obj->obj_bkt_ids[0]); + } + + if (zeroing) + return umem_zalloc(umm, size); + + return umem_alloc(umm, size); +} + +static inline umem_off_t +vos_obj_reserve(struct umem_instance *umm, struct vos_object *obj, + struct umem_rsrvd_act *rsrvd_scm, daos_size_t size) +{ + if (obj != NULL && vos_pool_is_evictable(vos_obj2pool(obj))) { + D_ASSERT(obj->obj_bkt_alloted == 1); + return umem_reserve_from_bucket(umm, rsrvd_scm, size, obj->obj_bkt_ids[0]); + } + + return umem_reserve(umm, rsrvd_scm, size); +} + +/* vos_obj_cache.c */ +static inline struct dtx_handle * +clear_cur_dth(struct vos_pool *pool) +{ + struct dtx_handle *dth; + + dth = vos_dth_get(pool->vp_sysdb); + vos_dth_set(NULL, pool->vp_sysdb); + + return dth; +} + +static inline void +restore_cur_dth(struct vos_pool *pool, struct dtx_handle *dth) +{ + vos_dth_set(dth, pool->vp_sysdb); +} + +static inline struct vos_cache_metrics * +store2cache_metrics(struct umem_store *store) +{ + struct vos_pool_metrics *vpm = (struct vos_pool_metrics *)store->stor_stats; + + return vpm != NULL ? &vpm->vp_cache_metrics : NULL; +} + +static inline void +update_page_stats(struct umem_store *store) +{ + struct vos_cache_metrics *vcm = store2cache_metrics(store); + struct umem_cache *cache = store->cache; + + if (vcm == NULL) + return; + + d_tm_set_gauge(vcm->vcm_pg_ne, cache->ca_pgs_stats[UMEM_PG_STATS_NONEVICTABLE]); + d_tm_set_gauge(vcm->vcm_pg_pinned, cache->ca_pgs_stats[UMEM_PG_STATS_PINNED]); + d_tm_set_gauge(vcm->vcm_pg_free, cache->ca_pgs_stats[UMEM_PG_STATS_FREE]); + + d_tm_set_counter(vcm->vcm_pg_hit, cache->ca_cache_stats[UMEM_CACHE_STATS_HIT]); + d_tm_set_counter(vcm->vcm_pg_miss, cache->ca_cache_stats[UMEM_CACHE_STATS_MISS]); + d_tm_set_counter(vcm->vcm_pg_evict, cache->ca_cache_stats[UMEM_CACHE_STATS_EVICT]); + d_tm_set_counter(vcm->vcm_pg_flush, cache->ca_cache_stats[UMEM_CACHE_STATS_FLUSH]); + d_tm_set_counter(vcm->vcm_pg_load, cache->ca_cache_stats[UMEM_CACHE_STATS_LOAD]); +} + +static inline int +vos_cache_pin(struct vos_pool *pool, struct umem_cache_range *ranges, int range_nr, + bool for_sys, struct umem_pin_handle **pin_handle) +{ + struct umem_store *store = vos_pool2store(pool); + struct dtx_handle *cur_dth; + int rc; + + cur_dth = clear_cur_dth(pool); + rc = umem_cache_pin(store, ranges, range_nr, for_sys, pin_handle); + restore_cur_dth(pool, cur_dth); + + update_page_stats(store); + + return rc; +} + +int vos_obj_acquire(struct vos_container *cont, daos_unit_oid_t oid, bool pin, + struct vos_object **obj_p); + +#define VOS_BKTS_INLINE_MAX 4 +struct vos_bkt_array { + uint32_t vba_tot; + uint32_t vba_cnt; + uint32_t vba_inline_bkts[VOS_BKTS_INLINE_MAX]; + uint32_t *vba_bkts; +}; + +static inline void +vos_bkt_array_fini(struct vos_bkt_array *bkts) +{ + if (bkts->vba_tot > VOS_BKTS_INLINE_MAX) + D_FREE(bkts->vba_bkts); +} + +static inline void +vos_bkt_array_init(struct vos_bkt_array *bkts) +{ + bkts->vba_tot = VOS_BKTS_INLINE_MAX; + bkts->vba_cnt = 0; + bkts->vba_bkts = &bkts->vba_inline_bkts[0]; +} + +bool vos_bkt_array_subset(struct vos_bkt_array *super, struct vos_bkt_array *sub); +int vos_bkt_array_add(struct vos_bkt_array *bkts, uint32_t bkt_id); +int vos_bkt_array_pin(struct vos_pool *pool, struct vos_bkt_array *bkts, + struct umem_pin_handle **pin_hdl); + #endif /* __VOS_INTERNAL_H__ */ diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index 7aa3c897755..efd3f9b9a49 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -37,6 +37,8 @@ struct vos_io_context { struct dcs_iod_csums *ic_iod_csums; /** reference on the object */ struct vos_object *ic_obj; + /** used only for md-on-ssd phase2 evictable pool */ + struct vos_object *ic_pinned_obj; /** BIO descriptor, has ic_iod_nr SGLs */ struct bio_desc *ic_biod; struct vos_ts_set *ic_ts_set; @@ -600,6 +602,9 @@ vos_ioc_destroy(struct vos_io_context *ioc, bool evict) if (ioc->ic_obj) vos_obj_release(ioc->ic_obj, 0, evict); + if (ioc->ic_pinned_obj) + vos_obj_release(ioc->ic_pinned_obj, 0, evict); + vos_ioc_reserve_fini(ioc); vos_ilog_fetch_finish(&ioc->ic_dkey_info); vos_ilog_fetch_finish(&ioc->ic_akey_info); @@ -2119,17 +2124,16 @@ dkey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_key_t *dkey, umem_off_t vos_reserve_scm(struct vos_container *cont, struct umem_rsrvd_act *rsrvd_scm, - daos_size_t size) + daos_size_t size, struct vos_object *obj) { - umem_off_t umoff; + umem_off_t umoff; + struct umem_instance *umm = vos_cont2umm(cont); D_ASSERT(size > 0); - - if (vos_cont2umm(cont)->umm_ops->mo_reserve != NULL) { - umoff = umem_reserve(vos_cont2umm(cont), rsrvd_scm, size); - } else { - umoff = umem_alloc(vos_cont2umm(cont), size); - } + if (umm->umm_ops->mo_reserve != NULL) + umoff = vos_obj_reserve(umm, obj, rsrvd_scm, size); + else + umoff = vos_obj_alloc(umm, obj, size, false); return umoff; } @@ -2175,7 +2179,7 @@ reserve_space(struct vos_io_context *ioc, uint16_t media, daos_size_t size, if (media == DAOS_MEDIA_SCM) { umem_off_t umoff; - umoff = vos_reserve_scm(ioc->ic_cont, ioc->ic_rsrvd_scm, size); + umoff = vos_reserve_scm(ioc->ic_cont, ioc->ic_rsrvd_scm, size, ioc->ic_pinned_obj); if (!UMOFF_IS_NULL(umoff)) { ioc->ic_umoffs[ioc->ic_umoffs_cnt] = umoff; ioc->ic_umoffs_cnt++; @@ -2577,7 +2581,12 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, tx_started = true; - /* Commit the CoS DTXs via the IO PMDK transaction. */ + /* + * Commit the CoS DTXs via the IO PMDK transaction. + * + * It's guaranteed that no other objects are involved in the CoS DTXs, so we don't + * need to pin extra objects here. + */ if (dtx_is_valid_handle(dth) && dth->dth_dti_cos_count > 0 && !dth->dth_cos_done) { D_ASSERT(!dth->dth_local); @@ -2745,6 +2754,20 @@ vos_update_begin(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, goto error; } + /* Hold the object for the evictable md-on-ssd phase2 pool */ + if (vos_pool_is_evictable(vos_cont2pool(ioc->ic_cont))) { + /* + * FIXME: + * The same object will be referenced by vos_obj_acquire() and vos_obj_hold() + * (in vos_update_end()) twice, this is for avoiding the complication of adding + * object ilog to ts_set. We'll re-org vos_obj_hold() in the future to make the + * code look cleaner. + */ + rc = vos_obj_acquire(ioc->ic_cont, ioc->ic_oid, true, &ioc->ic_pinned_obj); + if (rc != 0) + goto error; + } + rc = dkey_update_begin(ioc); if (rc != 0) { D_ERROR(DF_UOID ": dkey update begin failed. " DF_RC "\n", DP_UOID(oid), DP_RC(rc)); diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index 902cb064e26..87d092bc882 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -43,6 +43,16 @@ struct vos_gc_bin_df { uint16_t bin_pad16; }; +/* + * This is smaller than the VOS_OBJ_BKTS_MAX for object durable format, because + * I don't want to increase each GC item size (the amount of GC item is massive) + * for an imagined requirement. + * + * If we really need to support more than 2 evict-able buckets per object in the + * futhure, we can enlarge the GC item then. + */ +#define VOS_GC_BKTS_MAX 2 + struct vos_gc_bag_df { /** index of the first item in FIFO */ uint16_t bag_item_first; @@ -57,19 +67,12 @@ struct vos_gc_bag_df { struct vos_gc_item { /* address of the item to be freed */ umem_off_t it_addr; - /** Reserved, argument for GC_VEA/BIO (e.g. size of extent) */ - uint64_t it_args; + /* object buckets for GC_AKEY/DKEY/OBJ of the md-on-ssd p2 pool */ + uint32_t it_bkt_ids[VOS_GC_BKTS_MAX]; } bag_items[0]; }; enum vos_gc_type { - /* XXX: we could define GC_VEA, which can free NVMe/SCM space. - * So svt_rec_free() and evt_desc_bio_free() only need to call - * gc_add_item() to register BIO address for GC. - * - * However, GC_VEA could have extra overhead of reassigning SCM - * pointers, but it also has low latency for undo changes. - */ GC_AKEY, GC_DKEY, GC_OBJ, @@ -77,6 +80,11 @@ enum vos_gc_type { GC_MAX, }; +struct vos_gc_bkt_df { + /* GC bins categorized by bucket number */ + struct btr_root gd_bins_root; +}; + #define POOL_DF_MAGIC 0x5ca1ab1e /** Lowest supported durable format version */ @@ -107,6 +115,16 @@ enum vos_gc_type { /** 2.8 features */ #define VOS_POOL_FEAT_2_8 (VOS_POOL_FEAT_GANG_SV) +/* VOS pool durable format extension */ +struct vos_pool_ext_df { + /* Extension for GC bucket */ + struct vos_gc_bkt_df ped_gc_bkt; + /* Paddings for other potential new feature */ + uint64_t ped_paddings[54]; + /* Reserved for future extension */ + uint64_t ped_reserve; +}; + /** * Durable format for VOS pool */ @@ -124,8 +142,8 @@ struct vos_pool_df { * a new format, containers with old format can be attached at here. */ uint64_t pd_reserv_upgrade; - /** Reserved for future usage */ - uint64_t pd_reserv; + /** Pool durable format extension */ + umem_off_t pd_ext; /** Unique PoolID for each VOS pool assigned on creation */ uuid_t pd_id; /** Total space in bytes on SCM */ @@ -249,6 +267,16 @@ enum vos_io_stream { VOS_IOS_CNT }; +/* VOS container durable format extension */ +struct vos_cont_ext_df { + /* GC bucket extension */ + struct vos_gc_bkt_df ced_gc_bkt; + /* Reserved for potential new features */ + uint64_t ced_paddings[38]; + /* Reserved for future extension */ + uint64_t ced_reserve; +}; + /* VOS Container Value */ struct vos_cont_df { uuid_t cd_id; @@ -260,8 +288,8 @@ struct vos_cont_df { struct btr_root cd_obj_root; /** reserved for placement algorithm upgrade */ uint64_t cd_reserv_upgrade; - /** reserved for future usage */ - uint64_t cd_reserv; + /** Container durable format extension */ + umem_off_t cd_ext; /** The active DTXs blob head. */ umem_off_t cd_dtx_active_head; /** The active DTXs blob tail. */ @@ -380,4 +408,18 @@ struct vos_obj_df { struct btr_root vo_tree; }; +#define VOS_OBJ_BKTS_MAX 4 +D_CASSERT(VOS_GC_BKTS_MAX <= VOS_OBJ_BKTS_MAX); + +/* + * VOS object durable format for md-on-ssd phase2. The size is fit to the 128 bytes + * slab (see slab_map[] defined in mem.c). + */ +struct vos_obj_p2_df { + struct vos_obj_df p2_obj_df; + uint32_t p2_bkt_ids[VOS_OBJ_BKTS_MAX]; + uint64_t p2_reserved; +}; +D_CASSERT(sizeof(struct vos_obj_p2_df) == D_ALIGNUP(sizeof(struct vos_obj_df), 32)); + #endif diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index cc72575f608..25d50ec5868 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -496,9 +496,12 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, if (rc != 0) goto reset; - /* Commit the CoS DTXs via the PUNCH PMDK transaction. */ - if (dtx_is_valid_handle(dth) && dth->dth_dti_cos_count > 0 && - !dth->dth_cos_done) { + /* Commit the CoS DTXs via the PUNCH PMDK transaction. + * + * It's guaranteed that no other objects are involved in the CoS DTXs, so we don't + * need to pin extra objects here. + */ + if (dtx_is_valid_handle(dth) && dth->dth_dti_cos_count > 0 && !dth->dth_cos_done) { D_ALLOC_ARRAY(daes, dth->dth_dti_cos_count); if (daes == NULL) D_GOTO(reset, rc = -DER_NOMEM); @@ -1065,7 +1068,8 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type, * subtree */ if (krec->kr_bmap & KREC_BF_EVT) { - vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont)); + vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont), + obj); rc = evt_open(&krec->kr_evt, info->ii_uma, &cbs, &info->ii_tree_hdl); if (rc) { D_DEBUG(DB_TRACE, @@ -1077,7 +1081,7 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type, info->ii_fake_akey_flag = VOS_IT_DKEY_EV; } else { rc = dbtree_open_inplace_ex(&krec->kr_btr, info->ii_uma, - vos_cont2hdl(obj->obj_cont), vos_obj2pool(obj), + vos_cont2hdl(obj->obj_cont), obj, &info->ii_tree_hdl); if (rc) { D_DEBUG(DB_TRACE, @@ -2040,7 +2044,7 @@ vos_obj_akey_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, } rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), &toh); + obj, &toh); if (rc) { D_DEBUG(DB_TRACE, "Failed to open tree for iterator:" @@ -2097,7 +2101,7 @@ vos_obj_iter_sv_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, } rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), &toh); + obj, &toh); if (rc) { D_DEBUG(DB_TRACE, "Failed to open tree for iterator:" @@ -2157,7 +2161,7 @@ vos_obj_ev_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, goto prepare; } - vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont)); + vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont), obj); rc = evt_open(info->ii_evt, info->ii_uma, &cbs, &toh); if (rc) { D_DEBUG(DB_TRACE, diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index 2ccc8d71988..f572ebb03d9 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -47,12 +47,25 @@ struct vos_object { struct vos_obj_df *obj_df; /** backref to container */ struct vos_container *obj_cont; + /* Handle for the pinned object */ + struct umem_pin_handle *obj_pin_hdl; + /** Bucket IDs for the object */ + uint32_t obj_bkt_ids[VOS_OBJ_BKTS_MAX]; + ABT_mutex obj_mutex; + ABT_cond obj_wait_alloting; + ABT_cond obj_wait_loading; /** nobody should access this object */ bool obj_zombie; /** Object is held for discard */ uint32_t obj_discard : 1, /** If non-zero, object is held for aggregation */ - obj_aggregate : 1; + obj_aggregate : 1, + /** Evict-able bucket is already allocated */ + obj_bkt_alloted : 1, + /** Allocating evict-able bucket in in-progress */ + obj_bkt_alloting : 1, + /** Loading object is in-progress */ + obj_bkt_loading : 1; }; enum { diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index 8845eae0085..9274e219d75 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -73,13 +73,37 @@ obj_lop_alloc(void *key, unsigned int ksize, void *args, D_ALLOC_PTR(obj); if (!obj) - D_GOTO(failed, rc = -DER_NOMEM); + return -DER_NOMEM; + + rc = ABT_mutex_create(&obj->obj_mutex); + if (rc != ABT_SUCCESS) { + rc = dss_abterr2der(rc); + goto failed; + } + + rc = ABT_cond_create(&obj->obj_wait_alloting); + if (rc != ABT_SUCCESS) { + rc = dss_abterr2der(rc); + goto free_mutex; + } + + rc = ABT_cond_create(&obj->obj_wait_loading); + if (rc != ABT_SUCCESS) { + rc = dss_abterr2der(rc); + goto free_alloting; + } init_object(obj, lkey->olk_oid, cont); d_tm_inc_gauge(tls->vtl_obj_cnt, 1); *llink_p = &obj->obj_llink; - rc = 0; + return 0; + +free_alloting: + ABT_cond_free(&obj->obj_wait_alloting); +free_mutex: + ABT_mutex_free(&obj->obj_mutex); failed: + D_FREE(obj); return rc; } @@ -133,6 +157,9 @@ obj_lop_free(struct daos_llink *llink) tls = vos_tls_get(obj->obj_cont->vc_pool->vp_sysdb); d_tm_dec_gauge(tls->vtl_obj_cnt, 1); clean_object(obj); + ABT_cond_free(&obj->obj_wait_loading); + ABT_cond_free(&obj->obj_wait_alloting); + ABT_mutex_free(&obj->obj_mutex); D_FREE(obj); } @@ -245,12 +272,132 @@ obj_get(struct daos_lru_cache *occ, struct vos_container *cont, daos_unit_oid_t return rc; } +static inline void +vos_obj_unpin(struct vos_object *obj) +{ + struct vos_pool *pool = vos_obj2pool(obj); + struct umem_store *store = vos_pool2store(pool); + + if (obj->obj_pin_hdl != NULL && daos_lru_is_last_user(&obj->obj_llink)) { + umem_cache_unpin(store, obj->obj_pin_hdl); + obj->obj_pin_hdl = NULL; + } +} + +static void +obj_allot_bkt(struct vos_pool *pool, struct vos_object *obj) +{ + struct dtx_handle *cur_dth; + + D_ASSERT(umem_tx_none(vos_pool2umm(pool))); + + if (obj->obj_bkt_alloting) { + cur_dth = clear_cur_dth(pool); + + ABT_mutex_lock(obj->obj_mutex); + ABT_cond_wait(obj->obj_wait_alloting, obj->obj_mutex); + ABT_mutex_unlock(obj->obj_mutex); + + D_ASSERT(obj->obj_bkt_alloted == 1); + restore_cur_dth(pool, cur_dth); + return; + } + obj->obj_bkt_alloting = 1; + + if (!obj->obj_df) { + cur_dth = clear_cur_dth(pool); + obj->obj_bkt_ids[0] = umem_allot_mb_evictable(vos_pool2umm(pool), 0); + restore_cur_dth(pool, cur_dth); + } else { + struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj->obj_df; + + obj->obj_bkt_ids[0] = p2->p2_bkt_ids[0]; + } + + obj->obj_bkt_alloted = 1; + obj->obj_bkt_alloting = 0; + + ABT_mutex_lock(obj->obj_mutex); + ABT_cond_broadcast(obj->obj_wait_alloting); + ABT_mutex_unlock(obj->obj_mutex); +} + +static int +obj_pin_bkt(struct vos_pool *pool, struct vos_object *obj) +{ + struct umem_store *store = vos_pool2store(pool); + struct dtx_handle *cur_dth; + struct umem_cache_range rg; + int rc; + + if (obj->obj_bkt_ids[0] == UMEM_DEFAULT_MBKT_ID) { + D_ASSERT(obj->obj_pin_hdl == NULL); + D_ASSERT(!obj->obj_bkt_loading); + return 0; + } + + if (obj->obj_bkt_loading) { + cur_dth = clear_cur_dth(pool); + + ABT_mutex_lock(obj->obj_mutex); + ABT_cond_wait(obj->obj_wait_loading, obj->obj_mutex); + ABT_mutex_unlock(obj->obj_mutex); + + restore_cur_dth(pool, cur_dth); + /* The loader failed on vos_cache_pin() */ + if (obj->obj_pin_hdl == NULL) { + D_ERROR("Object:"DF_UOID" isn't pinned.\n", DP_UOID(obj->obj_id)); + return -DER_BUSY; + } + } + + if (obj->obj_pin_hdl != NULL) { + struct vos_cache_metrics *vcm = store2cache_metrics(store); + + if (vcm) + d_tm_inc_counter(vcm->vcm_obj_hit, 1); + return 0; + } + + obj->obj_bkt_loading = 1; + + rg.cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), obj->obj_bkt_ids[0]); + rg.cr_size = store->cache->ca_page_sz; + + rc = vos_cache_pin(pool, &rg, 1, false, &obj->obj_pin_hdl); + if (rc) + DL_ERROR(rc, "Failed to pin object:"DF_UOID".", DP_UOID(obj->obj_id)); + + obj->obj_bkt_loading = 0; + + ABT_mutex_lock(obj->obj_mutex); + ABT_cond_broadcast(obj->obj_wait_loading); + ABT_mutex_unlock(obj->obj_mutex); + + return rc; +} + +/* Support single evict-able bucket for this moment */ +static inline int +vos_obj_pin(struct vos_object *obj) +{ + struct vos_pool *pool = vos_obj2pool(obj); + + if (!vos_pool_is_evictable(pool)) + return 0; + + if (!obj->obj_bkt_alloted) + obj_allot_bkt(pool, obj); + + return obj_pin_bkt(pool, obj); +} + static inline void obj_release(struct daos_lru_cache *occ, struct vos_object *obj, bool evict) { D_ASSERT(obj != NULL); - /* TODO: Unpin the object in md-on-ssd phase II */ + vos_obj_unpin(obj); if (obj == &obj_local) { clean_object(obj); @@ -294,6 +441,8 @@ cache_object(struct daos_lru_cache *occ, struct vos_object **objp) /* This object should not be cached */ D_ASSERT(obj_new != NULL); D_ASSERT(obj_new->obj_df == NULL); + D_ASSERT(!obj_local.obj_bkt_alloting); + D_ASSERT(!obj_local.obj_bkt_loading); vos_ilog_fetch_move(&obj_new->obj_ilog_info, &obj_local.obj_ilog_info); obj_new->obj_toh = obj_local.obj_toh; @@ -301,6 +450,8 @@ cache_object(struct daos_lru_cache *occ, struct vos_object **objp) obj_new->obj_sync_epoch = obj_local.obj_sync_epoch; obj_new->obj_df = obj_local.obj_df; obj_new->obj_zombie = obj_local.obj_zombie; + obj_new->obj_bkt_alloted = obj_local.obj_bkt_alloted; + obj_new->obj_pin_hdl = obj_local.obj_pin_hdl; obj_local.obj_toh = DAOS_HDL_INVAL; obj_local.obj_ih = DAOS_HDL_INVAL; @@ -363,13 +514,11 @@ vos_obj_check_discard(struct vos_container *cont, daos_unit_oid_t oid, uint64_t if (rc) return rc; - /* TODO: Pin object in memory */ - if (check_discard(obj, flags)) /* Update request will retry with this error */ rc = (flags & VOS_OBJ_CREATE) ? -DER_UPDATE_AGAIN : -DER_BUSY; - obj_release(occ, obj, false); + obj_put(occ, obj, false); return rc; } @@ -420,6 +569,25 @@ vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t return -DER_TX_RESTART; } + if (obj->obj_bkt_ids[0] != UMEM_DEFAULT_MBKT_ID) { + struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj->obj_df; + + D_ASSERT(vos_pool_is_evictable(vos_obj2pool(obj))); + D_ASSERT(obj->obj_bkt_alloted); + + if (p2->p2_bkt_ids[0] == UMEM_DEFAULT_MBKT_ID) { + p2->p2_bkt_ids[0] = obj->obj_bkt_ids[0]; + rc = umem_tx_add_ptr(vos_cont2umm(cont), &p2->p2_bkt_ids[0], + sizeof(p2->p2_bkt_ids[0])); + if (rc) { + DL_ERROR(rc, "Add bucket ID failed."); + return rc; + } + } else { + D_ASSERT(p2->p2_bkt_ids[0] == obj->obj_bkt_ids[0]); + } + } + /* It's done for DAOS_INTENT_PUNCH case */ if (intent == DAOS_INTENT_PUNCH) return 0; @@ -453,6 +621,7 @@ vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t D_ASSERT(cont != NULL); D_ASSERT(cont->vc_pool); D_ASSERT(obj_p != NULL); + *obj_p = NULL; occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb); @@ -507,8 +676,16 @@ vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t D_ASSERT(tmprc == 0); /* Non-zero only valid for akey */ } - /* TODO: Pin the object in memory in md-on-ssd phase II. Revise the 'obj_local' implementation - * then, since this function could yield. */ + /* For md-on-ssd phase2 pool, add object to cache before yield in vos_obj_pin() */ + if (obj == &obj_local && vos_pool_is_evictable(cont->vc_pool)) { + rc = cache_object(occ, &obj); + if (rc != 0) + goto failed; + } + + rc = vos_obj_pin(obj); + if (rc) + goto failed; /* It's done for DAOS_INTENT_UPDATE or DAOS_INTENT_PUNCH or DAOS_INTENT_KILL */ if (intent == DAOS_INTENT_UPDATE || intent == DAOS_INTENT_PUNCH || @@ -608,3 +785,279 @@ vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid) return (rc == -DER_NONEXIST || rc == -DER_SHUTDOWN)? 0 : rc; } + +static int +bkt_cmp(void *array, int a, int b) +{ + uint32_t *bkt_arr = array; + + if (bkt_arr[a] > bkt_arr[b]) + return 1; + if (bkt_arr[a] < bkt_arr[b]) + return -1; + return 0; +} + +static int +bkt_cmp_key(void *array, int i, uint64_t key) +{ + uint32_t *bkt_arr = array; + uint32_t bkt_id = (uint32_t)key; + + if (bkt_arr[i] > bkt_id) + return 1; + if (bkt_arr[i] < bkt_id) + return -1; + return 0; +} + +static void +bkt_swap(void *array, int a, int b) +{ + uint32_t *bkt_arr = array; + uint32_t tmp; + + tmp = bkt_arr[a]; + bkt_arr[a] = bkt_arr[b]; + bkt_arr[b] = tmp; +} + +static daos_sort_ops_t bkt_sort_ops = { + .so_cmp = bkt_cmp, + .so_swap = bkt_swap, + .so_cmp_key = bkt_cmp_key, +}; + +/* if @sub is a subset of @super */ +bool +vos_bkt_array_subset(struct vos_bkt_array *super, struct vos_bkt_array *sub) +{ + int i, idx; + + D_ASSERT(sub->vba_cnt > 0); + if (sub->vba_cnt > super->vba_cnt) + return false; + + for (i = 0; i < sub->vba_cnt; i++) { + idx = daos_array_find(super, super->vba_cnt, sub->vba_bkts[i], &bkt_sort_ops); + if (idx < 0) + return false; + } + + return true; +} + +int +vos_bkt_array_add(struct vos_bkt_array *bkts, uint32_t bkt_id) +{ + int idx; + + D_ASSERT(bkt_id != UMEM_DEFAULT_MBKT_ID); + + /* The @bkt_id is already in bucket array */ + if (bkts->vba_cnt > 0) { + idx = daos_array_find(bkts->vba_bkts, bkts->vba_cnt, bkt_id, &bkt_sort_ops); + if (idx >= 0) + return 0; + } + + /* Bucket array needs be expanded */ + if (bkts->vba_cnt == bkts->vba_tot) { + uint32_t *new_bkts; + size_t new_size = bkts->vba_tot * 2; + + if (bkts->vba_tot > VOS_BKTS_INLINE_MAX) + D_REALLOC_ARRAY(new_bkts, bkts->vba_bkts, bkts->vba_tot, new_size); + else + D_ALLOC_ARRAY(new_bkts, new_size); + + if (new_bkts == NULL) + return -DER_NOMEM; + + if (bkts->vba_tot == VOS_BKTS_INLINE_MAX) + memcpy(new_bkts, bkts->vba_bkts, sizeof(uint32_t) * bkts->vba_tot); + + bkts->vba_bkts = new_bkts; + bkts->vba_tot = new_size; + } + + bkts->vba_bkts[bkts->vba_cnt] = bkt_id; + bkts->vba_cnt++; + + idx = daos_array_sort(bkts->vba_bkts, bkts->vba_cnt, true, &bkt_sort_ops); + D_ASSERT(idx == 0); + + return 0; +} + +int +vos_bkt_array_pin(struct vos_pool *pool, struct vos_bkt_array *bkts, + struct umem_pin_handle **pin_hdl) +{ + struct umem_cache_range rg_inline[VOS_BKTS_INLINE_MAX]; + struct umem_cache_range *ranges; + int i, rc; + + if (bkts->vba_cnt == 0) + return 0; + + if (bkts->vba_cnt > VOS_BKTS_INLINE_MAX) { + D_ALLOC_ARRAY(ranges, bkts->vba_cnt); + if (ranges == NULL) + return -DER_NOMEM; + } else { + ranges = &rg_inline[0]; + } + + for (i = 0; i < bkts->vba_cnt; i++) { + D_ASSERT(bkts->vba_bkts[i] != UMEM_DEFAULT_MBKT_ID); + ranges[i].cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), bkts->vba_bkts[i]); + ranges[i].cr_size = vos_pool2store(pool)->cache->ca_page_sz; + } + + rc = vos_cache_pin(pool, ranges, bkts->vba_cnt, false, pin_hdl); + if (rc) + DL_ERROR(rc, "Failed to pin %u ranges.", bkts->vba_cnt); + + if (ranges != &rg_inline[0]) + D_FREE(ranges); + + return rc; +} + +int +vos_obj_acquire(struct vos_container *cont, daos_unit_oid_t oid, bool pin, + struct vos_object **obj_p) +{ + struct vos_object *obj; + struct daos_lru_cache *occ; + int rc; + + D_ASSERT(cont != NULL); + D_ASSERT(cont->vc_pool); + D_ASSERT(obj_p != NULL); + *obj_p = NULL; + + occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb); + D_ASSERT(occ != NULL); + + /* Lookup object cache, create cache entry if not found */ + rc = obj_get(occ, cont, oid, true, &obj); + if (rc) { + DL_ERROR(rc, "Failed to lookup/create object in cache."); + return rc; + } + + if (obj->obj_zombie) { + D_ERROR("The object:"DF_UOID" is already evicted.\n", DP_UOID(oid)); + obj_put(occ, obj, true); + return -DER_AGAIN; + } + + /* Lookup OI table if the cached object is negative */ + if (obj->obj_df == NULL) { + obj->obj_sync_epoch = 0; + rc = vos_oi_find(cont, oid, &obj->obj_df, NULL); + if (rc == 0) { + obj->obj_sync_epoch = obj->obj_df->vo_sync; + } else if (rc == -DER_NONEXIST) { + rc = 0; + } else if (rc) { + DL_ERROR(rc, "Failed to lookup OI table."); + obj_put(occ, obj, false); + return rc; + } + } + + if (!obj->obj_bkt_alloted) + obj_allot_bkt(cont->vc_pool, obj); + + if (pin) { + rc = obj_pin_bkt(cont->vc_pool, obj); + if (rc) { + obj_put(occ, obj, false); + return rc; + } + } + + *obj_p = obj; + + return 0; +} + +struct vos_pin_handle { + unsigned int vph_acquired; + struct umem_pin_handle *vph_pin_hdl; + struct vos_object *vph_objs[0]; +}; + +void +vos_unpin_objects(daos_handle_t coh, struct vos_pin_handle *hdl) +{ + struct vos_container *cont = vos_hdl2cont(coh); + struct vos_pool *pool = vos_cont2pool(cont); + int i; + + if (hdl->vph_pin_hdl != NULL) + umem_cache_unpin(vos_pool2store(pool), hdl->vph_pin_hdl); + + for (i = 0; i < hdl->vph_acquired; i++) + vos_obj_release(hdl->vph_objs[i], 0, false); + + D_FREE(hdl); +} + +int +vos_pin_objects(daos_handle_t coh, daos_unit_oid_t oids[], int count, struct vos_pin_handle **hdl) +{ + struct vos_pin_handle *vos_hdl; + struct vos_object *obj; + struct vos_bkt_array bkts; + struct vos_container *cont = vos_hdl2cont(coh); + struct vos_pool *pool = vos_cont2pool(cont); + int i, rc; + + *hdl = NULL; + if (!vos_pool_is_evictable(pool)) + return 0; + + D_ASSERT(count > 0); + D_ALLOC(vos_hdl, sizeof(*vos_hdl) + sizeof(struct vos_object *) * count); + if (vos_hdl == NULL) + return -DER_NOMEM; + + vos_bkt_array_init(&bkts); + for (i = 0; i < count; i++) { + rc = vos_obj_acquire(cont, oids[i], false, &vos_hdl->vph_objs[i]); + if (rc) { + DL_ERROR(rc, "Failed to acquire object:"DF_UOID"", DP_UOID(oids[i])); + goto error; + } + vos_hdl->vph_acquired++; + + obj = vos_hdl->vph_objs[i]; + D_ASSERT(obj->obj_bkt_alloted == 1); + if (obj->obj_bkt_ids[0] != UMEM_DEFAULT_MBKT_ID) { + rc = vos_bkt_array_add(&bkts, obj->obj_bkt_ids[0]); + if (rc) { + DL_ERROR(rc, "Failed to add bucket:%u to array", + obj->obj_bkt_ids[0]); + goto error; + } + } + } + + rc = vos_bkt_array_pin(pool, &bkts, &vos_hdl->vph_pin_hdl); + if (rc) { + DL_ERROR(rc, "Failed to pin %u objects.", vos_hdl->vph_acquired); + goto error; + } + + vos_bkt_array_fini(&bkts); + *hdl = vos_hdl; + return 0; +error: + vos_bkt_array_fini(&bkts); + vos_unpin_objects(coh, vos_hdl); + return rc; +} diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index d5955384454..6a86b383c90 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -47,7 +47,8 @@ oi_hkey_size(void) static int oi_rec_msize(int alloc_overhead) { - return alloc_overhead + sizeof(struct vos_obj_df); + /* This function is only used for metadata overhead estimation. */ + return alloc_overhead + D_ALIGNUP(sizeof(struct vos_obj_df), 32); } static void @@ -67,6 +68,15 @@ oi_hkey_cmp(struct btr_instance *tins, struct btr_record *rec, void *hkey) return dbtree_key_cmp_rc(memcmp(oid1, oid2, sizeof(*oid1))); } +static inline unsigned int +vos_obj_df_size(struct vos_pool *pool) +{ + if (vos_pool_is_p2(pool)) + return sizeof(struct vos_obj_p2_df); + + return sizeof(struct vos_obj_df); +} + static int oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov, struct btr_record *rec, d_iov_t *val_out) @@ -76,10 +86,11 @@ oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, struct vos_obj_df *obj; daos_unit_oid_t *key; umem_off_t obj_off; + struct vos_pool *pool = (struct vos_pool *)tins->ti_priv; int rc; /* Allocate a PMEM value of type vos_obj_df */ - obj_off = umem_zalloc(&tins->ti_umm, sizeof(struct vos_obj_df)); + obj_off = umem_zalloc(&tins->ti_umm, vos_obj_df_size(pool)); if (UMOFF_IS_NULL(obj_off)) return -DER_NOSPACE; @@ -100,11 +111,11 @@ oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, } else { struct vos_obj_df *new_obj = val_out->iov_buf; - memcpy(obj, new_obj, sizeof(*obj)); + memcpy(obj, new_obj, vos_obj_df_size(pool)); obj->vo_id = *key; } - d_iov_set(val_iov, obj, sizeof(struct vos_obj_df)); + d_iov_set(val_iov, obj, vos_obj_df_size(pool)); rec->rec_off = obj_off; /* For new created object, commit it synchronously to reduce @@ -134,6 +145,7 @@ oi_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) daos_handle_t coh = { 0 }; int rc; struct vos_pool *pool; + uint32_t *bkt_ids = NULL; obj = umem_off2ptr(umm, rec->rec_off); @@ -162,7 +174,14 @@ oi_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) if (del_arg != NULL) coh = vos_cont2hdl((struct vos_container *)del_arg->cont); - return gc_add_item(tins->ti_priv, coh, GC_OBJ, rec->rec_off, 0); + + if (vos_pool_is_evictable(pool)) { + struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj; + + bkt_ids = &p2->p2_bkt_ids[0]; + } + + return gc_add_item(tins->ti_priv, coh, GC_OBJ, rec->rec_off, bkt_ids); } static int @@ -176,7 +195,7 @@ oi_rec_fetch(struct btr_instance *tins, struct btr_record *rec, DP_UOID(obj->vo_id), rec->rec_off); D_ASSERT(val_iov != NULL); - d_iov_set(val_iov, obj, sizeof(struct vos_obj_df)); + d_iov_set(val_iov, obj, vos_obj_df_size((struct vos_pool *)tins->ti_priv)); return 0; } @@ -234,7 +253,6 @@ vos_oi_find(struct vos_container *cont, daos_unit_oid_t oid, } tmprc = vos_ilog_ts_add(ts_set, ilog, &oid, sizeof(oid)); - D_ASSERT(tmprc == 0); /* Non-zero return for akey only */ return rc; @@ -504,7 +522,7 @@ oi_iter_nested_tree_fetch(struct vos_iterator *iter, vos_iter_type_t type, return rc; } - D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); obj = (struct vos_obj_df *)rec_iov.iov_buf; rc = oi_iter_ilog_check(obj, oiter, &info->ii_epr, false); @@ -610,7 +628,7 @@ oi_iter_match_probe(struct vos_iterator *iter, daos_anchor_t *anchor, uint32_t f goto failed; } - D_ASSERT(iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); obj = (struct vos_obj_df *)iov.iov_buf; if (iter->it_filter_cb != NULL && (flags & VOS_ITER_PROBE_AGAIN) == 0) { @@ -767,7 +785,7 @@ oi_iter_fetch(struct vos_iterator *iter, vos_iter_entry_t *it_entry, return rc; } - D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); return oi_iter_fill(rec_iov.iov_buf, oiter, false, it_entry); } @@ -818,7 +836,7 @@ oi_iter_check_punch(daos_handle_t ih) "Probe should be done before aggregation\n"); if (rc != 0) return rc; - D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); obj = (struct vos_obj_df *)rec_iov.iov_buf; oid = obj->vo_id; @@ -883,7 +901,7 @@ oi_iter_aggregate(daos_handle_t ih, bool range_discard) "Probe should be done before aggregation\n"); if (rc != 0) return rc; - D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); obj = (struct vos_obj_df *)rec_iov.iov_buf; oid = obj->vo_id; diff --git a/src/vos/vos_overhead.c b/src/vos/vos_overhead.c index fff55c67d2a..f0b0f0375d3 100644 --- a/src/vos/vos_overhead.c +++ b/src/vos/vos_overhead.c @@ -8,13 +8,13 @@ int vos_pool_get_msize(void) { - return sizeof(struct vos_pool_df); + return sizeof(struct vos_pool_df) + sizeof(struct vos_pool_ext_df); } int vos_container_get_msize(void) { - return sizeof(struct vos_cont_df); + return sizeof(struct vos_cont_df) + sizeof(struct vos_cont_ext_df); } int diff --git a/src/vos/vos_pool.c b/src/vos/vos_pool.c index 6c2e0120842..f0b8fa8604f 100644 --- a/src/vos/vos_pool.c +++ b/src/vos/vos_pool.c @@ -149,12 +149,12 @@ vos_meta_load_fn(void *arg) ABT_cond_signal(mlc->mlc_cond); } -static inline int -vos_meta_load(struct umem_store *store, char *start) +static int +vos_meta_load(struct umem_store *store, char *start, daos_off_t offset, daos_size_t len) { uint64_t read_size; - uint64_t remain_size = store->stor_size; - daos_off_t off = 0; + uint64_t remain_size = len; + daos_off_t off = offset; int rc = 0; struct meta_load_arg *mla; struct meta_load_control mlc; @@ -223,6 +223,74 @@ vos_meta_load(struct umem_store *store, char *start) return rc ? rc : mlc.mlc_rc; } +struct vos_waitqueue { + ABT_cond vw_cond; + ABT_mutex vw_mutex; +}; + +static int +vos_waitqueue_create(void **ret_wq) +{ + struct vos_waitqueue *wq; + int rc; + + D_ALLOC_PTR(wq); + if (wq == NULL) + return -DER_NOMEM; + + rc = ABT_mutex_create(&wq->vw_mutex); + if (rc != ABT_SUCCESS) { + D_FREE(wq); + return dss_abterr2der(rc); + } + rc = ABT_cond_create(&wq->vw_cond); + if (rc != ABT_SUCCESS) { + ABT_mutex_free(&wq->vw_mutex); + D_FREE(wq); + return dss_abterr2der(rc); + } + + *ret_wq = wq; + return 0; +} + +static void +vos_waitqueue_destroy(void *arg) +{ + struct vos_waitqueue *wq = arg; + + ABT_cond_free(&wq->vw_cond); + ABT_mutex_free(&wq->vw_mutex); + D_FREE(wq); +} + +static void +vos_waitqueue_wait(void *arg, bool yield_only) +{ + struct vos_waitqueue *wq = arg; + + if (yield_only) { + ABT_thread_yield(); + return; + } + ABT_mutex_lock(wq->vw_mutex); + ABT_cond_wait(wq->vw_cond, wq->vw_mutex); + ABT_mutex_unlock(wq->vw_mutex); +} + +static void +vos_waitqueue_wakeup(void *arg, bool wakeup_all) +{ + struct vos_waitqueue *wq = arg; + + ABT_mutex_lock(wq->vw_mutex); + if (wakeup_all) + ABT_cond_broadcast(wq->vw_cond); + else + ABT_cond_signal(wq->vw_cond); + ABT_mutex_unlock(wq->vw_mutex); +} + static inline int vos_meta_writev(struct umem_store *store, struct umem_store_iod *iod, d_sg_list_t *sgl) { @@ -353,13 +421,75 @@ vos_wal_metrics_init(struct vos_wal_metrics *vw_metrics, const char *path, int t D_WARN("Failed to create 'replay_entries' telemetry: "DF_RC"\n", DP_RC(rc)); } +#define VOS_CACHE_DIR "vos_cache" + +void +vos_cache_metrics_init(struct vos_cache_metrics *vc_metrics, const char *path, int tgt_id) +{ + int rc; + + rc = d_tm_add_metric(&vc_metrics->vcm_pg_ne, D_TM_GAUGE, "Non-evictable pages", + "pages", "%s/%s/page_ne/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create non-evictable pages telemetry."); + + rc = d_tm_add_metric(&vc_metrics->vcm_pg_pinned, D_TM_GAUGE, "Pinned pages", + "pages", "%s/%s/page_pinned/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create pinned pages telemetry."); + + rc = d_tm_add_metric(&vc_metrics->vcm_pg_free, D_TM_GAUGE, "Free pages", + "pages", "%s/%s/page_free/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create free pages telemetry."); + + rc = d_tm_add_metric(&vc_metrics->vcm_pg_hit, D_TM_COUNTER, "Page cache hit", + "hits", "%s/%s/page_hit/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create page hit telemetry."); + + rc = d_tm_add_metric(&vc_metrics->vcm_pg_miss, D_TM_COUNTER, "Page cache miss", + "misses", "%s/%s/page_miss/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create page miss telemetry."); + + rc = d_tm_add_metric(&vc_metrics->vcm_pg_evict, D_TM_COUNTER, "Page cache evict", + "pages", "%s/%s/page_evict/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create page evict telemetry."); + + rc = d_tm_add_metric(&vc_metrics->vcm_pg_flush, D_TM_COUNTER, "Page cache flush", + "pages", "%s/%s/page_flush/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create page flush telemetry."); + + rc = d_tm_add_metric(&vc_metrics->vcm_pg_load, D_TM_COUNTER, "Page cache load", + "pages", "%s/%s/page_load/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create page load telemetry."); + + rc = d_tm_add_metric(&vc_metrics->vcm_obj_hit, D_TM_COUNTER, "Object cache hit", + "hits", "%s/%s/obj_hit/tgt_%d", path, VOS_CACHE_DIR, tgt_id); + if (rc) + DL_WARN(rc, "Failed to create object hit telemetry."); + +} + +static inline struct vos_wal_metrics * +store2wal_metrics(struct umem_store *store) +{ + struct vos_pool_metrics *vpm = (struct vos_pool_metrics *)store->stor_stats; + + return vpm != NULL ? &vpm->vp_wal_metrics : NULL; +} + static inline int vos_wal_reserve(struct umem_store *store, uint64_t *tx_id) { struct bio_wal_info wal_info; struct vos_pool *pool; struct bio_wal_stats ws = { 0 }; - struct vos_wal_metrics *vwm; + struct vos_wal_metrics *vwm = store2wal_metrics(store); int rc; pool = store->vos_priv; @@ -377,7 +507,6 @@ vos_wal_reserve(struct umem_store *store, uint64_t *tx_id) reserve: D_ASSERT(store && store->stor_priv != NULL); - vwm = (struct vos_wal_metrics *)store->stor_stats; rc = bio_wal_reserve(store->stor_priv, tx_id, (vwm != NULL) ? &ws : NULL); if (rc == 0 && vwm != NULL) d_tm_set_gauge(vwm->vwm_wal_waiters, ws.ws_waiters); @@ -391,11 +520,10 @@ vos_wal_commit(struct umem_store *store, struct umem_wal_tx *wal_tx, void *data_ struct bio_wal_info wal_info; struct vos_pool *pool; struct bio_wal_stats ws = {0}; - struct vos_wal_metrics *vwm; + struct vos_wal_metrics *vwm = store2wal_metrics(store); int rc; D_ASSERT(store && store->stor_priv != NULL); - vwm = (struct vos_wal_metrics *)store->stor_stats; if (vwm != NULL) d_tm_mark_duration_start(vwm->vwm_wal_dur, D_TM_CLOCK_REALTIME); rc = bio_wal_commit(store->stor_priv, wal_tx, data_iod, (vwm != NULL) ? &ws : NULL); @@ -426,6 +554,9 @@ vos_wal_commit(struct umem_store *store, struct umem_wal_tx *wal_tx, void *data_ d_tm_set_gauge(vwm->vwm_wal_qd, ws.ws_qd); } + bio_wal_query(store->stor_priv, &wal_info); + umem_cache_commit(store, wal_info.wi_commit_id); + pool = store->vos_priv; if (unlikely(pool == NULL)) return 0; /** In case there is any race for checkpoint init. */ @@ -433,8 +564,6 @@ vos_wal_commit(struct umem_store *store, struct umem_wal_tx *wal_tx, void *data_ /** Update checkpoint state after commit in case there is an active checkpoint waiting * for this commit to finish. */ - bio_wal_query(store->stor_priv, &wal_info); - pool->vp_update_cb(pool->vp_chkpt_arg, wal_info.wi_commit_id, wal_info.wi_used_blks, wal_info.wi_tot_blks); @@ -446,18 +575,15 @@ vos_wal_replay(struct umem_store *store, int (*replay_cb)(uint64_t tx_id, struct umem_action *act, void *arg), void *arg) { - struct bio_wal_rp_stats wrs; - int rc; + struct bio_wal_rp_stats wrs; + struct vos_wal_metrics *vwm = store2wal_metrics(store); + int rc; D_ASSERT(store && store->stor_priv != NULL); - rc = bio_wal_replay(store->stor_priv, - (store->stor_stats != NULL) ? &wrs : NULL, - replay_cb, arg); + rc = bio_wal_replay(store->stor_priv, (vwm != NULL) ? &wrs : NULL, replay_cb, arg); /* VOS file rehydration metrics */ - if (store->stor_stats != NULL && rc >= 0) { - struct vos_wal_metrics *vwm = (struct vos_wal_metrics *)store->stor_stats; - + if (vwm != NULL && rc >= 0) { d_tm_inc_counter(vwm->vwm_replay_count, 1); d_tm_set_gauge(vwm->vwm_replay_size, wrs.wrs_sz); d_tm_set_gauge(vwm->vwm_replay_time, wrs.wrs_tm); @@ -475,6 +601,10 @@ vos_wal_id_cmp(struct umem_store *store, uint64_t id1, uint64_t id2) } struct umem_store_ops vos_store_ops = { + .so_waitqueue_create = vos_waitqueue_create, + .so_waitqueue_destroy = vos_waitqueue_destroy, + .so_waitqueue_wait = vos_waitqueue_wait, + .so_waitqueue_wakeup = vos_waitqueue_wakeup, .so_load = vos_meta_load, .so_read = vos_meta_readv, .so_write = vos_meta_writev, @@ -667,30 +797,90 @@ vos2mc_flags(unsigned int vos_flags) return mc_flags; } +static inline void +init_umem_store(struct umem_store *store, struct bio_meta_context *mc) +{ + bio_meta_get_attr(mc, &store->stor_size, &store->stor_blk_size, &store->stor_hdr_blks, + (uint8_t *)&store->store_type, &store->store_evictable); + store->stor_priv = mc; + store->stor_ops = &vos_store_ops; + + /* Legacy BMEM V1 pool without backend type stored */ + if (bio_nvme_configured(SMD_DEV_TYPE_META) && store->store_type == DAOS_MD_PMEM) + store->store_type = DAOS_MD_BMEM; +} + +static int +vos_pool_store_type(daos_size_t scm_sz, daos_size_t meta_sz) +{ + int backend; + + backend = umempobj_get_backend_type(); + D_ASSERT((meta_sz != 0) && (scm_sz != 0)); + + if (scm_sz > meta_sz) { + D_ERROR("memsize %lu is greater than metasize %lu", scm_sz, meta_sz); + return -DER_INVAL; + } + + if (scm_sz < meta_sz) { + if ((backend == DAOS_MD_BMEM) && umempobj_allow_md_bmem_v2()) + backend = DAOS_MD_BMEM_V2; + else if (backend != DAOS_MD_BMEM_V2) { + D_ERROR("scm_sz %lu is less than meta_sz %lu", scm_sz, meta_sz); + return -DER_INVAL; + } + } + + return backend; +} + +int +vos_pool_roundup_size(daos_size_t *scm_sz, daos_size_t *meta_sz) +{ + size_t alignsz; + int rc; + + D_ASSERT(*scm_sz != 0); + rc = vos_pool_store_type(*scm_sz, *meta_sz ? *meta_sz : *scm_sz); + if (rc < 0) + return rc; + + /* Round up the size such that it is compatible with backend */ + alignsz = umempobj_pgsz(rc); + *scm_sz = max(D_ALIGNUP(*scm_sz, alignsz), 1 << 24); + if (*meta_sz) + *meta_sz = max(D_ALIGNUP(*meta_sz, alignsz), 1 << 24); + + return 0; +} + static int vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, - size_t scm_sz, size_t nvme_sz, size_t wal_sz, unsigned int flags, - struct umem_pool **ph) + size_t scm_sz, size_t nvme_sz, size_t wal_sz, size_t meta_sz, + unsigned int flags, struct umem_pool **ph) { struct bio_xs_context *xs_ctxt = vos_xsctxt_get(); struct umem_store store = { 0 }; struct bio_meta_context *mc; struct umem_pool *pop = NULL; enum bio_mc_flags mc_flags = vos2mc_flags(flags); - size_t meta_sz = scm_sz; int rc, ret; + size_t scm_sz_actual; *ph = NULL; /* always use PMEM mode for SMD */ - store.store_type = umempobj_get_backend_type(); if (flags & VOS_POF_SYSDB) { store.store_type = DAOS_MD_PMEM; store.store_standalone = true; + goto umem_create; } /* No NVMe is configured or current xstream doesn't have NVMe context */ - if (!bio_nvme_configured(SMD_DEV_TYPE_MAX) || xs_ctxt == NULL) + if (!bio_nvme_configured(SMD_DEV_TYPE_MAX) || xs_ctxt == NULL) { + store.store_type = DAOS_MD_PMEM; goto umem_create; + } if (!scm_sz) { struct stat lstat; @@ -698,14 +888,28 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, rc = stat(path, &lstat); if (rc != 0) return daos_errno2der(errno); - meta_sz = lstat.st_size; + scm_sz_actual = lstat.st_size; + } else + scm_sz_actual = scm_sz; + + /* Is meta_sz is set then use it, otherwise derive from VOS file size or scm_sz */ + if (!meta_sz) + meta_sz = scm_sz_actual; + + rc = vos_pool_store_type(scm_sz_actual, meta_sz); + if (rc < 0) { + D_ERROR("Failed to determine the store type for xs:%p pool:"DF_UUID". "DF_RC, + xs_ctxt, DP_UUID(pool_id), DP_RC(rc)); + return rc; } + store.store_type = rc; D_DEBUG(DB_MGMT, "Create BIO meta context for xs:%p pool:"DF_UUID" " - "meta_sz: %zu, nvme_sz: %zu wal_sz:%zu\n", - xs_ctxt, DP_UUID(pool_id), meta_sz, nvme_sz, wal_sz); + "scm_sz: %zu meta_sz: %zu, nvme_sz: %zu wal_sz:%zu backend:%d\n", + xs_ctxt, DP_UUID(pool_id), scm_sz, meta_sz, nvme_sz, wal_sz, store.store_type); - rc = bio_mc_create(xs_ctxt, pool_id, meta_sz, wal_sz, nvme_sz, mc_flags); + rc = bio_mc_create(xs_ctxt, pool_id, scm_sz_actual, meta_sz, wal_sz, nvme_sz, mc_flags, + store.store_type); if (rc != 0) { D_ERROR("Failed to create BIO meta context for xs:%p pool:"DF_UUID". "DF_RC"\n", xs_ctxt, DP_UUID(pool_id), DP_RC(rc)); @@ -724,11 +928,11 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, return rc; } - bio_meta_get_attr(mc, &store.stor_size, &store.stor_blk_size, &store.stor_hdr_blks); - store.stor_priv = mc; - store.stor_ops = &vos_store_ops; + init_umem_store(&store, mc); umem_create: + D_DEBUG(DB_MGMT, "umempobj_create sz: " DF_U64 " store_sz: " DF_U64, scm_sz, + store.stor_size); pop = umempobj_create(path, layout, UMEMPOBJ_ENABLE_STATS, scm_sz, 0600, &store); if (pop != NULL) { *ph = pop; @@ -764,15 +968,17 @@ vos_pmemobj_open(const char *path, uuid_t pool_id, const char *layout, unsigned *ph = NULL; /* always use PMEM mode for SMD */ - store.store_type = umempobj_get_backend_type(); if (flags & VOS_POF_SYSDB) { store.store_type = DAOS_MD_PMEM; store.store_standalone = true; + goto umem_open; } /* No NVMe is configured or current xstream doesn't have NVMe context */ - if (!bio_nvme_configured(SMD_DEV_TYPE_MAX) || xs_ctxt == NULL) + if (!bio_nvme_configured(SMD_DEV_TYPE_MAX) || xs_ctxt == NULL) { + store.store_type = DAOS_MD_PMEM; goto umem_open; + } D_DEBUG(DB_MGMT, "Open BIO meta context for xs:%p pool:"DF_UUID"\n", xs_ctxt, DP_UUID(pool_id)); @@ -784,14 +990,8 @@ vos_pmemobj_open(const char *path, uuid_t pool_id, const char *layout, unsigned return rc; } - bio_meta_get_attr(mc, &store.stor_size, &store.stor_blk_size, &store.stor_hdr_blks); - store.stor_priv = mc; - store.stor_ops = &vos_store_ops; - if (metrics != NULL) { - struct vos_pool_metrics *vpm = (struct vos_pool_metrics *)metrics; - - store.stor_stats = &vpm->vp_wal_metrics; - } + init_umem_store(&store, mc); + store.stor_stats = metrics; umem_open: pop = umempobj_open(path, layout, UMEMPOBJ_ENABLE_STATS, &store); @@ -1014,7 +1214,8 @@ static int pool_open(void *ph, struct vos_pool_df *pool_df, int vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t nvme_sz, - daos_size_t wal_sz, unsigned int flags, uint32_t version, daos_handle_t *poh) + daos_size_t wal_sz, daos_size_t meta_sz, unsigned int flags, uint32_t version, + daos_handle_t *poh) { struct umem_pool *ph; struct umem_attr uma = {0}; @@ -1036,9 +1237,9 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ return -DER_INVAL; D_DEBUG(DB_MGMT, - "Pool Path: %s, size: " DF_U64 ":" DF_U64 ", " + "Pool Path: %s, size: " DF_U64 ":" DF_U64 ":" DF_U64 ", " "UUID: " DF_UUID ", version: %u\n", - path, scm_sz, nvme_sz, DP_UUID(uuid), version); + path, scm_sz, nvme_sz, meta_sz, DP_UUID(uuid), version); if (flags & VOS_POF_SMALL) flags |= VOS_POF_EXCL; @@ -1054,15 +1255,16 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ } /* Path must be a file with a certain size when size argument is 0 */ - if (!scm_sz && access(path, F_OK) == -1) { + if (!scm_sz && access(path, F_OK | R_OK | W_OK) == -1) { D_ERROR("File not accessible (%d) when size is 0\n", errno); return daos_errno2der(errno); } - rc = vos_pmemobj_create(path, uuid, VOS_POOL_LAYOUT, scm_sz, nvme_sz, wal_sz, flags, &ph); + rc = vos_pmemobj_create(path, uuid, VOS_POOL_LAYOUT, scm_sz, nvme_sz, wal_sz, meta_sz, + flags, &ph); if (rc) { - D_ERROR("Failed to create pool %s, scm_sz="DF_U64", nvme_sz="DF_U64". "DF_RC"\n", - path, scm_sz, nvme_sz, DP_RC(rc)); + D_ERROR("Failed to create pool %s, scm_sz="DF_U64", nvme_sz="DF_U64", meta_sz=" + DF_U64". "DF_RC"\n", path, scm_sz, nvme_sz, meta_sz, DP_RC(rc)); return rc; } @@ -1096,6 +1298,18 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ goto end; memset(pool_df, 0, sizeof(*pool_df)); + + pool_df->pd_ext = umem_zalloc(&umem, sizeof(struct vos_pool_ext_df)); + if (UMOFF_IS_NULL(pool_df->pd_ext)) { + D_ERROR("Failed to allocate pool df extension.\n"); + rc = -DER_NOSPACE; + goto end; + } + + rc = gc_init_pool(&umem, pool_df); + if (rc) + goto end; + rc = dbtree_create_inplace(VOS_BTR_CONT_TABLE, 0, VOS_CONT_ORDER, &uma, &pool_df->pd_cont_root, &hdl); if (rc != 0) @@ -1104,15 +1318,14 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ dbtree_close(hdl); uuid_copy(pool_df->pd_id, uuid); - pool_df->pd_scm_sz = scm_sz; + /* Use meta-blob size as scm if present */ + pool_df->pd_scm_sz = (meta_sz) ? meta_sz : scm_sz; pool_df->pd_nvme_sz = nvme_sz; pool_df->pd_magic = POOL_DF_MAGIC; if (DAOS_FAIL_CHECK(FLC_POOL_DF_VER)) pool_df->pd_version = 0; else pool_df->pd_version = version; - - gc_init_pool(&umem, pool_df); end: /** * The transaction can in reality be aborted @@ -1172,11 +1385,11 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ } int -vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t nvme_sz, - unsigned int flags, uint32_t version, daos_handle_t *poh) +vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t data_sz, + daos_size_t meta_sz, unsigned int flags, uint32_t version, daos_handle_t *poh) { /* create vos pool with default WAL size */ - return vos_pool_create_ex(path, uuid, scm_sz, nvme_sz, 0, flags, version, poh); + return vos_pool_create_ex(path, uuid, scm_sz, data_sz, 0, meta_sz, flags, version, poh); } /** @@ -1399,15 +1612,8 @@ pool_open(void *ph, struct vos_pool_df *pool_df, unsigned int flags, void *metri /* Insert the opened pool to the uuid hash table */ uuid_copy(ukey.uuid, pool_df->pd_id); pool->vp_sysdb = !!(flags & VOS_POF_SYSDB); - rc = pool_link(pool, &ukey, poh); - if (rc) { - D_ERROR("Error inserting into vos DRAM hash\n"); - D_GOTO(failed, rc); - } - pool->vp_dtx_committed_count = 0; pool->vp_pool_df = pool_df; - pool->vp_opened = 1; pool->vp_excl = !!(flags & VOS_POF_EXCL); pool->vp_small = !!(flags & VOS_POF_SMALL); @@ -1425,6 +1631,16 @@ pool_open(void *ph, struct vos_pool_df *pool_df, unsigned int flags, void *metri else pool->vp_data_thresh = DAOS_PROP_PO_DATA_THRESH_DEFAULT; + rc = gc_open_pool(pool); + if (rc) + goto failed; + + rc = pool_link(pool, &ukey, poh); + if (rc) { + D_ERROR("Error inserting into vos DRAM hash\n"); + D_GOTO(failed, rc); + } + vos_space_sys_init(pool); /* Ensure GC is triggered after server restart */ gc_add_pool(pool); @@ -1616,10 +1832,12 @@ vos_pool_close(daos_handle_t poh) pool->vp_opened--; /* If the last reference is holding by GC */ - if (pool->vp_opened == 1 && gc_have_pool(pool)) + if (pool->vp_opened == 1 && gc_have_pool(pool)) { gc_del_pool(pool); - else if (pool->vp_opened == 0) + } else if (pool->vp_opened == 0) { vos_pool_hash_del(pool); + gc_close_pool(pool); + } vos_pool_decref(pool); /* -1 for myself */ return 0; diff --git a/src/vos/vos_query.c b/src/vos/vos_query.c index e924e4016b6..b4d414012e5 100644 --- a/src/vos/vos_query.c +++ b/src/vos/vos_query.c @@ -162,7 +162,7 @@ query_normal_recx(struct open_query *query, daos_recx_t *recx) uint32_t inob; - vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh); + vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh, query->qt_obj); rc = evt_open(query->qt_recx_root, &query->qt_pool->vp_uma, &cbs, &toh); if (rc != 0) return rc; @@ -344,7 +344,7 @@ query_ec_recx(struct open_query *query, daos_recx_t *recx) bool prefresh = true; - vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh); + vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh, query->qt_obj); rc = evt_open(query->qt_recx_root, &query->qt_pool->vp_uma, &cbs, &toh); if (rc != 0) return rc; @@ -517,7 +517,7 @@ open_and_query_key(struct open_query *query, daos_key_t *key, return -DER_NONEXIST; rc = dbtree_open_inplace_ex(to_open, &query->qt_pool->vp_uma, - query->qt_coh, query->qt_pool, toh); + query->qt_coh, query->qt_obj, toh); if (rc != 0) return rc; diff --git a/src/vos/vos_space.c b/src/vos/vos_space.c index 5763e3f8bac..35a407e2b3e 100644 --- a/src/vos/vos_space.c +++ b/src/vos/vos_space.c @@ -126,7 +126,7 @@ vos_space_query(struct vos_pool *pool, struct vos_pool_space *vps, bool slow) struct vos_pool_df *df = pool->vp_pool_df; struct vea_attr *attr = &vps->vps_vea_attr; struct vea_stat *stat = slow ? &vps->vps_vea_stat : NULL; - daos_size_t scm_used; + daos_size_t scm_used, ne_used; int rc; SCM_TOTAL(vps) = df->pd_scm_sz; @@ -143,6 +143,27 @@ vos_space_query(struct vos_pool *pool, struct vos_pool_space *vps, bool slow) return rc; } + /* Query non-evictable zones usage when the phase2 pool is evictable */ + if (vos_pool_is_evictable(pool)) { + rc = umempobj_get_mbusage(vos_pool2umm(pool)->umm_pool, UMEM_DEFAULT_MBKT_ID, + &ne_used, &vps->vps_ne_total); + if (rc) { + rc = umem_tx_errno(rc); + DL_ERROR(rc, "Query pool:"DF_UUID" NE space usage failed.", + DP_UUID(pool->vp_id)); + return rc; + } + if (ne_used > vps->vps_ne_total) { + D_ERROR("NE used:"DF_U64" > NE total:"DF_U64"\n", + ne_used, vps->vps_ne_total); + return -DER_INVAL; + } + vps->vps_ne_free = vps->vps_ne_total - ne_used; + } else { + vps->vps_ne_total = 0; + vps->vps_ne_free = 0; + } + /* * FIXME: pmemobj_ctl_get() sometimes return an insane large value, it * could be a PMDK defect. diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index e9dd4e94436..c7aa8b57f5e 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -154,8 +154,9 @@ ktr_hkey_gen(struct btr_instance *tins, d_iov_t *key_iov, void *hkey) { struct ktr_hkey *kkey = (struct ktr_hkey *)hkey; struct umem_pool *umm_pool = tins->ti_umm.umm_pool; - struct vos_pool *pool = (struct vos_pool *)tins->ti_priv; + struct vos_pool *pool; + pool = vos_obj2pool(tins->ti_priv); D_ASSERT(key_iov->iov_len < pool->vp_pool_df->pd_scm_sz); hkey_common_gen(key_iov, hkey); @@ -255,7 +256,7 @@ ktr_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, rbund = iov2rec_bundle(val_iov); - rec->rec_off = umem_zalloc(&tins->ti_umm, vos_krec_size(rbund)); + rec->rec_off = vos_obj_alloc(&tins->ti_umm, tins->ti_priv, vos_krec_size(rbund), true); if (UMOFF_IS_NULL(rec->rec_off)) return -DER_NOSPACE; @@ -286,6 +287,8 @@ ktr_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) int gc; int rc; struct vos_pool *pool; + struct vos_object *obj; + uint32_t *bkt_ids = NULL; if (UMOFF_IS_NULL(rec->rec_off)) return 0; @@ -298,14 +301,22 @@ ktr_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) if (rc != 0) return rc; - pool = (struct vos_pool *)tins->ti_priv; + D_ASSERT(tins->ti_priv); + obj = tins->ti_priv; + pool = vos_obj2pool(obj); + vos_ilog_ts_evict(&krec->kr_ilog, (krec->kr_bmap & KREC_BF_DKEY) ? VOS_TS_TYPE_DKEY : VOS_TS_TYPE_AKEY, pool->vp_sysdb); - D_ASSERT(tins->ti_priv); gc = (krec->kr_bmap & KREC_BF_DKEY) ? GC_DKEY : GC_AKEY; coh = vos_cont2hdl(args); - return gc_add_item(pool, coh, gc, rec->rec_off, 0); + + if (vos_pool_is_evictable(pool)) { + D_ASSERT(obj->obj_bkt_alloted == 1); + bkt_ids = &obj->obj_bkt_ids[0]; + } + + return gc_add_item(pool, coh, gc, rec->rec_off, bkt_ids); } static int @@ -351,7 +362,7 @@ ktr_rec_update(struct btr_instance *tins, struct btr_record *rec, static umem_off_t ktr_node_alloc(struct btr_instance *tins, int size) { - return umem_zalloc(&tins->ti_umm, size); + return vos_obj_alloc(&tins->ti_umm, tins->ti_priv, size, true); } static btr_ops_t key_btr_ops = { @@ -636,7 +647,7 @@ svt_free_payload(struct vos_pool *pool, bio_addr_t *addr, uint64_t rsize) } else if (addr->ba_type == DAOS_MEDIA_NVME) { rc = vos_bio_addr_free(pool, addr, rsize); if (rc) - DL_ERROR(rc, "Free SV payload on NVMe failed."); + DL_ERROR(rc, "Free SV payload on NVMe failed."); } /* Payload is allocated along with vos_iref_df when SV is stored on SCM */ @@ -670,7 +681,10 @@ svt_rec_free_internal(struct btr_instance *tins, struct btr_record *rec, return rc; if (!overwrite) { - struct vos_pool *pool = tins->ti_priv; + struct vos_pool *pool; + + D_ASSERT(tins->ti_priv != NULL); + pool = vos_obj2pool(tins->ti_priv); rc = svt_free_payload(pool, addr, irec->ir_size); if (rc) @@ -762,7 +776,7 @@ svt_check_availability(struct btr_instance *tins, struct btr_record *rec, static umem_off_t svt_node_alloc(struct btr_instance *tins, int size) { - return umem_zalloc(&tins->ti_umm, size); + return vos_obj_alloc(&tins->ti_umm, tins->ti_priv, size, true); } static btr_ops_t singv_btr_ops = { @@ -850,12 +864,13 @@ evt_dop_log_del(struct umem_instance *umm, daos_epoch_t epoch, } void -vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, - daos_handle_t coh) +vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, daos_handle_t coh, + struct vos_object *obj) { /* NB: coh is not required for destroy */ cbs->dc_bio_free_cb = evt_dop_bio_free; cbs->dc_bio_free_args = (void *)pool; + cbs->dc_alloc_arg = (void *)obj; cbs->dc_log_status_cb = evt_dop_log_status; cbs->dc_log_status_args = (void *)(unsigned long)coh.cookie; cbs->dc_log_add_cb = evt_dop_log_add; @@ -877,7 +892,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, int unexpected_flag; int rc = 0; - vos_evt_desc_cbs_init(&cbs, pool, coh); + vos_evt_desc_cbs_init(&cbs, pool, coh, obj); if ((krec->kr_bmap & (KREC_BF_BTR | KREC_BF_EVT)) == 0) goto create; @@ -903,7 +918,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, if (expected_flag == KREC_BF_EVT) { rc = evt_open(&krec->kr_evt, uma, &cbs, sub_toh); } else { - rc = dbtree_open_inplace_ex(&krec->kr_btr, uma, coh, pool, sub_toh); + rc = dbtree_open_inplace_ex(&krec->kr_btr, uma, coh, obj, sub_toh); } if (rc != 0) D_ERROR("Failed to open tree: " DF_RC "\n", DP_RC(rc)); @@ -972,7 +987,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, rc = dbtree_create_inplace_ex(ta->ta_class, tree_feats, ta->ta_order, uma, &krec->kr_btr, - coh, pool, sub_toh); + coh, obj, sub_toh); if (rc != 0) { D_ERROR("Failed to create btree: "DF_RC"\n", DP_RC(rc)); goto out; @@ -1254,14 +1269,13 @@ obj_tree_init(struct vos_object *obj) ta->ta_order, vos_obj2uma(obj), &obj->obj_df->vo_tree, vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), - &obj->obj_toh); + obj, &obj->obj_toh); } else { D_DEBUG(DB_DF, "Open btree for object\n"); rc = dbtree_open_inplace_ex(&obj->obj_df->vo_tree, vos_obj2uma(obj), vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), &obj->obj_toh); + obj, &obj->obj_toh); } if (rc) diff --git a/utils/build.config b/utils/build.config index 5b039750a9b..55dc0b05862 100644 --- a/utils/build.config +++ b/utils/build.config @@ -8,7 +8,7 @@ pmdk=2.1.0 isal=v2.30.0 isal_crypto=v2.23.0 spdk=v22.01.2 -ofi=v1.19.1 +ofi=v1.22.0 mercury=v2.4.0rc5 protobufc=v1.3.3 ucx=v1.14.1 @@ -27,7 +27,6 @@ ucx=https://github.com/openucx/ucx.git [patch_versions] spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff -ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 1394dc8182a..e49537f3098 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -1336,7 +1336,7 @@ def __str__(self): return f'DFuse instance at {self.dir} ({running})' - def start(self, v_hint=None, single_threaded=False, use_oopt=False): + def start(self, v_hint=None, use_oopt=False): """Start a dfuse instance""" # pylint: disable=too-many-branches dfuse_bin = join(self.conf['PREFIX'], 'bin', 'dfuse') @@ -1384,9 +1384,7 @@ def start(self, v_hint=None, single_threaded=False, use_oopt=False): if self.multi_user: cmd.append('--multi-user') - if single_threaded: - cmd.append('--singlethread') - elif not self.cores: + if not self.cores: # Use a lower default thread-count for NLT due to running tests in parallel. cmd.extend(['--thread-count', '4']) @@ -1979,11 +1977,9 @@ class needs_dfuse_with_opt(): wrapping_lock = threading.Lock() # pylint: disable=too-few-public-methods - def __init__(self, caching_variants=None, wbcache=True, single_threaded=False, - dfuse_inval=True, ro=False): + def __init__(self, caching_variants=None, wbcache=True, dfuse_inval=True, ro=False): self.caching_variants = caching_variants if caching_variants else [False, True] self.wbcache = wbcache - self.single_threaded = single_threaded self.dfuse_inval = dfuse_inval self.ro = ro @@ -2019,7 +2015,7 @@ def _helper(obj): caching=caching, wbcache=self.wbcache, **args) - obj.dfuse.start(v_hint=method.__name__, single_threaded=self.single_threaded) + obj.dfuse.start(v_hint=method.__name__) try: rc = method(obj) finally: @@ -2677,11 +2673,6 @@ def test_readdir_unlink(self): assert len(post_files) == len(files) - 1 assert post_files == files[:-2] + [files[-1]] - @needs_dfuse_with_opt(single_threaded=True, caching_variants=[True]) - def test_single_threaded(self): - """Test single-threaded mode""" - self.readdir_test(10) - @needs_dfuse def test_open_replaced(self): """Test that fstat works on file clobbered by rename""" @@ -5919,7 +5910,7 @@ def test_dfuse_start(server, conf, wf): cmd = [join(conf['PREFIX'], 'bin', 'dfuse'), '--mountpoint', mount_point, - '--pool', pool.id(), '--cont', container.id(), '--foreground', '--singlethread'] + '--pool', pool.id(), '--cont', container.id(), '--foreground', '--thread-count=2'] test_cmd = AllocFailTest(conf, 'dfuse', cmd) test_cmd.wf = wf diff --git a/utils/rpms/daos.rpmlintrc b/utils/rpms/daos.rpmlintrc index b1553ca5141..9912465edf4 100644 --- a/utils/rpms/daos.rpmlintrc +++ b/utils/rpms/daos.rpmlintrc @@ -44,7 +44,7 @@ addFilter("E: static-library-without-debuginfo \/usr\/lib64\/lib(dfuse|ioil)\.a" # these need to be fixed: # https://daosio.atlassian.net/browse/DAOS-11539 -addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|self_test|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)).so") +addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|self_test|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)|dav_v2).so") # Tests rpm needs to be able to build daos from source so pulls in build deps and is expected. addFilter("daos-client-tests\.x86_64: E: devel-dependency protobuf-c-devel") diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index ea49dd2d8df..12ac6bd3d5c 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -16,7 +16,7 @@ Name: daos Version: 2.7.100 -Release: 9%{?relval}%{?dist} +Release: 10%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -457,6 +457,7 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent %{_libdir}/daos_srv/libplacement.so %{_libdir}/daos_srv/libpipeline.so %{_libdir}/libdaos_common_pmem.so +%{_libdir}/libdav_v2.so %config(noreplace) %{conf_dir}/vos_size_input.yaml %{_bindir}/daos_storage_estimator.py %{python3_sitearch}/storage_estimator/*.py @@ -592,6 +593,10 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog +* Fri Nov 1 2024 Sherin T George 2.7.100-10 +- The modified DAV allocator with memory bucket support for md_on_ssd + phase-2 is delivered as dav_v2.so. + * Tue Oct 15 2024 Brian J. Murrell - 2.7.100-9 - Drop BRs for UCX as they were obsoleted as of e01970d diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml index cfb13b5c40f..c6d9974456d 100644 --- a/utils/trivy/trivy.yaml +++ b/utils/trivy/trivy.yaml @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: BSD-2-Clause-Patent +# Copyright (c) 2024 Intel Corporation. + cache: backend: fs dir: @@ -16,7 +19,7 @@ db: no-progress: false repository: ghcr.io/aquasecurity/trivy-db skip-update: false -debug: false +debug: true dependency-tree: true exit-code: 0 generate-default-config: false diff --git a/utils/utest.yaml b/utils/utest.yaml index d9e66e2ad1f..faf0102050d 100644 --- a/utils/utest.yaml +++ b/utils/utest.yaml @@ -130,6 +130,11 @@ sudo: True required_src: ["src/vos/tests/bio_ut.c"] tests: + - cmd: ["bin/vos_tests", "-A", "50"] + env_vars: + DAOS_MD_ON_SSD_MODE: "3" + aio: "AIO_7" + size: 13 - cmd: ["bin/vos_tests", "-A", "50"] aio: "AIO_7" size: 13