From a50b8d8032f98aca0bbe4c60546aef321d31a29e Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Wed, 8 Apr 2026 12:53:15 +0800 Subject: [PATCH 1/5] DAOS-18705 rebuild: stop refreshing aggregation epoch while rebuilding - stop refreshing aggregation epoch while rebuilding - set rebuilding flag before setting rebuild fence Signed-off-by: Liang Zhen --- src/container/srv_target.c | 52 +++++++------------------- src/include/daos_srv/container.h | 6 +-- src/include/daos_srv/pool.h | 21 +++-------- src/object/srv_obj.c | 2 +- src/rebuild/rebuild_internal.h | 13 ++++--- src/rebuild/scan.c | 29 ++++++++++----- src/rebuild/srv.c | 64 ++++++++++++-------------------- 7 files changed, 73 insertions(+), 114 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index e172d29c4b1..50b9a323573 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -184,7 +184,7 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, if (ds_pool_is_rebuilding(pool) && !vos_agg) { D_DEBUG(DB_EPC, DF_CONT ": skip EC aggregation during rebuild %d, %d.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), - atomic_load(&pool->sp_rebuilding), pool->sp_rebuild_scan); + atomic_load(&pool->sp_rebuilding), atomic_load(&pool->sp_rebuild_scanning)); return false; } @@ -293,8 +293,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, daos_epoch_t epoch_max, epoch_min; daos_epoch_range_t epoch_range; struct sched_request *req = cont2req(cont, param->ap_vos_agg); - uint64_t hlc = d_hlc_get(); - uint64_t change_hlc; + uint64_t hlc = d_hlc_get(); uint64_t interval; uint64_t snapshots_local[MAX_SNAPSHOT_LOCAL] = { 0 }; uint64_t *snapshots = NULL; @@ -303,16 +302,14 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, uint32_t flags = 0; int i, rc = 0; - change_hlc = max(cont->sc_snapshot_delete_hlc, - cont->sc_pool->spc_rebuild_end_hlc); - if (param->ap_full_scan_hlc < change_hlc) { - /* Snapshot has been deleted or rebuild happens since the last + if (param->ap_full_scan_hlc < cont->sc_snapshot_delete_hlc) { + /* Snapshot has been deleted since the last * aggregation, let's restart from 0. */ epoch_min = 0; flags |= VOS_AGG_FL_FORCE_SCAN; - D_DEBUG(DB_EPC, "change hlc "DF_X64" > full "DF_X64"\n", - change_hlc, param->ap_full_scan_hlc); + D_DEBUG(DB_EPC, "snapshot del hlc " DF_X64 " > full " DF_X64 "\n", + cont->sc_snapshot_delete_hlc, param->ap_full_scan_hlc); } else { epoch_min = get_hae(cont, param->ap_vos_agg); } @@ -352,41 +349,18 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, D_DEBUG(DB_EPC, "hlc "DF_X64" epoch "DF_X64"/"DF_X64" agg max "DF_X64"\n", hlc, epoch_max, epoch_min, cont->sc_aggregation_max); - if (cont->sc_snapshots_nr + 1 < MAX_SNAPSHOT_LOCAL) { + snapshots_nr = cont->sc_snapshots_nr; + if (snapshots_nr < MAX_SNAPSHOT_LOCAL) { snapshots = snapshots_local; } else { - D_ALLOC(snapshots, (cont->sc_snapshots_nr + 1) * - sizeof(daos_epoch_t)); + D_ALLOC(snapshots, snapshots_nr * sizeof(daos_epoch_t)); if (snapshots == NULL) return -DER_NOMEM; } - if (cont->sc_pool->spc_rebuild_fence != 0) { - uint64_t rebuild_fence = cont->sc_pool->spc_rebuild_fence; - int j; - int insert_idx; - - /* insert rebuild_fetch into the snapshot list */ - D_DEBUG(DB_EPC, "rebuild fence "DF_X64"\n", rebuild_fence); - for (j = 0, insert_idx = 0; j < cont->sc_snapshots_nr; j++) { - if (cont->sc_snapshots[j] < rebuild_fence) { - snapshots[j] = cont->sc_snapshots[j]; - insert_idx++; - } else { - snapshots[j + 1] = cont->sc_snapshots[j]; - } - } - snapshots[insert_idx] = rebuild_fence; - snapshots_nr = cont->sc_snapshots_nr + 1; - } else { - /* Since sc_snapshots might be freed by other ULT, let's - * always copy here. - */ - snapshots_nr = cont->sc_snapshots_nr; - if (snapshots_nr > 0) - memcpy(snapshots, cont->sc_snapshots, - snapshots_nr * sizeof(daos_epoch_t)); - } + /* Since sc_snapshots might be freed by other ULT, let's always copy here. */ + if (snapshots_nr > 0) + memcpy(snapshots, cont->sc_snapshots, snapshots_nr * sizeof(daos_epoch_t)); /* Find highest snapshot less than last aggregated epoch. */ for (i = 0; i < snapshots_nr && snapshots[i] < epoch_min; ++i) diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 5cb2d466027..a731cf7e101 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -112,7 +112,7 @@ struct ds_cont_child { * VOS aggregation will use this boundary. We will optimize it later. */ uint64_t sc_ec_agg_eph_boundary; - /* The current EC aggregate epoch for this xstream */ + /* The local EC aggregation epoch for this xstream */ uint64_t sc_ec_agg_eph; /* Used by cont_ec_eph_query_ult to query the minimum EC agg epoch from all * local VOS. @@ -142,7 +142,7 @@ struct ds_cont_child { struct agg_param { void *ap_data; struct ds_cont_child *ap_cont; - daos_epoch_t ap_full_scan_hlc; + daos_epoch_t ap_full_scan_hlc; bool ap_vos_agg; }; diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 861c488fa4a..b79a004ddf7 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -95,11 +95,8 @@ struct ds_pool { uint32_t sp_rebuild_gen; ATOMIC int sp_rebuilding; ATOMIC int sp_discarding; - /** - * someone has already messaged this pool to for rebuild scan, - * NB: all xstreams can do lockless-write on it but it's OK - */ - int sp_rebuild_scan; + /* someone has already messaged this pool to for rebuild scan */ + ATOMIC int sp_rebuild_scanning; int sp_discard_status; /** path to ephemeral metrics */ @@ -174,16 +171,7 @@ struct ds_pool_child { struct sched_request *spc_chkpt_req; /* Track checkpointing ULT*/ d_list_t spc_cont_list; - /* The current maxim rebuild epoch, (0 if there is no rebuild), so - * vos aggregation can not cross this epoch during rebuild to avoid - * interfering rebuild process. - */ - uint64_t spc_rebuild_fence; - - /* The HLC when current rebuild ends, which will be used to compare - * with the aggregation full scan start HLC to know whether the - * aggregation needs to be restarted from 0. */ - uint64_t spc_rebuild_end_hlc; + uint64_t spc_rebuild_start; uint32_t spc_map_version; int spc_ref; ABT_eventual spc_ref_eventual; @@ -215,7 +203,8 @@ struct ds_pool_svc_op_val { static inline bool ds_pool_is_rebuilding(struct ds_pool *pool) { - return (atomic_load(&pool->sp_rebuilding) > 0 || pool->sp_rebuild_scan > 0); + return (atomic_load(&pool->sp_rebuilding) > 0 || + atomic_load(&pool->sp_rebuild_scanning) > 0); } /* encode metadata RPC operation key: HLC time first, in network order, for keys sorted by time. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 1b6dbfff4b3..fa2824e2db5 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -3449,7 +3449,7 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, * by setting this flag. * NB: it's a lockess write to shared data structure and it's harmless. */ - ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scan = 1; + atomic_fetch_add(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scanning, 1); flags = DTX_FOR_MIGRATION; } diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 4eb7f8ef2b5..bb956824b0d 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -78,10 +78,12 @@ struct rebuild_tgt_pool_tracker { /* Only used by reclaim job to discard those half-rebuild data */ uint64_t rt_reclaim_epoch; - /* local rebuild epoch mainly to constrain the VOS aggregation - * to make sure aggregation will not cross the epoch + /* + * XX: remove this. + * rebuild_fini_one() compare this value against rt_rebuild_start to + * decide whether this rebuild still owns this vos pool's rebuild. */ - uint64_t rt_rebuild_fence; + uint64_t rt_rebuild_start; uint32_t rt_leader_rank; @@ -339,7 +341,8 @@ void rebuild_tgt_status_check_ult(void *arg); int -rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt); +rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, + struct rebuild_tgt_pool_tracker **p_rpt); bool rebuild_status_match(struct rebuild_tgt_pool_tracker *rpt, diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index bc445f0e20b..5201946567f 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1204,6 +1204,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) struct rebuild_scan_out *rso; struct rebuild_pool_tls *tls = NULL; struct rebuild_tgt_pool_tracker *rpt = NULL; + struct ds_pool *pool = NULL; int rc; rsi = crt_req_get(rpc); @@ -1214,6 +1215,13 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rsi->rsi_rebuild_ver, rsi->rsi_rebuild_gen, rsi->rsi_master_rank, rsi->rsi_leader_term, RB_OP_STR(rsi->rsi_rebuild_op)); + rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); + if (rc) { + DL_ERROR(rc, DF_RB " cannot find pool", DP_RB_RSI(rsi)); + D_GOTO(out, rc); + } + atomic_fetch_add(&pool->sp_rebuilding, 1); + /* If PS leader has been changed, and rebuild version is also increased * due to adding new failure targets for rebuild, let's abort previous * rebuild. @@ -1321,7 +1329,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) if (daos_fail_check(DAOS_REBUILD_TGT_START_FAIL)) D_GOTO(out, rc = -DER_INVAL); - rc = rebuild_tgt_prepare(rpc, &rpt); + rc = rebuild_tgt_prepare(pool, rsi, &rpt); if (rc) D_GOTO(out, rc); @@ -1333,8 +1341,6 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) D_GOTO(out, rc); } - atomic_fetch_add(&rpt->rt_pool->sp_rebuilding, 1); /* reset in rebuild_tgt_fini */ - rpt_get(rpt); /* step-3: start scan leader */ rc = dss_ult_create(rebuild_scan_leader, rpt, DSS_XS_SELF, 0, 0, NULL); @@ -1344,14 +1350,19 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) } out: - if (tls && tls->rebuild_pool_status == 0 && rc != 0) - tls->rebuild_pool_status = rc; - - if (rpt) { - if (rc) + if (rc != 0) { + if (tls && tls->rebuild_pool_status == 0) + tls->rebuild_pool_status = rc; + if (pool) + atomic_fetch_sub(&pool->sp_rebuilding, 1); + if (rpt) rpt_delete(rpt); - rpt_put(rpt); } + if (pool) + ds_pool_put(pool); + if (rpt) + rpt_put(rpt); + rso = crt_reply_get(rpc); rso->rso_status = rc; rso->rso_stable_epoch = d_hlc_get(); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index fa54b812a2c..72b04deb65d 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2305,21 +2305,15 @@ rebuild_fini_one(void *arg) if (dpc == NULL) return 0; - /* Reset rebuild epoch, then reset the aggregation epoch, so - * it can aggregate the rebuild epoch. - */ - D_ASSERT(rpt->rt_rebuild_fence != 0); - if (rpt->rt_rebuild_fence == dpc->spc_rebuild_fence) { - dpc->spc_rebuild_fence = 0; - dpc->spc_rebuild_end_hlc = d_hlc_get(); - D_DEBUG(DB_REBUILD, DF_UUID": Reset aggregation end hlc " - DF_U64"\n", DP_UUID(rpt->rt_pool_uuid), - dpc->spc_rebuild_end_hlc); + D_ASSERT(rpt->rt_rebuild_start != 0); + if (rpt->rt_rebuild_start == dpc->spc_rebuild_start) { + dpc->spc_rebuild_start = 0; + D_DEBUG(DB_REBUILD, DF_RB ": Reset rebuild start epoch\n", DP_RB_RPT(rpt)); } else { - D_DEBUG(DB_REBUILD, DF_UUID": pool is still being rebuilt" - " rt_rebuild_fence "DF_U64" spc_rebuild_fence " - DF_U64"\n", DP_UUID(rpt->rt_pool_uuid), - rpt->rt_rebuild_fence, dpc->spc_rebuild_fence); + D_DEBUG(DB_REBUILD, + DF_RB ": pool is still being rebuilt rt_rebuild_start " DF_U64 + " spc_rebuild_start " DF_U64 "\n", + DP_RB_RPT(rpt), rpt->rt_rebuild_start, dpc->spc_rebuild_start); } ds_pool_child_put(dpc); @@ -2337,7 +2331,9 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuilding) > 0); atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); - rpt->rt_pool->sp_rebuild_scan = 0; + + D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuild_scanning) > 0); + atomic_store(&rpt->rt_pool->sp_rebuild_scanning, 0); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond); @@ -2569,14 +2565,10 @@ rebuild_prepare_one(void *data) D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - /* Set the rebuild epoch per VOS container, so VOS aggregation will not - * cross the epoch to cause problem. - */ - D_ASSERT(rpt->rt_rebuild_fence != 0); - dpc->spc_rebuild_fence = rpt->rt_rebuild_fence; - D_DEBUG(DB_REBUILD, "open local container "DF_UUID"/"DF_UUID - " rebuild eph "DF_X64" "DF_RC"\n", DP_UUID(rpt->rt_pool_uuid), - DP_UUID(rpt->rt_coh_uuid), rpt->rt_rebuild_fence, DP_RC(rc)); + D_ASSERT(rpt->rt_rebuild_start != 0); + dpc->spc_rebuild_start = rpt->rt_rebuild_start; + D_DEBUG(DB_REBUILD, DF_RB " open local container " DF_UUID " rebuild eph " DF_X64 "\n", + DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_rebuild_start); put: ds_pool_child_put(dpc); @@ -2638,10 +2630,9 @@ rpt_create(struct ds_pool *pool, uint32_t master_rank, uint32_t pm_ver, * each target get the scan rpc from the master. */ int -rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) +rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, + struct rebuild_tgt_pool_tracker **p_rpt) { - struct rebuild_scan_in *rsi = crt_req_get(rpc); - struct ds_pool *pool; struct rebuild_tgt_pool_tracker *rpt = NULL; struct rebuild_pool_tls *pool_tls; daos_prop_t prop = { 0 }; @@ -2652,12 +2643,6 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) D_DEBUG(DB_REBUILD, "prepare rebuild for "DF_UUID"/%d\n", DP_UUID(rsi->rsi_pool_uuid), rsi->rsi_rebuild_ver); - rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); - if (rc) { - D_ERROR("Can not find pool "DF_UUID": %d\n", DP_UUID(rsi->rsi_pool_uuid), rc); - return rc; - } - if (ds_pool_get_version(pool) < rsi->rsi_rebuild_ver) { D_INFO(DF_UUID" map %u < rsi_rebuild_ver %u\n", DP_UUID(rsi->rsi_pool_uuid), ds_pool_get_version(pool), @@ -2722,12 +2707,12 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) if (pool_tls == NULL) D_GOTO(out, rc = -DER_NOMEM); - rpt->rt_rebuild_fence = d_hlc_get(); + rpt->rt_rebuild_start = d_hlc_get(); rc = ds_pool_task_collective(rpt->rt_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, rebuild_prepare_one, rpt, 0); if (rc) { - rpt->rt_rebuild_fence = 0; + rpt->rt_rebuild_start = 0; rebuild_pool_tls_destroy(pool_tls); D_GOTO(out, rc); } @@ -2738,15 +2723,12 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) *p_rpt = rpt; out: - if (rc) { - if (rpt) { - if (!d_list_empty(&rpt->rt_list)) { - rpt_delete(rpt); - rpt_put(rpt); - } + if (rc && rpt) { + if (!d_list_empty(&rpt->rt_list)) { + rpt_delete(rpt); rpt_put(rpt); } - ds_pool_put(pool); + rpt_put(rpt); } daos_prop_fini(&prop); From 76897769a63a2768bfd483abd77ce1e070708190 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Thu, 9 Apr 2026 23:53:25 +0800 Subject: [PATCH 2/5] DAOS: resolve cherry-pick conflict Signed-off-by: Liang Zhen --- src/object/srv_obj.c | 5 ++--- src/rebuild/scan.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index fa2824e2db5..0c991ce542a 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2493,9 +2493,8 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, * which otherwise might be written duplicately, which might cause * the failure in VOS. */ - if ((flags & ORF_REBUILDING_IO) && - (!child->sc_pool->spc_pool->sp_disable_rebuild && - child->sc_pool->spc_rebuild_fence == 0)) { + if ((flags & ORF_REBUILDING_IO) && (!child->sc_pool->spc_pool->sp_disable_rebuild && + child->sc_pool->spc_rebuild_start == 0)) { D_ERROR("rebuilding "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); return -DER_UPDATE_AGAIN; } diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 5201946567f..2035c0d182a 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1217,7 +1217,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); if (rc) { - DL_ERROR(rc, DF_RB " cannot find pool", DP_RB_RSI(rsi)); + D_ERROR("Can not find pool " DF_UUID ": %d\n", DP_UUID(rsi->rsi_pool_uuid), rc); D_GOTO(out, rc); } atomic_fetch_add(&pool->sp_rebuilding, 1); From 20bb86c03000d7c0e10a6e8b266c4c4d648cc9ff Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Fri, 10 Apr 2026 20:49:53 +0800 Subject: [PATCH 3/5] DAOS: remove unused epoch Signed-off-by: Liang Zhen --- src/container/srv_target.c | 5 +---- src/include/daos_srv/pool.h | 1 - src/object/srv_obj.c | 13 ------------- src/rebuild/rebuild_internal.h | 9 +-------- src/rebuild/srv.c | 27 +++++++-------------------- 5 files changed, 9 insertions(+), 46 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 50b9a323573..75e3eba7e01 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -366,10 +366,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, for (i = 0; i < snapshots_nr && snapshots[i] < epoch_min; ++i) ; - if (i == 0) - epoch_range.epr_lo = 0; - else - epoch_range.epr_lo = snapshots[i - 1] + 1; + epoch_range.epr_lo = epoch_min != 0 ? epoch_min + 1 : 0; if (epoch_range.epr_lo >= epoch_max) D_GOTO(free, rc = 0); diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index b79a004ddf7..3d96771cea0 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -171,7 +171,6 @@ struct ds_pool_child { struct sched_request *spc_chkpt_req; /* Track checkpointing ULT*/ d_list_t spc_cont_list; - uint64_t spc_rebuild_start; uint32_t spc_map_version; int spc_ref; ABT_eventual spc_ref_eventual; diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 0c991ce542a..cc08820a4d7 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2486,19 +2486,6 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, D_ERROR("reintegrating " DF_UUID " retry.\n", DP_UUID(pool->sp_uuid)); return -DER_UPDATE_AGAIN; } - - /* All I/O during rebuilding, needs to wait for the rebuild fence to - * be generated (see rebuild_prepare_one()), which will create a boundary - * for rebuild, so the data after boundary(epoch) should not be rebuilt, - * which otherwise might be written duplicately, which might cause - * the failure in VOS. - */ - if ((flags & ORF_REBUILDING_IO) && (!child->sc_pool->spc_pool->sp_disable_rebuild && - child->sc_pool->spc_rebuild_start == 0)) { - D_ERROR("rebuilding "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); - return -DER_UPDATE_AGAIN; - } - return 0; } diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index bb956824b0d..70705c6f011 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -77,14 +77,7 @@ struct rebuild_tgt_pool_tracker { uint64_t rt_stable_epoch; /* Only used by reclaim job to discard those half-rebuild data */ - uint64_t rt_reclaim_epoch; - /* - * XX: remove this. - * rebuild_fini_one() compare this value against rt_rebuild_start to - * decide whether this rebuild still owns this vos pool's rebuild. - */ - uint64_t rt_rebuild_start; - + uint64_t rt_reclaim_epoch; uint32_t rt_leader_rank; /* Global dtx resync version */ diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 72b04deb65d..9216aea8480 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2305,17 +2305,8 @@ rebuild_fini_one(void *arg) if (dpc == NULL) return 0; - D_ASSERT(rpt->rt_rebuild_start != 0); - if (rpt->rt_rebuild_start == dpc->spc_rebuild_start) { - dpc->spc_rebuild_start = 0; - D_DEBUG(DB_REBUILD, DF_RB ": Reset rebuild start epoch\n", DP_RB_RPT(rpt)); - } else { - D_DEBUG(DB_REBUILD, - DF_RB ": pool is still being rebuilt rt_rebuild_start " DF_U64 - " spc_rebuild_start " DF_U64 "\n", - DP_RB_RPT(rpt), rpt->rt_rebuild_start, dpc->spc_rebuild_start); - } - + D_DEBUG(DB_REBUILD, DF_RB ": rebuild fini for stable epoch " DF_U64 "\n", DP_RB_RPT(rpt), + rpt->rt_stable_epoch); ds_pool_child_put(dpc); return 0; } @@ -2565,10 +2556,8 @@ rebuild_prepare_one(void *data) D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - D_ASSERT(rpt->rt_rebuild_start != 0); - dpc->spc_rebuild_start = rpt->rt_rebuild_start; - D_DEBUG(DB_REBUILD, DF_RB " open local container " DF_UUID " rebuild eph " DF_X64 "\n", - DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_rebuild_start); + D_DEBUG(DB_REBUILD, DF_RB " open local container " DF_UUID " stable eph " DF_X64 "\n", + DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_stable_epoch); put: ds_pool_child_put(dpc); @@ -2707,12 +2696,10 @@ rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, if (pool_tls == NULL) D_GOTO(out, rc = -DER_NOMEM); - rpt->rt_rebuild_start = d_hlc_get(); - rc = ds_pool_task_collective(rpt->rt_pool_uuid, - PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, - rebuild_prepare_one, rpt, 0); + rc = ds_pool_task_collective(rpt->rt_pool_uuid, + PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, + rebuild_prepare_one, rpt, 0); if (rc) { - rpt->rt_rebuild_start = 0; rebuild_pool_tls_destroy(pool_tls); D_GOTO(out, rc); } From 8b1581ec89138924270a70eaa4f32c9582545f15 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 11 Apr 2026 00:05:12 +0800 Subject: [PATCH 4/5] DAOS: fix the ds_pool refcount Signed-off-by: Liang Zhen --- src/object/srv_obj.c | 2 +- src/rebuild/scan.c | 4 ++-- src/rebuild/srv.c | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index cc08820a4d7..ecb817dbaa5 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -3435,7 +3435,7 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, * by setting this flag. * NB: it's a lockess write to shared data structure and it's harmless. */ - atomic_fetch_add(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scanning, 1); + atomic_store(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scanning, 1); flags = DTX_FOR_MIGRATION; } diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 2035c0d182a..198a31b5756 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1353,10 +1353,10 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) if (rc != 0) { if (tls && tls->rebuild_pool_status == 0) tls->rebuild_pool_status = rc; - if (pool) - atomic_fetch_sub(&pool->sp_rebuilding, 1); if (rpt) rpt_delete(rpt); + else if (pool) /* otherwise rpt_put() will decrease this for me */ + atomic_fetch_sub(&pool->sp_rebuilding, 1); } if (pool) ds_pool_put(pool); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 9216aea8480..25267862399 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2705,6 +2705,7 @@ rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, } ABT_mutex_lock(rpt->rt_lock); + ds_pool_get(pool); rpt->rt_pool = pool; /* pin it */ ABT_mutex_unlock(rpt->rt_lock); From f64e95b56e4fd14b7f22d5d0bda959c69a6a04ee Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 11 Apr 2026 11:42:42 +0800 Subject: [PATCH 5/5] DAOS: remove false assertion Signed-off-by: Liang Zhen --- src/container/srv_target.c | 2 +- src/include/daos_srv/pool.h | 9 ++++----- src/object/srv_obj.c | 3 +-- src/rebuild/srv.c | 3 +-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 75e3eba7e01..56be5221fa3 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -184,7 +184,7 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, if (ds_pool_is_rebuilding(pool) && !vos_agg) { D_DEBUG(DB_EPC, DF_CONT ": skip EC aggregation during rebuild %d, %d.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), - atomic_load(&pool->sp_rebuilding), atomic_load(&pool->sp_rebuild_scanning)); + atomic_load(&pool->sp_rebuilding), atomic_load(&pool->sp_rebuild_enum)); return false; } diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 3d96771cea0..f46abec202c 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -93,10 +93,10 @@ struct ds_pool { * rebuild job. */ uint32_t sp_rebuild_gen; - ATOMIC int sp_rebuilding; ATOMIC int sp_discarding; - /* someone has already messaged this pool to for rebuild scan */ - ATOMIC int sp_rebuild_scanning; + ATOMIC int sp_rebuilding; + /* someone has already messaged this pool to for rebuild object/key enumeration */ + ATOMIC int sp_rebuild_enum; int sp_discard_status; /** path to ephemeral metrics */ @@ -202,8 +202,7 @@ struct ds_pool_svc_op_val { static inline bool ds_pool_is_rebuilding(struct ds_pool *pool) { - return (atomic_load(&pool->sp_rebuilding) > 0 || - atomic_load(&pool->sp_rebuild_scanning) > 0); + return (atomic_load(&pool->sp_rebuilding) > 0 || atomic_load(&pool->sp_rebuild_enum) > 0); } /* encode metadata RPC operation key: HLC time first, in network order, for keys sorted by time. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index ecb817dbaa5..a706e2dc294 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -3433,9 +3433,8 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, if (oei->oei_flags & ORF_FOR_MIGRATION) { /* just in case ds_pool::sp_rebuilding is not set, pause my local EC aggregation * by setting this flag. - * NB: it's a lockess write to shared data structure and it's harmless. */ - atomic_store(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scanning, 1); + atomic_store(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_enum, 1); flags = DTX_FOR_MIGRATION; } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 25267862399..a6f3c37f378 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2323,8 +2323,7 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuilding) > 0); atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); - D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuild_scanning) > 0); - atomic_store(&rpt->rt_pool->sp_rebuild_scanning, 0); + atomic_store(&rpt->rt_pool->sp_rebuild_enum, 0); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond);