diff --git a/src/container/srv_target.c b/src/container/srv_target.c index e172d29c4b1..56be5221fa3 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -184,7 +184,7 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, if (ds_pool_is_rebuilding(pool) && !vos_agg) { D_DEBUG(DB_EPC, DF_CONT ": skip EC aggregation during rebuild %d, %d.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), - atomic_load(&pool->sp_rebuilding), pool->sp_rebuild_scan); + atomic_load(&pool->sp_rebuilding), atomic_load(&pool->sp_rebuild_enum)); return false; } @@ -293,8 +293,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, daos_epoch_t epoch_max, epoch_min; daos_epoch_range_t epoch_range; struct sched_request *req = cont2req(cont, param->ap_vos_agg); - uint64_t hlc = d_hlc_get(); - uint64_t change_hlc; + uint64_t hlc = d_hlc_get(); uint64_t interval; uint64_t snapshots_local[MAX_SNAPSHOT_LOCAL] = { 0 }; uint64_t *snapshots = NULL; @@ -303,16 +302,14 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, uint32_t flags = 0; int i, rc = 0; - change_hlc = max(cont->sc_snapshot_delete_hlc, - cont->sc_pool->spc_rebuild_end_hlc); - if (param->ap_full_scan_hlc < change_hlc) { - /* Snapshot has been deleted or rebuild happens since the last + if (param->ap_full_scan_hlc < cont->sc_snapshot_delete_hlc) { + /* Snapshot has been deleted since the last * aggregation, let's restart from 0. */ epoch_min = 0; flags |= VOS_AGG_FL_FORCE_SCAN; - D_DEBUG(DB_EPC, "change hlc "DF_X64" > full "DF_X64"\n", - change_hlc, param->ap_full_scan_hlc); + D_DEBUG(DB_EPC, "snapshot del hlc " DF_X64 " > full " DF_X64 "\n", + cont->sc_snapshot_delete_hlc, param->ap_full_scan_hlc); } else { epoch_min = get_hae(cont, param->ap_vos_agg); } @@ -352,50 +349,24 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, D_DEBUG(DB_EPC, "hlc "DF_X64" epoch "DF_X64"/"DF_X64" agg max "DF_X64"\n", hlc, epoch_max, epoch_min, cont->sc_aggregation_max); - if (cont->sc_snapshots_nr + 1 < MAX_SNAPSHOT_LOCAL) { + snapshots_nr = cont->sc_snapshots_nr; + if (snapshots_nr < MAX_SNAPSHOT_LOCAL) { snapshots = snapshots_local; } else { - D_ALLOC(snapshots, (cont->sc_snapshots_nr + 1) * - sizeof(daos_epoch_t)); + D_ALLOC(snapshots, snapshots_nr * sizeof(daos_epoch_t)); if (snapshots == NULL) return -DER_NOMEM; } - if (cont->sc_pool->spc_rebuild_fence != 0) { - uint64_t rebuild_fence = cont->sc_pool->spc_rebuild_fence; - int j; - int insert_idx; - - /* insert rebuild_fetch into the snapshot list */ - D_DEBUG(DB_EPC, "rebuild fence "DF_X64"\n", rebuild_fence); - for (j = 0, insert_idx = 0; j < cont->sc_snapshots_nr; j++) { - if (cont->sc_snapshots[j] < rebuild_fence) { - snapshots[j] = cont->sc_snapshots[j]; - insert_idx++; - } else { - snapshots[j + 1] = cont->sc_snapshots[j]; - } - } - snapshots[insert_idx] = rebuild_fence; - snapshots_nr = cont->sc_snapshots_nr + 1; - } else { - /* Since sc_snapshots might be freed by other ULT, let's - * always copy here. - */ - snapshots_nr = cont->sc_snapshots_nr; - if (snapshots_nr > 0) - memcpy(snapshots, cont->sc_snapshots, - snapshots_nr * sizeof(daos_epoch_t)); - } + /* Since sc_snapshots might be freed by other ULT, let's always copy here. */ + if (snapshots_nr > 0) + memcpy(snapshots, cont->sc_snapshots, snapshots_nr * sizeof(daos_epoch_t)); /* Find highest snapshot less than last aggregated epoch. */ for (i = 0; i < snapshots_nr && snapshots[i] < epoch_min; ++i) ; - if (i == 0) - epoch_range.epr_lo = 0; - else - epoch_range.epr_lo = snapshots[i - 1] + 1; + epoch_range.epr_lo = epoch_min != 0 ? epoch_min + 1 : 0; if (epoch_range.epr_lo >= epoch_max) D_GOTO(free, rc = 0); diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 5cb2d466027..a731cf7e101 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2015-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -112,7 +112,7 @@ struct ds_cont_child { * VOS aggregation will use this boundary. We will optimize it later. */ uint64_t sc_ec_agg_eph_boundary; - /* The current EC aggregate epoch for this xstream */ + /* The local EC aggregation epoch for this xstream */ uint64_t sc_ec_agg_eph; /* Used by cont_ec_eph_query_ult to query the minimum EC agg epoch from all * local VOS. @@ -142,7 +142,7 @@ struct ds_cont_child { struct agg_param { void *ap_data; struct ds_cont_child *ap_cont; - daos_epoch_t ap_full_scan_hlc; + daos_epoch_t ap_full_scan_hlc; bool ap_vos_agg; }; diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 861c488fa4a..f46abec202c 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -93,13 +93,10 @@ struct ds_pool { * rebuild job. */ uint32_t sp_rebuild_gen; - ATOMIC int sp_rebuilding; ATOMIC int sp_discarding; - /** - * someone has already messaged this pool to for rebuild scan, - * NB: all xstreams can do lockless-write on it but it's OK - */ - int sp_rebuild_scan; + ATOMIC int sp_rebuilding; + /* someone has already messaged this pool to for rebuild object/key enumeration */ + ATOMIC int sp_rebuild_enum; int sp_discard_status; /** path to ephemeral metrics */ @@ -174,16 +171,6 @@ struct ds_pool_child { struct sched_request *spc_chkpt_req; /* Track checkpointing ULT*/ d_list_t spc_cont_list; - /* The current maxim rebuild epoch, (0 if there is no rebuild), so - * vos aggregation can not cross this epoch during rebuild to avoid - * interfering rebuild process. - */ - uint64_t spc_rebuild_fence; - - /* The HLC when current rebuild ends, which will be used to compare - * with the aggregation full scan start HLC to know whether the - * aggregation needs to be restarted from 0. */ - uint64_t spc_rebuild_end_hlc; uint32_t spc_map_version; int spc_ref; ABT_eventual spc_ref_eventual; @@ -215,7 +202,7 @@ struct ds_pool_svc_op_val { static inline bool ds_pool_is_rebuilding(struct ds_pool *pool) { - return (atomic_load(&pool->sp_rebuilding) > 0 || pool->sp_rebuild_scan > 0); + return (atomic_load(&pool->sp_rebuilding) > 0 || atomic_load(&pool->sp_rebuild_enum) > 0); } /* encode metadata RPC operation key: HLC time first, in network order, for keys sorted by time. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 1b6dbfff4b3..a706e2dc294 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2486,20 +2486,6 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, D_ERROR("reintegrating " DF_UUID " retry.\n", DP_UUID(pool->sp_uuid)); return -DER_UPDATE_AGAIN; } - - /* All I/O during rebuilding, needs to wait for the rebuild fence to - * be generated (see rebuild_prepare_one()), which will create a boundary - * for rebuild, so the data after boundary(epoch) should not be rebuilt, - * which otherwise might be written duplicately, which might cause - * the failure in VOS. - */ - if ((flags & ORF_REBUILDING_IO) && - (!child->sc_pool->spc_pool->sp_disable_rebuild && - child->sc_pool->spc_rebuild_fence == 0)) { - D_ERROR("rebuilding "DF_UUID" retry.\n", DP_UUID(child->sc_pool->spc_uuid)); - return -DER_UPDATE_AGAIN; - } - return 0; } @@ -3447,9 +3433,8 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, if (oei->oei_flags & ORF_FOR_MIGRATION) { /* just in case ds_pool::sp_rebuilding is not set, pause my local EC aggregation * by setting this flag. - * NB: it's a lockess write to shared data structure and it's harmless. */ - ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scan = 1; + atomic_store(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_enum, 1); flags = DTX_FOR_MIGRATION; } diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 4eb7f8ef2b5..70705c6f011 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -77,12 +77,7 @@ struct rebuild_tgt_pool_tracker { uint64_t rt_stable_epoch; /* Only used by reclaim job to discard those half-rebuild data */ - uint64_t rt_reclaim_epoch; - /* local rebuild epoch mainly to constrain the VOS aggregation - * to make sure aggregation will not cross the epoch - */ - uint64_t rt_rebuild_fence; - + uint64_t rt_reclaim_epoch; uint32_t rt_leader_rank; /* Global dtx resync version */ @@ -339,7 +334,8 @@ void rebuild_tgt_status_check_ult(void *arg); int -rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt); +rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, + struct rebuild_tgt_pool_tracker **p_rpt); bool rebuild_status_match(struct rebuild_tgt_pool_tracker *rpt, diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index bc445f0e20b..198a31b5756 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1204,6 +1204,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) struct rebuild_scan_out *rso; struct rebuild_pool_tls *tls = NULL; struct rebuild_tgt_pool_tracker *rpt = NULL; + struct ds_pool *pool = NULL; int rc; rsi = crt_req_get(rpc); @@ -1214,6 +1215,13 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rsi->rsi_rebuild_ver, rsi->rsi_rebuild_gen, rsi->rsi_master_rank, rsi->rsi_leader_term, RB_OP_STR(rsi->rsi_rebuild_op)); + rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); + if (rc) { + D_ERROR("Can not find pool " DF_UUID ": %d\n", DP_UUID(rsi->rsi_pool_uuid), rc); + D_GOTO(out, rc); + } + atomic_fetch_add(&pool->sp_rebuilding, 1); + /* If PS leader has been changed, and rebuild version is also increased * due to adding new failure targets for rebuild, let's abort previous * rebuild. @@ -1321,7 +1329,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) if (daos_fail_check(DAOS_REBUILD_TGT_START_FAIL)) D_GOTO(out, rc = -DER_INVAL); - rc = rebuild_tgt_prepare(rpc, &rpt); + rc = rebuild_tgt_prepare(pool, rsi, &rpt); if (rc) D_GOTO(out, rc); @@ -1333,8 +1341,6 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) D_GOTO(out, rc); } - atomic_fetch_add(&rpt->rt_pool->sp_rebuilding, 1); /* reset in rebuild_tgt_fini */ - rpt_get(rpt); /* step-3: start scan leader */ rc = dss_ult_create(rebuild_scan_leader, rpt, DSS_XS_SELF, 0, 0, NULL); @@ -1344,14 +1350,19 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) } out: - if (tls && tls->rebuild_pool_status == 0 && rc != 0) - tls->rebuild_pool_status = rc; - - if (rpt) { - if (rc) + if (rc != 0) { + if (tls && tls->rebuild_pool_status == 0) + tls->rebuild_pool_status = rc; + if (rpt) rpt_delete(rpt); - rpt_put(rpt); + else if (pool) /* otherwise rpt_put() will decrease this for me */ + atomic_fetch_sub(&pool->sp_rebuilding, 1); } + if (pool) + ds_pool_put(pool); + if (rpt) + rpt_put(rpt); + rso = crt_reply_get(rpc); rso->rso_status = rc; rso->rso_stable_epoch = d_hlc_get(); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index fa54b812a2c..a6f3c37f378 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2305,23 +2305,8 @@ rebuild_fini_one(void *arg) if (dpc == NULL) return 0; - /* Reset rebuild epoch, then reset the aggregation epoch, so - * it can aggregate the rebuild epoch. - */ - D_ASSERT(rpt->rt_rebuild_fence != 0); - if (rpt->rt_rebuild_fence == dpc->spc_rebuild_fence) { - dpc->spc_rebuild_fence = 0; - dpc->spc_rebuild_end_hlc = d_hlc_get(); - D_DEBUG(DB_REBUILD, DF_UUID": Reset aggregation end hlc " - DF_U64"\n", DP_UUID(rpt->rt_pool_uuid), - dpc->spc_rebuild_end_hlc); - } else { - D_DEBUG(DB_REBUILD, DF_UUID": pool is still being rebuilt" - " rt_rebuild_fence "DF_U64" spc_rebuild_fence " - DF_U64"\n", DP_UUID(rpt->rt_pool_uuid), - rpt->rt_rebuild_fence, dpc->spc_rebuild_fence); - } - + D_DEBUG(DB_REBUILD, DF_RB ": rebuild fini for stable epoch " DF_U64 "\n", DP_RB_RPT(rpt), + rpt->rt_stable_epoch); ds_pool_child_put(dpc); return 0; } @@ -2337,7 +2322,8 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuilding) > 0); atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); - rpt->rt_pool->sp_rebuild_scan = 0; + + atomic_store(&rpt->rt_pool->sp_rebuild_enum, 0); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond); @@ -2569,14 +2555,8 @@ rebuild_prepare_one(void *data) D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - /* Set the rebuild epoch per VOS container, so VOS aggregation will not - * cross the epoch to cause problem. - */ - D_ASSERT(rpt->rt_rebuild_fence != 0); - dpc->spc_rebuild_fence = rpt->rt_rebuild_fence; - D_DEBUG(DB_REBUILD, "open local container "DF_UUID"/"DF_UUID - " rebuild eph "DF_X64" "DF_RC"\n", DP_UUID(rpt->rt_pool_uuid), - DP_UUID(rpt->rt_coh_uuid), rpt->rt_rebuild_fence, DP_RC(rc)); + D_DEBUG(DB_REBUILD, DF_RB " open local container " DF_UUID " stable eph " DF_X64 "\n", + DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_stable_epoch); put: ds_pool_child_put(dpc); @@ -2638,10 +2618,9 @@ rpt_create(struct ds_pool *pool, uint32_t master_rank, uint32_t pm_ver, * each target get the scan rpc from the master. */ int -rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) +rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, + struct rebuild_tgt_pool_tracker **p_rpt) { - struct rebuild_scan_in *rsi = crt_req_get(rpc); - struct ds_pool *pool; struct rebuild_tgt_pool_tracker *rpt = NULL; struct rebuild_pool_tls *pool_tls; daos_prop_t prop = { 0 }; @@ -2652,12 +2631,6 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) D_DEBUG(DB_REBUILD, "prepare rebuild for "DF_UUID"/%d\n", DP_UUID(rsi->rsi_pool_uuid), rsi->rsi_rebuild_ver); - rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); - if (rc) { - D_ERROR("Can not find pool "DF_UUID": %d\n", DP_UUID(rsi->rsi_pool_uuid), rc); - return rc; - } - if (ds_pool_get_version(pool) < rsi->rsi_rebuild_ver) { D_INFO(DF_UUID" map %u < rsi_rebuild_ver %u\n", DP_UUID(rsi->rsi_pool_uuid), ds_pool_get_version(pool), @@ -2722,31 +2695,27 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) if (pool_tls == NULL) D_GOTO(out, rc = -DER_NOMEM); - rpt->rt_rebuild_fence = d_hlc_get(); - rc = ds_pool_task_collective(rpt->rt_pool_uuid, - PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, - rebuild_prepare_one, rpt, 0); + rc = ds_pool_task_collective(rpt->rt_pool_uuid, + PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, + rebuild_prepare_one, rpt, 0); if (rc) { - rpt->rt_rebuild_fence = 0; rebuild_pool_tls_destroy(pool_tls); D_GOTO(out, rc); } ABT_mutex_lock(rpt->rt_lock); + ds_pool_get(pool); rpt->rt_pool = pool; /* pin it */ ABT_mutex_unlock(rpt->rt_lock); *p_rpt = rpt; out: - if (rc) { - if (rpt) { - if (!d_list_empty(&rpt->rt_list)) { - rpt_delete(rpt); - rpt_put(rpt); - } + if (rc && rpt) { + if (!d_list_empty(&rpt->rt_list)) { + rpt_delete(rpt); rpt_put(rpt); } - ds_pool_put(pool); + rpt_put(rpt); } daos_prop_fini(&prop);