|
36 | 36 | import org.apache.pinot.common.auth.AuthProviderUtils; |
37 | 37 | import org.apache.pinot.common.auth.NullAuthProvider; |
38 | 38 | import org.apache.pinot.common.restlet.resources.ValidDocIdsBitmapResponse; |
39 | | -import org.apache.pinot.common.restlet.resources.ValidDocIdsMetadataInfo; |
40 | 39 | import org.apache.pinot.common.restlet.resources.ValidDocIdsType; |
41 | 40 | import org.apache.pinot.common.utils.RoaringBitmapUtils; |
42 | 41 | import org.apache.pinot.common.utils.ServiceStatus; |
|
67 | 66 | public class MinionTaskUtils { |
68 | 67 | private static final Logger LOGGER = LoggerFactory.getLogger(MinionTaskUtils.class); |
69 | 68 |
|
| 69 | + /** Package-private for testing: parses validDocIdsComparisonMode config string. */ |
| 70 | + static MinionConstants.ValidDocIdsConsensusMode parseValidDocIdsConsensusMode(String value) { |
| 71 | + if (value == null || value.isBlank()) { |
| 72 | + return MinionConstants.ValidDocIdsConsensusMode.EQUAL; |
| 73 | + } |
| 74 | + return MinionConstants.ValidDocIdsConsensusMode.valueOf(value.toUpperCase().trim()); |
| 75 | + } |
| 76 | + |
70 | 77 | private static final String DEFAULT_DIR_PATH_TERMINATOR = "/"; |
71 | 78 |
|
72 | 79 | public static final String DATETIME_PATTERN = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; |
@@ -281,66 +288,114 @@ public static boolean extractMinionAllowDownloadFromServer(TableConfig tableConf |
281 | 288 | } |
282 | 289 |
|
283 | 290 | /** |
284 | | - * Returns the validDocID bitmap from the server whose local segment crc matches both crc of ZK metadata and |
285 | | - * deepstore copy (expectedCrc). |
| 291 | + * Returns the validDocIds bitmap from server(s). {@code comparisonMode} is the task config value: UNSAFE, |
| 292 | + * EQUAL (default), or MOST_VALID_DOCS. |
286 | 293 | */ |
287 | 294 | @Nullable |
288 | 295 | public static RoaringBitmap getValidDocIdFromServerMatchingCrc(String tableNameWithType, String segmentName, |
289 | | - String validDocIdsType, MinionContext minionContext, String expectedCrc) { |
| 296 | + String validDocIdsType, MinionContext minionContext, String expectedCrc, String comparisonModeStr) { |
| 297 | + MinionConstants.ValidDocIdsConsensusMode consensusMode = parseValidDocIdsConsensusMode(comparisonModeStr); |
290 | 298 | String clusterName = minionContext.getHelixManager().getClusterName(); |
291 | 299 | HelixAdmin helixAdmin = minionContext.getHelixManager().getClusterManagmentTool(); |
292 | | - RoaringBitmap validDocIds = null; |
293 | 300 | List<String> servers = getServers(segmentName, tableNameWithType, helixAdmin, clusterName); |
| 301 | + List<RoaringBitmap> matchingBitmaps = new ArrayList<>(); |
| 302 | + |
294 | 303 | for (String server : servers) { |
295 | 304 | InstanceConfig instanceConfig = helixAdmin.getInstanceConfig(clusterName, server); |
296 | 305 | String endpoint = InstanceUtils.getServerAdminEndpoint(instanceConfig); |
297 | 306 |
|
298 | | - // We only need aggregated table size and the total number of docs/rows. Skipping column related stats, by |
299 | | - // passing an empty list. |
300 | 307 | ServerSegmentMetadataReader serverSegmentMetadataReader = new ServerSegmentMetadataReader(); |
301 | 308 | ValidDocIdsBitmapResponse validDocIdsBitmapResponse; |
302 | 309 | try { |
303 | 310 | validDocIdsBitmapResponse = |
304 | 311 | serverSegmentMetadataReader.getValidDocIdsBitmapFromServer(tableNameWithType, segmentName, endpoint, |
305 | 312 | validDocIdsType, 60_000); |
306 | 313 | } catch (Exception e) { |
307 | | - LOGGER.warn("Unable to retrieve validDocIds bitmap for segment: " + segmentName + " from endpoint: " |
308 | | - + endpoint, e); |
309 | | - continue; |
| 314 | + if (consensusMode == MinionConstants.ValidDocIdsConsensusMode.UNSAFE) { |
| 315 | + LOGGER.warn( |
| 316 | + "Unable to retrieve validDocIds bitmap for segment: " + segmentName + " from endpoint: " + endpoint, e); |
| 317 | + continue; |
| 318 | + } else { |
| 319 | + throw new IllegalStateException( |
| 320 | + "Unable to retrieve validDocIds bitmap for segment: " + segmentName + " from endpoint: " + endpoint, e); |
| 321 | + } |
310 | 322 | } |
311 | 323 |
|
| 324 | + String crcFromValidDocIdsBitmap = validDocIdsBitmapResponse.getSegmentCrc(); |
312 | 325 | // Check crc from the downloaded segment against the crc returned from the server along with the valid doc id |
313 | 326 | // bitmap. If this doesn't match, this means that we are hitting the race condition where the segment has been |
314 | 327 | // uploaded successfully while the server is still reloading the segment. Reloading can take a while when the |
315 | 328 | // offheap upsert is used because we will need to delete & add all primary keys. |
316 | 329 | // `BaseSingleSegmentConversionExecutor.executeTask()` already checks for the crc from the task generator |
317 | 330 | // against the crc from the current segment zk metadata, so we don't need to check that here. |
318 | | - String crcFromValidDocIdsBitmap = validDocIdsBitmapResponse.getSegmentCrc(); |
319 | 331 | if (!expectedCrc.equals(crcFromValidDocIdsBitmap)) { |
320 | | - // In this scenario, we are hitting the other replica of the segment which did not commit to ZK or deepstore. |
321 | | - // We will skip processing this bitmap to query other server to confirm if there is a valid matching CRC. |
322 | | - String message = "CRC mismatch for segment: " + segmentName + ", expected value based on task generator: " |
323 | | - + expectedCrc + ", actual crc from validDocIdsBitmapResponse from endpoint " + endpoint + ": " |
324 | | - + crcFromValidDocIdsBitmap; |
325 | | - LOGGER.warn(message); |
326 | | - continue; |
| 332 | + if (consensusMode == MinionConstants.ValidDocIdsConsensusMode.UNSAFE) { |
| 333 | + LOGGER.warn("CRC mismatch for segment: {} from endpoint {}, skipping", segmentName, endpoint); |
| 334 | + continue; |
| 335 | + } else { |
| 336 | + throw new IllegalStateException( |
| 337 | + "CRC mismatch for segment: " + segmentName + ", expected: " + expectedCrc + ", actual from endpoint " |
| 338 | + + endpoint + ": " + crcFromValidDocIdsBitmap); |
| 339 | + } |
327 | 340 | } |
328 | 341 |
|
329 | | - // skipping servers which are not in READY state. The bitmaps would be inconsistent when |
330 | | - // server is NOT READY as UPDATING segments might be updating the ONLINE segments |
331 | 342 | if (validDocIdsBitmapResponse.getServerStatus() != null && !validDocIdsBitmapResponse.getServerStatus() |
332 | 343 | .equals(ServiceStatus.Status.GOOD)) { |
333 | | - String message = "Server " + validDocIdsBitmapResponse.getInstanceId() + " is in " |
334 | | - + validDocIdsBitmapResponse.getServerStatus() + " state, skipping it for execution for segment: " |
335 | | - + validDocIdsBitmapResponse.getSegmentName() + ". Will try other servers."; |
336 | | - LOGGER.warn(message); |
337 | | - continue; |
| 344 | + if (consensusMode == MinionConstants.ValidDocIdsConsensusMode.UNSAFE) { |
| 345 | + LOGGER.warn("Server {} not READY for segment {}, skipping", validDocIdsBitmapResponse.getInstanceId(), |
| 346 | + segmentName); |
| 347 | + continue; |
| 348 | + } else { |
| 349 | + throw new IllegalStateException("Server " + validDocIdsBitmapResponse.getInstanceId() + " is in " |
| 350 | + + validDocIdsBitmapResponse.getServerStatus() + " state for segment: " + segmentName |
| 351 | + + ". Failing task to avoid inconsistency among replicas."); |
| 352 | + } |
| 353 | + } |
| 354 | + |
| 355 | + RoaringBitmap bitmap = RoaringBitmapUtils.deserialize(validDocIdsBitmapResponse.getBitmap()); |
| 356 | + int cardinality = bitmap.getCardinality(); |
| 357 | + |
| 358 | + if (consensusMode == MinionConstants.ValidDocIdsConsensusMode.UNSAFE) { |
| 359 | + LOGGER.info("Using server {} with {} valid docs for segment {} (mode=UNSAFE)", server, cardinality, |
| 360 | + segmentName); |
| 361 | + return bitmap; |
338 | 362 | } |
339 | 363 |
|
340 | | - validDocIds = RoaringBitmapUtils.deserialize(validDocIdsBitmapResponse.getBitmap()); |
341 | | - break; |
| 364 | + matchingBitmaps.add(bitmap); |
342 | 365 | } |
343 | | - return validDocIds; |
| 366 | + |
| 367 | + if (matchingBitmaps.isEmpty()) { |
| 368 | + return null; |
| 369 | + } |
| 370 | + |
| 371 | + if (consensusMode == MinionConstants.ValidDocIdsConsensusMode.EQUAL) { |
| 372 | + RoaringBitmap consensusBitMap = matchingBitmaps.get(0); |
| 373 | + for (RoaringBitmap b : matchingBitmaps) { |
| 374 | + if (!b.equals(consensusBitMap)) { |
| 375 | + throw new IllegalStateException("No consensus on validDocs across replicas for segment: " + segmentName |
| 376 | + + ". Failing task to avoid replica inconsistency."); |
| 377 | + } |
| 378 | + } |
| 379 | + LOGGER.info("All {} servers have {} valid docs for segment {}", servers.size(), consensusBitMap.getCardinality(), |
| 380 | + segmentName); |
| 381 | + return consensusBitMap; |
| 382 | + } |
| 383 | + |
| 384 | + // MOST_VALID_DOCS: explicitly pick the bitmap with the maximum valid doc count |
| 385 | + RoaringBitmap maxCardinalityMap = null; |
| 386 | + int maxCard = -1; |
| 387 | + for (RoaringBitmap b : matchingBitmaps) { |
| 388 | + int card = b.getCardinality(); |
| 389 | + if (card > maxCard) { |
| 390 | + maxCard = card; |
| 391 | + maxCardinalityMap = b; |
| 392 | + } |
| 393 | + } |
| 394 | + if (maxCardinalityMap != null) { |
| 395 | + LOGGER.info("Selected server with {} valid docs for segment {} (mode=MOST_VALID_DOCS, checked {} servers)", |
| 396 | + maxCard, segmentName, servers.size()); |
| 397 | + } |
| 398 | + return maxCardinalityMap; |
344 | 399 | } |
345 | 400 |
|
346 | 401 | public static String toUTCString(long epochMillis) { |
@@ -396,47 +451,4 @@ public static ValidDocIdsType getValidDocIdsType(UpsertConfig upsertConfig, Map< |
396 | 451 | } |
397 | 452 | return validDocIdsType; |
398 | 453 | } |
399 | | - |
400 | | - /** |
401 | | - * Checks if all replicas have consensus on validDoc counts for a segment. |
402 | | - * SAFETY LOGIC: |
403 | | - * 1. Only proceed with operations when ALL replicas agree on totalValidDocs count |
404 | | - * 2. Skip operations if ANY server hosting the segment is not in READY state |
405 | | - * 3. Include all replicas (even those with CRC mismatches) in consensus for safety |
406 | | - * |
407 | | - * @param segmentName the name of the segment being checked |
408 | | - * @param replicaMetadataList list of metadata from all replicas of the segment |
409 | | - * @return true if all replicas have consensus on validDoc counts, false otherwise |
410 | | - */ |
411 | | - public static boolean hasValidDocConsensus(String segmentName, |
412 | | - List<ValidDocIdsMetadataInfo> replicaMetadataList) { |
413 | | - |
414 | | - if (replicaMetadataList == null || replicaMetadataList.isEmpty()) { |
415 | | - LOGGER.warn("No replica metadata available for segment: {}", segmentName); |
416 | | - return false; |
417 | | - } |
418 | | - |
419 | | - // Check server readiness and validDoc consensus |
420 | | - Long consensusValidDocs = null; |
421 | | - for (ValidDocIdsMetadataInfo metadata : replicaMetadataList) { |
422 | | - // Check server readiness - skip if ANY server is not ready |
423 | | - if (metadata.getServerStatus() != null && !metadata.getServerStatus().equals(ServiceStatus.Status.GOOD)) { |
424 | | - LOGGER.warn("Server {} is in {} state for segment: {}, skipping consensus check", |
425 | | - metadata.getInstanceId(), metadata.getServerStatus(), segmentName); |
426 | | - return false; |
427 | | - } |
428 | | - |
429 | | - // Check if all replicas have the same totalValidDocs count |
430 | | - long validDocs = metadata.getTotalValidDocs(); |
431 | | - if (consensusValidDocs == null) { |
432 | | - // First iteration, we record the value to compare against |
433 | | - consensusValidDocs = validDocs; |
434 | | - } else if (!consensusValidDocs.equals(validDocs)) { |
435 | | - LOGGER.warn("Inconsistent validDoc counts across replicas for segment: {}. Expected: {}, but found: {}", |
436 | | - segmentName, consensusValidDocs, validDocs); |
437 | | - return false; |
438 | | - } |
439 | | - } |
440 | | - return true; |
441 | | - } |
442 | 454 | } |
0 commit comments