Hi
I was trying to use the topic format annotations in the HF dataset but it is missing almost 50% of the annotation data.
Here's the script I'm using to measure this after downloading both the documents/ and domains_formats/ directories -
# check_missing_labels.py
import os
import re
# Paths
BASE = "/dais/fs/scratch/afkhan/Creativity_Project/Data/Corpus-200B-Documents"
DOCS_DIR = f"{BASE}/documents"
LABELS_DIR = f"{BASE}/domains_formats"
# Regex to extract shard number
SHARD_RE = re.compile(r"CC_shard_(\d+)_processed\.jsonl")
# Collect all document files
doc_files = [f for f in os.listdir(DOCS_DIR) if f.endswith(".jsonl")]
missing_files = []
missing_shard_numbers = []
for doc_file in doc_files:
base = doc_file.replace(".jsonl", "")
label_file = f"{base}__choice.npy"
label_path = os.path.join(LABELS_DIR, label_file)
if not os.path.exists(label_path):
missing_files.append(doc_file)
# Extract shard number
match = SHARD_RE.match(doc_file)
if match:
missing_shard_numbers.append(int(match.group(1)))
# Print missing files
if missing_files:
print("The following document files do NOT have a corresponding label file:")
for f in missing_files:
print(f" - {f}")
else:
print("All document files have label files.")
# Percentage calculation
total = len(doc_files)
missing_count = len(missing_files)
percent_missing = (missing_count / total) * 100
print(f"\nTotal documents: {total}")
print(f"Missing label files: {missing_count} ({percent_missing:.2f}%)")
# Print smallest shard number missing
if missing_shard_numbers:
smallest_shard = min(missing_shard_numbers)
print(f"Smallest shard number missing a label file: {smallest_shard:08d}")
Here's the output -
Total documents: 9889
Missing label files: 4937 (49.92%)
Smallest shard number missing a label file: 00000005
and if you check the dataset on HF there is indeed no label file for shard 00000005. Is this expected/am I missing something here?
Hi
I was trying to use the topic format annotations in the HF dataset but it is missing almost 50% of the annotation data.
Here's the script I'm using to measure this after downloading both the
documents/anddomains_formats/directories -Here's the output -
and if you check the dataset on HF there is indeed no label file for shard 00000005. Is this expected/am I missing something here?