-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchroma_stats.py
More file actions
51 lines (38 loc) · 1.4 KB
/
chroma_stats.py
File metadata and controls
51 lines (38 loc) · 1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
"""
ChromaDB Stats - Shows document and chunk counts
Usage: docker exec genai-app python /app/chroma_stats.py
"""
import chromadb
import os
from collections import Counter
PERSIST_DIR = os.getenv("CHROMA_PERSIST_DIR", "/app/chroma_db")
def main():
client = chromadb.PersistentClient(path=PERSIST_DIR)
collections = client.list_collections()
print("=" * 60)
print("CHROMADB STATISTICS")
print("=" * 60)
if not collections:
print("No collections found.")
return
for col in collections:
collection = client.get_collection(col.name)
count = collection.count()
print(f"\nCollection: {col.name}")
print("-" * 40)
print(f" Total chunks: {count:,}")
if count > 0:
# Get all metadata to count unique sources
results = collection.get(include=["metadatas"])
sources = [m.get("source", "unknown") for m in results["metadatas"]]
source_counts = Counter(sources)
print(f" Unique documents: {len(source_counts)}")
print(f"\n Documents breakdown:")
for source, chunk_count in source_counts.most_common():
# Extract just filename from path
filename = os.path.basename(source)
print(f" - {filename}: {chunk_count:,} chunks")
print("\n" + "=" * 60)
if __name__ == "__main__":
main()