Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,19 @@
worked/**/*.html linguist-vendored=true
graphify-out/**/*.html linguist-vendored=true
*.html linguist-detectable=false

# Normalize line endings
* text=auto

# Scripts always get LF
*.sh text eol=lf
*.py text eol=lf

# Windows batch files always get CRLF
*.bat text eol=crlf
*.cmd text eol=crlf

# Binary files
*.pdf binary
*.png binary
*.jpg binary
7 changes: 6 additions & 1 deletion graphify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2932,13 +2932,18 @@ def main() -> None:

p = _ap.ArgumentParser(prog="graphify save-result")
p.add_argument("--question", required=True)
p.add_argument("--answer", required=True)
p.add_argument("--answer", default=None)
p.add_argument("--answer-file", dest="answer_file", default=None)
p.add_argument("--type", dest="query_type", default="query")
p.add_argument("--nodes", nargs="*", default=[])
p.add_argument("--outcome", choices=("useful", "dead_end", "corrected"), default=None)
p.add_argument("--correction", default=None)
p.add_argument("--memory-dir", default=str(Path(_GRAPHIFY_OUT) / "memory"))
opts = p.parse_args(sys.argv[2:])
if opts.answer_file:
opts.answer = Path(opts.answer_file).read_text(encoding="utf-8").strip()
elif not opts.answer:
p.error("--answer or --answer-file is required")
from graphify.ingest import save_query_result as _sqr

out = _sqr(
Expand Down
9 changes: 9 additions & 0 deletions graphify/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -1490,6 +1490,15 @@ def to_graphml(
for _, _, attrs in H.edges(data=True):
for k in [k for k in attrs if k.startswith("_")]:
del attrs[k]
# nx.write_graphml raises ValueError on None attribute values; replace with "".
for node_id in H.nodes():
for key, val in list(H.nodes[node_id].items()):
if val is None:
H.nodes[node_id][key] = ""
for u, v in H.edges():
for key, val in list(H.edges[u, v].items()):
if val is None:
H.edges[u, v][key] = ""
nx.write_graphml(H, output_path)


Expand Down
669 changes: 306 additions & 363 deletions graphify/skill-windows.md

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions graphify/windows-scripts/add_transcripts_to_detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sys
import json
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

transcripts_file = Path('graphify-out/.graphify_transcripts.json')
detect_file = Path('graphify-out/.graphify_detect.json')

if not transcripts_file.exists():
print('No transcripts file found, skipping.')
raise SystemExit(0)

transcript_paths = json.loads(transcripts_file.read_text(encoding='utf-8'))
if not transcript_paths:
print('No transcripts produced.')
raise SystemExit(0)

detect = json.loads(detect_file.read_text(encoding='utf-8'))
detect.setdefault('files', {}).setdefault('docs', []).extend(transcript_paths)
detect['total_files'] = detect.get('total_files', 0) + len(transcript_paths)
detect_file.write_text(json.dumps(detect, indent=2), encoding='utf-8')
print(f'Transcribed {len(transcript_paths)} video file(s) -> treating as docs')
20 changes: 20 additions & 0 deletions graphify/windows-scripts/ast_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import sys, json
from graphify.extract import collect_files, extract
from pathlib import Path
import json
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

detect = json.loads(Path('graphify-out/.graphify_detect.json').read_text(encoding='utf-8'))
input_path = detect.get('input_path', '.')
code_files = []
for f in detect.get('files', {}).get('code', []):
code_files.extend(collect_files(Path(f)) if Path(f).is_dir() else [Path(f)])

if code_files:
result = extract(code_files, cache_root=Path(input_path))
Path('graphify-out/.graphify_ast.json').write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding='utf-8')
print(f'AST: {len(result["nodes"])} nodes, {len(result["edges"])} edges')
else:
Path('graphify-out/.graphify_ast.json').write_text(json.dumps({'nodes':[],'edges':[],'input_tokens':0,'output_tokens':0}, ensure_ascii=False), encoding='utf-8')
print('No code files - skipping AST extraction')
47 changes: 47 additions & 0 deletions graphify/windows-scripts/build_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import sys, json
from graphify.build import build_from_json
from graphify.cluster import cluster, score_all
from graphify.analyze import god_nodes, surprising_connections, suggest_questions
from graphify.report import generate
from graphify.export import to_json
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

input_path = sys.argv[1] if len(sys.argv) > 1 else '.'
directed = sys.argv[2].lower() == 'true' if len(sys.argv) > 2 else False

extraction = json.loads(Path('graphify-out/.graphify_extract.json').read_text(encoding='utf-8'))
detection = json.loads(Path('graphify-out/.graphify_detect.json').read_text(encoding='utf-8'))

G = build_from_json(extraction, root=input_path, directed=directed)
if G.number_of_nodes() == 0:
print('ERROR: Graph is empty - extraction produced no nodes.')
print('Possible causes: all files were skipped, binary-only corpus, or extraction failed.')
raise SystemExit(1)

communities = cluster(G)
cohesion = score_all(G, communities)
tokens = {'input': extraction.get('input_tokens', 0), 'output': extraction.get('output_tokens', 0)}
gods = god_nodes(G)
surprises = surprising_connections(G, communities)
labels = {cid: 'Community ' + str(cid) for cid in communities}
questions = suggest_questions(G, communities, labels)

wrote = to_json(G, communities, 'graphify-out/graph.json')
if not wrote:
print('ERROR: refused to shrink graphify-out/graph.json (existing graph has more nodes; #479).')
print('If this shrink is intentional (you deleted files), re-run a full build with --force.')
raise SystemExit(1)

report = generate(G, communities, cohesion, labels, gods, surprises, detection, tokens, input_path, suggested_questions=questions)
Path('graphify-out/GRAPH_REPORT.md').write_text(report, encoding='utf-8')
analysis = {
'communities': {str(k): v for k, v in communities.items()},
'cohesion': {str(k): v for k, v in cohesion.items()},
'gods': gods,
'surprises': surprises,
'questions': questions,
}
Path('graphify-out/.graphify_analysis.json').write_text(json.dumps(analysis, indent=2, ensure_ascii=False), encoding='utf-8')
print(f'Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, {len(communities)} communities')
12 changes: 12 additions & 0 deletions graphify/windows-scripts/check_code_only.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import sys
import json
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

result = json.loads(Path('graphify-out/.graphify_incremental.json').read_text(encoding='utf-8')) if Path('graphify-out/.graphify_incremental.json').exists() else {}
code_exts = {'.py', '.ts', '.js', '.go', '.rs', '.java', '.cpp', '.c', '.rb', '.swift', '.kt', '.cs', '.scala', '.php', '.cc', '.cxx', '.hpp', '.h', '.kts', '.lua', '.toc'}
new_files = result.get('new_files', {})
all_changed = [f for files in new_files.values() for f in files]
code_only = all(Path(f).suffix.lower() in code_exts for f in all_changed)
print('code_only:', code_only)
26 changes: 26 additions & 0 deletions graphify/windows-scripts/check_extraction_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sys
import json
from graphify.cache import check_semantic_cache
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

input_path = sys.argv[1] if len(sys.argv) > 1 else '.'
detect = json.loads(Path('graphify-out/.graphify_detect.json').read_text(encoding='utf-8'))

# Only content files go to semantic extraction; code is handled by AST (Part A).
# Video is transcribed to document in Step 2.5 first.
all_files = [f for cat in ('document', 'paper', 'image') for f in detect['files'].get(cat, [])]

cached_nodes, cached_edges, cached_hyperedges, uncached = check_semantic_cache(all_files, root=Path(input_path))

# Always (re)write or DELETE the cache file so Part C never merges stale data (#1392).
if cached_nodes or cached_edges or cached_hyperedges:
Path('graphify-out/.graphify_cached.json').write_text(
json.dumps({'nodes': cached_nodes, 'edges': cached_edges, 'hyperedges': cached_hyperedges}, ensure_ascii=False),
encoding='utf-8'
)
else:
Path('graphify-out/.graphify_cached.json').unlink(missing_ok=True)
Path('graphify-out/.graphify_uncached.txt').write_text('\n'.join(uncached), encoding='utf-8')
print(f'Cache: {len(all_files)-len(uncached)} files hit, {len(uncached)} files need extraction')
8 changes: 8 additions & 0 deletions graphify/windows-scripts/check_graph_exists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import sys
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

if not Path('graphify-out/graph.json').exists():
print('ERROR: No graph found. Run /graphify <path> first to build the graph.')
raise SystemExit(1)
23 changes: 23 additions & 0 deletions graphify/windows-scripts/check_graph_health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sys, json
from pathlib import Path
from graphify.diagnostics import diagnose_extraction, format_diagnostic_report
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

input_path = sys.argv[1] if len(sys.argv) > 1 else '.'
directed = sys.argv[2].lower() == 'true' if len(sys.argv) > 2 else False

extraction = json.loads(Path('graphify-out/.graphify_extract.json').read_text(encoding='utf-8'))
summary = diagnose_extraction(extraction, directed=directed, root=input_path)
print(format_diagnostic_report(summary))
flags = [f'{summary[k]} {label}' for k, label in (
('dangling_endpoint_edges', 'dangling-endpoint edges'),
('missing_endpoint_edges', 'missing-endpoint edges'),
('self_loop_edges', 'self-loop edges'),
('directed_same_endpoint_collapsed_edges', 'collapsed (directed) edges'),
('undirected_same_endpoint_collapsed_edges', 'collapsed (undirected) edges'),
) if summary.get(k, 0)]
if flags:
print('GRAPH HEALTH WARNING: ' + '; '.join(flags) + ' - graph may be incomplete/corrupt.')
else:
print('Graph health: OK (no dangling/missing/collapsed edges).')
4 changes: 4 additions & 0 deletions graphify/windows-scripts/check_graphify_installed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import sys
import graphify # exits non-zero if not installed
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')
37 changes: 37 additions & 0 deletions graphify/windows-scripts/cluster_only.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import sys
import json
from graphify.cluster import cluster, score_all
from graphify.analyze import god_nodes, surprising_connections
from graphify.report import generate
from graphify.export import to_json
from networkx.readwrite import json_graph
import networkx as nx
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

data = json.loads(Path('graphify-out/graph.json').read_text(encoding='utf-8'))
G = json_graph.node_link_graph(data, edges='links')

detection = {'total_files': 0, 'total_words': 99999, 'needs_graph': True, 'warning': None,
'files': {'code': [], 'document': [], 'paper': []}}
tokens = {'input': 0, 'output': 0}

communities = cluster(G)
cohesion = score_all(G, communities)
gods = god_nodes(G)
surprises = surprising_connections(G, communities)
labels = {cid: 'Community ' + str(cid) for cid in communities}

report = generate(G, communities, cohesion, labels, gods, surprises, detection, tokens, '.')
Path('graphify-out/GRAPH_REPORT.md').write_text(report, encoding='utf-8')
to_json(G, communities, 'graphify-out/graph.json')

analysis = {
'communities': {str(k): v for k, v in communities.items()},
'cohesion': {str(k): v for k, v in cohesion.items()},
'gods': gods,
'surprises': surprises,
}
Path('graphify-out/.graphify_analysis.json').write_text(json.dumps(analysis, indent=2, ensure_ascii=False), encoding='utf-8')
print(f'Re-clustered: {len(communities)} communities')
58 changes: 58 additions & 0 deletions graphify/windows-scripts/collect_chunk_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import sys
import json, sys
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

out_dir = Path('graphify-out')
chunk_files = sorted(out_dir.glob('.graphify_chunk_*.json'))

if not chunk_files:
print('ERROR: No chunk files found in graphify-out/. Subagents may not have written results.')
print('Re-run and ensure subagent_type="general-purpose" is used.')
sys.exit(1)

all_nodes, all_edges, all_hyperedges = [], [], []
total_input = total_output = 0
failed = []

for cf in chunk_files:
try:
data = json.loads(cf.read_text(encoding='utf-8'))
if 'nodes' not in data or 'edges' not in data:
raise ValueError('missing nodes or edges keys')
all_nodes.extend(data['nodes'])
all_edges.extend(data['edges'])
all_hyperedges.extend(data.get('hyperedges', []))
total_input += data.get('input_tokens', 0)
total_output += data.get('output_tokens', 0)
except Exception as e:
print(f'WARNING: {cf.name} failed validation: {e}')
failed.append(cf.name)

total = len(chunk_files)
if len(failed) > total / 2:
print(f'ERROR: {len(failed)}/{total} chunks failed. Re-run and ensure subagent_type="general-purpose".')
sys.exit(1)

if failed:
print(f'WARNING: {len(failed)} chunk(s) invalid: {", ".join(failed)}')

seen: set = set()
deduped_nodes = []
for n in all_nodes:
if n['id'] not in seen:
seen.add(n['id'])
deduped_nodes.append(n)

Path('graphify-out/.graphify_semantic_new.json').write_text(json.dumps({
'nodes': deduped_nodes,
'edges': all_edges,
'hyperedges': all_hyperedges,
'input_tokens': total_input,
'output_tokens': total_output,
}, indent=2), encoding='utf-8')
print(f'Collected {total - len(failed)}/{total} chunks: {len(deduped_nodes)} nodes, {len(all_edges)} edges')

for cf in chunk_files:
cf.unlink(missing_ok=True)
20 changes: 20 additions & 0 deletions graphify/windows-scripts/detect_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import sys, json
from graphify.detect import detect
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

def _is_assets(path_str: str) -> bool:
return any(part.lower() == 'assets' for part in Path(path_str).parts)

input_path = Path(sys.argv[1]).resolve()
result = detect(input_path)
result['input_path'] = str(input_path)

# Discard files inside any folder named 'assets'
for category in result.get('files', {}):
result['files'][category] = [f for f in result['files'][category] if not _is_assets(f)]
result['total_files'] = sum(len(v) for v in result.get('files', {}).values())

Path('graphify-out').mkdir(exist_ok=True)
Path('graphify-out/.graphify_detect.json').write_text(json.dumps(result, ensure_ascii=False), encoding='utf-8')
39 changes: 39 additions & 0 deletions graphify/windows-scripts/detect_incremental.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys, json
from graphify.detect import detect_incremental, save_manifest
from pathlib import Path
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

def _is_assets(path_str: str) -> bool:
return any(part.lower() == 'assets' for part in Path(path_str).parts)

result = detect_incremental(Path(sys.argv[1]), kind='ast')

# Discard files inside any folder named 'assets'
for category in result.get('files', {}):
result['files'][category] = [f for f in result['files'][category] if not _is_assets(f)]
for category in result.get('new_files', {}):
result['new_files'][category] = [f for f in result['new_files'][category] if not _is_assets(f)]
result['new_total'] = sum(len(v) for v in result.get('new_files', {}).values())
result['total_files'] = sum(len(v) for v in result.get('files', {}).values())

new_total = result.get('new_total', 0)
print(json.dumps(result, indent=2))
Path('graphify-out/.graphify_incremental.json').write_text(json.dumps(result, ensure_ascii=False), encoding='utf-8')
if new_total == 0:
print('No files changed since last run. Nothing to update.')
raise SystemExit(0)
print(f'{new_total} new/changed file(s) to re-extract.')

# Write .graphify_detect.json scoped to new files only so all downstream
# scripts (check_extraction_cache, print_timing_estimate, build_graph, etc.)
# operate on the changed file set, not the full corpus.
detect_for_update = {
'files': result['new_files'],
'all_files': result.get('files', {}), # full corpus for manifest-saving
'total_files': result['new_total'],
'total_words': result.get('total_words', 0),
'skipped_sensitive': result.get('skipped_sensitive', []),
'input_path': result.get('input_path', str(Path(sys.argv[1]).resolve())),
}
Path('graphify-out/.graphify_detect.json').write_text(json.dumps(detect_for_update, ensure_ascii=False), encoding='utf-8')
Loading