diff --git a/knowledge-graph-reproducibility-routes/README.md b/knowledge-graph-reproducibility-routes/README.md new file mode 100644 index 0000000..4656010 --- /dev/null +++ b/knowledge-graph-reproducibility-routes/README.md @@ -0,0 +1,48 @@ +# Knowledge Graph Reproducibility Routes + +Self-contained milestone for SCIBASE.AI issue #17, Scientific Knowledge Graph +Integration. + +This module turns typed knowledge-graph entities into reproducibility-oriented +navigation paths. A route connects a concept or method to protocols, datasets, +notebooks, software, and result artifacts, then scores whether the path is +ready for a researcher to rerun or reuse. + +## What It Covers + +- Typed graph nodes for concepts, protocols, datasets, notebooks, software, + results, projects, and authors. +- Evidence-backed relationships with freshness, access, citation, and + reproducibility metadata. +- Route scoring that rewards strong evidence and executable rerun readiness. +- Blocker and curator action output for weak, stale, private, or non-runnable + graph paths. +- Recommendation digests for project sidebars, discovery mode, and weekly + knowledge graph summaries. +- Synthetic sample data only; no network calls, credentials, or external + services. + +## Files + +- `index.js` - route scoring and recommendation engine. +- `demo.js` - terminal demo for candidate route ranking. +- `test.js` - dependency-free regression tests. +- `demo.mp4` - short demo artifact for bounty review. + +## Run + +```sh +node knowledge-graph-reproducibility-routes/test.js +node knowledge-graph-reproducibility-routes/demo.js +``` + +## Requirement Map + +| Issue #17 Requirement | Implementation | +| --- | --- | +| Entity extraction and linked data | The module consumes typed entity nodes with evidence and ontology metadata, then emits route packets with stable audit digests. | +| Knowledge navigation | `findReproducibilityRoutes()` supports concept-to-result and notebook-reuse style graph journeys with domain/time/access filters. | +| Dynamic node types | Sample and tests include concepts, protocols, datasets, notebooks, software, results, projects, and authors. | +| Filters by domain, time, citation count, reproducibility | Route filtering uses `domain`, `minCitationCount`, `maxEvidenceAgeDays`, `access`, and `minScore`. | +| AI research recommendations | `buildRecommendationDigest()` converts ranked graph paths into sidebar/discovery recommendations with reasons and blockers. | +| Entity pages and usage contexts | Route packets include `path`, `usageContexts`, `evidence`, and `curatorActions` for entity pages and graph drilldowns. | diff --git a/knowledge-graph-reproducibility-routes/demo.js b/knowledge-graph-reproducibility-routes/demo.js new file mode 100644 index 0000000..c96a77f --- /dev/null +++ b/knowledge-graph-reproducibility-routes/demo.js @@ -0,0 +1,121 @@ +const { buildRecommendationDigest, findReproducibilityRoutes } = require("./index") + +const graph = { + nodes: [ + { id: "concept-crispr", type: "concept", label: "CRISPR perturbation", domain: "biology" }, + { id: "protocol-10x", type: "protocol", label: "10x guide capture protocol", domain: "biology" }, + { id: "dataset-neuro", type: "dataset", label: "Neuroscience perturb-seq dataset", domain: "biology" }, + { id: "notebook-cluster", type: "notebook", label: "Clustering replay notebook", domain: "biology" }, + { id: "software-scanpy", type: "software", label: "Scanpy workflow", domain: "biology" }, + { id: "result-cell-state", type: "result", label: "Cell-state transition map", domain: "biology" }, + { id: "dataset-private", type: "dataset", label: "Restricted validation cohort", domain: "biology" }, + { id: "result-private", type: "result", label: "Embargoed validation result", domain: "biology" }, + ], + edges: [ + { + id: "e1", + from: "concept-crispr", + to: "protocol-10x", + relation: "uses_protocol", + evidenceStrength: 0.92, + citationCount: 180, + evidenceAgeDays: 45, + access: "open", + usageContexts: ["protocol entity page", "project sidebar"], + }, + { + id: "e2", + from: "protocol-10x", + to: "dataset-neuro", + relation: "generated_dataset", + evidenceStrength: 0.9, + citationCount: 120, + evidenceAgeDays: 80, + access: "open", + reproducibilityVerified: true, + usageContexts: ["dataset reuse"], + }, + { + id: "e3", + from: "dataset-neuro", + to: "notebook-cluster", + relation: "reproduced_by", + evidenceStrength: 0.88, + citationCount: 90, + evidenceAgeDays: 22, + access: "open", + executable: true, + requiresRerun: true, + reproducibilityVerified: true, + usageContexts: ["run analysis"], + }, + { + id: "e4", + from: "notebook-cluster", + to: "software-scanpy", + relation: "depends_on", + evidenceStrength: 0.8, + citationCount: 60, + evidenceAgeDays: 18, + access: "open", + executable: true, + usageContexts: ["environment plan"], + }, + { + id: "e5", + from: "software-scanpy", + to: "result-cell-state", + relation: "produces_result", + evidenceStrength: 0.87, + citationCount: 75, + evidenceAgeDays: 18, + access: "open", + executable: true, + reproducibilityVerified: true, + usageContexts: ["result entity page"], + }, + { + id: "e6", + from: "concept-crispr", + to: "dataset-private", + relation: "validated_by", + evidenceStrength: 0.56, + citationCount: 12, + evidenceAgeDays: 500, + access: "restricted", + usageContexts: ["curator review"], + }, + { + id: "e7", + from: "dataset-private", + to: "result-private", + relation: "supports_result", + evidenceStrength: 0.48, + citationCount: 8, + evidenceAgeDays: 510, + access: "private", + requiresRerun: true, + executable: false, + usageContexts: ["curator review"], + }, + ], +} + +const routes = findReproducibilityRoutes(graph, { + startId: "concept-crispr", + targetTypes: ["result"], + maxDepth: 5, + limit: 5, +}) + +const digest = buildRecommendationDigest(graph, { + startId: "concept-crispr", + targetTypes: ["result"], + maxDepth: 5, + context: "project_sidebar", +}) + +console.log("Ranked reproducibility routes") +console.log(JSON.stringify(routes, null, 2)) +console.log("\nRecommendation digest") +console.log(JSON.stringify(digest, null, 2)) diff --git a/knowledge-graph-reproducibility-routes/demo.mp4 b/knowledge-graph-reproducibility-routes/demo.mp4 new file mode 100644 index 0000000..179cd29 Binary files /dev/null and b/knowledge-graph-reproducibility-routes/demo.mp4 differ diff --git a/knowledge-graph-reproducibility-routes/demo.svg b/knowledge-graph-reproducibility-routes/demo.svg new file mode 100644 index 0000000..8a89636 --- /dev/null +++ b/knowledge-graph-reproducibility-routes/demo.svg @@ -0,0 +1,13 @@ + + Knowledge graph reproducibility routes demo slide + Demo summary for reproducibility route ranking and recommendation digest output. + + + SCIBASE Knowledge Graph + Reproducibility Route Planner + Ranks concept -> protocol -> dataset -> notebook -> result paths + Scores evidence strength, freshness, access, and rerun readiness + Produces sidebar and discovery-mode recommendation digests + Validation - node test.js - 6 tests passed + Dependency-free, synthetic graph data only, deterministic audit digests. + diff --git a/knowledge-graph-reproducibility-routes/index.js b/knowledge-graph-reproducibility-routes/index.js new file mode 100644 index 0000000..92156e6 --- /dev/null +++ b/knowledge-graph-reproducibility-routes/index.js @@ -0,0 +1,257 @@ +const crypto = require("node:crypto") + +const NODE_WEIGHTS = { + concept: 0.7, + protocol: 0.9, + dataset: 1, + notebook: 1, + software: 0.85, + result: 1, + project: 0.75, + author: 0.55, +} + +function stableJson(value) { + if (Array.isArray(value)) return `[${value.map(stableJson).join(",")}]` + if (value && typeof value === "object") { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableJson(value[key])}`) + .join(",")}}` + } + return JSON.stringify(value) +} + +function digest(value) { + return crypto.createHash("sha256").update(stableJson(value)).digest("hex") +} + +function indexById(items) { + return new Map(items.map((item) => [item.id, item])) +} + +function edgeFreshnessScore(edge) { + if (edge.retracted) return 0 + if (edge.evidenceAgeDays == null) return 0.65 + if (edge.evidenceAgeDays <= 30) return 1 + if (edge.evidenceAgeDays <= 180) return 0.85 + if (edge.evidenceAgeDays <= 365) return 0.65 + return 0.4 +} + +function edgeAccessScore(edge) { + if (edge.access === "open") return 1 + if (edge.access === "registered") return 0.8 + if (edge.access === "restricted") return 0.45 + if (edge.access === "private") return 0.2 + return 0.6 +} + +function edgeEvidenceScore(edge) { + const base = (edge.evidenceStrength ?? 0.5) * 0.72 + const citationBoost = Math.min((edge.citationCount ?? 0) / 250, 1) * 0.1 + const reproducibilityBoost = edge.reproducibilityVerified ? 0.11 : 0 + const executableBoost = edge.executable ? 0.07 : 0 + return Math.min(base + citationBoost + reproducibilityBoost + executableBoost, 1) +} + +function scoreEdge(edge) { + const evidence = edgeEvidenceScore(edge) + const freshness = edgeFreshnessScore(edge) + const access = edgeAccessScore(edge) + return Number((evidence * 0.5 + freshness * 0.3 + access * 0.2).toFixed(4)) +} + +function edgeBlockers(edge) { + const blockers = [] + if (edge.retracted) blockers.push("evidence_retracted") + if (edge.evidenceAgeDays > 365) blockers.push("stale_evidence") + if (edge.access === "private" || edge.access === "restricted") blockers.push("access_limited") + if (edge.requiresRerun && !edge.executable) blockers.push("rerun_not_executable") + if ((edge.evidenceStrength ?? 0) < 0.5) blockers.push("weak_evidence") + return blockers +} + +function buildAdjacency(edges) { + const adjacency = new Map() + for (const edge of edges) { + if (!adjacency.has(edge.from)) adjacency.set(edge.from, []) + adjacency.get(edge.from).push(edge) + } + return adjacency +} + +function walkRoutes(graph, options) { + const nodes = indexById(graph.nodes) + const adjacency = buildAdjacency(graph.edges) + const maxDepth = options.maxDepth ?? 5 + const queue = [{ nodeId: options.startId, edges: [], seen: new Set([options.startId]) }] + const routes = [] + + while (queue.length > 0) { + const current = queue.shift() + const node = nodes.get(current.nodeId) + if (!node) continue + + const isTarget = + current.edges.length > 0 && + (!options.targetTypes || options.targetTypes.includes(node.type)) && + (!options.targetId || current.nodeId === options.targetId) + + if (isTarget) routes.push(current) + if (current.edges.length >= maxDepth) continue + + for (const edge of adjacency.get(current.nodeId) || []) { + if (current.seen.has(edge.to)) continue + const next = nodes.get(edge.to) + if (!next) continue + if (options.domain && next.domain && next.domain !== options.domain) continue + queue.push({ + nodeId: edge.to, + edges: [...current.edges, edge], + seen: new Set([...current.seen, edge.to]), + }) + } + } + + return routes +} + +function scoreRoute(route, nodes) { + const edgeScores = route.edges.map(scoreEdge) + const nodeScores = route.edges.map((edge) => NODE_WEIGHTS[nodes.get(edge.to)?.type] ?? 0.6) + const meanEdgeScore = edgeScores.reduce((sum, score) => sum + score, 0) / edgeScores.length + const meanNodeScore = nodeScores.reduce((sum, score) => sum + score, 0) / nodeScores.length + const lengthPenalty = Math.max(0, (route.edges.length - 3) * 0.04) + const blockers = route.edges.flatMap(edgeBlockers) + const blockerPenalty = Math.min(blockers.length * 0.08, 0.4) + return Number(Math.max(meanEdgeScore * 0.72 + meanNodeScore * 0.28 - lengthPenalty - blockerPenalty, 0).toFixed(4)) +} + +function routeCuratorActions(route) { + const actions = [] + for (const edge of route.edges) { + for (const blocker of edgeBlockers(edge)) { + if (blocker === "evidence_retracted") { + actions.push(`Suppress ${edge.from}->${edge.to} until retraction is reviewed.`) + } + if (blocker === "stale_evidence") { + actions.push(`Refresh evidence for ${edge.from}->${edge.to}.`) + } + if (blocker === "access_limited") { + actions.push(`Request access policy review for ${edge.to}.`) + } + if (blocker === "rerun_not_executable") { + actions.push(`Add executable environment metadata for ${edge.to}.`) + } + if (blocker === "weak_evidence") { + actions.push(`Add stronger supporting citation or artifact evidence for ${edge.from}->${edge.to}.`) + } + } + } + return [...new Set(actions)] +} + +function formatRoute(route, graph) { + const nodes = indexById(graph.nodes) + const pathIds = [route.edges[0].from, ...route.edges.map((edge) => edge.to)] + const path = pathIds.map((id) => { + const node = nodes.get(id) + return { + id, + type: node?.type, + label: node?.label, + domain: node?.domain, + } + }) + const blockers = [...new Set(route.edges.flatMap(edgeBlockers))] + const packet = { + routeId: digest({ pathIds, edges: route.edges.map((edge) => edge.id) }).slice(0, 16), + path, + score: scoreRoute(route, nodes), + blockers, + usageContexts: route.edges.flatMap((edge) => edge.usageContexts || []), + evidence: route.edges.map((edge) => ({ + id: edge.id, + relation: edge.relation, + from: edge.from, + to: edge.to, + score: scoreEdge(edge), + citationCount: edge.citationCount ?? 0, + evidenceAgeDays: edge.evidenceAgeDays ?? null, + access: edge.access || "unknown", + executable: Boolean(edge.executable), + reproducibilityVerified: Boolean(edge.reproducibilityVerified), + })), + curatorActions: routeCuratorActions(route), + } + + return { + ...packet, + auditDigest: digest(packet), + } +} + +function findReproducibilityRoutes(graph, options) { + if (!options?.startId) throw new Error("startId is required") + const filters = options.filters || {} + const rawRoutes = walkRoutes(graph, { + startId: options.startId, + targetId: options.targetId, + targetTypes: options.targetTypes || ["result", "notebook", "dataset", "project"], + maxDepth: options.maxDepth, + domain: filters.domain, + }) + + return rawRoutes + .map((route) => formatRoute(route, graph)) + .filter((route) => route.score >= (filters.minScore ?? 0)) + .filter((route) => + filters.access ? route.evidence.every((evidence) => evidence.access === filters.access) : true, + ) + .filter((route) => + filters.minCitationCount + ? route.evidence.some((evidence) => evidence.citationCount >= filters.minCitationCount) + : true, + ) + .filter((route) => + filters.maxEvidenceAgeDays + ? route.evidence.every( + (evidence) => + evidence.evidenceAgeDays == null || evidence.evidenceAgeDays <= filters.maxEvidenceAgeDays, + ) + : true, + ) + .sort((a, b) => b.score - a.score) + .slice(0, options.limit ?? 10) +} + +function buildRecommendationDigest(graph, options) { + const routes = findReproducibilityRoutes(graph, options) + return { + startId: options.startId, + generatedFor: options.context || "discovery_mode", + totalRoutes: routes.length, + recommendations: routes.map((route) => ({ + routeId: route.routeId, + headline: `${route.path[0].label} -> ${route.path.at(-1).label}`, + score: route.score, + reason: + route.blockers.length === 0 + ? "High-confidence reproducibility path with reusable evidence." + : "Useful path with curator actions before full reuse.", + blockers: route.blockers, + curatorActions: route.curatorActions, + path: route.path.map((node) => `${node.type}:${node.label}`), + })), + auditDigest: digest(routes.map((route) => route.auditDigest)), + } +} + +module.exports = { + buildRecommendationDigest, + digest, + findReproducibilityRoutes, + scoreEdge, + stableJson, +} diff --git a/knowledge-graph-reproducibility-routes/test.js b/knowledge-graph-reproducibility-routes/test.js new file mode 100644 index 0000000..f3a63b3 --- /dev/null +++ b/knowledge-graph-reproducibility-routes/test.js @@ -0,0 +1,196 @@ +const assert = require("node:assert/strict") +const { + buildRecommendationDigest, + digest, + findReproducibilityRoutes, + scoreEdge, +} = require("./index") + +function sampleGraph() { + return { + nodes: [ + { id: "concept-a", type: "concept", label: "Graph neural biomarker", domain: "biology" }, + { id: "dataset-open", type: "dataset", label: "Open cohort", domain: "biology" }, + { id: "notebook-open", type: "notebook", label: "Executable notebook", domain: "biology" }, + { id: "result-open", type: "result", label: "Validated signature", domain: "biology" }, + { id: "dataset-stale", type: "dataset", label: "Stale cohort", domain: "biology" }, + { id: "result-stale", type: "result", label: "Stale finding", domain: "biology" }, + { id: "dataset-physics", type: "dataset", label: "Physics cohort", domain: "physics" }, + { id: "result-physics", type: "result", label: "Physics result", domain: "physics" }, + ], + edges: [ + { + id: "open-1", + from: "concept-a", + to: "dataset-open", + relation: "supported_by", + evidenceStrength: 0.9, + evidenceAgeDays: 20, + citationCount: 300, + access: "open", + reproducibilityVerified: true, + }, + { + id: "open-2", + from: "dataset-open", + to: "notebook-open", + relation: "reproduced_by", + evidenceStrength: 0.88, + evidenceAgeDays: 15, + citationCount: 110, + access: "open", + executable: true, + requiresRerun: true, + reproducibilityVerified: true, + }, + { + id: "open-3", + from: "notebook-open", + to: "result-open", + relation: "produces", + evidenceStrength: 0.86, + evidenceAgeDays: 10, + citationCount: 80, + access: "open", + executable: true, + reproducibilityVerified: true, + }, + { + id: "stale-1", + from: "concept-a", + to: "dataset-stale", + relation: "supported_by", + evidenceStrength: 0.45, + evidenceAgeDays: 800, + citationCount: 4, + access: "restricted", + }, + { + id: "stale-2", + from: "dataset-stale", + to: "result-stale", + relation: "produces", + evidenceStrength: 0.4, + evidenceAgeDays: 820, + citationCount: 1, + access: "private", + requiresRerun: true, + executable: false, + }, + { + id: "physics-1", + from: "concept-a", + to: "dataset-physics", + relation: "analogous_to", + evidenceStrength: 0.9, + evidenceAgeDays: 10, + citationCount: 50, + access: "open", + }, + { + id: "physics-2", + from: "dataset-physics", + to: "result-physics", + relation: "produces", + evidenceStrength: 0.9, + evidenceAgeDays: 10, + citationCount: 50, + access: "open", + executable: true, + }, + ], + } +} + +function testScoresStrongExecutableOpenEdgesHighest() { + const strong = scoreEdge({ + evidenceStrength: 0.9, + evidenceAgeDays: 10, + citationCount: 200, + access: "open", + executable: true, + reproducibilityVerified: true, + }) + const weak = scoreEdge({ + evidenceStrength: 0.35, + evidenceAgeDays: 900, + citationCount: 0, + access: "private", + executable: false, + }) + + assert.ok(strong > 0.9) + assert.ok(weak < 0.45) +} + +function testFindsRankedReproducibilityRoute() { + const [best] = findReproducibilityRoutes(sampleGraph(), { + startId: "concept-a", + targetTypes: ["result"], + maxDepth: 4, + }) + + assert.equal(best.path.at(-1).id, "result-open") + assert.equal(best.blockers.length, 0) + assert.ok(best.score > 0.85) +} + +function testWeakPrivateRouteProducesCuratorActions() { + const routes = findReproducibilityRoutes(sampleGraph(), { + startId: "concept-a", + targetId: "result-stale", + targetTypes: ["result"], + maxDepth: 3, + }) + + assert.equal(routes.length, 1) + assert.ok(routes[0].blockers.includes("stale_evidence")) + assert.ok(routes[0].blockers.includes("access_limited")) + assert.ok(routes[0].blockers.includes("rerun_not_executable")) + assert.ok(routes[0].curatorActions.some((action) => action.includes("Refresh evidence"))) +} + +function testDomainFilterSuppressesOtherDomains() { + const routes = findReproducibilityRoutes(sampleGraph(), { + startId: "concept-a", + targetTypes: ["result"], + filters: { domain: "biology" }, + maxDepth: 3, + }) + + assert.equal(routes.some((route) => route.path.at(-1).id === "result-physics"), false) +} + +function testRecommendationDigestExplainsRoutes() { + const digestPacket = buildRecommendationDigest(sampleGraph(), { + startId: "concept-a", + targetTypes: ["result"], + maxDepth: 4, + context: "weekly_digest", + }) + + assert.equal(digestPacket.generatedFor, "weekly_digest") + assert.ok(digestPacket.recommendations.length >= 2) + assert.match(digestPacket.recommendations[0].headline, /Graph neural biomarker/) + assert.equal(typeof digestPacket.auditDigest, "string") +} + +function testStableDigestIgnoresObjectKeyOrder() { + assert.equal(digest({ b: 2, a: 1 }), digest({ a: 1, b: 2 })) +} + +const tests = [ + testScoresStrongExecutableOpenEdgesHighest, + testFindsRankedReproducibilityRoute, + testWeakPrivateRouteProducesCuratorActions, + testDomainFilterSuppressesOtherDomains, + testRecommendationDigestExplainsRoutes, + testStableDigestIgnoresObjectKeyOrder, +] + +for (const test of tests) { + test() + console.log(`ok - ${test.name}`) +} + +console.log(`${tests.length} tests passed`)