SpecterOps · zinic · May 19, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/Makefile b/Makefile
@@ -50,7 +50,7 @@ QUALITY_INPUTS += -mutation-report $(MUTATION_REPORT)
 endif
 QUALITY_INPUTS += -benchmark-regression $(BENCHMARK_REGRESSION)
 
-.PHONY: default all build deps tidy lint format test test_all test_integration test_neo4j test_pg test_update complexity complexity_check crap crap_check quality quality_check quality_backend quality_bench metrics metrics_check generate clean help
+.PHONY: default all build deps tidy lint format test test_all test_integration test_neo4j test_pg test_update plan_corpus complexity complexity_check crap crap_check quality quality_check quality_backend quality_bench metrics metrics_check generate clean help
 
 # Default target
 default: help
@@ -109,6 +109,10 @@ test_update:
 	@cp -fv cypher/models/pgsql/test/updated_cases/* cypher/models/pgsql/test/translation_cases
 	@rm -rf cypher/models/pgsql/test/updated_cases
 
+plan_corpus: $(METRICS_DIR)
+	@echo "Capturing Cypher plan corpus..."
+	@$(GO_CMD) run ./cmd/plancorpus
+
 # Metric targets
 $(METRICS_DIR):
 	@mkdir -p $(METRICS_DIR)
@@ -218,6 +222,7 @@ help:
 	@echo "  test_bench  - Run benchmark test"
 	@echo "  test_neo4j  - Run Neo4j integration tests"
 	@echo "  test_pg     - Run PostgreSQL integration tests"
+	@echo "  plan_corpus - Capture shared corpus query plans for configured backends"
 	@echo "  test_update - Update test cases"
 	@echo "  complexity  - Report cyclomatic complexity"
 	@echo "  crap        - Report CRAP scores from unit test coverage"

diff --git a/README.md b/README.md
@@ -54,6 +54,8 @@ export CONNECTION_STRING="postgresql://dawgs:weneedbetterpasswords@localhost:654
 export CONNECTION_STRING="neo4j://neo4j:weneedbetterpasswords@localhost:7687"
 ```
 
+Neo4j connection strings may use `neo4j://`, `neo4j+s://`, or `neo4j+ssc://`; a single path segment selects the Neo4j database name.
+
 Use `make test` for unit tests only and `make test_integration` for integration tests only.
 
 ### Test Metrics
@@ -95,6 +97,24 @@ make quality FUZZ_REPORT=.coverage/fuzz.json MUTATION_REPORT=.coverage/mutation.
 `PG_CONNECTION_STRING` and `NEO4J_CONNECTION_STRING`. `make quality_bench` writes benchmark markdown and JSON captures
 for later baseline comparison.
 
+`make plan_corpus` captures plan diagnostics for the shared Cypher integration corpus. It accepts either
+`CONNECTION_STRING` for one backend or `PG_CONNECTION_STRING` and `NEO4J_CONNECTION_STRING` for both backends, then
+writes JSONL captures and markdown/JSON summaries under `.coverage/`.
+
+`go run ./cmd/graphbench` captures runtime diagnostics for the scale corpus under `benchmark/testdata/scale`. The
+current modes are `postgres_sql`, `local_traversal`, and `neo4j`; AGE is reference-design input only and is not a direct
+comparison mode yet. The command can emit JSONL records plus Markdown and JSON summaries, and can compare current timings
+against a previous JSONL baseline.
+
+PostgreSQL translates exact string property equality with a JSON string type guard and `properties ->>` extraction, so
+indexes created on expressions such as `properties ->> 'objectid'` and `properties ->> 'name'` can be used for selective
+anchors without matching JSON booleans or numbers. Simple relationship count fast paths depend on the schema's
+`kind_id`-first edge index for efficient typed counts.
+
+Substring and suffix predicates are intentionally not promoted to blanket schema indexes. PostgreSQL deployments can
+request explicit `TextSearchIndex`/trigram property indexes for fields that need `CONTAINS`, `STARTS WITH`, or
+`ENDS WITH`, but default schema assertion should wait until all suffix forms share one semantics-preserving lowering.
+
 Thresholds are report-only by default. To enforce the configured thresholds, run:
 
 ```bash

diff --git a/batch_operation_plan.md b/batch_operation_plan.md
@@ -0,0 +1,129 @@
+# BatchOperation COPY Streaming Plan
+
+## Objective
+
+Move PostgreSQL `BatchOperation` toward chunked streaming writes backed by `COPY` and staging tables, while documenting that `BatchOperation` is intentionally non-transactional across the whole delegate.
+
+## Ground Rules
+
+- `BatchOperation` is a buffered, non-atomic write API.
+- Successful flushes may persist even if the delegate later returns an error.
+- PostgreSQL flushes may use short chunk-local transactions.
+- Avoid one giant transaction for large batches.
+- Use PostgreSQL `COPY` into staging tables for high-volume batch paths.
+- Keep backend-neutral integration cases backend-equivalent; PG-specific behavior belongs in PG-scoped tests.
+
+## Steps
+
+### 1. Clarify Public Semantics
+
+Update `graph.BatchOperation` documentation to state that the API is non-transactional across the whole operation. Mention that flushes may commit before the delegate returns and that delegate errors do not roll back successful flushes.
+
+Status: Complete.
+
+### 2. Introduce PG COPY Staging Helpers
+
+Add internal PostgreSQL helpers for chunk flushes:
+
+- begin a chunk-local transaction
+- create a temporary staging table
+- stream rows with `COPY`
+- merge/upsert/delete into final graph tables
+- commit or roll back the chunk transaction
+
+Status: Complete.
+
+### 3. Add Streaming `CopyFromSource` Types
+
+Implement row-source types that satisfy `pgx.CopyFromSource` without materializing full `[][]any` batches. Each source should expose only current-row state plus encoder state.
+
+Status: Complete.
+
+### 4. Convert Relationship Create/Upsert
+
+Replace relationship create array batching with staging-table `COPY`.
+
+The flush should:
+
+- stream `graph_id`, `start_id`, `end_id`, `kind_id`, and `properties` into a temporary staging table
+- coalesce duplicate `(graph_id, start_id, end_id, kind_id)` rows in SQL
+- insert into the edge partition with `ON CONFLICT ... DO UPDATE`
+
+Status: Complete.
+
+### 5. Convert Node Create
+
+Replace node create array batching with staging-table `COPY`.
+
+Preserve the existing behavior that a single flush may not mix preset node IDs with nodes that require generated IDs.
+
+Status: Complete.
+
+### 6. Convert Node Update
+
+Replace the normal parameter-array node update and the special large-update path with one staging-based implementation. The existing large-update flow is a useful starting point but should become streaming rather than pre-materialized.
+
+Status: Complete.
+
+### 7. Convert Upsert Batches
+
+Convert `UpdateNodeBy` and `UpdateRelationshipBy` after the simpler paths are stable. Preserve current identity-property semantics while moving the data transfer to staging-table `COPY`.
+
+Status: Complete.
+
+### 8. Add PG-Scoped Tests
+
+Add PostgreSQL driver-scoped tests for:
+
+- flushed data persists after the delegate returns an error
+- relationship create duplicate coalescing
+- node create with and without IDs
+- node update via staging
+- `UpdateNodeBy` and `UpdateRelationshipBy`
+- streaming source behavior
+
+Status: Complete.
+
+### 9. Validate
+
+Run formatting and targeted tests:
+
+```bash
+make format
+go test ./drivers/pg/...
+```
+
+Run PostgreSQL integration tests only when `CONNECTION_STRING` points at PostgreSQL.
+
+Status: Complete.
+
+## Evaluation Notes
+
+This plan should be updated after each step is completed. If a step exposes a simpler or safer implementation order, update this file before moving on.
+
+- Step 1 confirmed the intended contract: `BatchOperation` remains a buffered, non-atomic API. The implementation work should optimize chunk flushes without introducing whole-operation atomicity.
+- Step 2 added the transaction and staging execution boundary as PG-internal helpers. The next step should focus on row sources so batch paths can stream rows into those helpers.
+- Step 3 added a generic slice-backed `CopyFromSource`. It streams encoded rows from existing buffers without creating a second materialized row matrix; later steps can still replace the outer buffers if needed.
+- Step 4 moved relationship create/upsert to staging-table `COPY` and SQL duplicate coalescing. This removed the old in-memory relationship de-duplication path, including its ambiguous key and incorrect index lookup behavior.
+- Step 5 moved node creation to staging-table `COPY` while preserving the existing split between preset-ID and generated-ID batches. Kind assertion remains outside the COPY stream, and row streaming uses kind mapping only.
+- Step 6 unified normal and large node updates on the same staging-table `COPY` flush path. Large node update inputs now use normal batch chunking instead of a separate all-at-once row materialization path.
+- Step 7 moved `UpdateNodeBy` and `UpdateRelationshipBy` to staging-table `COPY`. Node upserts still scan returned IDs into futures in staged row order so relationship upserts can reuse the resolved endpoint IDs.
+- Step 8 added manual PostgreSQL integration coverage for non-transactional flushed chunks, node create with and without IDs, relationship duplicate coalescing, node update staging, and `UpdateNodeBy`/`UpdateRelationshipBy` staging. Existing PG unit tests cover the streaming `CopyFromSource` behavior.
+- Step 9 validation passed for `go test ./drivers/pg/...` and manual integration compilation via `go test -tags manual_integration ./integration -run '^$'`. `make format` could not run because `goimports` is not available as an executable on this PATH, so the touched Go test file was formatted with `go run golang.org/x/tools/cmd/goimports@v0.44.0 -w`. Live PostgreSQL integration tests were not run because `CONNECTION_STRING` is unset.
+
+## Findings Follow-up
+
+The review findings were addressed in this order:
+
+- Refreshed the PostgreSQL schema graph cache during schema assertion and fixed the PG batch integration helper so default graph constraints are actually asserted.
+- Made node upsert ID resolution map returned IDs back to futures by staging row ordinal instead of result position.
+- Split relationship update staging from relationship create staging so identity updates no longer inherit physical-key coalescing.
+- Coalesced duplicate node ID updates before staging to avoid matching the same target row more than once in a PostgreSQL `MERGE`.
+- Converted node and relationship delete buffers to chunk-local `COPY` staging.
+
+Latest validation:
+
+- `go test ./drivers/pg/...` passed.
+- Full tagged PostgreSQL run passed with a PostgreSQL `CONNECTION_STRING`.
+- Full tagged Neo4j run passed with a Neo4j `CONNECTION_STRING`.
+- `make format` still fails in this environment because `goimports` is not executable on `PATH`; touched Go files were formatted with `gofmt` and `go run golang.org/x/tools/cmd/goimports@v0.44.0 -w`.
diff --git a/benchmark/testdata/scale/README.md b/benchmark/testdata/scale/README.md
@@ -0,0 +1,28 @@
+# GraphBench Scale Corpus
+
+This corpus measures graph workload shapes, not general Cypher correctness.
+The shared integration corpus remains the source of backend-equivalent semantic
+coverage.
+
+Cases declare the values a query observes so benchmark reports can separate
+ID-only work from node, relationship, property, and path materialization.
+Current execution modes are `postgres_sql`, `local_traversal`, and `neo4j`.
+Apache AGE is intentionally not a benchmark mode here; it may appear only in
+`reference_design` notes as input for DAWGS design choices.
+
+Each JSON file contains a list of scale cases with:
+
+- `source`: the source corpus or workload family.
+- `dataset`: the fixture dataset to load from `integration/testdata`.
+- `name` and `category`: stable identifiers used in reports.
+- `cypher`: the Cypher query under test.
+- `parameters`: named parameter values.
+- `expected_rows`: the expected result cardinality.
+- `observes`: whether the query observes paths, nodes, relationships,
+  properties, or only IDs internally.
+- `candidate_modes`: the execution modes that should attempt the case.
+- `reference_design`: optional design notes, including AGE observations when
+  useful.
+
+Use `cmd/graphbench` to run this corpus and produce JSONL, Markdown, and JSON
+summaries.
diff --git a/benchmark/testdata/scale/cases/counts.json b/benchmark/testdata/scale/cases/counts.json
@@ -0,0 +1,70 @@
+{
+  "cases": [
+    {
+      "name": "all_node_count",
+      "dataset": "base",
+      "category": "counts",
+      "cypher": "MATCH (n) RETURN count(n)",
+      "expected": {
+        "row_count": 1,
+        "result_kind": "scalar"
+      },
+      "observes": {
+        "paths": false,
+        "nodes": false,
+        "relationships": false,
+        "properties": false
+      },
+      "shape": {
+        "path_materialization_required": false
+      },
+      "candidate_modes": ["postgres_sql", "neo4j"],
+      "tags": ["count", "count-store"]
+    },
+    {
+      "name": "typed_node_count",
+      "dataset": "base",
+      "category": "counts",
+      "cypher": "MATCH (n:NodeKind1) RETURN count(n)",
+      "expected": {
+        "row_count": 1,
+        "result_kind": "scalar"
+      },
+      "observes": {
+        "paths": false,
+        "nodes": false,
+        "relationships": false,
+        "properties": false
+      },
+      "shape": {
+        "terminal_predicate": "node_kind",
+        "path_materialization_required": false
+      },
+      "candidate_modes": ["postgres_sql", "neo4j"],
+      "tags": ["count", "typed-count", "graph-stats"]
+    },
+    {
+      "name": "typed_edge_count",
+      "dataset": "base",
+      "category": "counts",
+      "cypher": "MATCH ()-[r:EdgeKind1]->() RETURN count(r)",
+      "expected": {
+        "row_count": 1,
+        "result_kind": "scalar"
+      },
+      "observes": {
+        "paths": false,
+        "nodes": false,
+        "relationships": false,
+        "properties": false
+      },
+      "shape": {
+        "edge_kinds": ["EdgeKind1"],
+        "path_materialization_required": false
+      },
+      "candidate_modes": ["postgres_sql", "neo4j"],
+      "tags": ["count", "typed-count", "graph-stats"]
+    }
+  ]
+}
+
diff --git a/benchmark/testdata/scale/cases/lookups.json b/benchmark/testdata/scale/cases/lookups.json
@@ -0,0 +1,54 @@
+{
+  "cases": [
+    {
+      "name": "objectid_exact_string_anchor",
+      "dataset": "base",
+      "category": "lookups",
+      "cypher": "MATCH (n:NodeKind1) WHERE n.objectid = $objectid RETURN id(n)",
+      "params": {
+        "objectid": "S-1-5-21-1"
+      },
+      "expected": {
+        "row_count": 1,
+        "result_kind": "id_set"
+      },
+      "observes": {
+        "paths": false,
+        "nodes": false,
+        "relationships": false,
+        "properties": false
+      },
+      "shape": {
+        "root_predicate": "selective_property",
+        "terminal_predicate": "node_kind",
+        "path_materialization_required": false
+      },
+      "candidate_modes": ["postgres_sql", "neo4j"],
+      "tags": ["property-anchor", "expression-index"]
+    },
+    {
+      "name": "boolean_property_filter",
+      "dataset": "base",
+      "category": "lookups",
+      "cypher": "MATCH (n:NodeKind1) WHERE n.enabled = true RETURN id(n)",
+      "expected": {
+        "row_count": 1,
+        "result_kind": "id_set"
+      },
+      "observes": {
+        "paths": false,
+        "nodes": false,
+        "relationships": false,
+        "properties": false
+      },
+      "shape": {
+        "root_predicate": "boolean_property",
+        "terminal_predicate": "node_kind",
+        "path_materialization_required": false
+      },
+      "candidate_modes": ["postgres_sql", "neo4j"],
+      "tags": ["property-filter"]
+    }
+  ]
+}
+