Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
117 commits
Select commit Hold shift + click to select a range
88e8089
fix(pgsql): support aggregate operator projections
zinic May 19, 2026
c66e422
fix(cypher): prefer supported integer conversion
zinic May 20, 2026
cc8035f
docs(pgsql): capture optimizer pass plan
zinic May 20, 2026
6d74059
feat(pgsql): analyze optimizer regions
zinic May 20, 2026
1de1d5c
test(pgsql): cover optimizer path safety
zinic May 20, 2026
1570581
feat(pgsql): add optimizer pipeline hook
zinic May 20, 2026
f0e69d2
feat(pgsql): attach optimizer predicates
zinic May 20, 2026
e751a7f
test(integration): stabilize optimizer fixture coverage
zinic May 20, 2026
d850875
chore(pgsql): harden optimizer foundation
zinic May 20, 2026
244a614
feat(pgsql): prune expansion path projections
zinic May 20, 2026
f5f3a64
feat(pgsql): materialize path edges late
zinic May 20, 2026
283a4eb
docs(pgsql): sequence optimizer review followups
zinic May 21, 2026
cc57a8b
fix(pgsql): preserve optional match pruning barrier
zinic May 21, 2026
fe4393e
test(integration): assert optimized path semantics
zinic May 21, 2026
b966e46
test(pgsql): guard relationship expression materialization
zinic May 21, 2026
83ca057
docs(pgsql): capture optimizer measurement gaps
zinic May 21, 2026
47bdfa2
test(pgsql): update optional match barrier shape
zinic May 21, 2026
d2dfcbb
feat(pgsql): lower bound fixed hops as expand-into
zinic May 21, 2026
171f5a4
feat(pgsql): reorder independent node anchors
zinic May 21, 2026
89737dd
feat(pgsql): push fixed suffix checks into expansions
zinic May 21, 2026
650b108
docs(pgsql): sequence optimizer gap closure plan
zinic May 21, 2026
363b633
feat(pgsql): close optimizer suffix pushdown gaps
zinic May 21, 2026
fffcfe0
test(pgsql): complete optimizer gap closure
zinic May 21, 2026
9d185f5
test(pgsql): measure optimizer rules locally
zinic May 21, 2026
a737cd6
Add optimizer lowering metadata contract
zinic May 21, 2026
807acd5
Lift projection pruning decisions into optimizer
zinic May 21, 2026
c46ff89
Lift late path materialization decisions
zinic May 21, 2026
b67e43d
Lift expansion suffix pushdown detection
zinic May 21, 2026
2ba0f9a
Report fixed-hop expand-into decisions
zinic May 21, 2026
bf8f2cc
Wire predicate attachments into lowering metadata
zinic May 21, 2026
223e083
Prefer optimizer lowering decisions in translator
zinic May 21, 2026
930091e
Lock lowering metadata verification
zinic May 21, 2026
bfe171e
Harden optimizer lowering metadata
zinic May 21, 2026
39eb6ab
Document lowering metadata contract
zinic May 22, 2026
19b9c40
Consume expand-into lowering decisions
zinic May 22, 2026
44c8ffa
Lift projection pruning binding actions
zinic May 22, 2026
9d6fbc4
Apply late materialization decisions explicitly
zinic May 22, 2026
6e56e36
Record consumed predicate placements
zinic May 22, 2026
79502d6
Carry suffix pushdown source spans
zinic May 22, 2026
f347033
Remove targeted lowering fallbacks
zinic May 22, 2026
081358d
Plan expand-into for anonymous continuations
zinic May 22, 2026
1510c18
Lift rewrite decisions into optimizer plan
zinic May 22, 2026
67eaab7
Plan projection pruning for pattern predicates
zinic May 22, 2026
08af7f6
Plan pattern predicate placement
zinic May 22, 2026
ba0fef7
Centralize selectivity and locality planning
zinic May 22, 2026
3e5581c
fix(pgsql): address optimizer review feedback
zinic May 22, 2026
2ed6a0b
fix(pgsql): stage path nodes in tail predicates
zinic May 22, 2026
91733ad
Validate ADCS optimizer fanout rewrite
zinic May 22, 2026
7eac29a
Add Cypher plan corpus capture tooling
zinic May 22, 2026
82a75e3
Add count fast path and skipped lowering reporting
zinic May 22, 2026
05744a0
Stage repeated path projection components
zinic May 22, 2026
0a8918f
Plan traversal flips for endpoint predicates
zinic May 22, 2026
4d63285
Extend suffix pushdown to constrained bound endpoints
zinic May 22, 2026
1425caa
Stabilize integration corpus validation
zinic May 23, 2026
7bfb919
Record predicate placement consumption
zinic May 23, 2026
1852b37
Constrain predicate placement planning to clause
zinic May 23, 2026
9d9c1e8
Preserve endpoints in edge count fast path
zinic May 23, 2026
52a0513
Count partially skipped lowerings
zinic May 23, 2026
0c0ceb0
Honor Neo4j plan corpus connection URIs
zinic May 23, 2026
4c7c401
Wire count star fast path planning
zinic May 23, 2026
2c7cf03
Validate optimization gap fixes
zinic May 23, 2026
fe58aed
Use typed text lookups for string equality
zinic May 23, 2026
4a27425
Cover typed string equality translation
zinic May 23, 2026
ce73e36
Add PG string equality plan coverage
zinic May 23, 2026
4d674de
Add edge kind count index
zinic May 23, 2026
90c8384
Cover count fast path SQL shapes
zinic May 23, 2026
03168c0
Document optimizer index assumptions
zinic May 23, 2026
3188ab2
Align optimizer safety string expectations
zinic May 23, 2026
5bd5adc
Record optimizer validation status
zinic May 23, 2026
bb986fa
More cases
zinic May 23, 2026
765d2ea
Optimize typed pattern predicates
zinic May 23, 2026
095a023
Lower membership-only collects to ids
zinic May 23, 2026
09905fa
Flip bound expansions to constrained terminals
zinic May 23, 2026
532a406
Plan terminal filters for kind-only shortest paths
zinic May 23, 2026
0e74683
Defer blanket suffix indexing
zinic May 23, 2026
f557acf
Update translation snapshots for optimizer lowerings
zinic May 23, 2026
8522854
Correct bounded Azure path assertion
zinic May 23, 2026
9c5232c
further lowering and live query optimization work
zinic May 23, 2026
94b7d78
Add live aggregate traversal plan guard
zinic May 23, 2026
51fe99c
Report skipped kind-only traversal flips
zinic May 23, 2026
61f039d
Widen aggregate traversal count matching
zinic May 23, 2026
7f62b68
Respect selective bound traversal sources
zinic May 23, 2026
97c3ffa
Expand aggregate traversal baseline coverage
zinic May 23, 2026
835a26f
Widen aggregate traversal final projections
zinic May 23, 2026
0068c3e
Carry selectivity through traversal lowerings
zinic May 23, 2026
b9f7b4b
Fold terminal filters into aggregate traversal
zinic May 23, 2026
74e6b2b
Document aggregate optimizer continuation status
zinic May 23, 2026
c2e311f
Add graphbench scale corpus contract
zinic May 24, 2026
f39a579
Add graphbench PostgreSQL SQL runner
zinic May 24, 2026
4714d8b
Add graphbench Neo4j runner
zinic May 24, 2026
be785ef
Add graphbench local traversal placeholder
zinic May 24, 2026
216c900
Add graphbench comparison reports
zinic May 24, 2026
795c687
Document graphbench AGE reference workflow
zinic May 24, 2026
178e1c7
Merge remote-tracking branch 'upstream/main' into optimizer
zinic May 24, 2026
a9cfede
fixup build
zinic May 24, 2026
80df793
Harden benchmark resource handling
zinic May 24, 2026
489938a
Fix optimizer and translation edge cases
zinic May 24, 2026
618c1e1
Harden Neo4j database parsing and plan assertions
zinic May 24, 2026
df9466d
Document backend-selected integration skips
zinic May 24, 2026
b679143
Document BatchOperation COPY streaming plan
zinic May 24, 2026
ca7178e
Clarify BatchOperation non-transactional semantics
zinic May 24, 2026
8ffcc3d
Add PostgreSQL batch COPY staging helpers
zinic May 24, 2026
fdd19ab
Add streaming COPY sources for PG batches
zinic May 24, 2026
5130dbb
Stream PG relationship creates through COPY staging
zinic May 24, 2026
a29c1a9
Stream PG node creates through COPY staging
zinic May 24, 2026
03297a5
Stream PG node updates through COPY staging
zinic May 24, 2026
56db31a
Stream PG upsert batches through COPY staging
zinic May 24, 2026
7c2078e
Add PostgreSQL batch operation integration coverage
zinic May 24, 2026
457bae8
Record batch operation validation
zinic May 24, 2026
f4736c0
Refresh PG schema cache on assertion
zinic May 24, 2026
582c392
Map PG node upsert IDs by staging ordinal
zinic May 24, 2026
e42afa7
Separate PG relationship update staging merge
zinic May 24, 2026
8ba7d6f
Coalesce duplicate PG node batch updates
zinic May 24, 2026
40edfef
Stream PG batch deletes through COPY staging
zinic May 24, 2026
10ee9b6
Record batch finding follow-up validation
zinic May 24, 2026
a3b4795
Fix integration schema for query-only kinds
zinic May 24, 2026
eee8333
Group local declarations
zinic May 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ QUALITY_INPUTS += -mutation-report $(MUTATION_REPORT)
endif
QUALITY_INPUTS += -benchmark-regression $(BENCHMARK_REGRESSION)

.PHONY: default all build deps tidy lint format test test_all test_integration test_neo4j test_pg test_update complexity complexity_check crap crap_check quality quality_check quality_backend quality_bench metrics metrics_check generate clean help
.PHONY: default all build deps tidy lint format test test_all test_integration test_neo4j test_pg test_update plan_corpus complexity complexity_check crap crap_check quality quality_check quality_backend quality_bench metrics metrics_check generate clean help

# Default target
default: help
Expand Down Expand Up @@ -109,6 +109,10 @@ test_update:
@cp -fv cypher/models/pgsql/test/updated_cases/* cypher/models/pgsql/test/translation_cases
@rm -rf cypher/models/pgsql/test/updated_cases

plan_corpus: $(METRICS_DIR)
@echo "Capturing Cypher plan corpus..."
@$(GO_CMD) run ./cmd/plancorpus

# Metric targets
$(METRICS_DIR):
@mkdir -p $(METRICS_DIR)
Expand Down Expand Up @@ -218,6 +222,7 @@ help:
@echo " test_bench - Run benchmark test"
@echo " test_neo4j - Run Neo4j integration tests"
@echo " test_pg - Run PostgreSQL integration tests"
@echo " plan_corpus - Capture shared corpus query plans for configured backends"
@echo " test_update - Update test cases"
@echo " complexity - Report cyclomatic complexity"
@echo " crap - Report CRAP scores from unit test coverage"
Expand Down
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ export CONNECTION_STRING="postgresql://dawgs:weneedbetterpasswords@localhost:654
export CONNECTION_STRING="neo4j://neo4j:weneedbetterpasswords@localhost:7687"
```

Neo4j connection strings may use `neo4j://`, `neo4j+s://`, or `neo4j+ssc://`; a single path segment selects the Neo4j database name.

Use `make test` for unit tests only and `make test_integration` for integration tests only.

### Test Metrics
Expand Down Expand Up @@ -95,6 +97,24 @@ make quality FUZZ_REPORT=.coverage/fuzz.json MUTATION_REPORT=.coverage/mutation.
`PG_CONNECTION_STRING` and `NEO4J_CONNECTION_STRING`. `make quality_bench` writes benchmark markdown and JSON captures
for later baseline comparison.

`make plan_corpus` captures plan diagnostics for the shared Cypher integration corpus. It accepts either
`CONNECTION_STRING` for one backend or `PG_CONNECTION_STRING` and `NEO4J_CONNECTION_STRING` for both backends, then
writes JSONL captures and markdown/JSON summaries under `.coverage/`.

`go run ./cmd/graphbench` captures runtime diagnostics for the scale corpus under `benchmark/testdata/scale`. The
current modes are `postgres_sql`, `local_traversal`, and `neo4j`; AGE is reference-design input only and is not a direct
comparison mode yet. The command can emit JSONL records plus Markdown and JSON summaries, and can compare current timings
against a previous JSONL baseline.

PostgreSQL translates exact string property equality with a JSON string type guard and `properties ->>` extraction, so
indexes created on expressions such as `properties ->> 'objectid'` and `properties ->> 'name'` can be used for selective
anchors without matching JSON booleans or numbers. Simple relationship count fast paths depend on the schema's
`kind_id`-first edge index for efficient typed counts.

Substring and suffix predicates are intentionally not promoted to blanket schema indexes. PostgreSQL deployments can
request explicit `TextSearchIndex`/trigram property indexes for fields that need `CONTAINS`, `STARTS WITH`, or
`ENDS WITH`, but default schema assertion should wait until all suffix forms share one semantics-preserving lowering.

Thresholds are report-only by default. To enforce the configured thresholds, run:

```bash
Expand Down
129 changes: 129 additions & 0 deletions batch_operation_plan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# BatchOperation COPY Streaming Plan

## Objective

Move PostgreSQL `BatchOperation` toward chunked streaming writes backed by `COPY` and staging tables, while documenting that `BatchOperation` is intentionally non-transactional across the whole delegate.

## Ground Rules

- `BatchOperation` is a buffered, non-atomic write API.
- Successful flushes may persist even if the delegate later returns an error.
- PostgreSQL flushes may use short chunk-local transactions.
- Avoid one giant transaction for large batches.
- Use PostgreSQL `COPY` into staging tables for high-volume batch paths.
- Keep backend-neutral integration cases backend-equivalent; PG-specific behavior belongs in PG-scoped tests.

## Steps

### 1. Clarify Public Semantics

Update `graph.BatchOperation` documentation to state that the API is non-transactional across the whole operation. Mention that flushes may commit before the delegate returns and that delegate errors do not roll back successful flushes.

Status: Complete.

### 2. Introduce PG COPY Staging Helpers

Add internal PostgreSQL helpers for chunk flushes:

- begin a chunk-local transaction
- create a temporary staging table
- stream rows with `COPY`
- merge/upsert/delete into final graph tables
- commit or roll back the chunk transaction

Status: Complete.

### 3. Add Streaming `CopyFromSource` Types

Implement row-source types that satisfy `pgx.CopyFromSource` without materializing full `[][]any` batches. Each source should expose only current-row state plus encoder state.

Status: Complete.

### 4. Convert Relationship Create/Upsert

Replace relationship create array batching with staging-table `COPY`.

The flush should:

- stream `graph_id`, `start_id`, `end_id`, `kind_id`, and `properties` into a temporary staging table
- coalesce duplicate `(graph_id, start_id, end_id, kind_id)` rows in SQL
- insert into the edge partition with `ON CONFLICT ... DO UPDATE`

Status: Complete.

### 5. Convert Node Create

Replace node create array batching with staging-table `COPY`.

Preserve the existing behavior that a single flush may not mix preset node IDs with nodes that require generated IDs.

Status: Complete.

### 6. Convert Node Update

Replace the normal parameter-array node update and the special large-update path with one staging-based implementation. The existing large-update flow is a useful starting point but should become streaming rather than pre-materialized.

Status: Complete.

### 7. Convert Upsert Batches

Convert `UpdateNodeBy` and `UpdateRelationshipBy` after the simpler paths are stable. Preserve current identity-property semantics while moving the data transfer to staging-table `COPY`.

Status: Complete.

### 8. Add PG-Scoped Tests

Add PostgreSQL driver-scoped tests for:

- flushed data persists after the delegate returns an error
- relationship create duplicate coalescing
- node create with and without IDs
- node update via staging
- `UpdateNodeBy` and `UpdateRelationshipBy`
- streaming source behavior

Status: Complete.

### 9. Validate

Run formatting and targeted tests:

```bash
make format
go test ./drivers/pg/...
```

Run PostgreSQL integration tests only when `CONNECTION_STRING` points at PostgreSQL.

Status: Complete.

## Evaluation Notes

This plan should be updated after each step is completed. If a step exposes a simpler or safer implementation order, update this file before moving on.

- Step 1 confirmed the intended contract: `BatchOperation` remains a buffered, non-atomic API. The implementation work should optimize chunk flushes without introducing whole-operation atomicity.
- Step 2 added the transaction and staging execution boundary as PG-internal helpers. The next step should focus on row sources so batch paths can stream rows into those helpers.
- Step 3 added a generic slice-backed `CopyFromSource`. It streams encoded rows from existing buffers without creating a second materialized row matrix; later steps can still replace the outer buffers if needed.
- Step 4 moved relationship create/upsert to staging-table `COPY` and SQL duplicate coalescing. This removed the old in-memory relationship de-duplication path, including its ambiguous key and incorrect index lookup behavior.
- Step 5 moved node creation to staging-table `COPY` while preserving the existing split between preset-ID and generated-ID batches. Kind assertion remains outside the COPY stream, and row streaming uses kind mapping only.
- Step 6 unified normal and large node updates on the same staging-table `COPY` flush path. Large node update inputs now use normal batch chunking instead of a separate all-at-once row materialization path.
- Step 7 moved `UpdateNodeBy` and `UpdateRelationshipBy` to staging-table `COPY`. Node upserts still scan returned IDs into futures in staged row order so relationship upserts can reuse the resolved endpoint IDs.
- Step 8 added manual PostgreSQL integration coverage for non-transactional flushed chunks, node create with and without IDs, relationship duplicate coalescing, node update staging, and `UpdateNodeBy`/`UpdateRelationshipBy` staging. Existing PG unit tests cover the streaming `CopyFromSource` behavior.
- Step 9 validation passed for `go test ./drivers/pg/...` and manual integration compilation via `go test -tags manual_integration ./integration -run '^$'`. `make format` could not run because `goimports` is not available as an executable on this PATH, so the touched Go test file was formatted with `go run golang.org/x/tools/cmd/goimports@v0.44.0 -w`. Live PostgreSQL integration tests were not run because `CONNECTION_STRING` is unset.

## Findings Follow-up

The review findings were addressed in this order:

- Refreshed the PostgreSQL schema graph cache during schema assertion and fixed the PG batch integration helper so default graph constraints are actually asserted.
- Made node upsert ID resolution map returned IDs back to futures by staging row ordinal instead of result position.
- Split relationship update staging from relationship create staging so identity updates no longer inherit physical-key coalescing.
- Coalesced duplicate node ID updates before staging to avoid matching the same target row more than once in a PostgreSQL `MERGE`.
- Converted node and relationship delete buffers to chunk-local `COPY` staging.

Latest validation:

- `go test ./drivers/pg/...` passed.
- Full tagged PostgreSQL run passed with a PostgreSQL `CONNECTION_STRING`.
- Full tagged Neo4j run passed with a Neo4j `CONNECTION_STRING`.
- `make format` still fails in this environment because `goimports` is not executable on `PATH`; touched Go files were formatted with `gofmt` and `go run golang.org/x/tools/cmd/goimports@v0.44.0 -w`.
28 changes: 28 additions & 0 deletions benchmark/testdata/scale/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# GraphBench Scale Corpus

This corpus measures graph workload shapes, not general Cypher correctness.
The shared integration corpus remains the source of backend-equivalent semantic
coverage.

Cases declare the values a query observes so benchmark reports can separate
ID-only work from node, relationship, property, and path materialization.
Current execution modes are `postgres_sql`, `local_traversal`, and `neo4j`.
Apache AGE is intentionally not a benchmark mode here; it may appear only in
`reference_design` notes as input for DAWGS design choices.

Each JSON file contains a list of scale cases with:

- `source`: the source corpus or workload family.
- `dataset`: the fixture dataset to load from `integration/testdata`.
- `name` and `category`: stable identifiers used in reports.
- `cypher`: the Cypher query under test.
- `parameters`: named parameter values.
- `expected_rows`: the expected result cardinality.
- `observes`: whether the query observes paths, nodes, relationships,
properties, or only IDs internally.
- `candidate_modes`: the execution modes that should attempt the case.
- `reference_design`: optional design notes, including AGE observations when
useful.

Use `cmd/graphbench` to run this corpus and produce JSONL, Markdown, and JSON
summaries.
70 changes: 70 additions & 0 deletions benchmark/testdata/scale/cases/counts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"cases": [
{
"name": "all_node_count",
"dataset": "base",
"category": "counts",
"cypher": "MATCH (n) RETURN count(n)",
"expected": {
"row_count": 1,
"result_kind": "scalar"
},
"observes": {
"paths": false,
"nodes": false,
"relationships": false,
"properties": false
},
"shape": {
"path_materialization_required": false
},
"candidate_modes": ["postgres_sql", "neo4j"],
"tags": ["count", "count-store"]
},
{
"name": "typed_node_count",
"dataset": "base",
"category": "counts",
"cypher": "MATCH (n:NodeKind1) RETURN count(n)",
"expected": {
"row_count": 1,
"result_kind": "scalar"
},
"observes": {
"paths": false,
"nodes": false,
"relationships": false,
"properties": false
},
"shape": {
"terminal_predicate": "node_kind",
"path_materialization_required": false
},
"candidate_modes": ["postgres_sql", "neo4j"],
"tags": ["count", "typed-count", "graph-stats"]
},
{
"name": "typed_edge_count",
"dataset": "base",
"category": "counts",
"cypher": "MATCH ()-[r:EdgeKind1]->() RETURN count(r)",
"expected": {
"row_count": 1,
"result_kind": "scalar"
},
"observes": {
"paths": false,
"nodes": false,
"relationships": false,
"properties": false
},
"shape": {
"edge_kinds": ["EdgeKind1"],
"path_materialization_required": false
},
"candidate_modes": ["postgres_sql", "neo4j"],
"tags": ["count", "typed-count", "graph-stats"]
}
]
}

54 changes: 54 additions & 0 deletions benchmark/testdata/scale/cases/lookups.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"cases": [
{
"name": "objectid_exact_string_anchor",
"dataset": "base",
"category": "lookups",
"cypher": "MATCH (n:NodeKind1) WHERE n.objectid = $objectid RETURN id(n)",
"params": {
"objectid": "S-1-5-21-1"
},
"expected": {
"row_count": 1,
"result_kind": "id_set"
},
"observes": {
"paths": false,
"nodes": false,
"relationships": false,
"properties": false
},
"shape": {
"root_predicate": "selective_property",
"terminal_predicate": "node_kind",
"path_materialization_required": false
},
"candidate_modes": ["postgres_sql", "neo4j"],
"tags": ["property-anchor", "expression-index"]
},
{
"name": "boolean_property_filter",
"dataset": "base",
"category": "lookups",
"cypher": "MATCH (n:NodeKind1) WHERE n.enabled = true RETURN id(n)",
"expected": {
"row_count": 1,
"result_kind": "id_set"
},
"observes": {
"paths": false,
"nodes": false,
"relationships": false,
"properties": false
},
"shape": {
"root_predicate": "boolean_property",
"terminal_predicate": "node_kind",
"path_materialization_required": false
},
"candidate_modes": ["postgres_sql", "neo4j"],
"tags": ["property-filter"]
}
]
}

Loading
Loading