diff --git a/Makefile b/Makefile index f2a0b9a62..3ea9236a6 100644 --- a/Makefile +++ b/Makefile @@ -184,7 +184,8 @@ REGRESS = scan \ security \ reserved_keyword_alias \ agtype_jsonb_cast \ - containment_selectivity + containment_selectivity \ + subgraph ifneq ($(EXTRA_TESTS),) REGRESS += $(EXTRA_TESTS) diff --git a/age--1.7.0--y.y.y.sql b/age--1.7.0--y.y.y.sql index a4cac0c5c..282eaa0f9 100644 --- a/age--1.7.0--y.y.y.sql +++ b/age--1.7.0--y.y.y.sql @@ -800,3 +800,260 @@ ALTER OPERATOR ag_catalog.?&(agtype, text[]) SET (RESTRICT = contsel, JOIN = contjoinsel); ALTER OPERATOR ag_catalog.?&(agtype, agtype) SET (RESTRICT = contsel, JOIN = contjoinsel); + +-- +-- create_subgraph(): materialized subgraph extraction (see sql/age_subgraph.sql). +-- Induced-subgraph semantics matching Neo4j GDS gds.graph.filter(): a vertex is +-- kept iff node_filter holds ('*' = all); an edge is kept iff relationship_filter +-- holds AND both endpoints are kept. Produces a persistent, Cypher-queryable graph. +-- +CREATE FUNCTION ag_catalog.create_subgraph(new_graph name, + from_graph name, + node_filter text DEFAULT '*', + relationship_filter text DEFAULT '*') + RETURNS TABLE(node_count bigint, relationship_count bigint) + LANGUAGE plpgsql + VOLATILE + SET search_path = ag_catalog, pg_catalog + AS $function$ +DECLARE + from_oid oid; + new_oid oid; + v_node_count bigint := 0; + v_rel_count bigint := 0; + rec RECORD; + cypher_q text; + where_clause text; + dst_label_id int; + dst_seq_fqn text; + dst_relation text; + inserted bigint; + has_rows boolean; +BEGIN + -- Argument validation. + IF new_graph IS NULL THEN + RAISE EXCEPTION 'new graph name must not be NULL'; + END IF; + IF from_graph IS NULL THEN + RAISE EXCEPTION 'source graph name must not be NULL'; + END IF; + IF new_graph = from_graph THEN + RAISE EXCEPTION 'cannot extract a subgraph of "%" into itself', from_graph; + END IF; + + -- NULL predicate is treated as the '*' wildcard (keep all). + IF node_filter IS NULL THEN + node_filter := '*'; + END IF; + IF relationship_filter IS NULL THEN + relationship_filter := '*'; + END IF; + + -- The predicates are embedded into a dollar-quoted cypher() query using the + -- $age_subgraph$ tag; reject predicates that contain the tag to keep the + -- quoting unambiguous. + IF position('$age_subgraph$' IN node_filter) > 0 + OR position('$age_subgraph$' IN relationship_filter) > 0 THEN + RAISE EXCEPTION 'filter predicate must not contain the reserved token $age_subgraph$'; + END IF; + + -- Validate source graph exists. + SELECT graphid INTO from_oid + FROM ag_catalog.ag_graph WHERE name = from_graph; + IF from_oid IS NULL THEN + RAISE EXCEPTION 'graph "%" does not exist', from_graph; + END IF; + + -- Validate destination graph does not exist (create_graph also enforces + -- naming rules and uniqueness, but we give a clear early error). + IF EXISTS (SELECT 1 FROM ag_catalog.ag_graph WHERE name = new_graph) THEN + RAISE EXCEPTION 'graph "%" already exists', new_graph; + END IF; + + -- Create the destination graph (default labels are created automatically). + PERFORM ag_catalog.create_graph(new_graph); + + SELECT graphid INTO new_oid + FROM ag_catalog.ag_graph WHERE name = new_graph; + + -- Working sets / mapping (uniquely named to avoid colliding with user temps). + DROP TABLE IF EXISTS _ag_sg_kept_v; + DROP TABLE IF EXISTS _ag_sg_kept_e; + DROP TABLE IF EXISTS _ag_sg_vmap; + DROP TABLE IF EXISTS _ag_sg_vstage; + DROP TABLE IF EXISTS _ag_sg_estage; + + -- + -- Kept vertices: evaluate node_filter with AGE's Cypher engine. The node + -- variable `n` is bound exactly as in the spec; '*' selects all vertices. + -- + IF node_filter IS NULL OR btrim(node_filter) = '*' THEN + where_clause := ''; + ELSE + where_clause := ' WHERE ' || node_filter; + END IF; + cypher_q := 'MATCH (n)' || where_clause || ' RETURN id(n)'; + + EXECUTE format( + 'CREATE TEMP TABLE _ag_sg_kept_v ON COMMIT DROP AS ' + 'SELECT DISTINCT ag_catalog.agtype_to_graphid(vid) AS gid ' + 'FROM ag_catalog.cypher(%L, $age_subgraph$%s$age_subgraph$) AS (vid agtype)', + from_graph, cypher_q); + CREATE INDEX ON _ag_sg_kept_v (gid); + + -- + -- Kept edges: evaluate relationship_filter with AGE's Cypher engine. The + -- relationship variable `r` is bound exactly as in the spec. + -- + IF relationship_filter IS NULL OR btrim(relationship_filter) = '*' THEN + where_clause := ''; + ELSE + where_clause := ' WHERE ' || relationship_filter; + END IF; + cypher_q := 'MATCH ()-[r]->()' || where_clause || ' RETURN id(r)'; + + EXECUTE format( + 'CREATE TEMP TABLE _ag_sg_kept_e ON COMMIT DROP AS ' + 'SELECT DISTINCT ag_catalog.agtype_to_graphid(eid) AS gid ' + 'FROM ag_catalog.cypher(%L, $age_subgraph$%s$age_subgraph$) AS (eid agtype)', + from_graph, cypher_q); + CREATE INDEX ON _ag_sg_kept_e (gid); + + -- old -> new vertex id mapping (graphid is unique within a graph). + CREATE TEMP TABLE _ag_sg_vmap (old_id graphid PRIMARY KEY, + new_id graphid NOT NULL) ON COMMIT DROP; + + -- + -- PASS 1: copy kept vertices, label by label, assigning new graphids and + -- recording the old->new mapping for edge remapping. + -- + FOR rec IN + SELECT name, id, relation, seq_name + FROM ag_catalog.ag_label + WHERE graph = from_oid AND kind = 'v' + ORDER BY id + LOOP + -- Skip labels with no surviving vertices. Read ONLY this label's own + -- rows: AGE label tables use table inheritance (custom labels inherit + -- from _ag_label_vertex), so a plain scan of a parent would also return + -- its children and copy them twice. + EXECUTE format( + 'SELECT EXISTS (SELECT 1 FROM ONLY %s t ' + 'WHERE EXISTS (SELECT 1 FROM _ag_sg_kept_v k WHERE k.gid = t.id))', + rec.relation::regclass::text) + INTO has_rows; + IF NOT has_rows THEN + CONTINUE; + END IF; + + -- Ensure the label exists in the destination graph. + IF rec.name <> '_ag_label_vertex' THEN + PERFORM 1 FROM ag_catalog.ag_label + WHERE graph = new_oid AND name = rec.name; + IF NOT FOUND THEN + EXECUTE format('SELECT ag_catalog.create_vlabel(%L, %L)', + new_graph, rec.name); + END IF; + END IF; + + SELECT id, seq_name, relation::regclass::text + INTO dst_label_id, dst_seq_fqn, dst_relation + FROM ag_catalog.ag_label + WHERE graph = new_oid AND name = rec.name; + dst_seq_fqn := format('%I.%I', new_graph, dst_seq_fqn); + + -- Stage surviving vertices with freshly generated ids in a real temp + -- table (single evaluation), then copy to the label table and record + -- the old->new mapping. A materialized stage avoids any ambiguity from + -- referencing a nextval-bearing CTE more than once. + DROP TABLE IF EXISTS _ag_sg_vstage; + EXECUTE format( + 'CREATE TEMP TABLE _ag_sg_vstage ON COMMIT DROP AS ' + 'SELECT t.id AS old_id, ' + ' ag_catalog._graphid(%s, nextval(%L)) AS new_id, ' + ' t.properties AS props ' + 'FROM ONLY %s t ' + 'WHERE EXISTS (SELECT 1 FROM _ag_sg_kept_v k WHERE k.gid = t.id)', + dst_label_id, dst_seq_fqn, rec.relation::regclass::text); + + EXECUTE format('INSERT INTO %s (id, properties) ' + 'SELECT new_id, props FROM _ag_sg_vstage', dst_relation); + + INSERT INTO _ag_sg_vmap (old_id, new_id) + SELECT old_id, new_id FROM _ag_sg_vstage; + + DROP TABLE _ag_sg_vstage; + END LOOP; + + SELECT count(*) INTO v_node_count FROM _ag_sg_vmap; + + -- + -- PASS 2: copy kept edges, remapping endpoints. The joins to _ag_sg_vmap + -- enforce the induced rule (an edge survives only if BOTH endpoints were + -- kept); membership in _ag_sg_kept_e applies relationship_filter. + -- + FOR rec IN + SELECT name, id, relation, seq_name + FROM ag_catalog.ag_label + WHERE graph = from_oid AND kind = 'e' + ORDER BY id + LOOP + -- Skip labels with no surviving edges. Read ONLY this label's own rows + -- (see the vertex pass for why inheritance requires ONLY). + EXECUTE format( + 'SELECT EXISTS (' + ' SELECT 1 FROM ONLY %s x ' + ' JOIN _ag_sg_vmap vs ON vs.old_id = x.start_id ' + ' JOIN _ag_sg_vmap ve ON ve.old_id = x.end_id ' + ' WHERE EXISTS (SELECT 1 FROM _ag_sg_kept_e k WHERE k.gid = x.id))', + rec.relation::regclass::text) + INTO has_rows; + IF NOT has_rows THEN + CONTINUE; + END IF; + + IF rec.name <> '_ag_label_edge' THEN + PERFORM 1 FROM ag_catalog.ag_label + WHERE graph = new_oid AND name = rec.name; + IF NOT FOUND THEN + EXECUTE format('SELECT ag_catalog.create_elabel(%L, %L)', + new_graph, rec.name); + END IF; + END IF; + + SELECT id, seq_name, relation::regclass::text + INTO dst_label_id, dst_seq_fqn, dst_relation + FROM ag_catalog.ag_label + WHERE graph = new_oid AND name = rec.name; + dst_seq_fqn := format('%I.%I', new_graph, dst_seq_fqn); + + -- Stage surviving edges, remapping endpoints through _ag_sg_vmap. The + -- joins enforce the induced rule (both endpoints kept); membership in + -- _ag_sg_kept_e applies relationship_filter. + DROP TABLE IF EXISTS _ag_sg_estage; + EXECUTE format( + 'CREATE TEMP TABLE _ag_sg_estage ON COMMIT DROP AS ' + 'SELECT ag_catalog._graphid(%s, nextval(%L)) AS new_id, ' + ' vs.new_id AS new_start, ve.new_id AS new_end, ' + ' x.properties AS props ' + 'FROM ONLY %s x ' + 'JOIN _ag_sg_vmap vs ON vs.old_id = x.start_id ' + 'JOIN _ag_sg_vmap ve ON ve.old_id = x.end_id ' + 'WHERE EXISTS (SELECT 1 FROM _ag_sg_kept_e k WHERE k.gid = x.id)', + dst_label_id, dst_seq_fqn, rec.relation::regclass::text); + + EXECUTE format('INSERT INTO %s (id, start_id, end_id, properties) ' + 'SELECT new_id, new_start, new_end, props ' + 'FROM _ag_sg_estage', dst_relation); + GET DIAGNOSTICS inserted = ROW_COUNT; + v_rel_count := v_rel_count + inserted; + + DROP TABLE _ag_sg_estage; + END LOOP; + + RETURN QUERY SELECT v_node_count, v_rel_count; +END; +$function$; + +COMMENT ON FUNCTION ag_catalog.create_subgraph(name, name, text, text) IS +'Materializes a new persistent graph as the induced subgraph of from_graph selected by a Cypher node predicate (on n) and relationship predicate (on r); ''*'' keeps all. An edge is kept only if its predicate holds and both endpoints are kept. Returns (node_count, relationship_count).'; diff --git a/regress/expected/subgraph.out b/regress/expected/subgraph.out new file mode 100644 index 000000000..a27569a5d --- /dev/null +++ b/regress/expected/subgraph.out @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +LOAD 'age'; +SET search_path TO ag_catalog; +-- Suppress the create_graph / create_vlabel NOTICE chatter so the assertions +-- below are the deterministic output. (The feature is exercised regardless.) +SET client_min_messages = warning; +-- +-- Build a "somewhat large" source graph with NO MATCH (fast bulk CREATE): +-- * 2000 isolated components, each (:Person{pid,age})-[:KNOWS{w}]->(:Friend{pid}) +-- => 2000 Person + 2000 Friend vertices, 2000 KNOWS edges +-- * 500 isolated :Company vertices (no edges) +-- Totals: 4500 vertices, 2000 edges, label set {Person,Friend,Company,KNOWS}. +-- +SELECT create_graph('sg_src'); + create_graph +-------------- + +(1 row) + +SELECT count(*) FROM cypher('sg_src', $$ + UNWIND range(1, 2000) AS i + CREATE (:Person {pid: i, age: i % 100})-[:KNOWS {w: i}]->(:Friend {pid: i}) +$$) AS (a agtype); + count +------- + 0 +(1 row) + +SELECT count(*) FROM cypher('sg_src', $$ + UNWIND range(1, 500) AS i CREATE (:Company {cid: i}) +$$) AS (a agtype); + count +------- + 0 +(1 row) + +-- Source baseline (printed for reference; deterministic). +SELECT + (SELECT count(*) FROM cypher('sg_src', $$ MATCH (n) RETURN n $$) AS (n agtype)) AS src_vertices, + (SELECT count(*) FROM cypher('sg_src', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) AS src_edges; + src_vertices | src_edges +--------------+----------- + 4500 | 2000 +(1 row) + +-- +-- 1. Full copy ('*','*'): counts equal the source, and the new graph round-trips. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_all', 'sg_src', '*', '*'); + node_count | relationship_count +------------+-------------------- + 4500 | 2000 +(1 row) + +SELECT + (SELECT count(*) FROM cypher('sg_all', $$ MATCH (n) RETURN n $$) AS (n agtype)) + = (SELECT count(*) FROM cypher('sg_src', $$ MATCH (n) RETURN n $$) AS (n agtype)) AS nodes_match, + (SELECT count(*) FROM cypher('sg_all', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) + = (SELECT count(*) FROM cypher('sg_src', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) AS edges_match; + nodes_match | edges_match +-------------+------------- + t | t +(1 row) + +-- +-- 2. Vertex-induced (node filter only): keep pid <= 1000. An edge survives iff +-- BOTH endpoints survive (induced rule), with no relationship filter. +-- node_count is asserted against the function return; correctness is verified +-- by recomputing the induced set from the source (robust booleans). +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_v', 'sg_src', 'n.pid <= 1000', '*'); + node_count | relationship_count +------------+-------------------- + 2000 | 1000 +(1 row) + +SELECT + (SELECT count(*) FROM cypher('sg_v', $$ MATCH (n) RETURN n $$) AS (n agtype)) + = (SELECT count(*) FROM cypher('sg_src', + $$ MATCH (n) WHERE n.pid <= 1000 RETURN n $$) AS (n agtype)) AS nodes_ok, + (SELECT count(*) FROM cypher('sg_v', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) + = (SELECT count(*) FROM cypher('sg_src', + $$ MATCH (a)-[r]->(b) WHERE a.pid <= 1000 AND b.pid <= 1000 RETURN r $$) + AS (r agtype)) AS edges_ok; + nodes_ok | edges_ok +----------+---------- + t | t +(1 row) + +-- +-- 3. Node + relationship predicate: keep pid <= 1000 vertices and w <= 300 edges. +-- Edge survives iff w<=300 AND both endpoints pid<=1000. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_nr', 'sg_src', 'n.pid <= 1000', 'r.w <= 300'); + node_count | relationship_count +------------+-------------------- + 2000 | 300 +(1 row) + +SELECT + (SELECT count(*) FROM cypher('sg_nr', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) + = (SELECT count(*) FROM cypher('sg_src', + $$ MATCH (a)-[r]->(b) WHERE r.w <= 300 AND a.pid <= 1000 AND b.pid <= 1000 + RETURN r $$) AS (r agtype)) AS edges_ok; + edges_ok +---------- + t +(1 row) + +-- +-- 4. Label filter excludes one endpoint type: keep only :Person. Every KNOWS +-- edge points Person->Friend, so all edges must be dropped (induced rule). +-- (AGE evaluates label predicates with label(n); GDS uses n:Person -- same +-- containment semantics, different predicate syntax.) +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_person', 'sg_src', $f$label(n) = 'Person'$f$, '*'); + node_count | relationship_count +------------+-------------------- + 2000 | 0 +(1 row) + +-- +-- 5. Bipartite (type filter): keep Person+Friend and KNOWS edges => all 2000. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_bip', 'sg_src', + $f$label(n) = 'Person' OR label(n) = 'Friend'$f$, + $f$label(r) = 'KNOWS'$f$); + node_count | relationship_count +------------+-------------------- + 4000 | 2000 +(1 row) + +-- +-- 6. Empty result: a predicate matching nothing yields an empty subgraph +-- (not an error), with the default labels only. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_empty', 'sg_src', 'n.pid < 0', '*'); + node_count | relationship_count +------------+-------------------- + 0 | 0 +(1 row) + +SELECT count(*) AS empty_vertices +FROM cypher('sg_empty', $$ MATCH (n) RETURN n $$) AS (n agtype); + empty_vertices +---------------- + 0 +(1 row) + +-- +-- 7. Composability: extract a subgraph from an already-extracted subgraph. +-- From sg_v (pid<=1000) keep pid<=500; verify against recomputation on sg_v. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_v2', 'sg_v', 'n.pid <= 500', '*'); + node_count | relationship_count +------------+-------------------- + 1000 | 500 +(1 row) + +SELECT + (SELECT count(*) FROM cypher('sg_v2', $$ MATCH (n) RETURN n $$) AS (n agtype)) + = (SELECT count(*) FROM cypher('sg_v', + $$ MATCH (n) WHERE n.pid <= 500 RETURN n $$) AS (n agtype)) AS nodes_ok, + (SELECT count(*) FROM cypher('sg_v2', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) + = (SELECT count(*) FROM cypher('sg_v', + $$ MATCH (a)-[r]->(b) WHERE a.pid <= 500 AND b.pid <= 500 RETURN r $$) + AS (r agtype)) AS edges_ok; + nodes_ok | edges_ok +----------+---------- + t | t +(1 row) + +-- +-- 8. Self-loops and parallel edges (multigraph structure) are preserved. +-- +SELECT create_graph('sg_multi'); + create_graph +-------------- + +(1 row) + +SELECT * FROM cypher('sg_multi', $$ + CREATE (a:N {k: 1}) CREATE (a)-[:E {t: 1}]->(a) +$$) AS (a agtype); + a +--- +(0 rows) + +SELECT * FROM cypher('sg_multi', $$ + CREATE (a:N {k: 2}), (b:N {k: 3}), + (a)-[:E {t: 2}]->(b), (a)-[:E {t: 3}]->(b) +$$) AS (a agtype); + a +--- +(0 rows) + +SELECT node_count, relationship_count +FROM create_subgraph('sg_multi_sub', 'sg_multi', '*', '*'); + node_count | relationship_count +------------+-------------------- + 3 | 3 +(1 row) + +-- self-loop preserved (exactly one edge from a node to itself) +SELECT count(*) AS self_loops +FROM cypher('sg_multi_sub', $$ MATCH (a)-[r]->(a) RETURN r $$) AS (r agtype); + self_loops +------------ + 1 +(1 row) + +-- parallel edges preserved (two edges between k=2 and k=3) +SELECT count(*) AS parallel_edges +FROM cypher('sg_multi_sub', $$ MATCH (a {k: 2})-[r]->(b {k: 3}) RETURN r $$) AS (r agtype); + parallel_edges +---------------- + 2 +(1 row) + +-- +-- 9. Property fidelity: a copied vertex keeps its properties verbatim. +-- +SELECT count(*) AS person_500_age_ok +FROM cypher('sg_v', $$ MATCH (n:Person {pid: 500}) WHERE n.age = 0 RETURN n $$) AS (n agtype); + person_500_age_ok +------------------- + 1 +(1 row) + +-- +-- 10. Error handling / edge cases. +-- +-- NULL graph name +SELECT create_subgraph(NULL, 'sg_src', '*', '*'); +ERROR: new graph name must not be NULL +CONTEXT: PL/pgSQL function create_subgraph(name,name,text,text) line 18 at RAISE +-- source does not exist +SELECT create_subgraph('sg_x', 'no_such_graph', '*', '*'); +ERROR: graph "no_such_graph" does not exist +CONTEXT: PL/pgSQL function create_subgraph(name,name,text,text) line 47 at RAISE +-- extracting into the source itself +SELECT create_subgraph('sg_src', 'sg_src', '*', '*'); +ERROR: cannot extract a subgraph of "sg_src" into itself +CONTEXT: PL/pgSQL function create_subgraph(name,name,text,text) line 24 at RAISE +-- destination already exists +SELECT create_subgraph('sg_all', 'sg_src', '*', '*'); +ERROR: graph "sg_all" already exists +CONTEXT: PL/pgSQL function create_subgraph(name,name,text,text) line 53 at RAISE +-- invalid Cypher predicate is reported (propagated from the engine) +SELECT create_subgraph('sg_bad', 'sg_src', 'n.pid <<>> 1', '*'); +ERROR: operator does not exist: agtype <<>> agtype +LINE 1: ...her('sg_src', $age_subgraph$MATCH (n) WHERE n.pid <<>> 1 RET... + ^ +HINT: No operator matches the given name and argument types. You might need to add explicit type casts. +QUERY: CREATE TEMP TABLE _ag_sg_kept_v ON COMMIT DROP AS SELECT DISTINCT ag_catalog.agtype_to_graphid(vid) AS gid FROM ag_catalog.cypher('sg_src', $age_subgraph$MATCH (n) WHERE n.pid <<>> 1 RETURN id(n)$age_subgraph$) AS (vid agtype) +CONTEXT: PL/pgSQL function create_subgraph(name,name,text,text) line 80 at EXECUTE +-- cleanup +SELECT drop_graph('sg_v2', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_multi_sub', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_multi', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_empty', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_bip', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_person', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_nr', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_v', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_all', true); + drop_graph +------------ + +(1 row) + +SELECT drop_graph('sg_src', true); + drop_graph +------------ + +(1 row) + diff --git a/regress/sql/subgraph.sql b/regress/sql/subgraph.sql new file mode 100644 index 000000000..0d01dfe60 --- /dev/null +++ b/regress/sql/subgraph.sql @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +LOAD 'age'; +SET search_path TO ag_catalog; + +-- Suppress the create_graph / create_vlabel NOTICE chatter so the assertions +-- below are the deterministic output. (The feature is exercised regardless.) +SET client_min_messages = warning; + +-- +-- Build a "somewhat large" source graph with NO MATCH (fast bulk CREATE): +-- * 2000 isolated components, each (:Person{pid,age})-[:KNOWS{w}]->(:Friend{pid}) +-- => 2000 Person + 2000 Friend vertices, 2000 KNOWS edges +-- * 500 isolated :Company vertices (no edges) +-- Totals: 4500 vertices, 2000 edges, label set {Person,Friend,Company,KNOWS}. +-- +SELECT create_graph('sg_src'); + +SELECT count(*) FROM cypher('sg_src', $$ + UNWIND range(1, 2000) AS i + CREATE (:Person {pid: i, age: i % 100})-[:KNOWS {w: i}]->(:Friend {pid: i}) +$$) AS (a agtype); + +SELECT count(*) FROM cypher('sg_src', $$ + UNWIND range(1, 500) AS i CREATE (:Company {cid: i}) +$$) AS (a agtype); + +-- Source baseline (printed for reference; deterministic). +SELECT + (SELECT count(*) FROM cypher('sg_src', $$ MATCH (n) RETURN n $$) AS (n agtype)) AS src_vertices, + (SELECT count(*) FROM cypher('sg_src', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) AS src_edges; + +-- +-- 1. Full copy ('*','*'): counts equal the source, and the new graph round-trips. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_all', 'sg_src', '*', '*'); + +SELECT + (SELECT count(*) FROM cypher('sg_all', $$ MATCH (n) RETURN n $$) AS (n agtype)) + = (SELECT count(*) FROM cypher('sg_src', $$ MATCH (n) RETURN n $$) AS (n agtype)) AS nodes_match, + (SELECT count(*) FROM cypher('sg_all', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) + = (SELECT count(*) FROM cypher('sg_src', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) AS edges_match; + +-- +-- 2. Vertex-induced (node filter only): keep pid <= 1000. An edge survives iff +-- BOTH endpoints survive (induced rule), with no relationship filter. +-- node_count is asserted against the function return; correctness is verified +-- by recomputing the induced set from the source (robust booleans). +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_v', 'sg_src', 'n.pid <= 1000', '*'); + +SELECT + (SELECT count(*) FROM cypher('sg_v', $$ MATCH (n) RETURN n $$) AS (n agtype)) + = (SELECT count(*) FROM cypher('sg_src', + $$ MATCH (n) WHERE n.pid <= 1000 RETURN n $$) AS (n agtype)) AS nodes_ok, + (SELECT count(*) FROM cypher('sg_v', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) + = (SELECT count(*) FROM cypher('sg_src', + $$ MATCH (a)-[r]->(b) WHERE a.pid <= 1000 AND b.pid <= 1000 RETURN r $$) + AS (r agtype)) AS edges_ok; + +-- +-- 3. Node + relationship predicate: keep pid <= 1000 vertices and w <= 300 edges. +-- Edge survives iff w<=300 AND both endpoints pid<=1000. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_nr', 'sg_src', 'n.pid <= 1000', 'r.w <= 300'); + +SELECT + (SELECT count(*) FROM cypher('sg_nr', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) + = (SELECT count(*) FROM cypher('sg_src', + $$ MATCH (a)-[r]->(b) WHERE r.w <= 300 AND a.pid <= 1000 AND b.pid <= 1000 + RETURN r $$) AS (r agtype)) AS edges_ok; + +-- +-- 4. Label filter excludes one endpoint type: keep only :Person. Every KNOWS +-- edge points Person->Friend, so all edges must be dropped (induced rule). +-- (AGE evaluates label predicates with label(n); GDS uses n:Person -- same +-- containment semantics, different predicate syntax.) +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_person', 'sg_src', $f$label(n) = 'Person'$f$, '*'); + +-- +-- 5. Bipartite (type filter): keep Person+Friend and KNOWS edges => all 2000. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_bip', 'sg_src', + $f$label(n) = 'Person' OR label(n) = 'Friend'$f$, + $f$label(r) = 'KNOWS'$f$); + +-- +-- 6. Empty result: a predicate matching nothing yields an empty subgraph +-- (not an error), with the default labels only. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_empty', 'sg_src', 'n.pid < 0', '*'); + +SELECT count(*) AS empty_vertices +FROM cypher('sg_empty', $$ MATCH (n) RETURN n $$) AS (n agtype); + +-- +-- 7. Composability: extract a subgraph from an already-extracted subgraph. +-- From sg_v (pid<=1000) keep pid<=500; verify against recomputation on sg_v. +-- +SELECT node_count, relationship_count +FROM create_subgraph('sg_v2', 'sg_v', 'n.pid <= 500', '*'); + +SELECT + (SELECT count(*) FROM cypher('sg_v2', $$ MATCH (n) RETURN n $$) AS (n agtype)) + = (SELECT count(*) FROM cypher('sg_v', + $$ MATCH (n) WHERE n.pid <= 500 RETURN n $$) AS (n agtype)) AS nodes_ok, + (SELECT count(*) FROM cypher('sg_v2', $$ MATCH ()-[r]->() RETURN r $$) AS (r agtype)) + = (SELECT count(*) FROM cypher('sg_v', + $$ MATCH (a)-[r]->(b) WHERE a.pid <= 500 AND b.pid <= 500 RETURN r $$) + AS (r agtype)) AS edges_ok; + +-- +-- 8. Self-loops and parallel edges (multigraph structure) are preserved. +-- +SELECT create_graph('sg_multi'); +SELECT * FROM cypher('sg_multi', $$ + CREATE (a:N {k: 1}) CREATE (a)-[:E {t: 1}]->(a) +$$) AS (a agtype); +SELECT * FROM cypher('sg_multi', $$ + CREATE (a:N {k: 2}), (b:N {k: 3}), + (a)-[:E {t: 2}]->(b), (a)-[:E {t: 3}]->(b) +$$) AS (a agtype); + +SELECT node_count, relationship_count +FROM create_subgraph('sg_multi_sub', 'sg_multi', '*', '*'); + +-- self-loop preserved (exactly one edge from a node to itself) +SELECT count(*) AS self_loops +FROM cypher('sg_multi_sub', $$ MATCH (a)-[r]->(a) RETURN r $$) AS (r agtype); + +-- parallel edges preserved (two edges between k=2 and k=3) +SELECT count(*) AS parallel_edges +FROM cypher('sg_multi_sub', $$ MATCH (a {k: 2})-[r]->(b {k: 3}) RETURN r $$) AS (r agtype); + +-- +-- 9. Property fidelity: a copied vertex keeps its properties verbatim. +-- +SELECT count(*) AS person_500_age_ok +FROM cypher('sg_v', $$ MATCH (n:Person {pid: 500}) WHERE n.age = 0 RETURN n $$) AS (n agtype); + +-- +-- 10. Error handling / edge cases. +-- +-- NULL graph name +SELECT create_subgraph(NULL, 'sg_src', '*', '*'); +-- source does not exist +SELECT create_subgraph('sg_x', 'no_such_graph', '*', '*'); +-- extracting into the source itself +SELECT create_subgraph('sg_src', 'sg_src', '*', '*'); +-- destination already exists +SELECT create_subgraph('sg_all', 'sg_src', '*', '*'); +-- invalid Cypher predicate is reported (propagated from the engine) +SELECT create_subgraph('sg_bad', 'sg_src', 'n.pid <<>> 1', '*'); + +-- cleanup +SELECT drop_graph('sg_v2', true); +SELECT drop_graph('sg_multi_sub', true); +SELECT drop_graph('sg_multi', true); +SELECT drop_graph('sg_empty', true); +SELECT drop_graph('sg_bip', true); +SELECT drop_graph('sg_person', true); +SELECT drop_graph('sg_nr', true); +SELECT drop_graph('sg_v', true); +SELECT drop_graph('sg_all', true); +SELECT drop_graph('sg_src', true); diff --git a/sql/age_subgraph.sql b/sql/age_subgraph.sql new file mode 100644 index 000000000..960790ded --- /dev/null +++ b/sql/age_subgraph.sql @@ -0,0 +1,294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +-- +-- create_subgraph(): materialized subgraph extraction. +-- +-- Builds a new, persistent AGE graph that is the subgraph of an existing graph +-- selected by a node predicate and a relationship predicate. The semantics +-- follow the graph-theory "induced subgraph" definition as operationalized by +-- Neo4j GDS gds.graph.filter(): +-- +-- * a vertex is kept iff node_filter evaluates true ('*' keeps all); +-- * an edge is kept iff relationship_filter evaluates true AND BOTH of its +-- endpoints were kept (the induced rule -- no dangling edges). +-- +-- Unlike the Neo4j in-memory projection, the result is a real, ACID, +-- fully-Cypher-queryable AGE graph; properties of any agtype are preserved, and +-- self-loops / parallel edges (multigraph structure) are kept. +-- +-- node_filter / relationship_filter are Cypher predicates bound to a single +-- entity -- the node variable is `n`, the relationship variable is `r` -- or +-- the literal '*' to keep all. They are evaluated by AGE's own Cypher engine +-- against the source graph, so the full Cypher predicate language is available. +-- +-- Internal entity ids (graphids) are reassigned in the new graph (a graphid +-- encodes the source graph's label id, which differs in the destination), and +-- edge endpoints are remapped accordingly. Properties are copied verbatim. +-- +CREATE FUNCTION ag_catalog.create_subgraph(new_graph name, + from_graph name, + node_filter text DEFAULT '*', + relationship_filter text DEFAULT '*') + RETURNS TABLE(node_count bigint, relationship_count bigint) + LANGUAGE plpgsql + VOLATILE + SET search_path = ag_catalog, pg_catalog + AS $function$ +DECLARE + from_oid oid; + new_oid oid; + v_node_count bigint := 0; + v_rel_count bigint := 0; + rec RECORD; + cypher_q text; + where_clause text; + dst_label_id int; + dst_seq_fqn text; + dst_relation text; + inserted bigint; + has_rows boolean; +BEGIN + -- Argument validation. + IF new_graph IS NULL THEN + RAISE EXCEPTION 'new graph name must not be NULL'; + END IF; + IF from_graph IS NULL THEN + RAISE EXCEPTION 'source graph name must not be NULL'; + END IF; + IF new_graph = from_graph THEN + RAISE EXCEPTION 'cannot extract a subgraph of "%" into itself', from_graph; + END IF; + + -- NULL predicate is treated as the '*' wildcard (keep all). + IF node_filter IS NULL THEN + node_filter := '*'; + END IF; + IF relationship_filter IS NULL THEN + relationship_filter := '*'; + END IF; + + -- The predicates are embedded into a dollar-quoted cypher() query using the + -- $age_subgraph$ tag; reject predicates that contain the tag to keep the + -- quoting unambiguous. + IF position('$age_subgraph$' IN node_filter) > 0 + OR position('$age_subgraph$' IN relationship_filter) > 0 THEN + RAISE EXCEPTION 'filter predicate must not contain the reserved token $age_subgraph$'; + END IF; + + -- Validate source graph exists. + SELECT graphid INTO from_oid + FROM ag_catalog.ag_graph WHERE name = from_graph; + IF from_oid IS NULL THEN + RAISE EXCEPTION 'graph "%" does not exist', from_graph; + END IF; + + -- Validate destination graph does not exist (create_graph also enforces + -- naming rules and uniqueness, but we give a clear early error). + IF EXISTS (SELECT 1 FROM ag_catalog.ag_graph WHERE name = new_graph) THEN + RAISE EXCEPTION 'graph "%" already exists', new_graph; + END IF; + + -- Create the destination graph (default labels are created automatically). + PERFORM ag_catalog.create_graph(new_graph); + + SELECT graphid INTO new_oid + FROM ag_catalog.ag_graph WHERE name = new_graph; + + -- Working sets / mapping (uniquely named to avoid colliding with user temps). + DROP TABLE IF EXISTS _ag_sg_kept_v; + DROP TABLE IF EXISTS _ag_sg_kept_e; + DROP TABLE IF EXISTS _ag_sg_vmap; + DROP TABLE IF EXISTS _ag_sg_vstage; + DROP TABLE IF EXISTS _ag_sg_estage; + + -- + -- Kept vertices: evaluate node_filter with AGE's Cypher engine. The node + -- variable `n` is bound exactly as in the spec; '*' selects all vertices. + -- + IF node_filter IS NULL OR btrim(node_filter) = '*' THEN + where_clause := ''; + ELSE + where_clause := ' WHERE ' || node_filter; + END IF; + cypher_q := 'MATCH (n)' || where_clause || ' RETURN id(n)'; + + EXECUTE format( + 'CREATE TEMP TABLE _ag_sg_kept_v ON COMMIT DROP AS ' + 'SELECT DISTINCT ag_catalog.agtype_to_graphid(vid) AS gid ' + 'FROM ag_catalog.cypher(%L, $age_subgraph$%s$age_subgraph$) AS (vid agtype)', + from_graph, cypher_q); + CREATE INDEX ON _ag_sg_kept_v (gid); + + -- + -- Kept edges: evaluate relationship_filter with AGE's Cypher engine. The + -- relationship variable `r` is bound exactly as in the spec. + -- + IF relationship_filter IS NULL OR btrim(relationship_filter) = '*' THEN + where_clause := ''; + ELSE + where_clause := ' WHERE ' || relationship_filter; + END IF; + cypher_q := 'MATCH ()-[r]->()' || where_clause || ' RETURN id(r)'; + + EXECUTE format( + 'CREATE TEMP TABLE _ag_sg_kept_e ON COMMIT DROP AS ' + 'SELECT DISTINCT ag_catalog.agtype_to_graphid(eid) AS gid ' + 'FROM ag_catalog.cypher(%L, $age_subgraph$%s$age_subgraph$) AS (eid agtype)', + from_graph, cypher_q); + CREATE INDEX ON _ag_sg_kept_e (gid); + + -- old -> new vertex id mapping (graphid is unique within a graph). + CREATE TEMP TABLE _ag_sg_vmap (old_id graphid PRIMARY KEY, + new_id graphid NOT NULL) ON COMMIT DROP; + + -- + -- PASS 1: copy kept vertices, label by label, assigning new graphids and + -- recording the old->new mapping for edge remapping. + -- + FOR rec IN + SELECT name, id, relation, seq_name + FROM ag_catalog.ag_label + WHERE graph = from_oid AND kind = 'v' + ORDER BY id + LOOP + -- Skip labels with no surviving vertices. Read ONLY this label's own + -- rows: AGE label tables use table inheritance (custom labels inherit + -- from _ag_label_vertex), so a plain scan of a parent would also return + -- its children and copy them twice. + EXECUTE format( + 'SELECT EXISTS (SELECT 1 FROM ONLY %s t ' + 'WHERE EXISTS (SELECT 1 FROM _ag_sg_kept_v k WHERE k.gid = t.id))', + rec.relation::regclass::text) + INTO has_rows; + IF NOT has_rows THEN + CONTINUE; + END IF; + + -- Ensure the label exists in the destination graph. + IF rec.name <> '_ag_label_vertex' THEN + PERFORM 1 FROM ag_catalog.ag_label + WHERE graph = new_oid AND name = rec.name; + IF NOT FOUND THEN + EXECUTE format('SELECT ag_catalog.create_vlabel(%L, %L)', + new_graph, rec.name); + END IF; + END IF; + + SELECT id, seq_name, relation::regclass::text + INTO dst_label_id, dst_seq_fqn, dst_relation + FROM ag_catalog.ag_label + WHERE graph = new_oid AND name = rec.name; + dst_seq_fqn := format('%I.%I', new_graph, dst_seq_fqn); + + -- Stage surviving vertices with freshly generated ids in a real temp + -- table (single evaluation), then copy to the label table and record + -- the old->new mapping. A materialized stage avoids any ambiguity from + -- referencing a nextval-bearing CTE more than once. + DROP TABLE IF EXISTS _ag_sg_vstage; + EXECUTE format( + 'CREATE TEMP TABLE _ag_sg_vstage ON COMMIT DROP AS ' + 'SELECT t.id AS old_id, ' + ' ag_catalog._graphid(%s, nextval(%L)) AS new_id, ' + ' t.properties AS props ' + 'FROM ONLY %s t ' + 'WHERE EXISTS (SELECT 1 FROM _ag_sg_kept_v k WHERE k.gid = t.id)', + dst_label_id, dst_seq_fqn, rec.relation::regclass::text); + + EXECUTE format('INSERT INTO %s (id, properties) ' + 'SELECT new_id, props FROM _ag_sg_vstage', dst_relation); + + INSERT INTO _ag_sg_vmap (old_id, new_id) + SELECT old_id, new_id FROM _ag_sg_vstage; + + DROP TABLE _ag_sg_vstage; + END LOOP; + + SELECT count(*) INTO v_node_count FROM _ag_sg_vmap; + + -- + -- PASS 2: copy kept edges, remapping endpoints. The joins to _ag_sg_vmap + -- enforce the induced rule (an edge survives only if BOTH endpoints were + -- kept); membership in _ag_sg_kept_e applies relationship_filter. + -- + FOR rec IN + SELECT name, id, relation, seq_name + FROM ag_catalog.ag_label + WHERE graph = from_oid AND kind = 'e' + ORDER BY id + LOOP + -- Skip labels with no surviving edges. Read ONLY this label's own rows + -- (see the vertex pass for why inheritance requires ONLY). + EXECUTE format( + 'SELECT EXISTS (' + ' SELECT 1 FROM ONLY %s x ' + ' JOIN _ag_sg_vmap vs ON vs.old_id = x.start_id ' + ' JOIN _ag_sg_vmap ve ON ve.old_id = x.end_id ' + ' WHERE EXISTS (SELECT 1 FROM _ag_sg_kept_e k WHERE k.gid = x.id))', + rec.relation::regclass::text) + INTO has_rows; + IF NOT has_rows THEN + CONTINUE; + END IF; + + IF rec.name <> '_ag_label_edge' THEN + PERFORM 1 FROM ag_catalog.ag_label + WHERE graph = new_oid AND name = rec.name; + IF NOT FOUND THEN + EXECUTE format('SELECT ag_catalog.create_elabel(%L, %L)', + new_graph, rec.name); + END IF; + END IF; + + SELECT id, seq_name, relation::regclass::text + INTO dst_label_id, dst_seq_fqn, dst_relation + FROM ag_catalog.ag_label + WHERE graph = new_oid AND name = rec.name; + dst_seq_fqn := format('%I.%I', new_graph, dst_seq_fqn); + + -- Stage surviving edges, remapping endpoints through _ag_sg_vmap. The + -- joins enforce the induced rule (both endpoints kept); membership in + -- _ag_sg_kept_e applies relationship_filter. + DROP TABLE IF EXISTS _ag_sg_estage; + EXECUTE format( + 'CREATE TEMP TABLE _ag_sg_estage ON COMMIT DROP AS ' + 'SELECT ag_catalog._graphid(%s, nextval(%L)) AS new_id, ' + ' vs.new_id AS new_start, ve.new_id AS new_end, ' + ' x.properties AS props ' + 'FROM ONLY %s x ' + 'JOIN _ag_sg_vmap vs ON vs.old_id = x.start_id ' + 'JOIN _ag_sg_vmap ve ON ve.old_id = x.end_id ' + 'WHERE EXISTS (SELECT 1 FROM _ag_sg_kept_e k WHERE k.gid = x.id)', + dst_label_id, dst_seq_fqn, rec.relation::regclass::text); + + EXECUTE format('INSERT INTO %s (id, start_id, end_id, properties) ' + 'SELECT new_id, new_start, new_end, props ' + 'FROM _ag_sg_estage', dst_relation); + GET DIAGNOSTICS inserted = ROW_COUNT; + v_rel_count := v_rel_count + inserted; + + DROP TABLE _ag_sg_estage; + END LOOP; + + RETURN QUERY SELECT v_node_count, v_rel_count; +END; +$function$; + +COMMENT ON FUNCTION ag_catalog.create_subgraph(name, name, text, text) IS +'Materializes a new persistent graph as the induced subgraph of from_graph selected by a Cypher node predicate (on n) and relationship predicate (on r); ''*'' keeps all. An edge is kept only if its predicate holds and both endpoints are kept. Returns (node_count, relationship_count).'; diff --git a/sql/sql_files b/sql/sql_files index 32f9a7099..996ad4b46 100644 --- a/sql/sql_files +++ b/sql/sql_files @@ -15,3 +15,4 @@ age_trig age_aggregate agtype_typecast age_pg_upgrade +age_subgraph