From 47c001d8b66e3f2c2c9de526e8540f650bc4c263 Mon Sep 17 00:00:00 2001 From: matt6697 <32440697+matt6697@users.noreply.github.com> Date: Fri, 10 Nov 2023 08:57:30 +0100 Subject: [PATCH 1/4] Added new recipe --- .../filtering-node-neighbors/recipe.json | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 custom-recipes/filtering-node-neighbors/recipe.json diff --git a/custom-recipes/filtering-node-neighbors/recipe.json b/custom-recipes/filtering-node-neighbors/recipe.json new file mode 100644 index 0000000..a6f600a --- /dev/null +++ b/custom-recipes/filtering-node-neighbors/recipe.json @@ -0,0 +1,71 @@ +// This file is the descriptor for the Custom code recipe filtering-node-neighbors +{ + // Meta data for display purposes + "meta": { + // label: name of the recipe as displayed, should be short + "label": "Filtering node neighbors", + // description: longer string to help end users understand what this recipe does + "description": "", + // icon: must be one of the FontAwesome 3.2.1 icons, complete list here at https://fontawesome.com/v3.2.1/icons/ + "icon": "icon-puzzle-piece" + }, + + "kind": "PYTHON", + + + // Inputs and outputs are defined by roles. In the recipe's I/O tab, the user can associate one + // or more dataset to each input and output role. + + // The "arity" field indicates whether the user can associate several datasets to the role ('NARY') + // or at most one ('UNARY'). The "required" field indicates whether the user is allowed to + // associate no dataset with the role. + + "selectableFromDataset" : "Input Dataset", + "inputRoles" : [ + { + "name": "Input Dataset", + "arity": "UNARY", + "required": true, + "acceptsDataset": true + } + ], + "outputRoles" : [ + { + "name": "Output Dataset", + "arity": "UNARY", + "required": true, + "acceptsDataset": true + } + ], + + "params": [ + { + "name": "create_graph_of", + "label": "Create graph of", + "type": "COLUMN", + "columnRole":"Input Dataset" + }, + { + "name": "linked_by", + "label": "Linked by", + "type": "COLUMN", + "columnRole":"Input Dataset" + }, + { + "name": "weighted", + "label": "Weighted graph", + "type": "BOOLEAN", + "defaultValue": false, + "description": "" + } + ], + + // The field "resourceKeys" holds a list of keys that allows to limit the number + // of concurrent executions and activities triggered by this recipe. + // + // Administrators can configure the limit per resource key in the Administration > Settings > Flow build + // screen. + + "resourceKeys": [] + +} From 462fb211cce5944d733d9563259c422e547e686a Mon Sep 17 00:00:00 2001 From: matt6697 <32440697+matt6697@users.noreply.github.com> Date: Fri, 10 Nov 2023 10:14:24 +0100 Subject: [PATCH 2/4] Added new recipe to create an ego graph from nodes --- .../ego-graph-from-nodes/recipe.json | 70 +++++++++++++++++ custom-recipes/ego-graph-from-nodes/recipe.py | 75 +++++++++++++++++++ .../filtering-node-neighbors/recipe.json | 71 ------------------ 3 files changed, 145 insertions(+), 71 deletions(-) create mode 100644 custom-recipes/ego-graph-from-nodes/recipe.json create mode 100644 custom-recipes/ego-graph-from-nodes/recipe.py delete mode 100644 custom-recipes/filtering-node-neighbors/recipe.json diff --git a/custom-recipes/ego-graph-from-nodes/recipe.json b/custom-recipes/ego-graph-from-nodes/recipe.json new file mode 100644 index 0000000..f0a0f5d --- /dev/null +++ b/custom-recipes/ego-graph-from-nodes/recipe.json @@ -0,0 +1,70 @@ +{ + "meta": { + "label": "Ego graph from nodes", + "description": "Returns induced subgraph of neighbors centered at selected nodes within a given radius", + "icon": "icon-link" + }, + + "kind": "PYTHON", + + "selectableFromDataset" : "Input Dataset", + "inputRoles" : [ + { + "name": "Input Dataset", + "arity": "UNARY", + "required": true, + "acceptsDataset": true + } + ], + "outputRoles" : [ + { + "name": "Output Dataset", + "arity": "UNARY", + "required": true, + "acceptsDataset": true + } + ], + + "params": [ + { + "name": "source_nodes", + "label": "Source nodes", + "type": "COLUMN", + "columnRole":"Input Dataset" + }, + { + "name": "target_nodes", + "label": "Target nodes", + "type": "COLUMN", + "columnRole":"Input Dataset" + }, + { + "name": "edges_label", + "label": "Edges label", + "type": "COLUMN", + "columnRole":"Input Dataset" + }, + { + "name": "weighted", + "label": "Weighted graph", + "type": "BOOLEAN", + "defaultValue": false, + "description": "" + }, + { + "name": "nodes", + "label": "Nodes", + "type": "STRINGS", + "description": "" + }, + { + "name": "ego_graph_radius", + "label": "Ego graph radius", + "type": "INT", + "defaultValue": 2, + "description": "" + } + ], + + "resourceKeys": [] +} diff --git a/custom-recipes/ego-graph-from-nodes/recipe.py b/custom-recipes/ego-graph-from-nodes/recipe.py new file mode 100644 index 0000000..1d94af1 --- /dev/null +++ b/custom-recipes/ego-graph-from-nodes/recipe.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +from dataiku.customrecipe import get_recipe_config +from graph_analytics_utils import get_input_dataset, get_output_dataset +from graph_analytics_constants import Constants +import pandas as pd +import networkx as nx +from networkx.algorithms import union +import logging +import time + +def create_ego_graph(graph, node, radius=1, predecessors=False): + try: + # Create a directed ego-graph from node to get it's successors + node_ego_graph = nx.ego_graph(graph, node, radius) + + # Reverse graph to get level 1 predecessors + if predecessors: + tmp_graph = graph.reverse() + predecessors_ego_graph = nx.ego_graph(tmp_graph, node, radius=1) + predecessors_ego_graph = predecessors_ego_graph.reverse() + node_ego_graph.update(predecessors_ego_graph) + + return node_ego_graph + + # Return an empty directed graph if node is not found in graph + except: + logging.warn("Ego graph - Node " + node + " not found in graph") + return nx.DiGraph() + + +# Read recipe config +input_dataset = get_input_dataset('Input Dataset') +output_dataset = get_output_dataset('Output Dataset') + +recipe_config = get_recipe_config() + +# List of necessary columns +columns = [] +columns.append(recipe_config['source_nodes']) +columns.append(recipe_config['target_nodes']) +columns.append(recipe_config['edges_label']) + +# Recipe input +input_df = input_dataset.get_dataframe(columns=columns) +logging.info("Ego graph - Dataset loaded") + +# Delete nulls +input_df = input_df[(input_df[recipe_config['source_nodes']].notnull()) & (input_df[recipe_config['target_nodes']].notnull())] +logging.info("Ego graph - Null values removed") + +# Dedup +deduplicated_df = input_df.groupby(columns).size().reset_index().rename(columns={0: 'w'}) +logging.info("Ego graph - Deduplicated dataset created") + +# Creating the directed graph +graph = nx.from_pandas_edgelist(deduplicated_df, recipe_config['source_nodes'], recipe_config['target_nodes'], recipe_config['edges_label'], nx.DiGraph) +# graph = nx.DiGraph() +# graph.add_nodes_from(deduplicated_df[recipe_config['source_nodes']].unique()) +# graph.add_nodes_from(deduplicated_df[recipe_config['target_nodes']].unique()) +# graph.add_edges_from(zip(deduplicated_df[recipe_config['source_nodes']], deduplicated_df[recipe_config['target_nodes']], deduplicated_df[recipe_config['edges_label']])) +logging.info("Base NetworkX graph created") + +node_ego_graph = nx.DiGraph() +for node in recipe_config["nodes"]: + node_ego_graph.update(create_ego_graph(graph, node, recipe_config["ego_graph_radius"], True)) + +# Write output dataframe +output_df = pd.DataFrame(list(node_ego_graph.edges(data=True))) +output_df.columns = [recipe_config['source_nodes'], recipe_config['target_nodes'], recipe_config['edges_label']] + +logging.info("Ego graph - Ego graph computed") + +output_dataset.write_with_schema(output_df) + + diff --git a/custom-recipes/filtering-node-neighbors/recipe.json b/custom-recipes/filtering-node-neighbors/recipe.json deleted file mode 100644 index a6f600a..0000000 --- a/custom-recipes/filtering-node-neighbors/recipe.json +++ /dev/null @@ -1,71 +0,0 @@ -// This file is the descriptor for the Custom code recipe filtering-node-neighbors -{ - // Meta data for display purposes - "meta": { - // label: name of the recipe as displayed, should be short - "label": "Filtering node neighbors", - // description: longer string to help end users understand what this recipe does - "description": "", - // icon: must be one of the FontAwesome 3.2.1 icons, complete list here at https://fontawesome.com/v3.2.1/icons/ - "icon": "icon-puzzle-piece" - }, - - "kind": "PYTHON", - - - // Inputs and outputs are defined by roles. In the recipe's I/O tab, the user can associate one - // or more dataset to each input and output role. - - // The "arity" field indicates whether the user can associate several datasets to the role ('NARY') - // or at most one ('UNARY'). The "required" field indicates whether the user is allowed to - // associate no dataset with the role. - - "selectableFromDataset" : "Input Dataset", - "inputRoles" : [ - { - "name": "Input Dataset", - "arity": "UNARY", - "required": true, - "acceptsDataset": true - } - ], - "outputRoles" : [ - { - "name": "Output Dataset", - "arity": "UNARY", - "required": true, - "acceptsDataset": true - } - ], - - "params": [ - { - "name": "create_graph_of", - "label": "Create graph of", - "type": "COLUMN", - "columnRole":"Input Dataset" - }, - { - "name": "linked_by", - "label": "Linked by", - "type": "COLUMN", - "columnRole":"Input Dataset" - }, - { - "name": "weighted", - "label": "Weighted graph", - "type": "BOOLEAN", - "defaultValue": false, - "description": "" - } - ], - - // The field "resourceKeys" holds a list of keys that allows to limit the number - // of concurrent executions and activities triggered by this recipe. - // - // Administrators can configure the limit per resource key in the Administration > Settings > Flow build - // screen. - - "resourceKeys": [] - -} From 1d4da2a456fd8f4fe4673fc342592ddd467fa634 Mon Sep 17 00:00:00 2001 From: matt6697 <32440697+matt6697@users.noreply.github.com> Date: Fri, 10 Nov 2023 11:09:48 +0100 Subject: [PATCH 3/4] Use multiple columns as edge labels --- custom-recipes/ego-graph-from-nodes/recipe.json | 2 +- custom-recipes/ego-graph-from-nodes/recipe.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/custom-recipes/ego-graph-from-nodes/recipe.json b/custom-recipes/ego-graph-from-nodes/recipe.json index f0a0f5d..3025633 100644 --- a/custom-recipes/ego-graph-from-nodes/recipe.json +++ b/custom-recipes/ego-graph-from-nodes/recipe.json @@ -41,7 +41,7 @@ { "name": "edges_label", "label": "Edges label", - "type": "COLUMN", + "type": "COLUMNS", "columnRole":"Input Dataset" }, { diff --git a/custom-recipes/ego-graph-from-nodes/recipe.py b/custom-recipes/ego-graph-from-nodes/recipe.py index 1d94af1..c709acb 100644 --- a/custom-recipes/ego-graph-from-nodes/recipe.py +++ b/custom-recipes/ego-graph-from-nodes/recipe.py @@ -38,7 +38,8 @@ def create_ego_graph(graph, node, radius=1, predecessors=False): columns = [] columns.append(recipe_config['source_nodes']) columns.append(recipe_config['target_nodes']) -columns.append(recipe_config['edges_label']) +for col in recipe_config['edges_label']: + columns.append(col) # Recipe input input_df = input_dataset.get_dataframe(columns=columns) @@ -66,7 +67,7 @@ def create_ego_graph(graph, node, radius=1, predecessors=False): # Write output dataframe output_df = pd.DataFrame(list(node_ego_graph.edges(data=True))) -output_df.columns = [recipe_config['source_nodes'], recipe_config['target_nodes'], recipe_config['edges_label']] +output_df.columns = [recipe_config['source_nodes'], recipe_config['target_nodes'], "Edges labels"] logging.info("Ego graph - Ego graph computed") From 10f5a431d948c6f91d172f548d35c286ad48c581 Mon Sep 17 00:00:00 2001 From: matt6697 <32440697+matt6697@users.noreply.github.com> Date: Tue, 5 Dec 2023 13:25:17 +0100 Subject: [PATCH 4/4] Code cleanup --- custom-recipes/ego-graph-from-nodes/recipe.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/custom-recipes/ego-graph-from-nodes/recipe.py b/custom-recipes/ego-graph-from-nodes/recipe.py index c709acb..0719c75 100644 --- a/custom-recipes/ego-graph-from-nodes/recipe.py +++ b/custom-recipes/ego-graph-from-nodes/recipe.py @@ -55,10 +55,6 @@ def create_ego_graph(graph, node, radius=1, predecessors=False): # Creating the directed graph graph = nx.from_pandas_edgelist(deduplicated_df, recipe_config['source_nodes'], recipe_config['target_nodes'], recipe_config['edges_label'], nx.DiGraph) -# graph = nx.DiGraph() -# graph.add_nodes_from(deduplicated_df[recipe_config['source_nodes']].unique()) -# graph.add_nodes_from(deduplicated_df[recipe_config['target_nodes']].unique()) -# graph.add_edges_from(zip(deduplicated_df[recipe_config['source_nodes']], deduplicated_df[recipe_config['target_nodes']], deduplicated_df[recipe_config['edges_label']])) logging.info("Base NetworkX graph created") node_ego_graph = nx.DiGraph()