From cbd0a5f383f2395c4a594e3003799f84f8c465f5 Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 02:27:09 -0600 Subject: [PATCH 01/12] added coauthor cs dataset --- easygraph/datasets/__init__.py | 19 +++-- easygraph/datasets/citation_graph.py | 2 +- easygraph/datasets/coauthor.py | 112 +++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 9 deletions(-) create mode 100644 easygraph/datasets/coauthor.py diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index d303baa9..30869616 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -1,3 +1,4 @@ +# risky imports try: from easygraph.datasets.get_sample_graph import * from easygraph.datasets.gnn_benchmark import * @@ -7,17 +8,19 @@ from easygraph.datasets.hypergraph.House_Committees import House_Committees from easygraph.datasets.karate import KarateClubDataset from easygraph.datasets.mathoverflow_answers import mathoverflow_answers - - from .citation_graph import CitationGraphDataset - from .citation_graph import CiteseerGraphDataset - from .citation_graph import CoraBinary - from .citation_graph import CoraGraphDataset - from .citation_graph import PubmedGraphDataset from .ppi import LegacyPPIDataset from .ppi import PPIDataset - -except: +except Exception as e: print( " Please install Pytorch before use graph-related datasets and" " hypergraph-related datasets." ) + +from .citation_graph import ( + CitationGraphDataset, + CiteseerGraphDataset, + CoraBinary, + CoraGraphDataset, + PubmedGraphDataset, +) +from .coauthor import CoauthorCSDataset diff --git a/easygraph/datasets/citation_graph.py b/easygraph/datasets/citation_graph.py index a7e268d9..3bb2fead 100644 --- a/easygraph/datasets/citation_graph.py +++ b/easygraph/datasets/citation_graph.py @@ -55,7 +55,7 @@ class CitationGraphDataset(EasyGraphBuiltinDataset): """ _urls = { "cora_v2": "dataset/cora_v2.zip", - "citeseer": "dataset/citeSeer.zip", + "citeseer": "dataset/citeseer.zip", "pubmed": "dataset/pubmed.zip", } diff --git a/easygraph/datasets/coauthor.py b/easygraph/datasets/coauthor.py new file mode 100644 index 00000000..b9a1166b --- /dev/null +++ b/easygraph/datasets/coauthor.py @@ -0,0 +1,112 @@ +"""CoauthorCS Dataset + +This dataset contains a co-authorship network of authors who submitted papers to CS category. +Each node represents an author and edges represent co-authorships. +Node features are bag-of-words representations of keywords in the author's papers. +The task is node classification, with labels indicating the primary field of study. + +Statistics: +- Nodes: 18333 +- Edges: 81894 +- Feature Dim: 6805 +- Classes: 15 + +Source: https://github.com/dmlc/dgl/tree/master/examples/pytorch/cluster_gcn +""" + +import os +import numpy as np +import easygraph as eg +import scipy.sparse as sp + +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download, extract_archive, tensor, data_type_dict + + +class CoauthorCSDataset(EasyGraphBuiltinDataset): + r"""CoauthorCS citation network dataset. + + Nodes are authors, and edges indicate co-authorship relationships. Each node + has a bag-of-words feature vector and a label denoting the primary research field. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Transform to apply to the graph on access. + + Examples + -------- + >>> from easygraph.datasets import CoauthorCSDataset + >>> dataset = CoauthorCSDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + >>> print("Feature shape:", g.nodes[0]['feat'].shape) + >>> print("Label:", g.nodes[0]['label']) + >>> print("Number of classes:", dataset.num_classes) + """ + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "coauthor_cs" + url = "https://data.dgl.ai/dataset/coauthor_cs.zip" + super(CoauthorCSDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + path = os.path.join(self.raw_path, "coauthor_cs.npz") + data = np.load(path) + + # Reconstruct adjacency matrix + adj = sp.csr_matrix( + (data["adj_data"], data["adj_indices"], data["adj_indptr"]), + shape=data["adj_shape"], + ) + + # Reconstruct feature matrix + features = sp.csr_matrix( + (data["attr_data"], data["attr_indices"], data["attr_indptr"]), + shape=data["attr_shape"], + ).todense() + + labels = data["labels"] + + g = eg.Graph() + g.add_edges_from(list(zip(*adj.nonzero()))) + + for i in range(features.shape[0]): + g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i])) + + self._g = g + self._num_classes = len(np.unique(labels)) + + if self.verbose: + print("Finished loading CoauthorCS dataset.") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" NumFeats: {features.shape[1]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "CoauthorCSDataset only contains one graph" + if self._g is None: + raise ValueError("Graph has not been loaded or processed correctly.") + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes From 576eb0556a4ee959a25df98c204922295b82a02a Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 02:38:31 -0600 Subject: [PATCH 02/12] added amazon photos dataset --- easygraph/amazon_computers.py | 0 easygraph/datasets/__init__.py | 1 + easygraph/datasets/amazon_photo.py | 105 +++++++++++++++++++++++++++++ easygraph/datasets/coauthor.py | 2 +- 4 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 easygraph/amazon_computers.py create mode 100644 easygraph/datasets/amazon_photo.py diff --git a/easygraph/amazon_computers.py b/easygraph/amazon_computers.py new file mode 100644 index 00000000..e69de29b diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index 30869616..d32afdcf 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -24,3 +24,4 @@ PubmedGraphDataset, ) from .coauthor import CoauthorCSDataset +from .amazon_photo import AmazonPhotoDataset \ No newline at end of file diff --git a/easygraph/datasets/amazon_photo.py b/easygraph/datasets/amazon_photo.py new file mode 100644 index 00000000..29156976 --- /dev/null +++ b/easygraph/datasets/amazon_photo.py @@ -0,0 +1,105 @@ +import os +import numpy as np +import easygraph as eg +import scipy.sparse as sp + +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download, extract_archive, tensor, data_type_dict + + +class AmazonPhotoDataset(EasyGraphBuiltinDataset): + r"""Amazon Electronics Photo co-purchase graph dataset. + + Nodes represent products, and edges link products frequently co-purchased. + Node features are bag-of-words of product reviews. The task is to classify + the product category. + + Statistics: + + - Nodes: 7,650 + - Edges: 119,081 + - Number of Classes: 8 + - Features: 745 + + Parameters + ---------- + raw_dir : str, optional + Raw file directory to download/contains the input data directory. Default: None + force_reload : bool, optional + Whether to reload the dataset. Default: False + verbose : bool, optional + Whether to print out progress information. Default: True + transform : callable, optional + A transform that takes in a :class:`~easygraph.Graph` object and returns + a transformed version. The :class:`~easygraph.Graph` object will be + transformed before every access. + + Examples + -------- + >>> from easygraph.datasets import AmazonPhotoDataset + >>> dataset = AmazonPhotoDataset() + >>> g = dataset[0] + >>> print(g.number_of_nodes()) + >>> print(g.number_of_edges()) + >>> print(g.nodes[0]['feat'].shape) + >>> print(g.nodes[0]['label']) + >>> print(dataset.num_classes) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "amazon_photo" + url = "https://data.dgl.ai/dataset/amazon_co_buy_photo.zip" + super(AmazonPhotoDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + path = os.path.join(self.raw_path, "amazon_co_buy_photo.npz") + data = np.load(path) + + adj = sp.csr_matrix( + (data["adj_data"], data["adj_indices"], data["adj_indptr"]), + shape=data["adj_shape"], + ) + + features = sp.csr_matrix( + (data["attr_data"], data["attr_indices"], data["attr_indptr"]), + shape=data["attr_shape"], + ).todense() + + labels = data["labels"] + + g = eg.Graph() + g.add_edges_from(list(zip(*adj.nonzero()))) + + for i in range(features.shape[0]): + g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i])) + + self._g = g + self._num_classes = len(np.unique(labels)) + + if self.verbose: + print("Finished loading AmazonPhoto dataset.") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" NumFeats: {features.shape[1]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "AmazonPhotoDataset only contains one graph" + if self._g is None: + raise ValueError("Graph has not been loaded or processed correctly.") + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes diff --git a/easygraph/datasets/coauthor.py b/easygraph/datasets/coauthor.py index b9a1166b..1df6d3a9 100644 --- a/easygraph/datasets/coauthor.py +++ b/easygraph/datasets/coauthor.py @@ -25,7 +25,7 @@ class CoauthorCSDataset(EasyGraphBuiltinDataset): - r"""CoauthorCS citation network dataset. + r"""CoauthorCS citation network dataset. Nodes are authors, and edges indicate co-authorship relationships. Each node has a bag-of-words feature vector and a label denoting the primary research field. From 8355bc31d05c2ed8806cf0930ea89d554624c258 Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 03:28:31 -0600 Subject: [PATCH 03/12] added reddit dataset --- easygraph/amazon_computers.py | 103 +++++++++++++++++++++++++++++++++ easygraph/datasets/__init__.py | 3 +- easygraph/datasets/reddit.py | 82 ++++++++++++++++++++++++++ 3 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 easygraph/datasets/reddit.py diff --git a/easygraph/amazon_computers.py b/easygraph/amazon_computers.py index e69de29b..06c27beb 100644 --- a/easygraph/amazon_computers.py +++ b/easygraph/amazon_computers.py @@ -0,0 +1,103 @@ +import os +import numpy as np +import easygraph as eg +import scipy.sparse as sp + +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset + + +class AmazonComputersDataset(EasyGraphBuiltinDataset): + r"""Amazon Computers co-purchase graph dataset. + + Nodes represent computer products, and edges link products frequently co-purchased. + Node features are bag-of-words of product reviews. The task is to classify + the product category. + + Statistics: + - Nodes: 13,752 + - Edges: 245,861 + - Number of Classes: 10 + - Features: 767 + + Parameters + ---------- + raw_dir : str, optional + Raw file directory to download/contains the input data directory. Default: None + force_reload : bool, optional + Whether to reload the dataset. Default: False + verbose : bool, optional + Whether to print out progress information. Default: True + transform : callable, optional + A transform that takes in a :class:`~easygraph.Graph` object and returns + a transformed version. The :class:`~easygraph.Graph` object will be + transformed before every access. + + Examples + -------- + >>> from easygraph.datasets import AmazonComputersDataset + >>> dataset = AmazonComputersDataset() + >>> g = dataset[0] + >>> print(g.number_of_nodes()) + >>> print(g.number_of_edges()) + >>> print(g.nodes[0]['feat'].shape) + >>> print(g.nodes[0]['label']) + >>> print(dataset.num_classes) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "amazon_computers" + url = "https://data.dgl.ai/dataset/amazon_co_buy_computers.zip" + super(AmazonComputersDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + path = os.path.join(self.raw_path, "amazon_co_buy_computers.npz") + data = np.load(path) + + adj = sp.csr_matrix( + (data["adj_data"], data["adj_indices"], data["adj_indptr"]), + shape=data["adj_shape"], + ) + + features = sp.csr_matrix( + (data["attr_data"], data["attr_indices"], data["attr_indptr"]), + shape=data["attr_shape"], + ).todense() + + labels = data["labels"] + + g = eg.Graph() + g.add_edges_from(list(zip(*adj.nonzero()))) + + for i in range(features.shape[0]): + g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i])) + + self._g = g + self._num_classes = len(np.unique(labels)) + + if self.verbose: + print("Finished loading AmazonComputers dataset.") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" NumFeats: {features.shape[1]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "AmazonComputersDataset only contains one graph" + if self._g is None: + raise ValueError("Graph has not been loaded or processed correctly.") + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index d32afdcf..2624c2d7 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -24,4 +24,5 @@ PubmedGraphDataset, ) from .coauthor import CoauthorCSDataset -from .amazon_photo import AmazonPhotoDataset \ No newline at end of file +from .amazon_photo import AmazonPhotoDataset +from .reddit import RedditDataset \ No newline at end of file diff --git a/easygraph/datasets/reddit.py b/easygraph/datasets/reddit.py new file mode 100644 index 00000000..d15bafde --- /dev/null +++ b/easygraph/datasets/reddit.py @@ -0,0 +1,82 @@ +import os +import numpy as np +import easygraph as eg +import scipy.sparse as sp + +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download, extract_archive, tensor, data_type_dict + +class RedditDataset(EasyGraphBuiltinDataset): + r"""Reddit posts graph (Sept 2014) for community (subreddit) classification. + + Statistics: + - Nodes: ~232,965 + - Edges: ~114 million (approx.) + - Features per node: 602 + - Classes: number of subreddit communities + + Data are split by post-day: first 20 days train, then validation (30%), test (rest). + + Parameters + ---------- + self_loop : bool + Add self-loop edges if True. + raw_dir, force_reload, verbose, transform : same as EasyGraphBuiltinDataset + """ + def __init__(self, self_loop=False, raw_dir=None, force_reload=False, + verbose=True, transform=None): + name = "reddit" + url = "https://data.dgl.ai/dataset/reddit.zip" + self.self_loop = self_loop + super().__init__(name=name, url=url, raw_dir=raw_dir, + force_reload=force_reload, verbose=verbose, + transform=transform) + + def process(self): + # Expect two files extracted: reddit_data.npz & reddit_graph.npz + data = np.load(os.path.join(self.raw_path, "reddit_data.npz")) + feat = data["feature"] # shape [N, 602] + labels = data["label"] # shape [N] + split = data["node_types"] # 1=train,2=val,3=test + + # Load adjacency + adj = sp.load_npz(os.path.join(self.raw_path, "reddit_graph.npz")) + src, dst = adj.nonzero() + if self.self_loop: + self_loops = np.arange(adj.shape[0]) + src = np.concatenate([src, self_loops]) + dst = np.concatenate([dst, self_loops]) + edges = list(zip(src, dst)) + + # Build graph + g = eg.Graph() + g.add_edges_from(edges) + + # Assign node features, labels, and masks + for i in range(feat.shape[0]): + g.add_node(i, feat=feat[i], label=int(labels[i]), + train_mask=(split[i] == 1), + val_mask=(split[i] == 2), + test_mask=(split[i] == 3)) + + self._g = g + self._num_classes = int(np.max(labels) + 1) + + if self.verbose: + print("Loaded Reddit dataset:") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" NumFeats: {feat.shape[1]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "RedditDataset only contains one graph" + return self._g if self.transform is None else self.transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes From 282e6a25069ad8dc0e9f5962c3470621f40691d5 Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 03:36:38 -0600 Subject: [PATCH 04/12] added flickr dataset --- easygraph/datasets/__init__.py | 3 +- easygraph/datasets/flickr.py | 107 +++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 easygraph/datasets/flickr.py diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index 2624c2d7..5fa6fd4c 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -25,4 +25,5 @@ ) from .coauthor import CoauthorCSDataset from .amazon_photo import AmazonPhotoDataset -from .reddit import RedditDataset \ No newline at end of file +from .reddit import RedditDataset +from .flickr import FlickrDataset diff --git a/easygraph/datasets/flickr.py b/easygraph/datasets/flickr.py new file mode 100644 index 00000000..8a226f84 --- /dev/null +++ b/easygraph/datasets/flickr.py @@ -0,0 +1,107 @@ +import os +import json +import numpy as np +import scipy.sparse as sp +import easygraph as eg +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import tensor, data_type_dict + +class FlickrDataset(EasyGraphBuiltinDataset): + r"""Flickr dataset for node classification. + + Nodes are images and edges represent social tags co-occurrence. + Node features are precomputed image embeddings. Labels indicate image categories. + + Statistics: + - Nodes: 89,250 + - Edges: 899,756 + - Classes: 7 + - Feature dim: 500 + + Source: GraphSAINT (https://arxiv.org/abs/1907.04931) + + Parameters + ---------- + raw_dir : str, optional + Custom directory to download the dataset. Default: None (uses standard cache dir). + force_reload : bool, optional + Whether to re-download and reprocess. Default: False. + verbose : bool, optional + Whether to print loading progress. Default: False. + transform : callable, optional + A transform applied to the graph on access. + reorder : bool, optional + Whether to apply graph reordering for locality (requires torch). Default: False. + + Examples + -------- + >>> from easygraph.datasets import FlickrDataset + >>> ds = FlickrDataset(verbose=True) + >>> g = ds[0] + >>> print(g.number_of_nodes(), g.number_of_edges(), ds.num_classes) + >>> print(g.nodes[0]['feat'].shape, g.nodes[0]['label']) + """ + def __init__(self, raw_dir=None, force_reload=False, verbose=False, transform=None, reorder=False): + name = "flickr" + url = self._get_dgl_url("dataset/flickr.zip") + self._reorder = reorder + super(FlickrDataset, self).__init__(name=name, url=url, raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, transform=transform) + + def process(self): + # Load adjacency + coo = sp.load_npz(os.path.join(self.raw_path, "adj_full.npz")) + g = eg.Graph() + g.add_edges_from(list(zip(*coo.nonzero()))) + + # Load features + feats = np.load(os.path.join(self.raw_path, "feats.npy")) + # Load labels + with open(os.path.join(self.raw_path, "class_map.json")) as f: + class_map = json.load(f) + labels = np.array([class_map[str(i)] for i in range(feats.shape[0])]) + + # Load train/val/test splits + with open(os.path.join(self.raw_path, "role.json")) as f: + role = json.load(f) + train_mask = np.zeros(feats.shape[0], dtype=bool); train_mask[role["tr"]] = True + val_mask = np.zeros(feats.shape[0], dtype=bool); val_mask[role["va"]] = True + test_mask = np.zeros(feats.shape[0], dtype=bool); test_mask[role["te"]] = True + + # Attach node data + for i in range(feats.shape[0]): + g.add_node(i, + feat=feats[i].astype(np.float32), + label=int(labels[i])) + g.graph["train_mask"] = train_mask + g.graph["val_mask"] = val_mask + g.graph["test_mask"] = test_mask + + self._g = g + self._num_classes = int(labels.max() + 1) + if self.verbose: + print("Loaded Flickr dataset") + print(f" Nodes: {g.number_of_nodes()}, Edges: {g.number_of_edges()}, Features: {feats.shape[1]}, Classes: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "FlickrDataset contains only one graph" + g = self._g + # transfer mask info + g.graph["train_mask"] = g.graph.pop("train_mask") + g.graph["val_mask"] = g.graph.pop("val_mask") + g.graph["test_mask"] = g.graph.pop("test_mask") + return self._transform(g) if self._transform else g + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes + + @staticmethod + def _get_dgl_url(path): + from .utils import _get_dgl_url + return _get_dgl_url(path) From fd79e025399904e11303a746887416f738d45e9a Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 20:29:04 -0600 Subject: [PATCH 05/12] added facebook ego dataset --- easygraph/datasets/__init__.py | 1 + easygraph/datasets/facebook_ego.py | 104 +++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 easygraph/datasets/facebook_ego.py diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index 5fa6fd4c..c329c687 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -27,3 +27,4 @@ from .amazon_photo import AmazonPhotoDataset from .reddit import RedditDataset from .flickr import FlickrDataset +from .facebook_ego import FacebookEgoNetDataset \ No newline at end of file diff --git a/easygraph/datasets/facebook_ego.py b/easygraph/datasets/facebook_ego.py new file mode 100644 index 00000000..505a594d --- /dev/null +++ b/easygraph/datasets/facebook_ego.py @@ -0,0 +1,104 @@ +"""Facebook Ego-Net Dataset + +This dataset contains a subset of Facebook’s social network collected from +survey participants in the SNAP EgoNet project. Nodes represent users, and +edges indicate friendship links between them. + +Each ego network is centered on a user and includes their friend connections +and friend-to-friend connections. The `.circles` files contain labeled groups +(i.e., communities) of friends identified by the ego user. + +This version processes all ego-nets as a single undirected graph. Node features +are not provided. Labels (circles) are optional and not included by default. + +Statistics (based on merged graph): +- Nodes: ~4,000+ +- Edges: ~88,000+ +- Features: None +- Classes: None + +Reference: +J. McAuley and J. Leskovec, “Learning to Discover Social Circles in Ego Networks,” +in NIPS, 2012. [https://snap.stanford.edu/data/egonets-Facebook.html] +""" + +import os +import easygraph as eg +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download, extract_archive + + +class FacebookEgoNetDataset(EasyGraphBuiltinDataset): + r"""Facebook Ego-Net social network dataset. + + Each node is a user, and edges represent friendship. The dataset + includes 10 ego networks centered on different users. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Optional transform to apply on the graph. + + Examples + -------- + >>> from easygraph.datasets import FacebookEgoNetDataset + >>> dataset = FacebookEgoNetDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "facebook" + url = "https://snap.stanford.edu/data/facebook.tar.gz" + super(FacebookEgoNetDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def process(self): + parent_dir = os.path.join(self.raw_path, "facebook") + g = eg.Graph() + + # Iterate over all .edges files in the subdirectory + for filename in os.listdir(parent_dir): + if filename.endswith(".edges"): + edge_file = os.path.join(parent_dir, filename) + + with open(edge_file, 'r') as f: + for line in f: + u, v = map(int, line.strip().split()) + g.add_edge(u, v) + + self._g = g + self._num_nodes = g.number_of_nodes() + self._num_edges = g.number_of_edges() + + if self.verbose: + print("Finished loading Facebook Ego-Net dataset.") + print(f" NumNodes: {self._num_nodes}") + print(f" NumEdges: {self._num_edges}") + + def __getitem__(self, idx): + assert idx == 0, "FacebookEgoNetDataset only contains one merged graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + def download(self): + r"""Automatically download data and extract it.""" + if self.url is not None: + archive_path = os.path.join(self.raw_dir, self.name + ".tar.gz") + download(self.url, path=archive_path) + extract_archive(archive_path, self.raw_path) \ No newline at end of file From 517ad2e6d5e64bb45f20adfcace0f914e06661ca Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 20:49:13 -0600 Subject: [PATCH 06/12] added web-google dataset --- easygraph/datasets/graph_dataset_base.py | 1 - easygraph/datasets/web_google.py | 113 +++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 easygraph/datasets/web_google.py diff --git a/easygraph/datasets/graph_dataset_base.py b/easygraph/datasets/graph_dataset_base.py index 4f433e81..1077ddf7 100644 --- a/easygraph/datasets/graph_dataset_base.py +++ b/easygraph/datasets/graph_dataset_base.py @@ -8,7 +8,6 @@ import os import sys import traceback - from ..utils import retry_method_with_fix from .utils import download from .utils import extract_archive diff --git a/easygraph/datasets/web_google.py b/easygraph/datasets/web_google.py new file mode 100644 index 00000000..68b5360e --- /dev/null +++ b/easygraph/datasets/web_google.py @@ -0,0 +1,113 @@ +"""Web-Google Dataset + +This dataset is a web graph based on Google's web pages and their hyperlink +structure, as crawled by the Stanford WebBase project in 2002. + +Each node represents a web page, and a directed edge from u to v indicates +a hyperlink from page u to page v. + +Statistics: +- Nodes: 875713 +- Edges: 5105039 +- Features: None +- Labels: None + +Reference: +J. Leskovec, A. Rajaraman, J. Ullman, “Mining of Massive Datasets.” +Dataset from SNAP: https://snap.stanford.edu/data/web-Google.html +""" + +import os +import easygraph as eg +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download, extract_archive +import gzip +import shutil + + +class WebGoogleDataset(EasyGraphBuiltinDataset): + r"""Web-Google hyperlink network dataset. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Optional transform to apply on the graph. + + Examples + -------- + >>> from easygraph.datasets import WebGoogleDataset + >>> dataset = WebGoogleDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "web-Google" + url = "https://snap.stanford.edu/data/web-Google.txt.gz" + super(WebGoogleDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + r"""Download and extract .gz edge list.""" + if self.url is not None: + file_path = os.path.join(self.raw_dir, self.name + ".txt.gz") + download(self.url, path=file_path) + extract_archive(file_path, self.raw_path) + + def process(self): + graph = eg.DiGraph() # Web-Google is directed + edge_list_path = os.path.join(self.raw_path, self.name + ".txt") + + with open(edge_list_path, 'r') as f: + for line in f: + if line.startswith('#') or line.strip() == "": + continue + u, v = map(int, line.strip().split()) + graph.add_edge(u, v) + + self._g = graph + self._num_nodes = graph.number_of_nodes() + self._num_edges = graph.number_of_edges() + + if self.verbose: + print("Finished loading Web-Google dataset.") + print(f" NumNodes: {self._num_nodes}") + print(f" NumEdges: {self._num_edges}") + + def __getitem__(self, idx): + assert idx == 0, "WebGoogleDataset only contains one graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + def download(self): + r"""Download and decompress the .txt.gz file.""" + if self.url is not None: + compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz") + extracted_path = os.path.join(self.raw_path, self.name + ".txt") + + # Download .gz file + download(self.url, path=compressed_path) + + # Ensure output directory exists + if not os.path.exists(self.raw_path): + os.makedirs(self.raw_path) + + # Decompress manually + with gzip.open(compressed_path, 'rb') as f_in: + with open(extracted_path, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) \ No newline at end of file From 3d59a68e984bca041c73dfa1d586445826605a32 Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 20:53:19 -0600 Subject: [PATCH 07/12] finished roadnet dataset --- easygraph/datasets/__init__.py | 3 +- easygraph/datasets/roadnet.py | 104 +++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 easygraph/datasets/roadnet.py diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index c329c687..ab5d4dd5 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -27,4 +27,5 @@ from .amazon_photo import AmazonPhotoDataset from .reddit import RedditDataset from .flickr import FlickrDataset -from .facebook_ego import FacebookEgoNetDataset \ No newline at end of file +from .facebook_ego import FacebookEgoNetDataset +from .roadnet import RoadNetCADataset \ No newline at end of file diff --git a/easygraph/datasets/roadnet.py b/easygraph/datasets/roadnet.py new file mode 100644 index 00000000..62e5203f --- /dev/null +++ b/easygraph/datasets/roadnet.py @@ -0,0 +1,104 @@ +"""RoadNet-CA Dataset + +This dataset represents the road network of California. +Nodes correspond to intersections, and edges represent roads connecting them. + +The data is undirected and unweighted. No features or labels are provided. + +Statistics: +- Nodes: 1,965,206 +- Edges: 2,766,607 +- Features: None +- Labels: None + +Reference: +J. Leskovec and A. Krevl, “SNAP Datasets: Stanford Large Network Dataset Collection,” +https://snap.stanford.edu/data/roadNet-CA.html +""" + +import os +import gzip +import shutil +import easygraph as eg +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download + + +class RoadNetCADataset(EasyGraphBuiltinDataset): + r"""Road network of California (RoadNet-CA) + + Nodes are road intersections and edges are roads connecting them. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Optional transform to apply on the graph. + + Examples + -------- + >>> from easygraph.datasets import RoadNetCADataset + >>> dataset = RoadNetCADataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "roadNet-CA" + url = "https://snap.stanford.edu/data/roadNet-CA.txt.gz" + super(RoadNetCADataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + r"""Download and decompress the .txt.gz file.""" + compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz") + extracted_path = os.path.join(self.raw_path, self.name + ".txt") + + download(self.url, path=compressed_path) + + if not os.path.exists(self.raw_path): + os.makedirs(self.raw_path) + + with gzip.open(compressed_path, 'rb') as f_in: + with open(extracted_path, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + def process(self): + graph = eg.Graph() # Undirected road network + edge_list_path = os.path.join(self.raw_path, self.name + ".txt") + + with open(edge_list_path, 'r') as f: + for line in f: + if line.startswith('#') or line.strip() == "": + continue + u, v = map(int, line.strip().split()) + graph.add_edge(u, v) + + self._g = graph + self._num_nodes = graph.number_of_nodes() + self._num_edges = graph.number_of_edges() + + if self.verbose: + print("Finished loading RoadNet-CA dataset.") + print(f" NumNodes: {self._num_nodes}") + print(f" NumEdges: {self._num_edges}") + + def __getitem__(self, idx): + assert idx == 0, "RoadNetCADataset only contains one graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 From f7d2d31d17f9d55fb831c1adb4427afe78514a03 Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 20:57:46 -0600 Subject: [PATCH 08/12] added arxiv dataset --- easygraph/datasets/__init__.py | 3 +- easygraph/datasets/arxiv.py | 103 +++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 easygraph/datasets/arxiv.py diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index ab5d4dd5..99c1d025 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -28,4 +28,5 @@ from .reddit import RedditDataset from .flickr import FlickrDataset from .facebook_ego import FacebookEgoNetDataset -from .roadnet import RoadNetCADataset \ No newline at end of file +from .roadnet import RoadNetCADataset +from .arxiv import ArxivHEPTHDataset \ No newline at end of file diff --git a/easygraph/datasets/arxiv.py b/easygraph/datasets/arxiv.py new file mode 100644 index 00000000..2239e2c9 --- /dev/null +++ b/easygraph/datasets/arxiv.py @@ -0,0 +1,103 @@ +"""Arxiv HEP-TH Citation Network + +This dataset represents the citation network of preprints from the High Energy Physics - Theory (HEP-TH) category on arXiv, covering the period from January 1993 to April 2003. + +Each node corresponds to a paper, and a directed edge from paper A to paper B indicates that A cites B. + +No features or labels are included in this dataset. + +Statistics: +- Nodes: 27,770 +- Edges: 352,807 +- Features: None +- Labels: None + +Reference: +J. Leskovec, J. Kleinberg and C. Faloutsos, "Graphs over Time: Densification Laws, Shrinking Diameters and Possible Explanations," +in KDD 2005. Dataset: https://snap.stanford.edu/data/cit-HepTh.html +""" + +import os +import gzip +import shutil +import easygraph as eg +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download + + +class ArxivHEPTHDataset(EasyGraphBuiltinDataset): + r"""Arxiv HEP-TH citation network dataset. + + Parameters + ---------- + raw_dir : str, optional + Directory to store the raw downloaded files. Default: None + force_reload : bool, optional + Whether to re-download and process the dataset. Default: False + verbose : bool, optional + Whether to print detailed processing logs. Default: True + transform : callable, optional + Optional transform to apply on the graph. + + Examples + -------- + >>> from easygraph.datasets import ArxivHEPTHDataset + >>> dataset = ArxivHEPTHDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "cit-HepTh" + url = "https://snap.stanford.edu/data/cit-HepTh.txt.gz" + super(ArxivHEPTHDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + r"""Download and decompress the .txt.gz file.""" + compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz") + extracted_path = os.path.join(self.raw_path, self.name + ".txt") + + download(self.url, path=compressed_path) + + if not os.path.exists(self.raw_path): + os.makedirs(self.raw_path) + + with gzip.open(compressed_path, 'rb') as f_in: + with open(extracted_path, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + def process(self): + graph = eg.DiGraph() # Citation network is directed + edge_list_path = os.path.join(self.raw_path, self.name + ".txt") + + with open(edge_list_path, 'r') as f: + for line in f: + if line.startswith('#') or line.strip() == "": + continue + u, v = map(int, line.strip().split()) + graph.add_edge(u, v) + + self._g = graph + self._num_nodes = graph.number_of_nodes() + self._num_edges = graph.number_of_edges() + + if self.verbose: + print("Finished loading Arxiv HEP-TH dataset.") + print(f" NumNodes: {self._num_nodes}") + print(f" NumEdges: {self._num_edges}") + + def __getitem__(self, idx): + assert idx == 0, "ArxivHEPTHDataset only contains one graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 From 12a6f1b51b5d9d6b2d6ed869d51991ba957fe7c4 Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 21:32:24 -0600 Subject: [PATCH 09/12] added twitter ego dataset --- easygraph/datasets/__init__.py | 4 +- easygraph/datasets/github.py | 123 ++++++++++++++++++++++++++++++ easygraph/datasets/twitter_ego.py | 60 +++++++++++++++ 3 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 easygraph/datasets/github.py create mode 100644 easygraph/datasets/twitter_ego.py diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index 99c1d025..d5fe7d6f 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -29,4 +29,6 @@ from .flickr import FlickrDataset from .facebook_ego import FacebookEgoNetDataset from .roadnet import RoadNetCADataset -from .arxiv import ArxivHEPTHDataset \ No newline at end of file +from .arxiv import ArxivHEPTHDataset +from .github import GitHubUsersDataset +from .twitter_ego import TwitterEgoDataset \ No newline at end of file diff --git a/easygraph/datasets/github.py b/easygraph/datasets/github.py new file mode 100644 index 00000000..35f40a80 --- /dev/null +++ b/easygraph/datasets/github.py @@ -0,0 +1,123 @@ +"""GitHub Users Social Network Dataset (musae_git) + +This dataset represents a directed social network of GitHub users collected in 2019. +Nodes represent GitHub developers, and a directed edge from user A to user B indicates that A follows B. + +Each node also includes: +- Features: User profile and activity-based features. +- Labels: Developer's project area (e.g., machine learning, web dev, etc.) + +Statistics: +- Nodes: 37,700 +- Edges: 289,003 +- Feature dim: 5,575 +- Classes: 2 + +Reference: +J. Leskovec et al. "SNAP Datasets: Stanford Large Network Dataset Collection", +https://snap.stanford.edu/data/github-social.html +""" + +import os +import csv +import json +import numpy as np +import easygraph as eg +from easygraph.classes.graph import Graph +from .graph_dataset_base import EasyGraphBuiltinDataset +from .utils import download, extract_archive + + +class GitHubUsersDataset(EasyGraphBuiltinDataset): + r"""GitHub developers social graph (musae_git). + + Parameters + ---------- + raw_dir : str, optional + Directory to store raw data. Default: None + force_reload : bool, optional + Force re-download and processing. Default: False + verbose : bool, optional + Print processing information. Default: True + transform : callable, optional + Transform to apply to the graph on load. + + Examples + -------- + >>> from easygraph.datasets import GitHubUsersDataset + >>> dataset = GitHubUsersDataset() + >>> g = dataset[0] + >>> print("Nodes:", g.number_of_nodes()) + >>> print("Edges:", g.number_of_edges()) + >>> print("Feature shape:", g.nodes[0]['feat'].shape) + >>> print("Label:", g.nodes[0]['label']) + """ + + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + name = "musae_git" + url = "https://snap.stanford.edu/data/git_web_ml.zip" + super(GitHubUsersDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + archive = os.path.join(self.raw_dir, self.name + ".zip") + download(self.url, path=archive) + extract_archive(archive, self.raw_path) + + + def process(self): + g = eg.DiGraph() + base_path = os.path.join(self.raw_path, "git_web_ml") + + + # Load node features + with open(os.path.join(base_path, "musae_git_features.json"), "r") as f: + features = json.load(f) + + # Load labels + labels = {} + with open(os.path.join(base_path, "musae_git_target.csv"), "r") as f: + reader = csv.DictReader(f) + for row in reader: + node_id = int(row["id"]) + labels[node_id] = int(row["ml_target"]) + + # Load edges + with open(os.path.join(base_path, "musae_git_edges.csv"), "r") as f: + reader = csv.DictReader(f) + for row in reader: + u, v = int(row["id_1"]), int(row["id_2"]) + g.add_edge(u, v) + + # Add node attributes + for node_id in g.nodes: + feat = np.array(features[str(node_id)], dtype=np.float32) + label = labels.get(node_id, -1) + g.add_node(node_id, feat=feat, label=label) + + self._g = g + self._num_classes = len(set(labels.values())) + + if self.verbose: + print("Finished loading GitHub Users dataset.") + print(f" NumNodes: {g.number_of_nodes()}") + print(f" NumEdges: {g.number_of_edges()}") + print(f" Feature dim: {feat.shape[0]}") + print(f" NumClasses: {self._num_classes}") + + def __getitem__(self, idx): + assert idx == 0, "GitHubUsersDataset only contains one graph" + return self._g if self._transform is None else self._transform(self._g) + + def __len__(self): + return 1 + + @property + def num_classes(self): + return self._num_classes diff --git a/easygraph/datasets/twitter_ego.py b/easygraph/datasets/twitter_ego.py new file mode 100644 index 00000000..d88b085a --- /dev/null +++ b/easygraph/datasets/twitter_ego.py @@ -0,0 +1,60 @@ +import gzip +import os +import easygraph as eg +from easygraph.datasets.graph_dataset_base import EasyGraphBuiltinDataset +from easygraph.datasets.utils import download, extract_archive + +class TwitterEgoDataset(EasyGraphBuiltinDataset): + r""" + Twitter Ego Network Dataset + + The Twitter dataset was collected from public sources and contains a large ego-network of Twitter users. + The combined network includes 81K edges among 81K users. + + Source: J. McAuley and J. Leskovec, Stanford SNAP, 2012 + URL: https://snap.stanford.edu/data/egonets-Twitter.html + File used: https://snap.stanford.edu/data/twitter_combined.txt.gz + """ + + def __init__(self): + super(TwitterEgoDataset, self).__init__( + name="twitter_ego", + url="https://snap.stanford.edu/data/twitter_combined.txt.gz", + force_reload=False, + ) + + def download(self): + gz_path = os.path.join(self.raw_path, "twitter_combined.txt.gz") + download(self.url, path=gz_path) + extract_archive(gz_path, self.raw_path) + + def process(self): + import gzip + import easygraph as eg + + gz_path = os.path.join(self.raw_path, "twitter_combined.txt.gz") + txt_path = os.path.join(self.raw_path, "twitter_combined.txt") + + if not os.path.exists(txt_path): + with gzip.open(gz_path, "rt") as f_in, open(txt_path, "w") as f_out: + f_out.writelines(f_in) + + G = eg.Graph() + edge_count = 0 + with open(txt_path, "r") as f: + for line in f: + u, v = map(int, line.strip().split()) + G.add_edge(u, v) + edge_count += 1 + + self._graphs = [G] + self._graph = G + self._processed = True + + def __getitem__(self, idx): + if self._graph is not None: + return self._graph + elif self._graphs: + return self._graphs[idx] + else: + return None From b3504d475001d7dce1d895cf5d7fedc4cd14c091 Mon Sep 17 00:00:00 2001 From: sama Date: Mon, 7 Jul 2025 21:47:37 -0600 Subject: [PATCH 10/12] added wikipedia topcats dataset --- easygraph/datasets/__init__.py | 3 +- easygraph/datasets/wiki_topcats.py | 96 ++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 easygraph/datasets/wiki_topcats.py diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index d5fe7d6f..bd12c6d5 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -31,4 +31,5 @@ from .roadnet import RoadNetCADataset from .arxiv import ArxivHEPTHDataset from .github import GitHubUsersDataset -from .twitter_ego import TwitterEgoDataset \ No newline at end of file +from .twitter_ego import TwitterEgoDataset +from .wiki_topcats import WikiTopCatsDataset \ No newline at end of file diff --git a/easygraph/datasets/wiki_topcats.py b/easygraph/datasets/wiki_topcats.py new file mode 100644 index 00000000..5a4085b0 --- /dev/null +++ b/easygraph/datasets/wiki_topcats.py @@ -0,0 +1,96 @@ +"""Wikipedia Top Categories Dataset (wiki-topcats) + +This dataset is a directed graph of Wikipedia articles restricted to +top-level categories (at least 100 articles), capturing the largest +strongly connected component. + +Statistics: +- Nodes: 1,791,489 +- Edges: 28,511,807 +- Categories: 17,364 +- Overlapping labels per node + +Source: +H. Yin, A. Benson, J. Leskovec, D. Gleich. +"Local Higher-order Graph Clustering", KDD 2017 +Data: https://snap.stanford.edu/data/wiki-topcats.html +""" + +import os +import gzip +import easygraph as eg +from easygraph.datasets.graph_dataset_base import EasyGraphBuiltinDataset +from easygraph.datasets.utils import download, extract_archive + +class WikiTopCatsDataset(EasyGraphBuiltinDataset): + """Wikipedia Top Categories Snapshot from 2011 (SNAP)""" + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): + super(WikiTopCatsDataset, self).__init__( + name="wiki_topcats", + url="https://snap.stanford.edu/data/wiki-topcats.txt.gz", + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) + + def download(self): + # Download the main graph file + gz_path = os.path.join(self.raw_dir, "wiki-topcats.txt.gz") + download(self.url, path=gz_path) + + # Also download category info and page names + cat_url = "https://snap.stanford.edu/data/wiki-topcats-categories.txt.gz" + names_url = "https://snap.stanford.edu/data/wiki-topcats-page-names.txt.gz" + download(cat_url, path=os.path.join(self.raw_dir, "wiki-topcats-categories.txt.gz")) + download(names_url, path=os.path.join(self.raw_dir, "wiki-topcats-page-names.txt.gz")) + + def process(self): + raw = self.raw_dir + + # Decompress and read edges + edge_gz = os.path.join(raw, "wiki-topcats.txt.gz") + edge_txt = os.path.join(raw, "wiki-topcats.txt") + if not os.path.exists(edge_txt): + with gzip.open(edge_gz, "rt") as fin, open(edge_txt, "w") as fout: + fout.writelines(fin) + G = eg.DiGraph() + edge_count = 0 + with open(edge_txt, "r") as f: + for line in f: + u, v = map(int, line.strip().split()) + G.add_edge(u, v) + edge_count += 1 + if self.verbose: + print(f"Loaded graph: {G.number_of_nodes()} nodes, {edge_count} edges") + + # Compress node names + names_gz = os.path.join(raw, "wiki-topcats-page-names.txt.gz") + names = {} + with gzip.open(names_gz, "rt") as f: + for idx, line in enumerate(f): + names[idx] = line.strip() + + # Load categories + cats_gz = os.path.join(raw, "wiki-topcats-categories.txt.gz") + labels = {} # mapping: node -> list of category strings + with gzip.open(cats_gz, "rt") as f: + for idx, line in enumerate(f): + categories = line.strip().split(";") + categories = [cat.strip() for cat in categories if cat.strip()] + labels[idx] = categories + + # Attach node features: empty, and node labels + for n in G.nodes: + G.add_node(n, name=names.get(n, ""), label=labels.get(n, [])) + + self._graph = G + self._graphs = [G] + self._processed = True + + def __getitem__(self, idx): + assert idx == 0 + return self._graph + + def __len__(self): + return 1 From a6e2870de62692c80788dbf396468c75f062f8a9 Mon Sep 17 00:00:00 2001 From: sama Date: Tue, 8 Jul 2025 01:47:51 -0600 Subject: [PATCH 11/12] fixed google dataset import --- easygraph/datasets/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index bd12c6d5..9bc87c89 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -32,4 +32,5 @@ from .arxiv import ArxivHEPTHDataset from .github import GitHubUsersDataset from .twitter_ego import TwitterEgoDataset +from .web_google import WebGoogleDataset from .wiki_topcats import WikiTopCatsDataset \ No newline at end of file From 0a04eae6a529cc26cb5fa2a7c6cc6cb4844eaf18 Mon Sep 17 00:00:00 2001 From: sama Date: Tue, 8 Jul 2025 01:54:16 -0600 Subject: [PATCH 12/12] fixed linter errors --- easygraph/amazon_computers.py | 103 -------------------- easygraph/datasets/__init__.py | 25 +++-- easygraph/datasets/amazon_photo.py | 9 +- easygraph/datasets/arxiv.py | 15 +-- easygraph/datasets/citation_graph.py | 4 +- easygraph/datasets/coauthor.py | 10 +- easygraph/datasets/dynamic/email_enron.py | 3 +- easygraph/datasets/dynamic/email_eu.py | 3 +- easygraph/datasets/dynamic/hospital_lyon.py | 7 +- easygraph/datasets/facebook_ego.py | 23 +++-- easygraph/datasets/flickr.py | 50 +++++++--- easygraph/datasets/github.py | 14 +-- easygraph/datasets/graph_dataset_base.py | 4 +- easygraph/datasets/ppi.py | 3 +- easygraph/datasets/reddit.py | 50 +++++++--- easygraph/datasets/roadnet.py | 15 +-- easygraph/datasets/twitter_ego.py | 13 ++- easygraph/datasets/web_google.py | 21 ++-- easygraph/datasets/wiki_topcats.py | 17 +++- easygraph/model/hypergraphs/hwnn.py | 2 +- easygraph/nn/convs/hypergraphs/hwnn_conv.py | 2 +- 21 files changed, 188 insertions(+), 205 deletions(-) delete mode 100644 easygraph/amazon_computers.py diff --git a/easygraph/amazon_computers.py b/easygraph/amazon_computers.py deleted file mode 100644 index 06c27beb..00000000 --- a/easygraph/amazon_computers.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -import numpy as np -import easygraph as eg -import scipy.sparse as sp - -from easygraph.classes.graph import Graph -from .graph_dataset_base import EasyGraphBuiltinDataset - - -class AmazonComputersDataset(EasyGraphBuiltinDataset): - r"""Amazon Computers co-purchase graph dataset. - - Nodes represent computer products, and edges link products frequently co-purchased. - Node features are bag-of-words of product reviews. The task is to classify - the product category. - - Statistics: - - Nodes: 13,752 - - Edges: 245,861 - - Number of Classes: 10 - - Features: 767 - - Parameters - ---------- - raw_dir : str, optional - Raw file directory to download/contains the input data directory. Default: None - force_reload : bool, optional - Whether to reload the dataset. Default: False - verbose : bool, optional - Whether to print out progress information. Default: True - transform : callable, optional - A transform that takes in a :class:`~easygraph.Graph` object and returns - a transformed version. The :class:`~easygraph.Graph` object will be - transformed before every access. - - Examples - -------- - >>> from easygraph.datasets import AmazonComputersDataset - >>> dataset = AmazonComputersDataset() - >>> g = dataset[0] - >>> print(g.number_of_nodes()) - >>> print(g.number_of_edges()) - >>> print(g.nodes[0]['feat'].shape) - >>> print(g.nodes[0]['label']) - >>> print(dataset.num_classes) - """ - - def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): - name = "amazon_computers" - url = "https://data.dgl.ai/dataset/amazon_co_buy_computers.zip" - super(AmazonComputersDataset, self).__init__( - name=name, - url=url, - raw_dir=raw_dir, - force_reload=force_reload, - verbose=verbose, - transform=transform, - ) - - def process(self): - path = os.path.join(self.raw_path, "amazon_co_buy_computers.npz") - data = np.load(path) - - adj = sp.csr_matrix( - (data["adj_data"], data["adj_indices"], data["adj_indptr"]), - shape=data["adj_shape"], - ) - - features = sp.csr_matrix( - (data["attr_data"], data["attr_indices"], data["attr_indptr"]), - shape=data["attr_shape"], - ).todense() - - labels = data["labels"] - - g = eg.Graph() - g.add_edges_from(list(zip(*adj.nonzero()))) - - for i in range(features.shape[0]): - g.add_node(i, feat=np.array(features[i]).squeeze(), label=int(labels[i])) - - self._g = g - self._num_classes = len(np.unique(labels)) - - if self.verbose: - print("Finished loading AmazonComputers dataset.") - print(f" NumNodes: {g.number_of_nodes()}") - print(f" NumEdges: {g.number_of_edges()}") - print(f" NumFeats: {features.shape[1]}") - print(f" NumClasses: {self._num_classes}") - - def __getitem__(self, idx): - assert idx == 0, "AmazonComputersDataset only contains one graph" - if self._g is None: - raise ValueError("Graph has not been loaded or processed correctly.") - return self._g if self._transform is None else self._transform(self._g) - - def __len__(self): - return 1 - - @property - def num_classes(self): - return self._num_classes diff --git a/easygraph/datasets/__init__.py b/easygraph/datasets/__init__.py index 9bc87c89..035ada50 100644 --- a/easygraph/datasets/__init__.py +++ b/easygraph/datasets/__init__.py @@ -8,6 +8,7 @@ from easygraph.datasets.hypergraph.House_Committees import House_Committees from easygraph.datasets.karate import KarateClubDataset from easygraph.datasets.mathoverflow_answers import mathoverflow_answers + from .ppi import LegacyPPIDataset from .ppi import PPIDataset except Exception as e: @@ -16,21 +17,19 @@ " hypergraph-related datasets." ) -from .citation_graph import ( - CitationGraphDataset, - CiteseerGraphDataset, - CoraBinary, - CoraGraphDataset, - PubmedGraphDataset, -) -from .coauthor import CoauthorCSDataset from .amazon_photo import AmazonPhotoDataset -from .reddit import RedditDataset -from .flickr import FlickrDataset -from .facebook_ego import FacebookEgoNetDataset -from .roadnet import RoadNetCADataset from .arxiv import ArxivHEPTHDataset +from .citation_graph import CitationGraphDataset +from .citation_graph import CiteseerGraphDataset +from .citation_graph import CoraBinary +from .citation_graph import CoraGraphDataset +from .citation_graph import PubmedGraphDataset +from .coauthor import CoauthorCSDataset +from .facebook_ego import FacebookEgoNetDataset +from .flickr import FlickrDataset from .github import GitHubUsersDataset +from .reddit import RedditDataset +from .roadnet import RoadNetCADataset from .twitter_ego import TwitterEgoDataset from .web_google import WebGoogleDataset -from .wiki_topcats import WikiTopCatsDataset \ No newline at end of file +from .wiki_topcats import WikiTopCatsDataset diff --git a/easygraph/datasets/amazon_photo.py b/easygraph/datasets/amazon_photo.py index 29156976..a9295a20 100644 --- a/easygraph/datasets/amazon_photo.py +++ b/easygraph/datasets/amazon_photo.py @@ -1,11 +1,16 @@ import os -import numpy as np + import easygraph as eg +import numpy as np import scipy.sparse as sp from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset -from .utils import download, extract_archive, tensor, data_type_dict +from .utils import data_type_dict +from .utils import download +from .utils import extract_archive +from .utils import tensor class AmazonPhotoDataset(EasyGraphBuiltinDataset): diff --git a/easygraph/datasets/arxiv.py b/easygraph/datasets/arxiv.py index 2239e2c9..cfce499b 100644 --- a/easygraph/datasets/arxiv.py +++ b/easygraph/datasets/arxiv.py @@ -13,15 +13,18 @@ - Labels: None Reference: -J. Leskovec, J. Kleinberg and C. Faloutsos, "Graphs over Time: Densification Laws, Shrinking Diameters and Possible Explanations," +J. Leskovec, J. Kleinberg and C. Faloutsos, "Graphs over Time: Densification Laws, Shrinking Diameters and Possible Explanations," in KDD 2005. Dataset: https://snap.stanford.edu/data/cit-HepTh.html """ -import os import gzip +import os import shutil + import easygraph as eg + from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset from .utils import download @@ -71,17 +74,17 @@ def download(self): if not os.path.exists(self.raw_path): os.makedirs(self.raw_path) - with gzip.open(compressed_path, 'rb') as f_in: - with open(extracted_path, 'wb') as f_out: + with gzip.open(compressed_path, "rb") as f_in: + with open(extracted_path, "wb") as f_out: shutil.copyfileobj(f_in, f_out) def process(self): graph = eg.DiGraph() # Citation network is directed edge_list_path = os.path.join(self.raw_path, self.name + ".txt") - with open(edge_list_path, 'r') as f: + with open(edge_list_path, "r") as f: for line in f: - if line.startswith('#') or line.strip() == "": + if line.startswith("#") or line.strip() == "": continue u, v = map(int, line.strip().split()) graph.add_edge(u, v) diff --git a/easygraph/datasets/citation_graph.py b/easygraph/datasets/citation_graph.py index 3bb2fead..3795d678 100644 --- a/easygraph/datasets/citation_graph.py +++ b/easygraph/datasets/citation_graph.py @@ -1,6 +1,5 @@ -"""Cora, citeseer, pubmed dataset. +"""Cora, citeseer, pubmed dataset.""" -""" from __future__ import absolute_import import os @@ -53,6 +52,7 @@ class CitationGraphDataset(EasyGraphBuiltinDataset): reorder : bool Whether to reorder the graph using :func:`~eg.reorder_graph`. Default: False. """ + _urls = { "cora_v2": "dataset/cora_v2.zip", "citeseer": "dataset/citeseer.zip", diff --git a/easygraph/datasets/coauthor.py b/easygraph/datasets/coauthor.py index 1df6d3a9..fe90f734 100644 --- a/easygraph/datasets/coauthor.py +++ b/easygraph/datasets/coauthor.py @@ -15,13 +15,18 @@ """ import os -import numpy as np + import easygraph as eg +import numpy as np import scipy.sparse as sp from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset -from .utils import download, extract_archive, tensor, data_type_dict +from .utils import data_type_dict +from .utils import download +from .utils import extract_archive +from .utils import tensor class CoauthorCSDataset(EasyGraphBuiltinDataset): @@ -52,6 +57,7 @@ class CoauthorCSDataset(EasyGraphBuiltinDataset): >>> print("Label:", g.nodes[0]['label']) >>> print("Number of classes:", dataset.num_classes) """ + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): name = "coauthor_cs" url = "https://data.dgl.ai/dataset/coauthor_cs.zip" diff --git a/easygraph/datasets/dynamic/email_enron.py b/easygraph/datasets/dynamic/email_enron.py index aad3087e..0fb24f78 100644 --- a/easygraph/datasets/dynamic/email_enron.py +++ b/easygraph/datasets/dynamic/email_enron.py @@ -73,8 +73,7 @@ def download(self): self.load_data = data def process(self): - """Loads input data from data directory and transfer to target graph for better analysis - """ + """Loads input data from data directory and transfer to target graph for better analysis""" self._g, edge_feature_list = dict_to_hypergraph(self.load_data, is_dynamic=True) diff --git a/easygraph/datasets/dynamic/email_eu.py b/easygraph/datasets/dynamic/email_eu.py index 236e6ecd..51c150ed 100644 --- a/easygraph/datasets/dynamic/email_eu.py +++ b/easygraph/datasets/dynamic/email_eu.py @@ -70,8 +70,7 @@ def download(self): self.load_data = data def process(self): - """Loads input data from data directory and transfer to target graph for better analysis - """ + """Loads input data from data directory and transfer to target graph for better analysis""" self._g, edge_feature_list = dict_to_hypergraph(self.load_data, is_dynamic=True) self._g.ndata["hyperedge_feature"] = tensor( range(1, len(edge_feature_list) + 1) diff --git a/easygraph/datasets/dynamic/hospital_lyon.py b/easygraph/datasets/dynamic/hospital_lyon.py index 6784d8f9..e7f93566 100644 --- a/easygraph/datasets/dynamic/hospital_lyon.py +++ b/easygraph/datasets/dynamic/hospital_lyon.py @@ -10,7 +10,9 @@ class Hospital_Lyon(EasyGraphDataset): _urls = { - "hospital_lyon": "easygraph-data-hospital-lyon/-/raw/main/hospital-lyon.json?ref_type=heads&inline=false", + "hospital_lyon": ( + "easygraph-data-hospital-lyon/-/raw/main/hospital-lyon.json?ref_type=heads&inline=false" + ), } def __init__( @@ -119,8 +121,7 @@ def download(self): self.load_data = data def process(self): - """Loads input data from data directory and transfer to target graph for better analysis - """ + """Loads input data from data directory and transfer to target graph for better analysis""" self._g, edge_feature_list = self.preprocess(self.load_data, is_dynamic=True) self._g.ndata["hyperedge_feature"] = tensor( diff --git a/easygraph/datasets/facebook_ego.py b/easygraph/datasets/facebook_ego.py index 505a594d..33eabf33 100644 --- a/easygraph/datasets/facebook_ego.py +++ b/easygraph/datasets/facebook_ego.py @@ -1,14 +1,14 @@ """Facebook Ego-Net Dataset -This dataset contains a subset of Facebook’s social network collected from -survey participants in the SNAP EgoNet project. Nodes represent users, and +This dataset contains a subset of Facebook’s social network collected from +survey participants in the SNAP EgoNet project. Nodes represent users, and edges indicate friendship links between them. -Each ego network is centered on a user and includes their friend connections -and friend-to-friend connections. The `.circles` files contain labeled groups +Each ego network is centered on a user and includes their friend connections +and friend-to-friend connections. The `.circles` files contain labeled groups (i.e., communities) of friends identified by the ego user. -This version processes all ego-nets as a single undirected graph. Node features +This version processes all ego-nets as a single undirected graph. Node features are not provided. Labels (circles) are optional and not included by default. Statistics (based on merged graph): @@ -18,15 +18,19 @@ - Classes: None Reference: -J. McAuley and J. Leskovec, “Learning to Discover Social Circles in Ego Networks,” +J. McAuley and J. Leskovec, “Learning to Discover Social Circles in Ego Networks,” in NIPS, 2012. [https://snap.stanford.edu/data/egonets-Facebook.html] """ import os + import easygraph as eg + from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset -from .utils import download, extract_archive +from .utils import download +from .utils import extract_archive class FacebookEgoNetDataset(EasyGraphBuiltinDataset): @@ -76,7 +80,7 @@ def process(self): if filename.endswith(".edges"): edge_file = os.path.join(parent_dir, filename) - with open(edge_file, 'r') as f: + with open(edge_file, "r") as f: for line in f: u, v = map(int, line.strip().split()) g.add_edge(u, v) @@ -96,9 +100,10 @@ def __getitem__(self, idx): def __len__(self): return 1 + def download(self): r"""Automatically download data and extract it.""" if self.url is not None: archive_path = os.path.join(self.raw_dir, self.name + ".tar.gz") download(self.url, path=archive_path) - extract_archive(archive_path, self.raw_path) \ No newline at end of file + extract_archive(archive_path, self.raw_path) diff --git a/easygraph/datasets/flickr.py b/easygraph/datasets/flickr.py index 8a226f84..022308a8 100644 --- a/easygraph/datasets/flickr.py +++ b/easygraph/datasets/flickr.py @@ -1,11 +1,16 @@ -import os import json +import os + +import easygraph as eg import numpy as np import scipy.sparse as sp -import easygraph as eg + from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset -from .utils import tensor, data_type_dict +from .utils import data_type_dict +from .utils import tensor + class FlickrDataset(EasyGraphBuiltinDataset): r"""Flickr dataset for node classification. @@ -42,13 +47,26 @@ class FlickrDataset(EasyGraphBuiltinDataset): >>> print(g.number_of_nodes(), g.number_of_edges(), ds.num_classes) >>> print(g.nodes[0]['feat'].shape, g.nodes[0]['label']) """ - def __init__(self, raw_dir=None, force_reload=False, verbose=False, transform=None, reorder=False): + + def __init__( + self, + raw_dir=None, + force_reload=False, + verbose=False, + transform=None, + reorder=False, + ): name = "flickr" url = self._get_dgl_url("dataset/flickr.zip") self._reorder = reorder - super(FlickrDataset, self).__init__(name=name, url=url, raw_dir=raw_dir, - force_reload=force_reload, - verbose=verbose, transform=transform) + super(FlickrDataset, self).__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) def process(self): # Load adjacency @@ -66,15 +84,16 @@ def process(self): # Load train/val/test splits with open(os.path.join(self.raw_path, "role.json")) as f: role = json.load(f) - train_mask = np.zeros(feats.shape[0], dtype=bool); train_mask[role["tr"]] = True - val_mask = np.zeros(feats.shape[0], dtype=bool); val_mask[role["va"]] = True - test_mask = np.zeros(feats.shape[0], dtype=bool); test_mask[role["te"]] = True + train_mask = np.zeros(feats.shape[0], dtype=bool) + train_mask[role["tr"]] = True + val_mask = np.zeros(feats.shape[0], dtype=bool) + val_mask[role["va"]] = True + test_mask = np.zeros(feats.shape[0], dtype=bool) + test_mask[role["te"]] = True # Attach node data for i in range(feats.shape[0]): - g.add_node(i, - feat=feats[i].astype(np.float32), - label=int(labels[i])) + g.add_node(i, feat=feats[i].astype(np.float32), label=int(labels[i])) g.graph["train_mask"] = train_mask g.graph["val_mask"] = val_mask g.graph["test_mask"] = test_mask @@ -83,7 +102,9 @@ def process(self): self._num_classes = int(labels.max() + 1) if self.verbose: print("Loaded Flickr dataset") - print(f" Nodes: {g.number_of_nodes()}, Edges: {g.number_of_edges()}, Features: {feats.shape[1]}, Classes: {self._num_classes}") + print( + f" Nodes: {g.number_of_nodes()}, Edges: {g.number_of_edges()}, Features: {feats.shape[1]}, Classes: {self._num_classes}" + ) def __getitem__(self, idx): assert idx == 0, "FlickrDataset contains only one graph" @@ -104,4 +125,5 @@ def num_classes(self): @staticmethod def _get_dgl_url(path): from .utils import _get_dgl_url + return _get_dgl_url(path) diff --git a/easygraph/datasets/github.py b/easygraph/datasets/github.py index 35f40a80..e0aebda1 100644 --- a/easygraph/datasets/github.py +++ b/easygraph/datasets/github.py @@ -14,18 +14,22 @@ - Classes: 2 Reference: -J. Leskovec et al. "SNAP Datasets: Stanford Large Network Dataset Collection", +J. Leskovec et al. "SNAP Datasets: Stanford Large Network Dataset Collection", https://snap.stanford.edu/data/github-social.html """ -import os import csv import json -import numpy as np +import os + import easygraph as eg +import numpy as np + from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset -from .utils import download, extract_archive +from .utils import download +from .utils import extract_archive class GitHubUsersDataset(EasyGraphBuiltinDataset): @@ -70,12 +74,10 @@ def download(self): download(self.url, path=archive) extract_archive(archive, self.raw_path) - def process(self): g = eg.DiGraph() base_path = os.path.join(self.raw_path, "git_web_ml") - # Load node features with open(os.path.join(base_path, "musae_git_features.json"), "r") as f: features = json.load(f) diff --git a/easygraph/datasets/graph_dataset_base.py b/easygraph/datasets/graph_dataset_base.py index 1077ddf7..b1d831be 100644 --- a/easygraph/datasets/graph_dataset_base.py +++ b/easygraph/datasets/graph_dataset_base.py @@ -1,5 +1,4 @@ -"""Basic EasyGraph Dataset -""" +"""Basic EasyGraph Dataset""" from __future__ import absolute_import @@ -8,6 +7,7 @@ import os import sys import traceback + from ..utils import retry_method_with_fix from .utils import download from .utils import extract_archive diff --git a/easygraph/datasets/ppi.py b/easygraph/datasets/ppi.py index 06c350cb..950a434c 100644 --- a/easygraph/datasets/ppi.py +++ b/easygraph/datasets/ppi.py @@ -1,4 +1,5 @@ -""" PPIDataset for inductive learning. """ +"""PPIDataset for inductive learning.""" + import json import os diff --git a/easygraph/datasets/reddit.py b/easygraph/datasets/reddit.py index d15bafde..a5e39493 100644 --- a/easygraph/datasets/reddit.py +++ b/easygraph/datasets/reddit.py @@ -1,11 +1,17 @@ import os -import numpy as np + import easygraph as eg +import numpy as np import scipy.sparse as sp from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset -from .utils import download, extract_archive, tensor, data_type_dict +from .utils import data_type_dict +from .utils import download +from .utils import extract_archive +from .utils import tensor + class RedditDataset(EasyGraphBuiltinDataset): r"""Reddit posts graph (Sept 2014) for community (subreddit) classification. @@ -24,21 +30,33 @@ class RedditDataset(EasyGraphBuiltinDataset): Add self-loop edges if True. raw_dir, force_reload, verbose, transform : same as EasyGraphBuiltinDataset """ - def __init__(self, self_loop=False, raw_dir=None, force_reload=False, - verbose=True, transform=None): + + def __init__( + self, + self_loop=False, + raw_dir=None, + force_reload=False, + verbose=True, + transform=None, + ): name = "reddit" url = "https://data.dgl.ai/dataset/reddit.zip" self.self_loop = self_loop - super().__init__(name=name, url=url, raw_dir=raw_dir, - force_reload=force_reload, verbose=verbose, - transform=transform) + super().__init__( + name=name, + url=url, + raw_dir=raw_dir, + force_reload=force_reload, + verbose=verbose, + transform=transform, + ) def process(self): # Expect two files extracted: reddit_data.npz & reddit_graph.npz data = np.load(os.path.join(self.raw_path, "reddit_data.npz")) - feat = data["feature"] # shape [N, 602] - labels = data["label"] # shape [N] - split = data["node_types"] # 1=train,2=val,3=test + feat = data["feature"] # shape [N, 602] + labels = data["label"] # shape [N] + split = data["node_types"] # 1=train,2=val,3=test # Load adjacency adj = sp.load_npz(os.path.join(self.raw_path, "reddit_graph.npz")) @@ -55,10 +73,14 @@ def process(self): # Assign node features, labels, and masks for i in range(feat.shape[0]): - g.add_node(i, feat=feat[i], label=int(labels[i]), - train_mask=(split[i] == 1), - val_mask=(split[i] == 2), - test_mask=(split[i] == 3)) + g.add_node( + i, + feat=feat[i], + label=int(labels[i]), + train_mask=(split[i] == 1), + val_mask=(split[i] == 2), + test_mask=(split[i] == 3), + ) self._g = g self._num_classes = int(np.max(labels) + 1) diff --git a/easygraph/datasets/roadnet.py b/easygraph/datasets/roadnet.py index 62e5203f..1d7bfa8a 100644 --- a/easygraph/datasets/roadnet.py +++ b/easygraph/datasets/roadnet.py @@ -12,15 +12,18 @@ - Labels: None Reference: -J. Leskovec and A. Krevl, “SNAP Datasets: Stanford Large Network Dataset Collection,” +J. Leskovec and A. Krevl, “SNAP Datasets: Stanford Large Network Dataset Collection,” https://snap.stanford.edu/data/roadNet-CA.html """ -import os import gzip +import os import shutil + import easygraph as eg + from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset from .utils import download @@ -72,17 +75,17 @@ def download(self): if not os.path.exists(self.raw_path): os.makedirs(self.raw_path) - with gzip.open(compressed_path, 'rb') as f_in: - with open(extracted_path, 'wb') as f_out: + with gzip.open(compressed_path, "rb") as f_in: + with open(extracted_path, "wb") as f_out: shutil.copyfileobj(f_in, f_out) def process(self): graph = eg.Graph() # Undirected road network edge_list_path = os.path.join(self.raw_path, self.name + ".txt") - with open(edge_list_path, 'r') as f: + with open(edge_list_path, "r") as f: for line in f: - if line.startswith('#') or line.strip() == "": + if line.startswith("#") or line.strip() == "": continue u, v = map(int, line.strip().split()) graph.add_edge(u, v) diff --git a/easygraph/datasets/twitter_ego.py b/easygraph/datasets/twitter_ego.py index d88b085a..7b631214 100644 --- a/easygraph/datasets/twitter_ego.py +++ b/easygraph/datasets/twitter_ego.py @@ -1,8 +1,12 @@ import gzip import os + import easygraph as eg + from easygraph.datasets.graph_dataset_base import EasyGraphBuiltinDataset -from easygraph.datasets.utils import download, extract_archive +from easygraph.datasets.utils import download +from easygraph.datasets.utils import extract_archive + class TwitterEgoDataset(EasyGraphBuiltinDataset): r""" @@ -11,9 +15,9 @@ class TwitterEgoDataset(EasyGraphBuiltinDataset): The Twitter dataset was collected from public sources and contains a large ego-network of Twitter users. The combined network includes 81K edges among 81K users. - Source: J. McAuley and J. Leskovec, Stanford SNAP, 2012 - URL: https://snap.stanford.edu/data/egonets-Twitter.html - File used: https://snap.stanford.edu/data/twitter_combined.txt.gz + Source: J. McAuley and J. Leskovec, Stanford SNAP, 2012 + URL: https://snap.stanford.edu/data/egonets-Twitter.html + File used: https://snap.stanford.edu/data/twitter_combined.txt.gz """ def __init__(self): @@ -30,6 +34,7 @@ def download(self): def process(self): import gzip + import easygraph as eg gz_path = os.path.join(self.raw_path, "twitter_combined.txt.gz") diff --git a/easygraph/datasets/web_google.py b/easygraph/datasets/web_google.py index 68b5360e..97597299 100644 --- a/easygraph/datasets/web_google.py +++ b/easygraph/datasets/web_google.py @@ -17,13 +17,17 @@ Dataset from SNAP: https://snap.stanford.edu/data/web-Google.html """ +import gzip import os +import shutil + import easygraph as eg + from easygraph.classes.graph import Graph + from .graph_dataset_base import EasyGraphBuiltinDataset -from .utils import download, extract_archive -import gzip -import shutil +from .utils import download +from .utils import extract_archive class WebGoogleDataset(EasyGraphBuiltinDataset): @@ -72,9 +76,9 @@ def process(self): graph = eg.DiGraph() # Web-Google is directed edge_list_path = os.path.join(self.raw_path, self.name + ".txt") - with open(edge_list_path, 'r') as f: + with open(edge_list_path, "r") as f: for line in f: - if line.startswith('#') or line.strip() == "": + if line.startswith("#") or line.strip() == "": continue u, v = map(int, line.strip().split()) graph.add_edge(u, v) @@ -94,6 +98,7 @@ def __getitem__(self, idx): def __len__(self): return 1 + def download(self): r"""Download and decompress the .txt.gz file.""" if self.url is not None: @@ -108,6 +113,6 @@ def download(self): os.makedirs(self.raw_path) # Decompress manually - with gzip.open(compressed_path, 'rb') as f_in: - with open(extracted_path, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) \ No newline at end of file + with gzip.open(compressed_path, "rb") as f_in: + with open(extracted_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) diff --git a/easygraph/datasets/wiki_topcats.py b/easygraph/datasets/wiki_topcats.py index 5a4085b0..9c337d5f 100644 --- a/easygraph/datasets/wiki_topcats.py +++ b/easygraph/datasets/wiki_topcats.py @@ -16,14 +16,19 @@ Data: https://snap.stanford.edu/data/wiki-topcats.html """ -import os import gzip +import os + import easygraph as eg + from easygraph.datasets.graph_dataset_base import EasyGraphBuiltinDataset -from easygraph.datasets.utils import download, extract_archive +from easygraph.datasets.utils import download +from easygraph.datasets.utils import extract_archive + class WikiTopCatsDataset(EasyGraphBuiltinDataset): """Wikipedia Top Categories Snapshot from 2011 (SNAP)""" + def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): super(WikiTopCatsDataset, self).__init__( name="wiki_topcats", @@ -42,8 +47,12 @@ def download(self): # Also download category info and page names cat_url = "https://snap.stanford.edu/data/wiki-topcats-categories.txt.gz" names_url = "https://snap.stanford.edu/data/wiki-topcats-page-names.txt.gz" - download(cat_url, path=os.path.join(self.raw_dir, "wiki-topcats-categories.txt.gz")) - download(names_url, path=os.path.join(self.raw_dir, "wiki-topcats-page-names.txt.gz")) + download( + cat_url, path=os.path.join(self.raw_dir, "wiki-topcats-categories.txt.gz") + ) + download( + names_url, path=os.path.join(self.raw_dir, "wiki-topcats-page-names.txt.gz") + ) def process(self): raw = self.raw_dir diff --git a/easygraph/model/hypergraphs/hwnn.py b/easygraph/model/hypergraphs/hwnn.py index 37684c39..980bd39e 100644 --- a/easygraph/model/hypergraphs/hwnn.py +++ b/easygraph/model/hypergraphs/hwnn.py @@ -39,7 +39,7 @@ def __init__( def forward(self, X: torch.Tensor, hgs: list) -> torch.Tensor: r"""The forward function. - + Parameters: ``X`` (``torch.Tensor``): Input vertex feature matrix. Size :math:`(N, C_{in})`. ``hg`` (``eg.Hypergraph``): The hypergraph structure that contains :math:`N` vertices. diff --git a/easygraph/nn/convs/hypergraphs/hwnn_conv.py b/easygraph/nn/convs/hypergraphs/hwnn_conv.py index ea7ea563..7c1fa7e8 100644 --- a/easygraph/nn/convs/hypergraphs/hwnn_conv.py +++ b/easygraph/nn/convs/hypergraphs/hwnn_conv.py @@ -44,7 +44,7 @@ def init_parameters(self): def forward(self, X: torch.Tensor, hg: Hypergraph) -> torch.Tensor: r"""The forward function. - + Parameters: X (``torch.Tensor``): Input vertex feature matrix. Size :math:`(N, C_{in})`. hg (``eg.Hypergraph``): The hypergraph structure that contains :math:`N` vertices.