Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions configs/dataset/graph/WS1000-gamma.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Dataset loader config
loader:
_target_: topobench.data.loaders.graph.ws1000_gamma_dataset_loader.WS1000GammaDatasetLoader
parameters:
data_domain: graph # primary domain
data_type: synthetic # you can rename this if you like
data_name: WS1000-gamma # must match your dataset's 'name' argument
# WS1000_gamma generation parameters (can be overridden from CLI)
num_nodes: 1000
feature_dim: 1000
mean_degree: 4
beta: 0.5
gamma: 0.0
noise_scale: 1.0
seed: 0
# where to store processed data
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

# Dataset parameters
parameters:
num_features: 1000 # = feature_dim
num_classes: ${dataset.parameters.num_nodes} # upper bound on possible distances
num_nodes: 1000
task: classification # we treat distance as a class label
loss_type: cross_entropy
monitor_metric: accuracy
task_level: node # node-level prediction

# Splits
split_params:
learning_setting: transductive
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0
split_type: random # or 'k-fold'
k: 10 # used only if split_type='k-fold'
train_prop: 0.5 # used only if split_type='random'
standardize: False # standardize node features

# Dataloader parameters
dataloader_params:
batch_size: 1 # fixed for transductive single-graph setting
num_workers: 0
pin_memory: False
7 changes: 4 additions & 3 deletions test/pipeline/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
from test._utils.simplified_pipeline import run


DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE
MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE
DATASET = "graph/WS1000-gamma"
# ADD YOUR DATASET HERE
MODELS = ["graph/gcn"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE


class TestPipeline:
Expand All @@ -32,4 +33,4 @@ def test_pipeline(self):
],
return_hydra_config=True
)
run(cfg)
run(cfg)
246 changes: 246 additions & 0 deletions topobench/data/datasets/ws1000_gamma_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
import os
import os.path as osp
import random
from collections import deque

import torch
from torch_geometric.data import Data, InMemoryDataset


class WS1000GammaDataset(InMemoryDataset):
"""
WS1000-Gamma Synthetic Dataset
==============================

This module implements the WS1000-Gamma dataset introduced in:

Katsman, I., Lou, E., & Gilbert, A. (2024).
*Revisiting the Necessity of Graph Learning and Common Graph Benchmarks*.
arXiv:2412.06173
https://arxiv.org/abs/2412.06173

The dataset is a synthetic Watts–Strogatz small-world graph with
BFS-dependent Gaussian node features. It is designed as a principled
benchmark that requires graph structure to perform EDGE-level tasks (see Note c).

Notes
-----
a.- This implementation follows the Watts & Strogatz (1998) construction:
1. Create a regular ring lattice with mean degree ``K``.
2. Rewire each oriented ring edge ``(i, i+j)`` with probability ``beta``.

b.- Node features are generated via **BFS parental dependence**:
``x_child = gamma * x_parent + noise_scale * z``, where ``z ~ N(0, I_d)``.

c.- The current implementation evaluates NODE-level distance classification
(predict BFS distance to the root).
EDGE prediction is NOT yet implemented.

Dataset Structure
-----------------
The output is a single :class:`torch_geometric.data.Data` object with:

- ``x`` : ``[num_nodes, feature_dim]`` float tensor
- ``edge_index`` : ``[2, 2 * num_edges]`` long tensor (undirected)
- ``y`` : ``[num_nodes]`` long tensor of BFS distances from the root node
- metadata fields: ``gamma``, ``beta``, ``mean_degree``, ``feature_dim``, ``seed``

Configuration Parameters
------------------------
The dataset accepts the following Hydra parameters:

- ``num_nodes`` : int
- ``feature_dim`` : int
- ``mean_degree`` : int (must be even)
- ``beta`` : float
- ``gamma`` : float
- ``noise_scale`` : float
- ``seed`` : int

These are typically defined in:

``configs/dataset/graph/WS1000-gamma.yaml``

"""

def __init__(
self,
root: str,
name: str = "WS1000-gamma",
parameters=None,
transform=None,
pre_transform=None,
) -> None:
self.name = name
self.parameters = parameters

# Defaults, can be overridden from Hydra DictConfig
self.num_nodes = 1000
self.feature_dim = 1000
self.mean_degree = 4 # K in WS model
self.beta = 0.5 # rewiring probability
self.gamma = 0.0 # parental coefficient
self.noise_scale = 1.0
self.seed = 0

if parameters is not None:
if "num_nodes" in parameters:
self.num_nodes = int(parameters.num_nodes)
if "feature_dim" in parameters:
self.feature_dim = int(parameters.feature_dim)
if "mean_degree" in parameters:
self.mean_degree = int(parameters.mean_degree)
if "beta" in parameters:
self.beta = float(parameters.beta)
if "gamma" in parameters:
self.gamma = float(parameters.gamma)
if "noise_scale" in parameters:
self.noise_scale = float(parameters.noise_scale)
if "seed" in parameters:
self.seed = int(parameters.seed)

super().__init__(root=root, transform=transform, pre_transform=pre_transform)

# Load processed data (super() will call process() the first time)
self.data, self.slices = torch.load(self.processed_paths[0])

# ---------------------------------------------------------------------
# Required PyG properties
# ---------------------------------------------------------------------
@property
def raw_file_names(self) -> list[str]:
# Dummy file to satisfy InMemoryDataset's bookkeeping.
return ["synthetic.done"]

@property
def processed_file_names(self) -> list[str]:
return ["data_v1.pt"]

# ---------------------------------------------------------------------
# Download: here we don't download anything.
# ---------------------------------------------------------------------
def download(self) -> None:
raw_path = osp.join(self.raw_dir, self.raw_file_names[0])
os.makedirs(self.raw_dir, exist_ok=True)
with open(raw_path, "w") as f:
f.write("synthetic ws1000_gamma marker\n")

# ---------------------------------------------------------------------
# Process: generate WS graph + WS1000_gamma features and save.
# ---------------------------------------------------------------------
def process(self) -> None:
data = self._generate_ws1000_gamma()
data_list = [data]
data, slices = self.collate(data_list)
os.makedirs(self.processed_dir, exist_ok=True)
torch.save((data, slices), self.processed_paths[0])

# ---------------------------------------------------------------------
# Helper: Watts–Strogatz graph + gamma-based features
# ---------------------------------------------------------------------
def _generate_ws1000_gamma(self) -> Data:
N = self.num_nodes
K = self.mean_degree
beta = self.beta
d = self.feature_dim
gamma = self.gamma
noise_scale = self.noise_scale
seed = self.seed

assert K % 2 == 0, "mean_degree K must be even for Watts–Strogatz ring construction."

# --- Seed everything deterministically
random.seed(seed)
torch.manual_seed(seed)

# --- 1) Build regular ring lattice
# neighbors: undirected adjacency; edges: undirected edge set
neighbors = {i: set() for i in range(N)}
edges = set()

half_k = K // 2

ring_edges_oriented = []
for j in range(1, half_k + 1): # distance layer outer
for i in range(N): # then each vertex
v = (i + j) % N
ring_edges_oriented.append((i, v))
u_min, u_max = (i, v) if i < v else (v, i)
if (u_min, u_max) not in edges:
edges.add((u_min, u_max))
neighbors[i].add(v)
neighbors[v].add(i)
# --- 2) Rewire edges in Watts–Strogatz style (exactly as in the paper)
# For each original ring edge (i, i+j) in clockwise sense, with probability beta,
# rewire the endpoint i+j to a new node w chosen uniformly at random
for (i, v) in ring_edges_oriented:
if random.random() < beta:
# Candidates: all nodes except i and current neighbours of i
possible_nodes = [w for w in range(N)
if w != i and w not in neighbors[i]]
if not possible_nodes:
# No valid candidate; skip rewiring for this edge
continue

w = random.choice(possible_nodes)

# Remove old edge (i, v) if it still exists
if v in neighbors[i]:
neighbors[i].remove(v)
neighbors[v].remove(i)
edges.discard((i, v) if i < v else (v, i))

# Add new edge (i, w)
neighbors[i].add(w)
neighbors[w].add(i)
edges.add((i, w) if i < w else (w, i))


# --- 3) Convert to undirected edge_index with both directions
edge_list = []
for (u, v) in edges:
edge_list.append((u, v))
edge_list.append((v, u))
edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

# --- 4) Generate features with BFS parental dependence
# Use neighbors directly as adjacency (adj = neighbors)
x = torch.empty((N, d), dtype=torch.float)

root = 0
queue = deque([root])

# root feature
x[root] = torch.randn(d)
dist = torch.full((N,), -1, dtype=torch.long)
dist[root] = 0

while queue:
u = queue.popleft()
for v in neighbors[u]:
if dist[v] == -1:
dist[v] = dist[u] + 1
queue.append(v)
noise = torch.randn(d)
x[v] = gamma * x[u] + noise_scale * noise

# For unvisited (disconnected) nodes:
for i in range(N):
if dist[i] == -1:
x[i] = torch.randn(d)


data = Data(
x=x,
edge_index=edge_index,
y=dist,
)
# Metadata
data.num_nodes = N
data.gamma = gamma
data.beta = beta
data.mean_degree = K
data.feature_dim = d
data.seed = seed

return data
85 changes: 85 additions & 0 deletions topobench/data/loaders/graph/ws1000_gamma_dataset_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# location: topobench/data/loaders/graph/ws1000_gamma_dataset_loader.py
from pathlib import Path

from omegaconf import DictConfig

from topobench.data.datasets import WS1000GammaDataset
from topobench.data.loaders.base import AbstractLoader


class WS1000GammaDatasetLoader(AbstractLoader):
"""
Loader for the WS1000-Gamma synthetic dataset.

Parameters
----------
parameters : omegaconf.DictConfig
The configuration block located at
``dataset.loader.parameters`` in the Hydra config. It must
contain at least the following fields:

- ``data_domain`` : str
- ``data_type`` : str
- ``data_name`` : str
- ``data_dir`` : str
- ``num_nodes`` : int
- ``feature_dim`` : int
- ``mean_degree`` : int
- ``beta`` : float
- ``gamma`` : float
- ``noise_scale`` : float
- ``seed`` : int
"""
def __init__(self, parameters: DictConfig) -> None:
super().__init__(parameters)

def load_dataset(self) -> WS1000GammaDataset:
"""
Load the WS1000-Gamma dataset.
Returns
-------
WS1000GammaDataset
The instantiated dataset containing one synthetic graph with
BFS-derived node features.
"""


dataset = self._initialize_dataset()
self.data_dir = self._redefine_data_dir(dataset)

return dataset

def _initialize_dataset(self) -> WS1000GammaDataset:
"""
Instantiate the underlying :class:`WS1000GammaDataset`.
Returns
-------
WS1000GammaDataset
A dataset instance that will trigger processing if the
processed data file is missing.
"""
return WS1000GammaDataset(
root=str(self.root_data_dir),
name=self.parameters.data_name,
parameters=self.parameters,
)

def _redefine_data_dir(self, dataset: WS1000GammaDataset) -> Path:
"""
Resolve the dataset directory to the processed root.

TopoBench components expect ``loader.data_dir`` to point to the
directory containing processed files. This method extracts the
correct processed directory from the dataset object.

Parameters
----------
dataset : WS1000GammaDataset
The dataset whose processed directory is being queried.

Returns
-------
pathlib.Path
Path to the processed dataset directory.
"""
return Path(dataset.processed_dir)