VectorInstitute · lotif · Mar 17, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 13, 2026
diff --git a/examples/ensemble_attack/README.md b/examples/ensemble_attack/README.md
@@ -1,11 +1,11 @@
 # Ensemble Attack
 
 ## Data Processing
-As the first step of the attack, we need to collect and split the data. The input data collected from all the attacks provided by the MIDST Challenge should be stored in `data_paths.data_paths` as defined by `config.yaml`. You can download and unzip the resources from [this Google Drive link](https://drive.google.com/drive/folders/1rmJ_E6IzG25eCL3foYAb2jVmAstXktJ1?usp=drive_link). Note that you can safely remove the provided shadow models with the competition resources since they are not used in this attack.
+As the first step of the attack, we need to collect and split the data. The input data collected from all the attacks provided by the MIDST Challenge should be stored in `data_paths.midst_data_path` as defined by [`configs/experiment_config.yaml`](configs/experiment_config.yaml). You can download and unzip the resources from [this Google Drive link](https://drive.google.com/drive/folders/1rmJ_E6IzG25eCL3foYAb2jVmAstXktJ1?usp=drive_link). Note that you can safely remove the provided shadow models with the competition resources since they are not used in this attack.
 
-Make sure directories and JSON files specified in `data_paths` and  `data_processing_config` configurations in `examples/ensemble_attack/config.yaml` exist.
+Make sure directories and JSON files specified in `data_paths` and  `data_processing_config` configurations in `examples/ensemble_attack/configs/experiment_config.yaml` exist.
 
-To run the whole data processing pipeline, run `run_attack.py` and set `pipeline.run_data_processing` to `true` in `config.yaml`. It reads data from `data_paths.midst_data_path` specified in config, and populates `data_paths.population_data` and `data_paths.processed_attack_data_path` directories.
+To run the whole data processing pipeline, run `run_attack.py` and set `pipeline.run_data_processing` to `true` in [`configs/experiment_config.yaml`](configs/experiment_config.yaml). It reads data from `data_paths.midst_data_path` specified in config, and populates `data_paths.population_data` and `data_paths.processed_attack_data_path` directories.
 
 Data processing steps for the MIDST challenge provided resources according to Ensemble attack are as follows:
 

diff --git a/examples/ensemble_attack/compute_attack_success.py b/examples/ensemble_attack/compute_attack_success.py
@@ -41,7 +41,12 @@ def load_target_challenge_labels_and_probabilities(
     test_prediction_probabilities = np.load(attack_result_file_path)
 
     # Challenge labels are the true membership labels for the challenge points.
-    test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze()
+    if challenge_label_path.suffix == ".npy":
+        test_target = np.load(challenge_label_path).squeeze()
+    elif challenge_label_path.suffix == ".csv":
+        test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze()
+    else:
+        raise ValueError(f"Unsupported challenge label file type: {challenge_label_path}. Must be .npy or .csv.")
 
     assert len(test_prediction_probabilities) == len(test_target), (
         "Number of challenge labels must match number of prediction probabilities."
@@ -71,9 +76,12 @@ def compute_attack_success_for_given_targets(
     predictions = []
     targets = []
     for target_id in target_ids:
-        # Override target model id in config as ``attack_probabilities_result_path`` and
-        # ``challenge_label_path`` are dependent on it and change in runtime.
-        target_model_config.target_model_id = target_id
+        # If there is a target model id in the config, override it with the current target id
+        if "target_model_id" in target_model_config:
+            # Override target model id in config as ``attack_probabilities_result_path`` and
+            # ``challenge_label_path`` are dependent on it and change in runtime.
+            target_model_config.target_model_id = target_id
+
         # Load challenge labels and prediction probabilities
         log(INFO, f"Loading challenge labels and prediction probabilities for target model ID {target_id}...")
         test_target, test_prediction_probabilities = load_target_challenge_labels_and_probabilities(

diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml
@@ -76,7 +76,7 @@ shadow_training:
   training_json_config_paths: # Config json files used for tabddpm training on the trans table
     table_domain_file_path: ${base_data_config_dir}/trans_domain.json
     dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json
-    tabddpm_training_config_path: ${base_data_config_dir}/trans.json
+    training_config_path: ${base_data_config_dir}/trans.json
   # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name
   # Also, training configs for each shadow model are created under shadow_models_data_path.
   shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data

diff --git a/examples/ensemble_attack/configs/original_attack_config.yaml b/examples/ensemble_attack/configs/original_attack_config.yaml
@@ -58,7 +58,7 @@ shadow_training:
   training_json_config_paths: # Config json files used for tabddpm training on the trans table
     table_domain_file_path: ${base_example_dir}/data_configs/trans_domain.json
     dataset_meta_file_path: ${base_example_dir}/data_configs/dataset_meta.json
-    tabddpm_training_config_path: ${base_example_dir}/data_configs/trans.json
+    training_config_path: ${base_example_dir}/data_configs/trans.json
   # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name
   # Also, training configs for each shadow model are created under shadow_models_data_path.
   shadow_models_output_path: ${base_data_dir}/shadow_models_and_data

diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py
@@ -6,7 +6,6 @@
 from enum import Enum
 from logging import INFO
 from pathlib import Path
-from typing import Literal
 
 import pandas as pd
 from omegaconf import DictConfig
@@ -32,7 +31,11 @@ class AttackType(Enum):
     TABDDPM_100K = "tabddpm_trained_with_100k"
 
 
-DatasetType = Literal["train", "challenge"]
+class AttackDataset(Enum):
+    """Enum for the different attack datasets."""
+
+    TRAIN = "train"
+    CHALLENGE = "challenge"
 
 
 def expand_ranges(ranges: list[tuple[int, int]]) -> list[int]:
@@ -56,7 +59,7 @@ def collect_midst_attack_data(
     attack_type: AttackType,
     data_dir: Path,
     split_folder: str,
-    dataset: DatasetType,
+    dataset: AttackDataset,
     data_processing_config: DictConfig,
 ) -> pd.DataFrame:
     """
@@ -74,20 +77,19 @@ def collect_midst_attack_data(
     Returns:
         pd.DataFrame: The specified dataset in this setting.
     """
-    assert dataset in {
-        "train",
-        "challenge",
-    }, "Only 'train' and 'challenge' collection is supported."
+    assert dataset in {AttackDataset.TRAIN, AttackDataset.CHALLENGE}, (
+        "Only 'train' and 'challenge' collections are supported."
+    )
     # `data_id` is the folder numbering of each training or challenge dataset,
     #  and is defined with the provided config.
     data_id = expand_ranges(data_processing_config.folder_ranges[split_folder])
 
     # Get file name based on the kind of dataset to be collected (i.e. train vs challenge).
     # TODO: Make the below parsing a bit more robust and less brittle
     generation_name = attack_type.value.split("_")[0]
-    if dataset == "challenge":
+    if dataset == AttackDataset.CHALLENGE:
         file_name = data_processing_config.challenge_data_file_name
-    else:
+    else:  # dataset == AttackDataset.TRAIN
         # Multi-table attacks have different file names.
         file_name = (
             data_processing_config.multi_table_train_data_file_name
@@ -110,7 +112,7 @@ def collect_midst_data(
     midst_data_input_dir: Path,
     attack_types: list[AttackType],
     split_folders: list[str],
-    dataset: DatasetType,
+    dataset: AttackDataset,
     data_processing_config: DictConfig,
 ) -> pd.DataFrame:
     """
@@ -133,7 +135,6 @@ def collect_midst_data(
     Returns:
         Collected train or challenge data as a dataframe.
     """
-    assert dataset in {"train", "challenge"}, "Only 'train' and 'challenge' collection is supported."
     population = []
     for attack_type in attack_types:
         for split_folder in split_folders:
@@ -204,7 +205,7 @@ def collect_population_data_ensemble(
         midst_data_input_dir,
         population_attack_types,
         split_folders=population_splits,
-        dataset="train",
+        dataset=AttackDataset.TRAIN,
         data_processing_config=data_processing_config,
     )
 
@@ -221,7 +222,8 @@ def collect_population_data_ensemble(
         )
 
     # Drop ids.
-    df_population_no_id = df_population.drop(columns=["trans_id", "account_id"])
+    id_columns = [c for c in df_population.columns if c.endswith("_id")]
+    df_population_no_id = df_population.drop(columns=id_columns)
     # Save the population data
     save_dataframe(df_population, save_dir, "population_all.csv")
     save_dataframe(df_population_no_id, save_dir, "population_all_no_id.csv")
@@ -233,7 +235,7 @@ def collect_population_data_ensemble(
         midst_data_input_dir,
         attack_types=challenge_attack_types,
         split_folders=challenge_splits,
-        dataset="challenge",
+        dataset=AttackDataset.CHALLENGE,
         data_processing_config=data_processing_config,
     )
     log(INFO, f"Collected challenge data length: {len(df_challenge)} from splits: {challenge_splits}")

diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py
@@ -1,6 +1,7 @@
 import shutil
 from logging import INFO
 from pathlib import Path
+from typing import cast
 
 import pandas as pd
 from omegaconf import DictConfig
@@ -10,12 +11,21 @@
     train_three_sets_of_shadow_models,
 )
 from midst_toolkit.attacks.ensemble.shadow_model_utils import (
-    save_additional_tabddpm_config,
+    ModelType,
+    TrainingResult,
+    save_additional_training_config,
+    train_or_fine_tune_ctgan,
     train_tabddpm_and_synthesize,
 )
+from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig
 from midst_toolkit.common.logger import log
 
 
+DEFAULT_TABLE_NAME = "trans"
+DEFAULT_ID_COLUMN_NAME = "trans_id"
+DEFAULT_MODEL_TYPE = ModelType.TABDDPM
+
+
 def run_target_model_training(config: DictConfig) -> Path:
     """
     Function to run the target model training for RMIA attack.
@@ -39,11 +49,15 @@ def run_target_model_training(config: DictConfig) -> Path:
     target_model_output_path = Path(config.shadow_training.target_model_output_path)
     target_training_json_config_paths = config.shadow_training.training_json_config_paths
 
-    # TODO: Add this to config or .json files
-    table_name = "trans"
+    table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME
 
     target_folder = target_model_output_path / "target_model"
 
+    model_type = DEFAULT_MODEL_TYPE
+    if "model_name" in config.shadow_training:
+        model_type = ModelType(config.shadow_training.model_name)
+    log(INFO, f"Training target model with model type: {model_type.value}")
+
     target_folder.mkdir(parents=True, exist_ok=True)
     shutil.copyfile(
         target_training_json_config_paths.table_domain_file_path,
@@ -53,20 +67,30 @@ def run_target_model_training(config: DictConfig) -> Path:
         target_training_json_config_paths.dataset_meta_file_path,
         target_folder / "dataset_meta.json",
     )
-    configs, save_dir = save_additional_tabddpm_config(
+    configs, save_dir = save_additional_training_config(
         data_dir=target_folder,
-        training_config_json_path=Path(target_training_json_config_paths.tabddpm_training_config_path),
+        training_config_json_path=Path(target_training_json_config_paths.training_config_path),
         final_config_json_path=target_folder / f"{table_name}.json",  # Path to the new json
         experiment_name="trained_target_model",
+        model_type=model_type,
     )
 
-    train_result = train_tabddpm_and_synthesize(
-        train_set=df_real_data,
-        configs=configs,
-        save_dir=save_dir,
-        synthesize=True,
-        number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
-    )
+    train_result: TrainingResult
+    if model_type == ModelType.TABDDPM:
+        train_result = train_tabddpm_and_synthesize(
+            train_set=df_real_data,
+            configs=cast(ClavaDDPMTrainingConfig, configs),
+            save_dir=save_dir,
+            synthesize=True,
+            number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
+        )
+    elif model_type == ModelType.CTGAN:
+        train_result = train_or_fine_tune_ctgan(
+            dataset=df_real_data,
+            configs=cast(CTGANTrainingConfig, configs),
+            save_dir=save_dir,
+            synthesize=True,
+        )
 
     # To train the attack model (metaclassifier), we only need to save target's synthetic data,
     # and not the entire target model's training result object.
@@ -102,11 +126,22 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra
         Path(config.data_paths.population_path),
         "population_all_with_challenge.csv",
     )
-    # Make sure master challenge train and population data have the "trans_id" column.
-    assert "trans_id" in df_challenge_train.columns, (
-        "trans_id column should be present in master train data for the shadow model pipeline."
+
+    table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME
+    id_column_name = config.table_id_column_name if "table_id_column_name" in config else DEFAULT_ID_COLUMN_NAME
+
+    model_type = DEFAULT_MODEL_TYPE
+    if "model_name" in config.shadow_training:
+        model_type = ModelType(config.shadow_training.model_name)
+    log(INFO, f"Training shadow models with model type: {model_type.value}")
+
+    # Make sure master challenge train and population data have the id column.
+    assert id_column_name in df_challenge_train.columns, (
+        f"{id_column_name} column should be present in master train data for the shadow model pipeline."
+    )
+    assert id_column_name in df_population_with_challenge.columns, (
+        f"{id_column_name} column should be present in population data for the shadow model pipeline."
     )
-    assert "trans_id" in df_population_with_challenge.columns
     # ``population_data`` in ensemble attack is used for shadow pre-training, and
     # ``master_challenge_df`` is used for fine-tuning for half of the shadow models.
     # For the other half of the shadow models, only ``master_challenge_df`` is used for training.
@@ -116,14 +151,15 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra
         shadow_models_output_path=Path(config.shadow_training.shadow_models_output_path),
         training_json_config_paths=config.shadow_training.training_json_config_paths,
         fine_tuning_config=config.shadow_training.fine_tuning_config,
-        table_name="trans",
-        id_column_name="trans_id",
+        table_name=table_name,
+        id_column_name=id_column_name,
         # Number of shadow models to train in each set of shadow training (3 sets total) results in
         # ``4 * n_models_per_set`` total shadow models.
         n_models_per_set=4,  # 4 based on the original code, must be even
         n_reps=12,  # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code
         number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
         random_seed=config.random_seed,
+        model_type=model_type,
     )
     log(
         INFO,

diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py
@@ -12,7 +12,7 @@
 import pandas as pd
 from omegaconf import DictConfig
 
-from examples.ensemble_attack.real_data_collection import AttackType, collect_midst_data
+from examples.ensemble_attack.real_data_collection import AttackDataset, AttackType, collect_midst_data
 from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training
 from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
@@ -182,7 +182,7 @@ def collect_challenge_and_train_data(
         attack_types=challenge_attack_types,
         # For ensemble experiments, change to ``test`` for 10k, and change to ``final`` for 20k
         split_folders=["final"],
-        dataset="challenge",
+        dataset=AttackDataset.CHALLENGE,
         data_processing_config=data_processing_config,
     )
     log(
@@ -261,11 +261,28 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list
         A list containing three dictionaries, each representing a collection of shadow
             models with their training data IDs and generated synthetic outputs.
     """
-    df_challenge_experiment, df_master_train = collect_challenge_and_train_data(
-        config.data_processing_config,
-        processed_attack_data_path=Path(config.data_paths.processed_attack_data_path),
-        targets_data_path=Path(config.data_processing_config.midst_data_path),
-    )
+    # Checking if challenge data exists
+    processed_attack_data_path = Path(config.data_paths.processed_attack_data_path)
+    challenge_data_file_name = "population_all_with_challenge_challenge_data.csv"
+
+    if (processed_attack_data_path / challenge_data_file_name).exists():
+        log(INFO, "Skipping data collection for testing phase.")
+        df_challenge_experiment = load_dataframe(
+            processed_attack_data_path,
+            challenge_data_file_name,
+        )
+        df_master_train = load_dataframe(
+            processed_attack_data_path,
+            "master_challenge_train.csv",
+        )
+    else:
+        # If challenge data does not exist, collect it from the cluster
+        df_challenge_experiment, df_master_train = collect_challenge_and_train_data(
+            config.data_processing_config,
+            processed_attack_data_path=Path(config.data_paths.processed_attack_data_path),
+            targets_data_path=Path(config.data_processing_config.midst_data_path),
+        )
+
     # Load the challenge dataframe for training RMIA shadow models.
     rmia_training_choice = RmiaTrainingDataChoice(config.target_model.attack_rmia_shadow_training_data_choice)
     df_challenge = select_challenge_data_for_training(rmia_training_choice, df_challenge_experiment, df_master_train)
@@ -292,7 +309,10 @@ def run_metaclassifier_testing(
     Args:
         config: Configuration object set in ``experiments_config.yaml``.
     """
-    log(INFO, f"Running metaclassifier testing on target model {config.target_model.target_model_id}...")
+    log(
+        INFO,
+        f"Running metaclassifier testing on target synthetic data at {config.target_model.target_synthetic_data_path}...",
+    )
 
     if config.random_seed is not None:
         set_all_random_seeds(seed=config.random_seed)
@@ -321,7 +341,13 @@ def run_metaclassifier_testing(
     test_data = pd.read_csv(challenge_data_path)
     log(INFO, f"Challenge data loaded from {challenge_data_path} with a size of {len(test_data)}.")
 
-    test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze()
+    if challenge_label_path.suffix == ".npy":
+        test_target = np.load(challenge_label_path).squeeze()
+    elif challenge_label_path.suffix == ".csv":
+        test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze()
+    else:
+        raise ValueError(f"Unsupported challenge label file type: {challenge_label_path}. Must be .npy or .csv.")
+
     assert len(test_data) == len(test_target), "Number of challenge labels must match number of challenge data points."
 
     target_synthetic_path = Path(config.target_model.target_synthetic_data_path)

diff --git a/examples/gan/README.md b/examples/gan/README.md
@@ -8,7 +8,7 @@ some data afterwards.
 ## Downloading data
 
 First, we need the data. Download it from this
-[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link),
+[Google Drive link](https://drive.google.com/file/d/1YbDRVn-fwfdcPnHj5eMhCa6A-YPiGnKr/view?usp=sharing),
 extract the files and place them in a `/data` folder in within this folder
 (`examples/gan`).