From 1a40fd7a477513dbc0eb06eec7604da17d63251a Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 8 Jan 2026 14:51:05 -0500 Subject: [PATCH 01/38] wip --- examples/ensemble_attack/README.md | 6 +- .../ensemble_attack/real_data_collection.py | 41 ++++++++----- .../run_shadow_model_training.py | 27 ++++++--- examples/gan/config.yaml | 20 +++++++ examples/gan/ensemble_attack/run.py | 59 +++++++++++++++++++ examples/synthesizing/multi_table/README.md | 2 +- examples/synthesizing/single_table/README.md | 2 +- examples/training/multi_table/README.md | 2 +- examples/training/single_table/README.md | 2 +- 9 files changed, 130 insertions(+), 31 deletions(-) create mode 100644 examples/gan/ensemble_attack/run.py diff --git a/examples/ensemble_attack/README.md b/examples/ensemble_attack/README.md index 030433fe..d77e63fd 100644 --- a/examples/ensemble_attack/README.md +++ b/examples/ensemble_attack/README.md @@ -1,11 +1,11 @@ # Ensemble Attack ## Data Processing -As the first step of the attack, we need to collect and split the data. The input data collected from all the attacks provided by the MIDST Challenge should be stored in `data_paths.data_paths` as defined by `config.yaml`. You can download and unzip the resources from [this Google Drive link](https://drive.google.com/drive/folders/1rmJ_E6IzG25eCL3foYAb2jVmAstXktJ1?usp=drive_link). Note that you can safely remove the provided shadow models with the competition resources since they are not used in this attack. +As the first step of the attack, we need to collect and split the data. The input data collected from all the attacks provided by the MIDST Challenge should be stored in `data_paths.midst_data_path` as defined by [`configs/experiment_config.yaml`](configs/experiment_config.yaml). You can download and unzip the resources from [this Google Drive link](https://drive.google.com/drive/folders/1rmJ_E6IzG25eCL3foYAb2jVmAstXktJ1?usp=drive_link). Note that you can safely remove the provided shadow models with the competition resources since they are not used in this attack. -Make sure directories and JSON files specified in `data_paths` and `data_processing_config` configurations in `examples/ensemble_attack/config.yaml` exist. +Make sure directories and JSON files specified in `data_paths` and `data_processing_config` configurations in `examples/ensemble_attack/configs/experiment_config.yaml` exist. -To run the whole data processing pipeline, run `run_attack.py` and set `pipeline.run_data_processing` to `true` in `config.yaml`. It reads data from `data_paths.midst_data_path` specified in config, and populates `data_paths.population_data` and `data_paths.processed_attack_data_path` directories. +To run the whole data processing pipeline, run `run_attack.py` and set `pipeline.run_data_processing` to `true` in [`configs/experiment_config.yaml`](configs/experiment_config.yaml). It reads data from `data_paths.midst_data_path` specified in config, and populates `data_paths.population_data` and `data_paths.processed_attack_data_path` directories. Data processing steps for the MIDST challenge provided resources according to Ensemble attack are as follows: diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py index 264f71ae..a7711347 100644 --- a/examples/ensemble_attack/real_data_collection.py +++ b/examples/ensemble_attack/real_data_collection.py @@ -29,6 +29,21 @@ class AttackType(Enum): TABDDPM_100K = "tabddpm_trained_with_100k" +class AttackDataSplit(Enum): + """Enum for the different attack data splits.""" + + TRAIN = "train" + DEV = "dev" + FINAL = "final" + + +class AttackDataset(Enum): + """Enum for the different attack datasets.""" + + TRAIN = "train" + CHALLENGE = "challenge" + + def expand_ranges(ranges: list[tuple[int, int]]) -> list[int]: """ Reads a list of tuples representing ranges and expands them into a flat list of integers. @@ -49,8 +64,8 @@ def expand_ranges(ranges: list[tuple[int, int]]) -> list[int]: def collect_midst_attack_data( attack_type: AttackType, data_dir: Path, - data_split: str, - dataset: str, + data_split: AttackDataSplit, + dataset: AttackDataset, data_processing_config: DictConfig, ) -> pd.DataFrame: """ @@ -66,21 +81,16 @@ def collect_midst_attack_data( Returns: pd.DataFrame: The specified dataset in this setting. """ - assert data_split in [ - "train", - "dev", - "final", - ], "data_split should be one of 'train', 'dev', or 'final'." # `data_id` is the folder numbering of each training or challenge dataset, # and is defined with the provided config. - data_id = expand_ranges(data_processing_config.folder_ranges[data_split]) + data_id = expand_ranges(data_processing_config.folder_ranges[data_split.value]) # Get file name based on the kind of dataset to be collected (i.e. train vs challenge). # TODO: Make the below parsing a bit more robust and less brittle generation_name = attack_type.value.split("_")[0] - if dataset == "challenge": + if dataset == AttackDataset.CHALLENGE: file_name = data_processing_config.challenge_data_file_name - else: # dataset == "train" + else: # dataset == AttackDataset.TRAIN # Multi-table attacks have different file names. file_name = ( data_processing_config.multi_table_train_data_file_name @@ -103,7 +113,7 @@ def collect_midst_data( midst_data_input_dir: Path, attack_types: list[AttackType], data_splits: list[str], - dataset: str, + dataset: AttackDataset, data_processing_config: DictConfig, ) -> pd.DataFrame: """ @@ -121,7 +131,6 @@ def collect_midst_data( Returns: Collected train or challenge data as a dataframe. """ - assert dataset in {"train", "challenge"}, "Only 'train' and 'challenge' collection is supported." population = [] for attack_type in attack_types: for data_split in data_splits: @@ -170,10 +179,10 @@ def collect_population_data_ensemble( save_dir.mkdir(parents=True, exist_ok=True) if population_splits is None: - population_splits = ["train"] + population_splits = [AttackDataSplit.TRAIN] if challenge_splits is None: # Original Ensemble collects all the challenge points from train, dev and final of "tabddpm_black_box" attack. - challenge_splits = ["train", "dev", "final"] + challenge_splits = [AttackDataSplit.TRAIN, AttackDataSplit.DEV, AttackDataSplit.FINAL] # Ensemble Attack collects train data of all the attack types (black box and white box) attack_names = data_processing_config.population_attack_data_types_to_collect @@ -184,7 +193,7 @@ def collect_population_data_ensemble( midst_data_input_dir, population_attack_types, data_splits=population_splits, - dataset="train", + dataset=AttackDataset.TRAIN, data_processing_config=data_processing_config, ) # Drop ids. @@ -199,7 +208,7 @@ def collect_population_data_ensemble( midst_data_input_dir, attack_types=challenge_attack_types, data_splits=challenge_splits, - dataset="challenge", + dataset=AttackDataset.CHALLENGE, data_processing_config=data_processing_config, ) # Save the challenge points diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index d4a85cbc..e519f38e 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -15,6 +15,10 @@ from midst_toolkit.common.logger import log +DEFAULT_TABLE_NAME = "trans" +DEFAULT_ID_COLUMN_NAME = "trans_id" + + def run_target_model_training(config: DictConfig) -> Path: """ Function to run the target model training for RMIA attack. @@ -38,8 +42,7 @@ def run_target_model_training(config: DictConfig) -> Path: target_model_output_path = Path(config.shadow_training.target_model_output_path) target_training_json_config_paths = config.shadow_training.training_json_config_paths - # TODO: Add this to config or .json files - table_name = "trans" + table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME target_folder = target_model_output_path / "target_model" @@ -104,12 +107,20 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]: Path(config.data_paths.population_path), "population_all_with_challenge.csv", ) + + table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME + id_column_name = config.table_id_column_name if "table_id_column_name" in config else DEFAULT_ID_COLUMN_NAME + # Make sure master challenge train and population data have the "trans_id" column. - assert "trans_id" in df_master_challenge_train.columns, ( - "trans_id column should be present in master train data for the shadow model pipeline." + assert id_column_name in df_master_challenge_train.columns, ( + f"{id_column_name} column should be present in master train data for the shadow model pipeline." + ) + assert id_column_name in df_population_with_challenge.columns, ( + f"{id_column_name} column should be present in population data for the shadow model pipeline." + ) + assert id_column_name in df_master_challenge_train.columns, ( + f"{id_column_name} column should be present in master train data for the shadow model pipeline." ) - assert "trans_id" in df_population_with_challenge.columns - assert "trans_id" in df_master_challenge_train.columns # ``population_data`` in ensemble attack is used for shadow pre-training, and # ``master_challenge_df`` is used for fine-tuning for half of the shadow models. # For the other half of the shadow models, only ``master_challenge_df`` is used for training. @@ -119,8 +130,8 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]: shadow_models_output_path=Path(config.shadow_training.shadow_models_output_path), training_json_config_paths=config.shadow_training.training_json_config_paths, fine_tuning_config=config.shadow_training.fine_tuning_config, - table_name="trans", - id_column_name="trans_id", + table_name=table_name, + id_column_name=id_column_name, # Number of shadow models to train in each set of shadow training (3 sets total) results in # ``4 * n_models_per_set`` total shadow models. n_models_per_set=4, # 4 based on the original code, must be even diff --git a/examples/gan/config.yaml b/examples/gan/config.yaml index 946f1560..5349b78b 100644 --- a/examples/gan/config.yaml +++ b/examples/gan/config.yaml @@ -9,3 +9,23 @@ training: synthesizing: sample_size: 20000 + +ensemble_attack: + random_seed: null # Set this to a value if you want to set a random seed for reproducibility + table_name: "trans_id" + table_id_column_name: "trans_id" + + data_paths: + processed_attack_data_path: ${base_data_dir}/ensemble_attack + population_path: ${base_data_dir}/ensemble_attack + + shadow_training: + shadow_models_output_path: ${results_dir}/ensemble_attack/shadow_models + training_json_config_paths: # Config json files used for tabddpm training on the trans table + table_domain_file_path: ${base_data_dir}/trans_domain.json + dataset_meta_file_path: ${base_data_dir}/dataset_meta.json + tabddpm_training_config_path: ${base_data_dir}/trans.json + fine_tuning_config: + fine_tune_diffusion_iterations: 200000 + fine_tune_classifier_iterations: 20000 + pre_train_data_size: 60000 diff --git a/examples/gan/ensemble_attack/run.py b/examples/gan/ensemble_attack/run.py new file mode 100644 index 00000000..5c305737 --- /dev/null +++ b/examples/gan/ensemble_attack/run.py @@ -0,0 +1,59 @@ +import importlib +from pathlib import Path +from logging import INFO + +from omegaconf import DictConfig +import hydra + +from midst_toolkit.common.logger import log +from midst_toolkit.common.random import set_all_random_seeds +from midst_toolkit.attacks.ensemble.process_split_data import process_split_data +from examples.ensemble_attack.real_data_collection import collect_population_data_ensemble + + + +@hydra.main(config_path="../", config_name="config", version_base=None) +def main(config: DictConfig) -> None: + """ + Run the Ensemble Attack pipeline with the CTGAN model. + + As the first step, data processing is done. + Second step is shadow model training used for RMIA attack. + Third step is metaclassifier training and evaluation. + + Args: + config: Attack configuration as an OmegaConf DictConfig object. + """ + + import ipdb;ipdb.set_trace() + + if config.ensemble_attack.random_seed is not None: + set_all_random_seeds(seed=config.ensemble_attack.random_seed) + log(INFO, f"Training phase random seed set to {config.ensemble_attack.random_seed}.") + + # Note: Importing the following two modules causes a segmentation fault error if imported together in this file. + # A quick solution is to load modules dynamically if any of the pipelines is called. + # TODO: Investigate the source of error. + shadow_pipeline = importlib.import_module("examples.ensemble_attack.run_shadow_model_training") + shadow_data_paths = shadow_pipeline.run_shadow_model_training(config.ensemble_attack) + shadow_data_paths = [Path(path) for path in shadow_data_paths] + + target_model_synthetic_path = shadow_pipeline.run_target_model_training(config) + + if config.pipeline.run_metaclassifier_training: + if not config.pipeline.run_shadow_model_training: + # If shadow model training is skipped, we need to provide the previous shadow model and target model paths. + shadow_data_paths = [Path(path) for path in config.shadow_training.final_shadow_models_path] + target_model_synthetic_path = Path(config.shadow_training.target_synthetic_data_path) + + assert len(shadow_data_paths) == 3, "The attack_data_paths list must contain exactly three elements." + assert target_model_synthetic_path is not None, ( + "The target_data_path must be provided for metaclassifier training." + ) + + meta_pipeline = importlib.import_module("examples.ensemble_attack.run_metaclassifier_training") + meta_pipeline.run_metaclassifier_training(config, shadow_data_paths, target_model_synthetic_path) + + +if __name__ == "__main__": + main() diff --git a/examples/synthesizing/multi_table/README.md b/examples/synthesizing/multi_table/README.md index 737b49ec..0396e010 100644 --- a/examples/synthesizing/multi_table/README.md +++ b/examples/synthesizing/multi_table/README.md @@ -7,7 +7,7 @@ up using the code in this toolkit. ## Downloading data First, we need the data. Download it from this -[Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link), +[Google Drive link](https://drive.google.com/file/d/1x2yXw824sMUJb9WKUoTkcyfPfx3zS7We/view?usp=sharing), extract the files and place them in a `/data` folder within this folder (`examples/synthesizing/multi_table`). diff --git a/examples/synthesizing/single_table/README.md b/examples/synthesizing/single_table/README.md index 5f6f1f51..924b024d 100644 --- a/examples/synthesizing/single_table/README.md +++ b/examples/synthesizing/single_table/README.md @@ -7,7 +7,7 @@ up using the code in this toolkit. ## Downloading data First, we need the data. Download it from this -[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link), +[Google Drive link](https://drive.google.com/file/d/1YbDRVn-fwfdcPnHj5eMhCa6A-YPiGnKr/view?usp=sharing), extract the files and place them in a `/data` folder within this folder (`examples/synthesizing/single_table`). diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md index 31791112..fa374d50 100644 --- a/examples/training/multi_table/README.md +++ b/examples/training/multi_table/README.md @@ -7,7 +7,7 @@ code in this toolkit. ## Downloading data First, we need the data. Download it from this -[Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link), +[Google Drive link](https://drive.google.com/file/d/1x2yXw824sMUJb9WKUoTkcyfPfx3zS7We/view?usp=sharing), extract the files and place them in a `/data` folder in within this folder (`examples/training/multi_table`). diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index ac6fa12b..b274f733 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -7,7 +7,7 @@ code in this toolkit. ## Downloading data First, we need the data. Download it from this -[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link), +[Google Drive link](https://drive.google.com/file/d/1YbDRVn-fwfdcPnHj5eMhCa6A-YPiGnKr/view?usp=sharing), extract the files and place them in a `/data` folder in within this folder (`examples/training/single_table`). From 1d18580236f285df3b663102d41a4bf2a9ef8edb Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 8 Jan 2026 14:51:10 -0500 Subject: [PATCH 02/38] wip --- examples/gan/ensemble_attack/run.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/gan/ensemble_attack/run.py b/examples/gan/ensemble_attack/run.py index 5c305737..f3f1da8a 100644 --- a/examples/gan/ensemble_attack/run.py +++ b/examples/gan/ensemble_attack/run.py @@ -1,15 +1,12 @@ import importlib -from pathlib import Path from logging import INFO +from pathlib import Path -from omegaconf import DictConfig import hydra +from omegaconf import DictConfig from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds -from midst_toolkit.attacks.ensemble.process_split_data import process_split_data -from examples.ensemble_attack.real_data_collection import collect_population_data_ensemble - @hydra.main(config_path="../", config_name="config", version_base=None) @@ -24,8 +21,9 @@ def main(config: DictConfig) -> None: Args: config: Attack configuration as an OmegaConf DictConfig object. """ + import ipdb - import ipdb;ipdb.set_trace() + ipdb.set_trace() if config.ensemble_attack.random_seed is not None: set_all_random_seeds(seed=config.ensemble_attack.random_seed) From e42e6307be477be8e388535a2d0dbe57c4ba7ef0 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 13 Jan 2026 15:30:51 -0500 Subject: [PATCH 03/38] WIP moving forward with the ensemble attack code changes --- .../configs/experiment_config.yaml | 2 +- .../ensemble_attack/real_data_collection.py | 8 ++-- examples/ensemble_attack/run_attack.py | 8 ++-- .../run_shadow_model_training.py | 10 +++- examples/gan/config.yaml | 20 -------- examples/gan/ensemble_attack/README.md | 7 +++ examples/gan/ensemble_attack/config.yaml | 32 +++++++++++++ examples/gan/ensemble_attack/run.py | 34 +++++++++----- .../multi_table/run_synthesizing.py | 6 +-- .../single_table/run_synthesizing.py | 6 +-- examples/training/multi_table/run_training.py | 8 ++-- .../training/single_table/run_training.py | 4 +- .../attacks/ensemble/clavaddpm_fine_tuning.py | 10 ++-- .../ensemble/rmia/shadow_model_training.py | 21 ++++++--- .../attacks/ensemble/shadow_model_utils.py | 31 +++++++++--- src/midst_toolkit/common/config.py | 47 ++++++++++++++----- .../models/clavaddpm/clustering.py | 6 +-- .../models/clavaddpm/synthesizer.py | 8 ++-- src/midst_toolkit/models/clavaddpm/train.py | 16 +++---- .../ensemble/test_shadow_model_training.py | 4 +- .../models/clavaddpm/test_model.py | 8 ++-- .../models/clavaddpm/test_synthesizer.py | 20 ++++---- .../ensemble/test_shadow_model_utils.py | 4 +- 23 files changed, 204 insertions(+), 116 deletions(-) create mode 100644 examples/gan/ensemble_attack/README.md create mode 100644 examples/gan/ensemble_attack/config.yaml diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml index 3803d69d..6cbc2d94 100644 --- a/examples/ensemble_attack/configs/experiment_config.yaml +++ b/examples/ensemble_attack/configs/experiment_config.yaml @@ -66,7 +66,7 @@ shadow_training: training_json_config_paths: # Config json files used for tabddpm training on the trans table table_domain_file_path: ${base_data_config_dir}/trans_domain.json dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json - tabddpm_training_config_path: ${base_data_config_dir}/trans.json + training_config_path: ${base_data_config_dir}/trans.json # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name # Also, training configs for each shadow model are created under shadow_models_data_path. shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py index a7711347..57048b56 100644 --- a/examples/ensemble_attack/real_data_collection.py +++ b/examples/ensemble_attack/real_data_collection.py @@ -100,7 +100,7 @@ def collect_midst_attack_data( df_real = pd.DataFrame() for i in data_id: - data_path_ith = data_dir / attack_type.value / data_split / f"{generation_name}_{i}" + data_path_ith = data_dir / attack_type.value / data_split.value / f"{generation_name}_{i}" # Will raise FileNotFoundError if the file does not exist or if it is not a CSV file. df_real_ith = load_dataframe(data_path_ith, file_name) df_real = df_real_ith if df_real.empty else pd.concat([df_real, df_real_ith]) @@ -112,7 +112,7 @@ def collect_midst_attack_data( def collect_midst_data( midst_data_input_dir: Path, attack_types: list[AttackType], - data_splits: list[str], + data_splits: list[AttackDataSplit], dataset: AttackDataset, data_processing_config: DictConfig, ) -> pd.DataFrame: @@ -151,8 +151,8 @@ def collect_population_data_ensemble( midst_data_input_dir: Path, data_processing_config: DictConfig, save_dir: Path, - population_splits: list[str] | None = None, - challenge_splits: list[str] | None = None, + population_splits: list[AttackDataSplit] | None = None, + challenge_splits: list[AttackDataSplit] | None = None, ) -> pd.DataFrame: """ Collect the population data from the MIDST competition based on Ensemble Attack implementation. diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index 5592e08e..f90436cd 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -10,7 +10,7 @@ import hydra from omegaconf import DictConfig -from examples.ensemble_attack.real_data_collection import collect_population_data_ensemble +from examples.ensemble_attack.real_data_collection import AttackDataSplit, collect_population_data_ensemble from midst_toolkit.attacks.ensemble.process_split_data import process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -25,12 +25,14 @@ def run_data_processing(config: DictConfig) -> None: """ log(INFO, "Running data processing pipeline...") # Collect the real data from the MIDST challenge resources. + population_splits = [AttackDataSplit(split) for split in config.data_processing_config.population_splits] + challenge_splits = [AttackDataSplit(split) for split in config.data_processing_config.challenge_splits] population_data = collect_population_data_ensemble( midst_data_input_dir=Path(config.data_paths.midst_data_path), data_processing_config=config.data_processing_config, save_dir=Path(config.data_paths.population_path), - population_splits=config.data_processing_config.population_splits, - challenge_splits=config.data_processing_config.challenge_splits, + population_splits=population_splits, + challenge_splits=challenge_splits, ) # The following function saves the required dataframe splits in the specified processed_attack_data_path path. process_split_data( diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index e519f38e..17fff0e9 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -9,7 +9,8 @@ train_three_sets_of_shadow_models, ) from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - save_additional_tabddpm_config, + ModelType, + save_additional_training_config, train_tabddpm_and_synthesize, ) from midst_toolkit.common.logger import log @@ -17,6 +18,7 @@ DEFAULT_TABLE_NAME = "trans" DEFAULT_ID_COLUMN_NAME = "trans_id" +DEFAULT_MODEL_TYPE = ModelType.TABDDPM def run_target_model_training(config: DictConfig) -> Path: @@ -55,7 +57,7 @@ def run_target_model_training(config: DictConfig) -> Path: target_training_json_config_paths.dataset_meta_file_path, target_folder / "dataset_meta.json", ) - configs, save_dir = save_additional_tabddpm_config( + configs, save_dir = save_additional_training_config( data_dir=target_folder, training_config_json_path=Path(target_training_json_config_paths.tabddpm_training_config_path), final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json @@ -110,6 +112,9 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]: table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME id_column_name = config.table_id_column_name if "table_id_column_name" in config else DEFAULT_ID_COLUMN_NAME + model_type = ( + ModelType(config.shadow_training.model_name) if "model_name" in config.shadow_training else DEFAULT_MODEL_TYPE + ) # Make sure master challenge train and population data have the "trans_id" column. assert id_column_name in df_master_challenge_train.columns, ( @@ -138,6 +143,7 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]: n_reps=12, # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, random_seed=config.random_seed, + model_type=model_type, ) log( INFO, diff --git a/examples/gan/config.yaml b/examples/gan/config.yaml index 5349b78b..946f1560 100644 --- a/examples/gan/config.yaml +++ b/examples/gan/config.yaml @@ -9,23 +9,3 @@ training: synthesizing: sample_size: 20000 - -ensemble_attack: - random_seed: null # Set this to a value if you want to set a random seed for reproducibility - table_name: "trans_id" - table_id_column_name: "trans_id" - - data_paths: - processed_attack_data_path: ${base_data_dir}/ensemble_attack - population_path: ${base_data_dir}/ensemble_attack - - shadow_training: - shadow_models_output_path: ${results_dir}/ensemble_attack/shadow_models - training_json_config_paths: # Config json files used for tabddpm training on the trans table - table_domain_file_path: ${base_data_dir}/trans_domain.json - dataset_meta_file_path: ${base_data_dir}/dataset_meta.json - tabddpm_training_config_path: ${base_data_dir}/trans.json - fine_tuning_config: - fine_tune_diffusion_iterations: 200000 - fine_tune_classifier_iterations: 20000 - pre_train_data_size: 60000 diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md new file mode 100644 index 00000000..b42b940e --- /dev/null +++ b/examples/gan/ensemble_attack/README.md @@ -0,0 +1,7 @@ +# CTGAN Ensemble Attack Example + +To run, execute the following command from the root project folder: + +```bash +python -m examples.gan.ensemble_attack.run +``` diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml new file mode 100644 index 00000000..4c45b8fb --- /dev/null +++ b/examples/gan/ensemble_attack/config.yaml @@ -0,0 +1,32 @@ +# Training example configuration +# Base data directory (can be overridden from command line) +base_data_dir: examples/gan/data +results_dir: examples/gan/results + +ensemble_attack: + random_seed: null # Set this to a value if you want to set a random seed for reproducibility + table_name: "trans" + table_id_column_name: "trans_id" + + data_paths: + processed_attack_data_path: ${base_data_dir}/ensemble_attack + population_path: ${base_data_dir}/ensemble_attack + + shadow_training: + model_name: ctgan + model_config: # Configurations specific for the CTGAN model + training: + epochs: 300 + verbose: True + synthesizing: + sample_size: 20000 + shadow_models_output_path: ${results_dir}/ensemble_attack/shadow_models + training_json_config_paths: # Config json files used for tabddpm training on the trans table + table_domain_file_path: ${base_data_dir}/trans_domain.json + dataset_meta_file_path: ${base_data_dir}/dataset_meta.json + training_config_path: ${base_data_dir}/trans.json # if this is not present, it will be created by copying the example config + fine_tuning_config: + fine_tune_diffusion_iterations: 200000 + fine_tune_classifier_iterations: 20000 + pre_train_data_size: 60000 + number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. diff --git a/examples/gan/ensemble_attack/run.py b/examples/gan/ensemble_attack/run.py index f3f1da8a..f1834f0d 100644 --- a/examples/gan/ensemble_attack/run.py +++ b/examples/gan/ensemble_attack/run.py @@ -1,15 +1,17 @@ import importlib +import json from logging import INFO from pathlib import Path import hydra -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf +from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds -@hydra.main(config_path="../", config_name="config", version_base=None) +@hydra.main(config_path="./", config_name="config", version_base=None) def main(config: DictConfig) -> None: """ Run the Ensemble Attack pipeline with the CTGAN model. @@ -21,22 +23,30 @@ def main(config: DictConfig) -> None: Args: config: Attack configuration as an OmegaConf DictConfig object. """ - import ipdb - - ipdb.set_trace() - if config.ensemble_attack.random_seed is not None: set_all_random_seeds(seed=config.ensemble_attack.random_seed) log(INFO, f"Training phase random seed set to {config.ensemble_attack.random_seed}.") - # Note: Importing the following two modules causes a segmentation fault error if imported together in this file. - # A quick solution is to load modules dynamically if any of the pipelines is called. - # TODO: Investigate the source of error. - shadow_pipeline = importlib.import_module("examples.ensemble_attack.run_shadow_model_training") - shadow_data_paths = shadow_pipeline.run_shadow_model_training(config.ensemble_attack) + # Saving the model config from the config.yaml into a json file + # because that's what the ensemble attack code will be looking for + training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path) + training_config_path.unlink(missing_ok=True) + with open(training_config_path, "w") as f: + training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config) + training_config["general"] = { + "test_data_dir": config.base_data_dir, + "sample_prefix": "ctgan", + # The values below will be overriden + "exp_name": "", + "data_dir": "", + "workspace_dir": "", + } + json.dump(training_config, f) + + shadow_data_paths = run_shadow_model_training(config.ensemble_attack) shadow_data_paths = [Path(path) for path in shadow_data_paths] - target_model_synthetic_path = shadow_pipeline.run_target_model_training(config) + target_model_synthetic_path = run_target_model_training(config) if config.pipeline.run_metaclassifier_training: if not config.pipeline.run_shadow_model_training: diff --git a/examples/synthesizing/multi_table/run_synthesizing.py b/examples/synthesizing/multi_table/run_synthesizing.py index 9d845e2e..ba6916f4 100644 --- a/examples/synthesizing/multi_table/run_synthesizing.py +++ b/examples/synthesizing/multi_table/run_synthesizing.py @@ -7,7 +7,7 @@ from omegaconf import DictConfig from examples.training.multi_table import run_training -from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig +from midst_toolkit.common.config import ClavaDDPMMatchingConfig, ClavaDDPMSamplingConfig, GeneralConfig from midst_toolkit.common.logger import TOOLKIT_LOGGER, log from midst_toolkit.models.clavaddpm.data_loaders import load_tables from midst_toolkit.models.clavaddpm.enumerations import Relation @@ -76,8 +76,8 @@ def main(config: DictConfig) -> None: Path(config.results_dir), models, GeneralConfig(**config.general_config), - SamplingConfig(**config.sampling_config), - MatchingConfig(**config.matching_config), + ClavaDDPMSamplingConfig(**config.sampling_config), + ClavaDDPMMatchingConfig(**config.matching_config), all_group_lengths_prob_dicts, ) diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py index b9f6a649..fd5341f3 100644 --- a/examples/synthesizing/single_table/run_synthesizing.py +++ b/examples/synthesizing/single_table/run_synthesizing.py @@ -7,7 +7,7 @@ from omegaconf import DictConfig from examples.training.single_table import run_training -from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig +from midst_toolkit.common.config import ClavaDDPMMatchingConfig, ClavaDDPMSamplingConfig, GeneralConfig from midst_toolkit.common.logger import TOOLKIT_LOGGER, log from midst_toolkit.models.clavaddpm.data_loaders import load_tables from midst_toolkit.models.clavaddpm.enumerations import Relation @@ -73,8 +73,8 @@ def main(config: DictConfig) -> None: Path(config.results_dir), models, GeneralConfig(**config.general_config), - SamplingConfig(**config.sampling_config), - MatchingConfig(**config.matching_config), + ClavaDDPMSamplingConfig(**config.sampling_config), + ClavaDDPMMatchingConfig(**config.matching_config), ) log(INFO, "Data synthesized successfully.") diff --git a/examples/training/multi_table/run_training.py b/examples/training/multi_table/run_training.py index 6d5548a5..fe427bb0 100644 --- a/examples/training/multi_table/run_training.py +++ b/examples/training/multi_table/run_training.py @@ -5,7 +5,7 @@ import hydra from omegaconf import DictConfig -from midst_toolkit.common.config import ClassifierConfig, ClusteringConfig, DiffusionConfig +from midst_toolkit.common.config import ClavaDDPMClassifierConfig, ClavaDDPMClusteringConfig, ClavaDDPMDiffusionConfig from midst_toolkit.common.logger import TOOLKIT_LOGGER, log from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.clustering import clava_clustering @@ -32,12 +32,12 @@ def main(config: DictConfig) -> None: tables, relation_order, _ = load_tables(Path(config.base_data_dir)) log(INFO, "Clustering data...") - clustering_config = ClusteringConfig(**config.clustering_config) + clustering_config = ClavaDDPMClusteringConfig(**config.clustering_config) tables, _ = clava_clustering(tables, relation_order, Path(config.results_dir), clustering_config) log(INFO, "Training model...") - diffusion_config = DiffusionConfig(**config.diffusion_config) - classifier_config = ClassifierConfig(**config.classifier_config) + diffusion_config = ClavaDDPMDiffusionConfig(**config.diffusion_config) + classifier_config = ClavaDDPMClassifierConfig(**config.classifier_config) tables, _ = clava_training( tables, diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py index 74897db7..886b6b9f 100644 --- a/examples/training/single_table/run_training.py +++ b/examples/training/single_table/run_training.py @@ -5,7 +5,7 @@ import hydra from omegaconf import DictConfig -from midst_toolkit.common.config import DiffusionConfig +from midst_toolkit.common.config import ClavaDDPMDiffusionConfig from midst_toolkit.common.logger import TOOLKIT_LOGGER, log from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.data_loaders import load_tables @@ -31,7 +31,7 @@ def main(config: DictConfig) -> None: tables, relation_order, _ = load_tables(Path(config.base_data_dir)) log(INFO, "Training model...") - diffusion_config = DiffusionConfig(**config.diffusion_config) + diffusion_config = ClavaDDPMDiffusionConfig(**config.diffusion_config) tables, _ = clava_training( tables, diff --git a/src/midst_toolkit/attacks/ensemble/clavaddpm_fine_tuning.py b/src/midst_toolkit/attacks/ensemble/clavaddpm_fine_tuning.py index f4ec8aa2..51cc91c7 100644 --- a/src/midst_toolkit/attacks/ensemble/clavaddpm_fine_tuning.py +++ b/src/midst_toolkit/attacks/ensemble/clavaddpm_fine_tuning.py @@ -12,7 +12,7 @@ import torch from torch import optim -from midst_toolkit.common.config import ClassifierConfig, DiffusionConfig +from midst_toolkit.common.config import ClavaDDPMClassifierConfig, ClavaDDPMDiffusionConfig from midst_toolkit.common.enumerations import DataSplit from midst_toolkit.common.logger import KeyValueLogger, log from midst_toolkit.common.variables import DEVICE @@ -246,8 +246,8 @@ def child_fine_tuning( child_domain_dict: dict[str, Any], parent_name: str | None, child_name: str, - diffusion_config: DiffusionConfig, - classifier_config: ClassifierConfig | None, + diffusion_config: ClavaDDPMDiffusionConfig, + classifier_config: ClavaDDPMClassifierConfig | None, fine_tuning_diffusion_iterations: int, fine_tuning_classifier_iterations: int, device: torch.device = DEVICE, @@ -343,8 +343,8 @@ def clava_fine_tuning( trained_models: dict[Relation, ModelArtifacts], new_tables: Tables, relation_order: RelationOrder, - diffusion_config: DiffusionConfig, - classifier_config: ClassifierConfig, + diffusion_config: ClavaDDPMDiffusionConfig, + classifier_config: ClavaDDPMClassifierConfig, fine_tuning_diffusion_iterations: int, fine_tuning_classifier_iterations: int, ) -> dict[Relation, ModelArtifacts]: diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 672d2ec8..c286119d 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -9,8 +9,9 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.shadow_model_utils import ( + ModelType, fine_tune_tabddpm_and_synthesize, - save_additional_tabddpm_config, + save_additional_training_config, train_tabddpm_and_synthesize, ) from midst_toolkit.common.logger import log @@ -32,6 +33,7 @@ def train_fine_tuned_shadow_models( number_of_points_to_synthesize: int = 20000, init_data_seed: int | None = None, random_seed: int | None = None, + model_type: ModelType = ModelType.TABDDPM, ) -> Path: """ Train ``n_models`` shadow models that start from a pre-trained TabDDPM model and are fine-tuned on @@ -76,6 +78,7 @@ def train_fine_tuned_shadow_models( defaults to 20,000. init_data_seed: Random seed for the initial training set. random_seed: Random seed used for reproducibility, defaults to None. + model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. Returns: The path where the shadow models and their artifacts are saved. @@ -112,14 +115,15 @@ def train_fine_tuned_shadow_models( ) # Train initial model with 60K data without any challenge points - # ``save_additional_tabddpm_config`` makes a personalized copy of the training config for each - # tabddpm model (here the base model). + # ``save_additional_training_config`` makes a personalized copy of the training config for each + # training model (here the base model). # All the shadow models will be saved under the base model data directory. - configs, save_dir = save_additional_tabddpm_config( + configs, save_dir = save_additional_training_config( data_dir=shadow_model_data_folder, - training_config_json_path=Path(training_json_config_paths.tabddpm_training_config_path), + training_config_json_path=Path(training_json_config_paths.training_config_path), final_config_json_path=shadow_model_data_folder / f"{table_name}.json", # Path to the new json experiment_name="pre_trained_model", + model_type=model_type, ) # Train the initial model if it is not already trained and saved. @@ -259,7 +263,7 @@ def train_shadow_on_half_challenge_data( training_json_config_paths.dataset_meta_file_path, shadow_folder / "dataset_meta.json", ) - configs, save_dir = save_additional_tabddpm_config( + configs, save_dir = save_additional_training_config( data_dir=shadow_folder, training_config_json_path=Path(training_json_config_paths.tabddpm_training_config_path), final_config_json_path=shadow_folder / f"{table_name}.json", # Path to the new json @@ -318,6 +322,7 @@ def train_three_sets_of_shadow_models( n_reps: int = 12, number_of_points_to_synthesize: int = 20000, random_seed: int | None = None, + model_type: ModelType = ModelType.TABDDPM, ) -> tuple[Path, Path, Path]: """ Runs the shadow model training pipeline of the ensemble attack. This pipeline trains three sets of shadow models. @@ -367,6 +372,7 @@ def train_three_sets_of_shadow_models( number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model, defaults to 20,000. random_seed: Random seed used for reproducibility, defaults to None. + model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. Returns: Paths where the shadow models and their artifacts including synthetic data are saved for each of @@ -392,6 +398,7 @@ def train_three_sets_of_shadow_models( number_of_points_to_synthesize=number_of_points_to_synthesize, init_data_seed=random_seed, random_seed=random_seed, + model_type=model_type, ) log( INFO, @@ -416,6 +423,7 @@ def train_three_sets_of_shadow_models( # Setting a different seed for the second train set init_data_seed=random_seed + 1 if random_seed is not None else None, random_seed=random_seed, + model_type=model_type, ) log( INFO, @@ -433,6 +441,7 @@ def train_three_sets_of_shadow_models( id_column_name=id_column_name, number_of_points_to_synthesize=number_of_points_to_synthesize, random_seed=random_seed, + model_type=model_type, ) log( INFO, diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index f1693c8e..52b0481e 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -2,13 +2,14 @@ import json import os from dataclasses import dataclass +from enum import Enum from logging import INFO from pathlib import Path import pandas as pd from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning -from midst_toolkit.common.config import TrainingConfig +from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, GeneralConfig, TrainingConfig from midst_toolkit.common.logger import log from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.clustering import clava_clustering @@ -22,10 +23,15 @@ from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training +class ModelType(Enum): + TABDDPM = "tabddpm" + CTGAN = "ctgan" + + @dataclass class TrainingResult: save_dir: Path - configs: TrainingConfig + configs: ClavaDDPMTrainingConfig tables: Tables relation_order: RelationOrder all_group_lengths_probabilities: GroupLengthsProbDicts @@ -33,12 +39,13 @@ class TrainingResult: synthetic_data: pd.DataFrame | None = None -def save_additional_tabddpm_config( +def save_additional_training_config( data_dir: Path, training_config_json_path: Path, final_config_json_path: Path, experiment_name: str = "attack_experiment", workspace_name: str = "shadow_workspace", + model_type: ModelType = ModelType.TABDDPM, ) -> tuple[TrainingConfig, Path]: """ Modifies a TabDDPM configuration JSON file with the specified data directory, experiment name and workspace name, @@ -50,14 +57,24 @@ def save_additional_tabddpm_config( final_config_json_path: Path where the modified configuration JSON file will be saved. experiment_name: Name of the experiment, used to create a unique save directory. workspace_name: Name of the workspace, used to create a unique save directory. + model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. Returns: - configs: Loaded configuration dictionary for TabDDPM. + configs: Loaded configuration dictionary for the model type. save_dir: Directory path where results will be saved. """ # Modify the config file to give the correct training data and saving directory with open(training_config_json_path, "r") as file: - configs = TrainingConfig(**json.load(file)) + configs: TrainingConfig + if model_type == ModelType.TABDDPM: + configs = ClavaDDPMTrainingConfig(**json.load(file)) + elif model_type == ModelType.CTGAN: + configs = CTGANTrainingConfig(**json.load(file)) + else: + raise ValueError(f"Invalid model type: {model_type}") + + if configs.general is None: + configs.general = GeneralConfig() configs.general.data_dir = data_dir # Save dir is set by joining the workspace_dir and exp_name @@ -79,7 +96,7 @@ def save_additional_tabddpm_config( # TODO: This and the next function should be unified later. def train_tabddpm_and_synthesize( train_set: pd.DataFrame, - configs: TrainingConfig, + configs: ClavaDDPMTrainingConfig, save_dir: Path, synthesize: bool = True, number_of_points_to_synthesize: int = 20000, @@ -158,7 +175,7 @@ def train_tabddpm_and_synthesize( def fine_tune_tabddpm_and_synthesize( trained_models: dict[Relation, ModelArtifacts], fine_tune_set: pd.DataFrame, - configs: TrainingConfig, + configs: ClavaDDPMTrainingConfig, save_dir: Path, fine_tuning_diffusion_iterations: int = 100, fine_tuning_classifier_iterations: int = 10, diff --git a/src/midst_toolkit/common/config.py b/src/midst_toolkit/common/config.py index 2f5974f1..268ab567 100644 --- a/src/midst_toolkit/common/config.py +++ b/src/midst_toolkit/common/config.py @@ -18,7 +18,7 @@ class GeneralConfig(BaseModel): sample_prefix: str -class ClusteringConfig(BaseModel): +class ClavaDDPMClusteringConfig(BaseModel): """Configuration for the trainer's clustering model.""" num_clusters: int | dict[str, int] @@ -26,7 +26,7 @@ class ClusteringConfig(BaseModel): parent_scale: float -class DiffusionConfig(BaseModel): +class ClavaDDPMDiffusionConfig(BaseModel): """Configuration for the trainer's diffusion model.""" d_layers: list[int] @@ -49,7 +49,7 @@ def validate_data_split_ratios(self) -> Self: return self -class ClassifierConfig(BaseModel): +class ClavaDDPMClassifierConfig(BaseModel): """Configuration for the trainer's classifier model.""" d_layers: list[int] @@ -67,14 +67,14 @@ def validate_data_split_ratios(self) -> Self: return self -class SamplingConfig(BaseModel): +class ClavaDDPMSamplingConfig(BaseModel): """Configuration for the synthesizer's sampling process.""" batch_size: int classifier_scale: float -class MatchingConfig(BaseModel): +class ClavaDDPMMatchingConfig(BaseModel): """Configuration for the synthesizer's matching process.""" num_matching_clusters: int @@ -83,14 +83,39 @@ class MatchingConfig(BaseModel): no_matching: bool +class CTGANModelConfig(BaseModel): + """Configuration for the CTGAN model.""" + + epochs: int + verbose: bool + + +class CTGANSynthesizingConfig(BaseModel): + """Configuration for the CTGAN model.""" + + sample_size: int + + class TrainingConfig(BaseModel): - """All configuration settings for training, synthesizing, and fine tuning.""" + """Base configuration settings for training models.""" model_config = ConfigDict(extra="forbid") # disallow extra fields from config files general: GeneralConfig - clustering: ClusteringConfig - diffusion: DiffusionConfig - classifier: ClassifierConfig - sampling: SamplingConfig - matching: MatchingConfig + + +class ClavaDDPMTrainingConfig(TrainingConfig): + """All configuration settings for training, synthesizing, and fine tuning TabDDPM models.""" + + clustering: ClavaDDPMClusteringConfig + diffusion: ClavaDDPMDiffusionConfig + classifier: ClavaDDPMClassifierConfig + sampling: ClavaDDPMSamplingConfig + matching: ClavaDDPMMatchingConfig + + +class CTGANTrainingConfig(TrainingConfig): + """All configuration settings for training, synthesizing, and fine tuning CTGAN models.""" + + training: CTGANModelConfig + synthesizing: CTGANSynthesizingConfig diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index b841ec6d..98fd2887 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -13,7 +13,7 @@ from sklearn.mixture import BayesianGaussianMixture, GaussianMixture from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, QuantileTransformer -from midst_toolkit.common.config import ClusteringConfig +from midst_toolkit.common.config import ClavaDDPMClusteringConfig from midst_toolkit.common.enumerations import DomainDataType from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.data_loaders import NO_PARENT_COLUMN_NAME, Tables @@ -29,7 +29,7 @@ def clava_clustering( tables: Tables, relation_order: RelationOrder, save_dir: Path, - configs: ClusteringConfig, + configs: ClavaDDPMClusteringConfig, ) -> tuple[dict[str, Any], GroupLengthsProbDicts]: """ Clustering function for the multi-table function of the ClavaDDPM model. @@ -96,7 +96,7 @@ def _load_clustering_info_from_checkpoint(save_dir: Path) -> dict[str, Any] | No def _run_clustering( tables: Tables, relation_order: RelationOrder, - configs: ClusteringConfig, + configs: ClavaDDPMClusteringConfig, ) -> tuple[Tables, GroupLengthsProbDicts]: """ Run the clustering process. diff --git a/src/midst_toolkit/models/clavaddpm/synthesizer.py b/src/midst_toolkit/models/clavaddpm/synthesizer.py index 44741bcd..3ed4f6fa 100644 --- a/src/midst_toolkit/models/clavaddpm/synthesizer.py +++ b/src/midst_toolkit/models/clavaddpm/synthesizer.py @@ -14,7 +14,7 @@ from torch.nn import functional from tqdm import tqdm -from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig +from midst_toolkit.common.config import ClavaDDPMMatchingConfig, ClavaDDPMSamplingConfig, GeneralConfig from midst_toolkit.common.enumerations import DataSplit from midst_toolkit.common.logger import log from midst_toolkit.models.clavaddpm.data_loaders import NO_PARENT_COLUMN_NAME, Tables @@ -675,7 +675,7 @@ def clava_synthesizing_matching_process( synthetic_tables: dict[Relation, dict[str, Any]], tables: Tables, relation_order: RelationOrder, - matching_config: MatchingConfig, + matching_config: ClavaDDPMMatchingConfig, ) -> dict[str, pd.DataFrame]: """ Matches synthetic child tables to synthetic parent tables based on clustering information. @@ -713,8 +713,8 @@ def clava_synthesizing( save_dir: Path, models: dict[Relation, ModelArtifacts], general_config: GeneralConfig, - sampling_config: SamplingConfig, - matching_config: MatchingConfig, + sampling_config: ClavaDDPMSamplingConfig, + matching_config: ClavaDDPMMatchingConfig, all_group_lengths_prob_dicts: GroupLengthsProbDicts | None = None, sample_scale: float = 1.0, ) -> tuple[dict[str, pd.DataFrame], float, float]: diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 3df3398a..6b854ca0 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -13,7 +13,7 @@ from sklearn.preprocessing import LabelEncoder from torch import Tensor, optim -from midst_toolkit.common.config import ClassifierConfig, DiffusionConfig +from midst_toolkit.common.config import ClavaDDPMClassifierConfig, ClavaDDPMDiffusionConfig from midst_toolkit.common.enumerations import DataSplit, DomainDataType, TaskType from midst_toolkit.common.logger import KeyValueLogger, log from midst_toolkit.common.variables import DEVICE @@ -58,8 +58,8 @@ def clava_training( tables: Tables, relation_order: RelationOrder, save_dir: Path, - diffusion_config: DiffusionConfig, - classifier_config: ClassifierConfig | None = None, + diffusion_config: ClavaDDPMDiffusionConfig, + classifier_config: ClavaDDPMClassifierConfig | None = None, device: torch.device = DEVICE, ) -> tuple[Tables, dict[Relation, ModelArtifacts]]: """ @@ -123,8 +123,8 @@ def child_training( child_domain: dict[str, Any], parent_name: str | None, child_name: str, - diffusion_config: DiffusionConfig, - classifier_config: ClassifierConfig | None = None, + diffusion_config: ClavaDDPMDiffusionConfig, + classifier_config: ClavaDDPMClassifierConfig | None = None, device: torch.device = DEVICE, ) -> ModelArtifacts: """ @@ -205,7 +205,7 @@ def train_model( table_metadata: TableMetadata, model_params: ModelParameters, transformations: Transformations, - diffusion_config: DiffusionConfig, + diffusion_config: ClavaDDPMDiffusionConfig, device: torch.device = DEVICE, ) -> ModelArtifacts: """ @@ -299,8 +299,8 @@ def train_classifier( table_metadata: TableMetadata, model_params: ModelParameters, transformations: Transformations, - diffusion_config: DiffusionConfig, - classifier_config: ClassifierConfig, + diffusion_config: ClavaDDPMDiffusionConfig, + classifier_config: ClavaDDPMClassifierConfig, device: torch.device = DEVICE, cluster_col: str = "cluster", classifier_evaluation_interval: int = 5, diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py index 354053c3..41aaa38e 100644 --- a/tests/integration/attacks/ensemble/test_shadow_model_training.py +++ b/tests/integration/attacks/ensemble/test_shadow_model_training.py @@ -16,7 +16,7 @@ from midst_toolkit.attacks.ensemble.shadow_model_utils import ( TrainingResult, fine_tune_tabddpm_and_synthesize, - save_additional_tabddpm_config, + save_additional_training_config, train_tabddpm_and_synthesize, ) @@ -151,7 +151,7 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: cfg.shadow_training.training_json_config_paths.dataset_meta_file_path, tmp_training_dir / "dataset_meta.json", ) - configs, save_dir = save_additional_tabddpm_config( + configs, save_dir = save_additional_training_config( data_dir=tmp_training_dir, training_config_json_path=tabddpm_config_path, final_config_json_path=tmp_training_dir / "trans.json", diff --git a/tests/integration/models/clavaddpm/test_model.py b/tests/integration/models/clavaddpm/test_model.py index 43689ff0..aec00323 100644 --- a/tests/integration/models/clavaddpm/test_model.py +++ b/tests/integration/models/clavaddpm/test_model.py @@ -10,7 +10,7 @@ import torch from torch.nn import functional -from midst_toolkit.common.config import ClassifierConfig, ClusteringConfig, DiffusionConfig +from midst_toolkit.common.config import ClavaDDPMClassifierConfig, ClavaDDPMClusteringConfig, ClavaDDPMDiffusionConfig from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds, unset_all_random_seeds from midst_toolkit.common.variables import DEVICE @@ -33,13 +33,13 @@ from tests.integration.utils import is_running_on_ci_environment -CLUSTERING_CONFIG = ClusteringConfig( +CLUSTERING_CONFIG = ClavaDDPMClusteringConfig( parent_scale=1.0, num_clusters=3, clustering_method=ClusteringMethod.KMEANS_AND_GMM, ) -DIFFUSION_CONFIG = DiffusionConfig( +DIFFUSION_CONFIG = ClavaDDPMDiffusionConfig( d_layers=[512, 1024, 1024, 1024, 1024, 512], dropout=0.0, num_timesteps=100, @@ -53,7 +53,7 @@ data_split_ratios=[0.99, 0.005, 0.005], ) -CLASSIFIER_CONFIG = ClassifierConfig( +CLASSIFIER_CONFIG = ClavaDDPMClassifierConfig( d_layers=[128, 256, 512, 1024, 512, 256, 128], lr=0.0001, dim_t=128, diff --git a/tests/integration/models/clavaddpm/test_synthesizer.py b/tests/integration/models/clavaddpm/test_synthesizer.py index ae88477e..46c66030 100644 --- a/tests/integration/models/clavaddpm/test_synthesizer.py +++ b/tests/integration/models/clavaddpm/test_synthesizer.py @@ -5,12 +5,12 @@ import pytest from midst_toolkit.common.config import ( - ClassifierConfig, - ClusteringConfig, - DiffusionConfig, + ClavaDDPMClassifierConfig, + ClavaDDPMClusteringConfig, + ClavaDDPMDiffusionConfig, + ClavaDDPMMatchingConfig, + ClavaDDPMSamplingConfig, GeneralConfig, - MatchingConfig, - SamplingConfig, ) from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds, unset_all_random_seeds @@ -25,13 +25,13 @@ from tests.integration.utils import is_running_on_ci_environment -CLUSTERING_CONFIG = ClusteringConfig( +CLUSTERING_CONFIG = ClavaDDPMClusteringConfig( parent_scale=1.0, num_clusters=3, clustering_method=ClusteringMethod.KMEANS_AND_GMM, ) -DIFFUSION_CONFIG = DiffusionConfig( +DIFFUSION_CONFIG = ClavaDDPMDiffusionConfig( d_layers=[512, 1024, 1024, 1024, 1024, 512], dropout=0.0, num_timesteps=100, @@ -45,7 +45,7 @@ data_split_ratios=[0.99, 0.005, 0.005], ) -CLASSIFIER_CONFIG = ClassifierConfig( +CLASSIFIER_CONFIG = ClavaDDPMClassifierConfig( d_layers=[128, 256, 512, 1024, 512, 256, 128], lr=0.0001, dim_t=128, @@ -62,12 +62,12 @@ sample_prefix="", ) -SAMPLING_CONFIG = SamplingConfig( +SAMPLING_CONFIG = ClavaDDPMSamplingConfig( batch_size=2, classifier_scale=1.0, ) -MATCHING_CONFIG = MatchingConfig( +MATCHING_CONFIG = ClavaDDPMMatchingConfig( num_matching_clusters=1, matching_batch_size=1, unique_matching=True, diff --git a/tests/unit/attacks/ensemble/test_shadow_model_utils.py b/tests/unit/attacks/ensemble/test_shadow_model_utils.py index 7222b3ff..a5f290b1 100644 --- a/tests/unit/attacks/ensemble/test_shadow_model_utils.py +++ b/tests/unit/attacks/ensemble/test_shadow_model_utils.py @@ -6,7 +6,7 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - save_additional_tabddpm_config, + save_additional_training_config, ) @@ -33,7 +33,7 @@ def test_save_additional_tabddpm_config(cfg: DictConfig, tmp_path: Path) -> None new_experiment_name = "test_experiment" final_json_path = tmp_path / "modified_config.json" - configs, save_dir = save_additional_tabddpm_config( + configs, save_dir = save_additional_training_config( data_dir=new_data_dir, training_config_json_path=tabddpm_config_path, final_config_json_path=final_json_path, From a46a01074d714fe91bdb7c13e487ad669c33c797 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 13 Jan 2026 17:46:07 -0500 Subject: [PATCH 04/38] WIP adding training and sythesizing code --- .../run_shadow_model_training.py | 11 ++- examples/training/multi_table/run_training.py | 4 +- .../training/single_table/run_training.py | 4 +- .../attacks/ensemble/clavaddpm_fine_tuning.py | 14 +-- .../ensemble/rmia/shadow_model_training.py | 30 +++++- .../attacks/ensemble/shadow_model_utils.py | 99 ++++++++++++++++--- .../models/clavaddpm/clustering.py | 2 +- .../models/clavaddpm/enumerations.py | 2 +- .../models/clavaddpm/synthesizer.py | 10 +- src/midst_toolkit/models/clavaddpm/train.py | 22 ++++- .../ensemble/test_shadow_model_training.py | 16 ++- 11 files changed, 164 insertions(+), 50 deletions(-) diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 17fff0e9..6086ea65 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -1,6 +1,7 @@ import shutil from logging import INFO from pathlib import Path +from typing import cast from omegaconf import DictConfig @@ -13,6 +14,7 @@ save_additional_training_config, train_tabddpm_and_synthesize, ) +from midst_toolkit.common.config import ClavaDDPMTrainingConfig from midst_toolkit.common.logger import log @@ -66,7 +68,7 @@ def run_target_model_training(config: DictConfig) -> Path: train_result = train_tabddpm_and_synthesize( train_set=df_real_data, - configs=configs, + configs=cast(ClavaDDPMTrainingConfig, configs), save_dir=save_dir, synthesize=True, number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, @@ -112,9 +114,10 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]: table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME id_column_name = config.table_id_column_name if "table_id_column_name" in config else DEFAULT_ID_COLUMN_NAME - model_type = ( - ModelType(config.shadow_training.model_name) if "model_name" in config.shadow_training else DEFAULT_MODEL_TYPE - ) + model_type = DEFAULT_MODEL_TYPE + if "model_name" in config.shadow_training: + model_type = ModelType(config.shadow_training.model_name) + log(INFO, f"Training shadow models with model type: {model_type.value}") # Make sure master challenge train and population data have the "trans_id" column. assert id_column_name in df_master_challenge_train.columns, ( diff --git a/examples/training/multi_table/run_training.py b/examples/training/multi_table/run_training.py index fe427bb0..095f52f5 100644 --- a/examples/training/multi_table/run_training.py +++ b/examples/training/multi_table/run_training.py @@ -10,7 +10,7 @@ from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.clustering import clava_clustering from midst_toolkit.models.clavaddpm.data_loaders import Table, load_tables -from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training +from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, clava_training # Preventing some excessive logging @@ -65,7 +65,7 @@ def main(config: DictConfig) -> None: result = pickle.load(f) # Asserting the results are the correct type - assert isinstance(result, ModelArtifacts) + assert isinstance(result, ClavaDDPMModelArtifacts) log(INFO, f"Result size (in bytes): {results_file.stat().st_size}") diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py index 886b6b9f..62e6fdb8 100644 --- a/examples/training/single_table/run_training.py +++ b/examples/training/single_table/run_training.py @@ -9,7 +9,7 @@ from midst_toolkit.common.logger import TOOLKIT_LOGGER, log from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.data_loaders import load_tables -from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training +from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, clava_training # Preventing some excessive logging @@ -49,7 +49,7 @@ def main(config: DictConfig) -> None: result = pickle.load(f) # Asserting the results are the correct type - assert isinstance(result, ModelArtifacts) + assert isinstance(result, ClavaDDPMModelArtifacts) log(INFO, f"Result size (in bytes): {results_file.stat().st_size}") diff --git a/src/midst_toolkit/attacks/ensemble/clavaddpm_fine_tuning.py b/src/midst_toolkit/attacks/ensemble/clavaddpm_fine_tuning.py index 51cc91c7..ec51cf4b 100644 --- a/src/midst_toolkit/attacks/ensemble/clavaddpm_fine_tuning.py +++ b/src/midst_toolkit/attacks/ensemble/clavaddpm_fine_tuning.py @@ -37,7 +37,7 @@ ) from midst_toolkit.models.clavaddpm.sampler import ScheduleSamplerType from midst_toolkit.models.clavaddpm.train import ( - ModelArtifacts, + ClavaDDPMModelArtifacts, _numerical_forward_backward_log, get_table_metadata, ) @@ -56,7 +56,7 @@ def fine_tune_model( weight_decay: float, data_split_ratios: list[float], device: torch.device = DEVICE, -) -> ModelArtifacts: +) -> ClavaDDPMModelArtifacts: """ Fine-tune a trained diffusion model on a new dataset. @@ -124,7 +124,7 @@ def fine_tune_model( if dataset.numerical_transform is not None: inverse_transform_function = dataset.numerical_transform.inverse_transform - return ModelArtifacts( + return ClavaDDPMModelArtifacts( diffusion=diffusion, label_encoders=label_encoders, dataset=dataset, @@ -241,7 +241,7 @@ def fine_tune_classifier( def child_fine_tuning( - pre_trained_model: ModelArtifacts, + pre_trained_model: ClavaDDPMModelArtifacts, child_df_with_cluster: pd.DataFrame, child_domain_dict: dict[str, Any], parent_name: str | None, @@ -251,7 +251,7 @@ def child_fine_tuning( fine_tuning_diffusion_iterations: int, fine_tuning_classifier_iterations: int, device: torch.device = DEVICE, -) -> ModelArtifacts: +) -> ClavaDDPMModelArtifacts: """ Fine-tune a child model based on the parent model. @@ -340,14 +340,14 @@ def child_fine_tuning( def clava_fine_tuning( - trained_models: dict[Relation, ModelArtifacts], + trained_models: dict[Relation, ClavaDDPMModelArtifacts], new_tables: Tables, relation_order: RelationOrder, diffusion_config: ClavaDDPMDiffusionConfig, classifier_config: ClavaDDPMClassifierConfig, fine_tuning_diffusion_iterations: int, fine_tuning_classifier_iterations: int, -) -> dict[Relation, ModelArtifacts]: +) -> dict[Relation, ClavaDDPMModelArtifacts]: """ Fine-tune the trained models on new tables data. diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index c286119d..104fa700 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -3,17 +3,20 @@ import shutil from logging import INFO from pathlib import Path -from typing import Any +from typing import Any, cast import pandas as pd from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.shadow_model_utils import ( ModelType, + TrainingResult, fine_tune_tabddpm_and_synthesize, save_additional_training_config, + train_ctgan_and_synthesize, train_tabddpm_and_synthesize, ) +from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig from midst_toolkit.common.logger import log @@ -129,8 +132,25 @@ def train_fine_tuned_shadow_models( # Train the initial model if it is not already trained and saved. initial_model_path = save_dir / f"initial_model_rmia_{init_model_id}.pkl" if not initial_model_path.exists(): - log(INFO, f"Training initial model with ID {init_model_id}...") - initial_model_training_results = train_tabddpm_and_synthesize(train, configs, save_dir, synthesize=False) + log(INFO, f"Training initial {model_type.value} model with ID {init_model_id}...") + + initial_model_training_results: TrainingResult + if model_type == ModelType.TABDDPM: + initial_model_training_results = train_tabddpm_and_synthesize( + train, + cast(ClavaDDPMTrainingConfig, configs), + save_dir, + synthesize=False, + ) + elif model_type == ModelType.CTGAN: + initial_model_training_results = train_ctgan_and_synthesize( + train, + cast(CTGANTrainingConfig, configs), + save_dir, + synthesize=False, + ) + else: + raise ValueError(f"Invalid model type: {model_type}") # Save the initial model # Pickle dump the results @@ -176,7 +196,7 @@ def train_fine_tuned_shadow_models( train_result = fine_tune_tabddpm_and_synthesize( trained_models=initial_model_training_results.models, fine_tune_set=selected_challenges, - configs=configs, + configs=cast(ClavaDDPMTrainingConfig, configs), save_dir=save_dir, fine_tuning_diffusion_iterations=fine_tuning_config.fine_tune_diffusion_iterations, fine_tuning_classifier_iterations=fine_tuning_config.fine_tune_classifier_iterations, @@ -289,7 +309,7 @@ def train_shadow_on_half_challenge_data( train_result = train_tabddpm_and_synthesize( selected_challenges, - configs, + cast(ClavaDDPMTrainingConfig, configs), save_dir, synthesize=True, number_of_points_to_synthesize=number_of_points_to_synthesize, diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index 52b0481e..f848f952 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -5,11 +5,14 @@ from enum import Enum from logging import INFO from pathlib import Path +from typing import Any import pandas as pd +from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] +from examples.gan.utils import get_single_table_svd_metadata, get_table_name from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning -from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, GeneralConfig, TrainingConfig +from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, TrainingConfig from midst_toolkit.common.logger import log from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.clustering import clava_clustering @@ -20,7 +23,11 @@ RelationOrder, ) from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing -from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training +from midst_toolkit.models.clavaddpm.train import ( + ClavaDDPMModelArtifacts, + CTGANModelArtifacts, + clava_training, +) class ModelType(Enum): @@ -28,15 +35,27 @@ class ModelType(Enum): CTGAN = "ctgan" -@dataclass +@dataclass(kw_only=True) # Setting kw_only=True avoids and error with default values and inheritance class TrainingResult: save_dir: Path + configs: TrainingConfig + models: Any + synthetic_data: pd.DataFrame | None = None + + +@dataclass +class CTGANTrainingResult(TrainingResult): + configs: CTGANTrainingConfig + models: dict[Relation, CTGANModelArtifacts] + + +@dataclass +class TabDDPMTrainingResult(TrainingResult): configs: ClavaDDPMTrainingConfig + models: dict[Relation, ClavaDDPMModelArtifacts] tables: Tables relation_order: RelationOrder all_group_lengths_probabilities: GroupLengthsProbDicts - models: dict[Relation, ModelArtifacts] - synthetic_data: pd.DataFrame | None = None def save_additional_training_config( @@ -73,9 +92,6 @@ def save_additional_training_config( else: raise ValueError(f"Invalid model type: {model_type}") - if configs.general is None: - configs.general = GeneralConfig() - configs.general.data_dir = data_dir # Save dir is set by joining the workspace_dir and exp_name configs.general.workspace_dir = data_dir / workspace_name @@ -100,7 +116,7 @@ def train_tabddpm_and_synthesize( save_dir: Path, synthesize: bool = True, number_of_points_to_synthesize: int = 20000, -) -> TrainingResult: +) -> TabDDPMTrainingResult: """ Train a TabDDPM model on the provided training set and optionally synthesize data using the trained models. @@ -137,7 +153,7 @@ def train_tabddpm_and_synthesize( classifier_config=configs.classifier, device=DEVICE, ) - result = TrainingResult( + result = TabDDPMTrainingResult( save_dir=save_dir, configs=configs, tables=tables, @@ -173,7 +189,7 @@ def train_tabddpm_and_synthesize( def fine_tune_tabddpm_and_synthesize( - trained_models: dict[Relation, ModelArtifacts], + trained_models: dict[Relation, ClavaDDPMModelArtifacts], fine_tune_set: pd.DataFrame, configs: ClavaDDPMTrainingConfig, save_dir: Path, @@ -181,7 +197,7 @@ def fine_tune_tabddpm_and_synthesize( fine_tuning_classifier_iterations: int = 10, synthesize: bool = True, number_of_points_to_synthesize: int = 20000, -) -> TrainingResult: +) -> TabDDPMTrainingResult: """ Given the trained models and a new training set, fine-tune the TabDDPM models. If ``synthesize`` is True, synthesizes data using the fine-tuned models. Number of @@ -230,7 +246,7 @@ def fine_tune_tabddpm_and_synthesize( fine_tuning_diffusion_iterations=fine_tuning_diffusion_iterations, fine_tuning_classifier_iterations=fine_tuning_classifier_iterations, ) - result = TrainingResult( + result = TabDDPMTrainingResult( save_dir=save_dir, configs=configs, tables=new_tables, @@ -265,6 +281,63 @@ def fine_tune_tabddpm_and_synthesize( return result +def train_ctgan_and_synthesize( + train_set: pd.DataFrame, + configs: CTGANTrainingConfig, + save_dir: Path, + synthesize: bool = True, +) -> CTGANTrainingResult: + """ + Train a CTGAN model on the provided training set and optionally synthesize data using the trained models. + + Args: + train_set: The training dataset as a pandas DataFrame. + configs: Configuration dictionary for CTGAN. + save_dir: Directory path where models and results will be saved. + synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. + + Returns: + A dataclass TrainingResult object containing: + - save_dir: Directory where results are saved. + - configs: Configuration dictionary used for training. + - models: The trained models. + - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, + otherwise, None. + """ + table_name = get_table_name(configs.general.data_dir) + domain_file_path = configs.general.data_dir / f"{table_name}_domain.json" + with open(domain_file_path, "r") as file: + domain_dictionary = json.load(file) + + metadata, train_data_without_ids = get_single_table_svd_metadata(train_set, domain_dictionary) + + log(INFO, "Fitting CTGAN...") + + ctgan = CTGANSynthesizer( + metadata=metadata, + epochs=configs.training.epochs, + verbose=configs.training.verbose, + ) + ctgan.fit(train_data_without_ids) + + results_file = Path(save_dir) / "trained_ctgan_model.pkl" + results_file.parent.mkdir(parents=True, exist_ok=True) + + ctgan.save(results_file) + + result = CTGANTrainingResult( + save_dir=save_dir, + configs=configs, + models={(None, table_name): CTGANModelArtifacts(model=ctgan, model_file_path=results_file)}, + ) + + if synthesize: + synthetic_data = ctgan.sample(num_rows=configs.synthesizing.sample_size) + result.synthetic_data = synthetic_data + + return result + + # TODO: The following function is directly copied from the midst reference code since # I need it to run the attack code, but, it should probably be moved to somewhere else # as it is an essential part of a working TabDDPM training pipeline. diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 98fd2887..66785fc7 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -112,7 +112,7 @@ def _run_clustering( - The tables dictionary. - The dictionary with the group lengths probability for all the parent-child pairs. """ - all_group_lengths_prob_dicts = {} + all_group_lengths_prob_dicts: GroupLengthsProbDicts = {} relation_order_reversed = relation_order[::-1] for parent, child in relation_order_reversed: if parent is not None: diff --git a/src/midst_toolkit/models/clavaddpm/enumerations.py b/src/midst_toolkit/models/clavaddpm/enumerations.py index f8af7830..fc8bb60b 100644 --- a/src/midst_toolkit/models/clavaddpm/enumerations.py +++ b/src/midst_toolkit/models/clavaddpm/enumerations.py @@ -3,7 +3,7 @@ import numpy as np -Relation = tuple[str, str] +Relation = tuple[str | None, str] RelationOrder = list[Relation] GroupLengthProbDict = dict[int, dict[int, float]] GroupLengthsProbDicts = dict[Relation, GroupLengthProbDict] diff --git a/src/midst_toolkit/models/clavaddpm/synthesizer.py b/src/midst_toolkit/models/clavaddpm/synthesizer.py index 3ed4f6fa..c361615a 100644 --- a/src/midst_toolkit/models/clavaddpm/synthesizer.py +++ b/src/midst_toolkit/models/clavaddpm/synthesizer.py @@ -32,7 +32,7 @@ GaussianMultinomialDiffusion, ) from midst_toolkit.models.clavaddpm.model import Classifier, ModelParameters -from midst_toolkit.models.clavaddpm.train import ModelArtifacts, get_df_without_id +from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, get_df_without_id def sample_from_diffusion( @@ -711,7 +711,7 @@ def clava_synthesizing( tables: Tables, relation_order: RelationOrder, save_dir: Path, - models: dict[Relation, ModelArtifacts], + models: dict[Relation, ClavaDDPMModelArtifacts], general_config: GeneralConfig, sampling_config: ClavaDDPMSamplingConfig, matching_config: ClavaDDPMMatchingConfig, @@ -827,7 +827,7 @@ def clava_synthesizing( def _synthesize_single_table( table_name: str, data: pd.DataFrame, - training_results: ModelArtifacts, + training_results: ClavaDDPMModelArtifacts, sample_scale: float, sample_batch_size: int, ) -> tuple[pd.DataFrame, list[int]]: @@ -883,8 +883,8 @@ def _synthesize_single_table( def _synthesize_multi_table( parent_name: str, child_name: str, - parent_training_results: ModelArtifacts, - child_training_results: ModelArtifacts, + parent_training_results: ClavaDDPMModelArtifacts, + child_training_results: ClavaDDPMModelArtifacts, parent_synthetic_data: dict[str, Any], data: pd.DataFrame, group_length_prob_dict: GroupLengthProbDict, diff --git a/src/midst_toolkit/models/clavaddpm/train.py b/src/midst_toolkit/models/clavaddpm/train.py index 6b854ca0..cc8ee580 100644 --- a/src/midst_toolkit/models/clavaddpm/train.py +++ b/src/midst_toolkit/models/clavaddpm/train.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd import torch +from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] from sklearn.preprocessing import LabelEncoder from torch import Tensor, optim @@ -39,6 +40,17 @@ @dataclass class ModelArtifacts: + pass + + +@dataclass +class CTGANModelArtifacts(ModelArtifacts): + model: CTGANSynthesizer + model_file_path: Path + + +@dataclass +class ClavaDDPMModelArtifacts(ModelArtifacts): diffusion: GaussianMultinomialDiffusion label_encoders: dict[int, LabelEncoder] dataset: Dataset @@ -61,7 +73,7 @@ def clava_training( diffusion_config: ClavaDDPMDiffusionConfig, classifier_config: ClavaDDPMClassifierConfig | None = None, device: torch.device = DEVICE, -) -> tuple[Tables, dict[Relation, ModelArtifacts]]: +) -> tuple[Tables, dict[Relation, ClavaDDPMModelArtifacts]]: """ Training function for the ClavaDDPM model. @@ -126,7 +138,7 @@ def child_training( diffusion_config: ClavaDDPMDiffusionConfig, classifier_config: ClavaDDPMClassifierConfig | None = None, device: torch.device = DEVICE, -) -> ModelArtifacts: +) -> ClavaDDPMModelArtifacts: """ Training function for a single child table. @@ -207,7 +219,7 @@ def train_model( transformations: Transformations, diffusion_config: ClavaDDPMDiffusionConfig, device: torch.device = DEVICE, -) -> ModelArtifacts: +) -> ClavaDDPMModelArtifacts: """ Training function for the diffusion model. @@ -281,7 +293,7 @@ def train_model( if dataset.numerical_transform is not None: inverse_transform_function = dataset.numerical_transform.inverse_transform - return ModelArtifacts( + return ClavaDDPMModelArtifacts( diffusion=diffusion, label_encoders=label_encoders, dataset=dataset, @@ -486,7 +498,7 @@ def get_table_metadata(df: pd.DataFrame, table_domain: dict[str, Any], target_co def save_table_info( tables: Tables, relation_order: RelationOrder, - models: dict[Relation, ModelArtifacts], + models: dict[Relation, ClavaDDPMModelArtifacts], save_dir: Path, ) -> None: """ diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py index 41aaa38e..bd4c49e9 100644 --- a/tests/integration/attacks/ensemble/test_shadow_model_training.py +++ b/tests/integration/attacks/ensemble/test_shadow_model_training.py @@ -2,6 +2,7 @@ import pickle import shutil from pathlib import Path +from typing import cast import pandas as pd import pytest @@ -14,11 +15,12 @@ train_shadow_on_half_challenge_data, ) from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - TrainingResult, + TabDDPMTrainingResult, fine_tune_tabddpm_and_synthesize, save_additional_training_config, train_tabddpm_and_synthesize, ) +from midst_toolkit.common.config import ClavaDDPMTrainingConfig POPULATION_DATA = load_dataframe( @@ -67,7 +69,7 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None assert len(shadow_data["fine_tuning_sets"]) == 2 # n_models assert len(shadow_data["fine_tuned_results"]) == 2 # n_models for result in shadow_data["fine_tuned_results"]: - assert type(result) is TrainingResult + assert type(result) is TabDDPMTrainingResult assert result.synthetic_data is not None assert result.tables is not None assert result.models is not None @@ -113,7 +115,7 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) -> assert len(shadow_data["selected_sets"]) == 2 # n_models assert len(shadow_data["trained_results"]) == 2 # n_models for result in shadow_data["trained_results"]: - assert type(result) is TrainingResult + assert type(result) is TabDDPMTrainingResult assert result.synthetic_data is not None assert result.tables is not None assert result.models is not None @@ -160,7 +162,11 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: ) train_result = train_tabddpm_and_synthesize( - train_set, configs, save_dir, synthesize=True, number_of_points_to_synthesize=99 + train_set, + cast(ClavaDDPMTrainingConfig, configs), + save_dir, + synthesize=True, + number_of_points_to_synthesize=99, ) assert train_result.synthetic_data is not None assert type(train_result.synthetic_data) is pd.DataFrame @@ -174,7 +180,7 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: fine_tuned_results = fine_tune_tabddpm_and_synthesize( trained_models=train_result.models, fine_tune_set=fine_tuning_set, # fine-tuning on the same data for testing purposes - configs=configs, + configs=cast(ClavaDDPMTrainingConfig, configs), save_dir=save_dir, fine_tuning_diffusion_iterations=cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations, fine_tuning_classifier_iterations=cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations, From 30c0ed302e6d4d765df1136fa9b8d76019b9d191 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 14 Jan 2026 13:04:47 -0500 Subject: [PATCH 05/38] More info on readme --- examples/gan/README.md | 2 +- examples/gan/ensemble_attack/README.md | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/examples/gan/README.md b/examples/gan/README.md index fd7fc530..9764e423 100644 --- a/examples/gan/README.md +++ b/examples/gan/README.md @@ -8,7 +8,7 @@ some data afterwards. ## Downloading data First, we need the data. Download it from this -[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link), +[Google Drive link](https://drive.google.com/file/d/1YbDRVn-fwfdcPnHj5eMhCa6A-YPiGnKr/view?usp=sharing), extract the files and place them in a `/data` folder in within this folder (`examples/gan`). diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index b42b940e..54afb0c3 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -1,6 +1,26 @@ # CTGAN Ensemble Attack Example -To run, execute the following command from the root project folder: +On this example, we demonstrate how to run the [Ensemble Attack](examples/ensemble_attack) +using the [CTGAN](https://arxiv.org/pdf/1907.00503) model. + +## Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/1B9z4vh51mH6ZMj5E0pJitqR8lid3EJKM/view?usp=sharing), +extract the files and place them in a `/data/ensemble_attack` folder in within this folder +(`examples/gan`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +Here is a description of the files that have been extracted: +- `master_challenge_train.csv`: +- `population_all_with_challenge.csv`: + +## Running the attack + +To run, execute the following command from the prooject's root folder: ```bash python -m examples.gan.ensemble_attack.run From 9464962c0e7ca289b50ba482e9231d0429a174cc Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 23 Feb 2026 16:42:23 -0500 Subject: [PATCH 06/38] More ctgan changes --- .../run_shadow_model_training.py | 33 ++++++--- examples/gan/ensemble_attack/config.yaml | 7 ++ examples/gan/ensemble_attack/run.py | 21 ++++-- .../ensemble/rmia/shadow_model_training.py | 67 +++++++++++++------ .../attacks/ensemble/shadow_model_utils.py | 41 +++++++----- 5 files changed, 122 insertions(+), 47 deletions(-) diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 6086ea65..e3b25780 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -11,10 +11,12 @@ ) from midst_toolkit.attacks.ensemble.shadow_model_utils import ( ModelType, + TrainingResult, save_additional_training_config, + train_or_fine_tune_ctgan, train_tabddpm_and_synthesize, ) -from midst_toolkit.common.config import ClavaDDPMTrainingConfig +from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig from midst_toolkit.common.logger import log @@ -50,6 +52,11 @@ def run_target_model_training(config: DictConfig) -> Path: target_folder = target_model_output_path / "target_model" + model_type = DEFAULT_MODEL_TYPE + if "model_name" in config.shadow_training: + model_type = ModelType(config.shadow_training.model_name) + log(INFO, f"Training target model with model type: {model_type.value}") + target_folder.mkdir(parents=True, exist_ok=True) shutil.copyfile( target_training_json_config_paths.table_domain_file_path, @@ -66,13 +73,22 @@ def run_target_model_training(config: DictConfig) -> Path: experiment_name="trained_target_model", ) - train_result = train_tabddpm_and_synthesize( - train_set=df_real_data, - configs=cast(ClavaDDPMTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, - ) + train_result: TrainingResult + if model_type == ModelType.TABDDPM: + train_result = train_tabddpm_and_synthesize( + train_set=df_real_data, + configs=cast(ClavaDDPMTrainingConfig, configs), + save_dir=save_dir, + synthesize=True, + number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, + ) + elif model_type == ModelType.CTGAN: + train_result = train_or_fine_tune_ctgan( + dataset=df_real_data, + configs=cast(CTGANTrainingConfig, configs), + save_dir=save_dir, + synthesize=True, + ) # To train the attack model (metaclassifier), we only need to save target's synthetic data, # and not the entire target model's training result object. @@ -114,6 +130,7 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]: table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME id_column_name = config.table_id_column_name if "table_id_column_name" in config else DEFAULT_ID_COLUMN_NAME + model_type = DEFAULT_MODEL_TYPE if "model_name" in config.shadow_training: model_type = ModelType(config.shadow_training.model_name) diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 4c45b8fb..50977265 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -30,3 +30,10 @@ ensemble_attack: fine_tune_classifier_iterations: 20000 pre_train_data_size: 60000 number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. + + final_shadow_models_path: [ + "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${ensemble_attack.shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", + ] + target_synthetic_data_path: ${ensemble_attack.shadow_training.target_model_output_path}/target_synthetic_data.csv diff --git a/examples/gan/ensemble_attack/run.py b/examples/gan/ensemble_attack/run.py index f1834f0d..6f3690cc 100644 --- a/examples/gan/ensemble_attack/run.py +++ b/examples/gan/ensemble_attack/run.py @@ -33,6 +33,7 @@ def main(config: DictConfig) -> None: training_config_path.unlink(missing_ok=True) with open(training_config_path, "w") as f: training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config) + assert isinstance(training_config, dict), "Training config must be a dictionary." training_config["general"] = { "test_data_dir": config.base_data_dir, "sample_prefix": "ctgan", @@ -43,24 +44,36 @@ def main(config: DictConfig) -> None: } json.dump(training_config, f) + log(INFO, "Training the shadow models...") shadow_data_paths = run_shadow_model_training(config.ensemble_attack) shadow_data_paths = [Path(path) for path in shadow_data_paths] - target_model_synthetic_path = run_target_model_training(config) + log(INFO, "Training the target model...") + target_model_synthetic_path = run_target_model_training(config.ensemble_attack) if config.pipeline.run_metaclassifier_training: + log(INFO, "Training the metaclassifier...") if not config.pipeline.run_shadow_model_training: # If shadow model training is skipped, we need to provide the previous shadow model and target model paths. - shadow_data_paths = [Path(path) for path in config.shadow_training.final_shadow_models_path] - target_model_synthetic_path = Path(config.shadow_training.target_synthetic_data_path) + shadow_data_paths = [ + Path(path) for path in config.ensemble_attack.shadow_training.final_shadow_models_path + ] + target_model_synthetic_path = Path(config.ensemble_attack.shadow_training.target_synthetic_data_path) assert len(shadow_data_paths) == 3, "The attack_data_paths list must contain exactly three elements." assert target_model_synthetic_path is not None, ( "The target_data_path must be provided for metaclassifier training." ) + # Note: Importing the following module causes a segmentation fault error if imported at the top of this file. + # A quick solution is to load modules dynamically if any of the pipelines is called. + # TODO: Investigate the source of error. meta_pipeline = importlib.import_module("examples.ensemble_attack.run_metaclassifier_training") - meta_pipeline.run_metaclassifier_training(config, shadow_data_paths, target_model_synthetic_path) + meta_pipeline.run_metaclassifier_training( + config.ensemble_attack, + shadow_data_paths, + target_model_synthetic_path, + ) if __name__ == "__main__": diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 104fa700..46347838 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -13,7 +13,7 @@ TrainingResult, fine_tune_tabddpm_and_synthesize, save_additional_training_config, - train_ctgan_and_synthesize, + train_or_fine_tune_ctgan, train_tabddpm_and_synthesize, ) from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig @@ -143,7 +143,7 @@ def train_fine_tuned_shadow_models( synthesize=False, ) elif model_type == ModelType.CTGAN: - initial_model_training_results = train_ctgan_and_synthesize( + initial_model_training_results = train_or_fine_tune_ctgan( train, cast(CTGANTrainingConfig, configs), save_dir, @@ -193,16 +193,28 @@ def train_fine_tuned_shadow_models( # Shuffle the dataset selected_challenges = selected_challenges.sample(frac=1, random_state=random_seed).reset_index(drop=True) - train_result = fine_tune_tabddpm_and_synthesize( - trained_models=initial_model_training_results.models, - fine_tune_set=selected_challenges, - configs=cast(ClavaDDPMTrainingConfig, configs), - save_dir=save_dir, - fine_tuning_diffusion_iterations=fine_tuning_config.fine_tune_diffusion_iterations, - fine_tuning_classifier_iterations=fine_tuning_config.fine_tune_classifier_iterations, - synthesize=True, - number_of_points_to_synthesize=number_of_points_to_synthesize, - ) + if model_type == ModelType.TABDDPM: + train_result = fine_tune_tabddpm_and_synthesize( + trained_models=initial_model_training_results.models, + fine_tune_set=selected_challenges, + configs=cast(ClavaDDPMTrainingConfig, configs), + save_dir=save_dir, + fine_tuning_diffusion_iterations=fine_tuning_config.fine_tune_diffusion_iterations, + fine_tuning_classifier_iterations=fine_tuning_config.fine_tune_classifier_iterations, + synthesize=True, + number_of_points_to_synthesize=number_of_points_to_synthesize, + ) + elif model_type == ModelType.CTGAN: + train_result = train_or_fine_tune_ctgan( + dataset=selected_challenges, + configs=cast(CTGANTrainingConfig, configs), + save_dir=save_dir, + synthesize=True, + trained_model=initial_model_training_results.models[(None, table_name)].model, + ) + else: + raise ValueError(f"Invalid model type: {model_type}") + assert train_result.synthetic_data is not None, "Fine-tuned models should generate synthetic data." log( INFO, @@ -228,6 +240,7 @@ def train_shadow_on_half_challenge_data( id_column_name: str, number_of_points_to_synthesize: int = 20000, random_seed: int | None = None, + model_type: ModelType = ModelType.TABDDPM, ) -> Path: """ 1. Create eight training sets with exactly half of the observations included in the challenge lists @@ -253,6 +266,7 @@ def train_shadow_on_half_challenge_data( number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model, defaults to 20,000. random_seed: Random seed used for reproducibility, defaults to None. + model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. Returns: The path where the shadow models and their artifacts are saved. @@ -285,9 +299,10 @@ def train_shadow_on_half_challenge_data( ) configs, save_dir = save_additional_training_config( data_dir=shadow_folder, - training_config_json_path=Path(training_json_config_paths.tabddpm_training_config_path), + training_config_json_path=Path(training_json_config_paths.training_config_path), final_config_json_path=shadow_folder / f"{table_name}.json", # Path to the new json experiment_name="trained_model", + model_type=model_type, ) attack_data: dict[str, Any] = { "selected_sets": selected_id_lists, @@ -307,13 +322,25 @@ def train_shadow_on_half_challenge_data( # Shuffle the dataset selected_challenges = selected_challenges.sample(frac=1, random_state=random_seed).reset_index(drop=True) - train_result = train_tabddpm_and_synthesize( - selected_challenges, - cast(ClavaDDPMTrainingConfig, configs), - save_dir, - synthesize=True, - number_of_points_to_synthesize=number_of_points_to_synthesize, - ) + train_result: TrainingResult + if model_type == ModelType.TABDDPM: + train_result = train_tabddpm_and_synthesize( + selected_challenges, + cast(ClavaDDPMTrainingConfig, configs), + save_dir, + synthesize=True, + number_of_points_to_synthesize=number_of_points_to_synthesize, + ) + elif model_type == ModelType.CTGAN: + train_result = train_or_fine_tune_ctgan( + dataset=selected_challenges, + configs=cast(CTGANTrainingConfig, configs), + save_dir=save_dir, + synthesize=True, + ) + else: + raise ValueError(f"Invalid model type: {model_type}") + assert train_result.synthetic_data is not None, "Trained shadow model did not generate synthetic data." log( INFO, diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index f848f952..57abf906 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -197,7 +197,7 @@ def fine_tune_tabddpm_and_synthesize( fine_tuning_classifier_iterations: int = 10, synthesize: bool = True, number_of_points_to_synthesize: int = 20000, -) -> TabDDPMTrainingResult: +) -> TrainingResult: """ Given the trained models and a new training set, fine-tune the TabDDPM models. If ``synthesize`` is True, synthesizes data using the fine-tuned models. Number of @@ -281,20 +281,25 @@ def fine_tune_tabddpm_and_synthesize( return result -def train_ctgan_and_synthesize( - train_set: pd.DataFrame, +def train_or_fine_tune_ctgan( + dataset: pd.DataFrame, configs: CTGANTrainingConfig, save_dir: Path, synthesize: bool = True, -) -> CTGANTrainingResult: + trained_model: CTGANSynthesizer | None = None, +) -> TrainingResult: """ - Train a CTGAN model on the provided training set and optionally synthesize data using the trained models. + Train or fine tune a CTGAN model on the provided dataset and optionally synthesize data. + + If no trained model is provided, a new model will be trained. Otherwise, the + provided model will be fine tuned. Args: - train_set: The training dataset as a pandas DataFrame. + dataset: The dataset as a pandas DataFrame. configs: Configuration dictionary for CTGAN. save_dir: Directory path where models and results will be saved. synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. + trained_model: The trained model to fine tune. If None, a new model will be trained. Returns: A dataclass TrainingResult object containing: @@ -309,18 +314,24 @@ def train_ctgan_and_synthesize( with open(domain_file_path, "r") as file: domain_dictionary = json.load(file) - metadata, train_data_without_ids = get_single_table_svd_metadata(train_set, domain_dictionary) + metadata, dataset_without_ids = get_single_table_svd_metadata(dataset, domain_dictionary) - log(INFO, "Fitting CTGAN...") + if trained_model is None: + log(INFO, "Training new CTGAN model...") + ctgan = CTGANSynthesizer( + metadata=metadata, + epochs=configs.training.epochs, + verbose=configs.training.verbose, + ) + model_name = "trained_ctgan_model.pkl" + else: + log(INFO, "Fine tuning CTGAN model...") + ctgan = trained_model + model_name = "fine_tuned_ctgan_model.pkl" - ctgan = CTGANSynthesizer( - metadata=metadata, - epochs=configs.training.epochs, - verbose=configs.training.verbose, - ) - ctgan.fit(train_data_without_ids) + ctgan.fit(dataset_without_ids) - results_file = Path(save_dir) / "trained_ctgan_model.pkl" + results_file = Path(save_dir) / model_name results_file.parent.mkdir(parents=True, exist_ok=True) ctgan.save(results_file) From e5c8fdac2cb913c4d22d0bf4dc1896f02176f915 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 24 Feb 2026 15:47:28 -0500 Subject: [PATCH 07/38] Adding the split data code --- examples/gan/ensemble_attack/config.yaml | 4 ++++ examples/gan/ensemble_attack/run.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 50977265..b3d1922e 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -12,6 +12,10 @@ ensemble_attack: processed_attack_data_path: ${base_data_dir}/ensemble_attack population_path: ${base_data_dir}/ensemble_attack + data_processing_config: + column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. + population_sample_size: 40000 # Population size is the total data that your attack has access to. + shadow_training: model_name: ctgan model_config: # Configurations specific for the CTGAN model diff --git a/examples/gan/ensemble_attack/run.py b/examples/gan/ensemble_attack/run.py index 6f3690cc..b2351d6a 100644 --- a/examples/gan/ensemble_attack/run.py +++ b/examples/gan/ensemble_attack/run.py @@ -7,6 +7,8 @@ from omegaconf import DictConfig, OmegaConf from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training +from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.process_split_data import process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -27,6 +29,20 @@ def main(config: DictConfig) -> None: set_all_random_seeds(seed=config.ensemble_attack.random_seed) log(INFO, f"Training phase random seed set to {config.ensemble_attack.random_seed}.") + # The following function saves the required dataframe splits in the specified processed_attack_data_path path. + population_data = load_dataframe( + Path(config.ensemble_attack.data_paths.population_path), + "population_all_with_challenge.csv", + ) + process_split_data( + all_population_data=population_data, + processed_attack_data_path=Path(config.ensemble_attack.data_paths.processed_attack_data_path), + # TODO: column_to_stratify value is not documented in the original codebase. + column_to_stratify=config.ensemble_attack.data_processing_config.column_to_stratify, + num_total_samples=config.ensemble_attack.data_processing_config.population_sample_size, + random_seed=config.ensemble_attack.random_seed, + ) + # Saving the model config from the config.yaml into a json file # because that's what the ensemble attack code will be looking for training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path) From 8f10678c09f4b8f4c695adab52eaeb4b313ef77a Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 24 Feb 2026 18:54:29 -0500 Subject: [PATCH 08/38] More config changes and bug fixes --- .../configs/original_attack_config.yaml | 2 +- .../run_shadow_model_training.py | 3 +- examples/gan/ensemble_attack/config.yaml | 20 +++++++ examples/gan/ensemble_attack/run.py | 53 +++++++++++-------- .../ensemble/rmia/shadow_model_training.py | 6 +-- .../configs/shadow_training_config.yaml | 2 +- .../ensemble/test_shadow_model_training.py | 4 +- .../configs/shadow_training_config.yaml | 2 +- .../ensemble/test_shadow_model_utils.py | 2 +- 9 files changed, 63 insertions(+), 31 deletions(-) diff --git a/examples/ensemble_attack/configs/original_attack_config.yaml b/examples/ensemble_attack/configs/original_attack_config.yaml index 4adaa181..22e6d52c 100644 --- a/examples/ensemble_attack/configs/original_attack_config.yaml +++ b/examples/ensemble_attack/configs/original_attack_config.yaml @@ -58,7 +58,7 @@ shadow_training: training_json_config_paths: # Config json files used for tabddpm training on the trans table table_domain_file_path: ${base_example_dir}/data_configs/trans_domain.json dataset_meta_file_path: ${base_example_dir}/data_configs/dataset_meta.json - tabddpm_training_config_path: ${base_example_dir}/data_configs/trans.json + training_config_path: ${base_example_dir}/data_configs/trans.json # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name # Also, training configs for each shadow model are created under shadow_models_data_path. shadow_models_output_path: ${base_data_dir}/shadow_models_and_data diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index e3b25780..904b29d9 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -68,9 +68,10 @@ def run_target_model_training(config: DictConfig) -> Path: ) configs, save_dir = save_additional_training_config( data_dir=target_folder, - training_config_json_path=Path(target_training_json_config_paths.tabddpm_training_config_path), + training_config_json_path=Path(target_training_json_config_paths.training_config_path), final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json experiment_name="trained_target_model", + model_type=model_type, ) train_result: TrainingResult diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index b3d1922e..353ea755 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -16,6 +16,12 @@ ensemble_attack: column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. population_sample_size: 40000 # Population size is the total data that your attack has access to. + pipeline: + # TODO: properly test these + run_data_processing: true # Set this to false if you have already saved the processed data + run_shadow_model_training: true # Set this to false if shadow models are already trained and saved + run_metaclassifier_training: true + shadow_training: model_name: ctgan model_config: # Configurations specific for the CTGAN model @@ -25,6 +31,7 @@ ensemble_attack: synthesizing: sample_size: 20000 shadow_models_output_path: ${results_dir}/ensemble_attack/shadow_models + target_model_output_path: ${results_dir}/shadow_target_model_and_data training_json_config_paths: # Config json files used for tabddpm training on the trans table table_domain_file_path: ${base_data_dir}/trans_domain.json dataset_meta_file_path: ${base_data_dir}/dataset_meta.json @@ -41,3 +48,16 @@ ensemble_attack: "${ensemble_attack.shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", ] target_synthetic_data_path: ${ensemble_attack.shadow_training.target_model_output_path}/target_synthetic_data.csv + + # Metaclassifier settings + metaclassifier: + # Data types json file is used for xgboost model training. + data_types_file_path: ${base_data_dir}/data_types.json + model_type: "xgb" + # Model training parameters + num_optuna_trials: 100 # Original code: 100 + num_kfolds: 5 + use_gpu: false + # Temporary. Might remove having an epoch parameter. + epochs: 1 + meta_classifier_model_name: ${ensemble_attach.metaclassifier.model_type}_metaclassifier_model diff --git a/examples/gan/ensemble_attack/run.py b/examples/gan/ensemble_attack/run.py index b2351d6a..09f75a09 100644 --- a/examples/gan/ensemble_attack/run.py +++ b/examples/gan/ensemble_attack/run.py @@ -7,7 +7,7 @@ from omegaconf import DictConfig, OmegaConf from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training -from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe from midst_toolkit.attacks.ensemble.process_split_data import process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -29,19 +29,29 @@ def main(config: DictConfig) -> None: set_all_random_seeds(seed=config.ensemble_attack.random_seed) log(INFO, f"Training phase random seed set to {config.ensemble_attack.random_seed}.") - # The following function saves the required dataframe splits in the specified processed_attack_data_path path. - population_data = load_dataframe( - Path(config.ensemble_attack.data_paths.population_path), - "population_all_with_challenge.csv", - ) - process_split_data( - all_population_data=population_data, - processed_attack_data_path=Path(config.ensemble_attack.data_paths.processed_attack_data_path), - # TODO: column_to_stratify value is not documented in the original codebase. - column_to_stratify=config.ensemble_attack.data_processing_config.column_to_stratify, - num_total_samples=config.ensemble_attack.data_processing_config.population_sample_size, - random_seed=config.ensemble_attack.random_seed, - ) + if config.ensemble_attack.pipeline.run_data_processing: + log(INFO, "Running data processing pipeline...") + # The following function saves the required dataframe splits in the specified processed_attack_data_path path. + population_data = load_dataframe( + Path(config.ensemble_attack.data_paths.population_path), + "population_all_with_challenge.csv", + ) + + population_data_no_id = population_data.drop(columns=[config.ensemble_attack.table_id_column_name]) + save_dataframe( + population_data_no_id, + Path(config.ensemble_attack.data_paths.population_path), + "population_all_with_challenge_no_id.csv", + ) + + process_split_data( + all_population_data=population_data, + processed_attack_data_path=Path(config.ensemble_attack.data_paths.processed_attack_data_path), + # TODO: column_to_stratify value is not documented in the original codebase. + column_to_stratify=config.ensemble_attack.data_processing_config.column_to_stratify, + num_total_samples=config.ensemble_attack.data_processing_config.population_sample_size, + random_seed=config.ensemble_attack.random_seed, + ) # Saving the model config from the config.yaml into a json file # because that's what the ensemble attack code will be looking for @@ -60,16 +70,17 @@ def main(config: DictConfig) -> None: } json.dump(training_config, f) - log(INFO, "Training the shadow models...") - shadow_data_paths = run_shadow_model_training(config.ensemble_attack) - shadow_data_paths = [Path(path) for path in shadow_data_paths] + if config.ensemble_attack.pipeline.run_shadow_model_training: + log(INFO, "Training the shadow models...") + shadow_data_paths = run_shadow_model_training(config.ensemble_attack) + shadow_data_paths = [Path(path) for path in shadow_data_paths] - log(INFO, "Training the target model...") - target_model_synthetic_path = run_target_model_training(config.ensemble_attack) + log(INFO, "Training the target model...") + target_model_synthetic_path = run_target_model_training(config.ensemble_attack) - if config.pipeline.run_metaclassifier_training: + if config.ensemble_attack.pipeline.run_metaclassifier_training: log(INFO, "Training the metaclassifier...") - if not config.pipeline.run_shadow_model_training: + if not config.ensemble_attack.pipeline.run_shadow_model_training: # If shadow model training is skipped, we need to provide the previous shadow model and target model paths. shadow_data_paths = [ Path(path) for path in config.ensemble_attack.shadow_training.final_shadow_models_path diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 46347838..572a4991 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -70,7 +70,7 @@ def train_fine_tuned_shadow_models( An example of this config is provided in ``examples/ensemble_attack/config.yaml``. Required keys are: - table_domain_file_path (str): Path to the table domain json file. - dataset_meta_file_path (str): Path to dataset meta json file. - - tabddpm_training_config_path (str): Path to table's training config json file. + - training_config_path (str): Path to table's training config json file. fine_tuning_config: Configuration dictionary containing shadow model fine-tuning specific information. init_model_id: An ID to assign to the pre-trained initial models. This can be used to save multiple pre-trained models with different IDs. @@ -260,7 +260,7 @@ def train_shadow_on_half_challenge_data( An example of this config is provided in ``examples/ensemble_attack/config.yaml``. Required keys are: - table_domain_file_path (str): Path to the table domain json file. - dataset_meta_file_path (str): Path to dataset meta json file. - - tabddpm_training_config_path (str): Path to table's training config json file. + - training_config_path (str): Path to table's training config json file. table_name: Name of the main table to be used for training the TabDDPM model. id_column_name: Name of the ID column in the data. number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model, @@ -406,7 +406,7 @@ def train_three_sets_of_shadow_models( An example of this config is provided in ``examples/ensemble_attack/config.yaml``. Required keys are: - table_domain_file_path (str): Path to the table domain json file. - dataset_meta_file_path (str): Path to dataset meta json file. - - tabddpm_training_config_path (str): Path to table's training config json file. + - training_config_path (str): Path to table's training config json file. fine_tuning_config: Configuration dictionary containing shadow model fine-tuning specific information. An example of this config is provided in ``examples/ensemble_attack/config.yaml``. Required keys are: - fine_tune_diffusion_iterations (int): Number of diffusion fine-tuning iterations. diff --git a/tests/integration/attacks/ensemble/configs/shadow_training_config.yaml b/tests/integration/attacks/ensemble/configs/shadow_training_config.yaml index 717f3d82..4ece9d95 100644 --- a/tests/integration/attacks/ensemble/configs/shadow_training_config.yaml +++ b/tests/integration/attacks/ensemble/configs/shadow_training_config.yaml @@ -7,7 +7,7 @@ shadow_training: training_json_config_paths: # Config json files used for tabddpm training on the trans table table_domain_file_path: ${base_test_assets_dir}/data_configs/trans_domain.json dataset_meta_file_path: ${base_test_assets_dir}/data_configs/dataset_meta.json - tabddpm_training_config_path: ${base_test_assets_dir}/data_configs/trans.json + training_config_path: ${base_test_assets_dir}/data_configs/trans.json # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name # Also, training configs for each shadow model are created under shadow_models_data_path. shadow_models_output_path: ${base_test_assets_dir}/shadow_models_data diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py index bd4c49e9..2dbab11b 100644 --- a/tests/integration/attacks/ensemble/test_shadow_model_training.py +++ b/tests/integration/attacks/ensemble/test_shadow_model_training.py @@ -140,7 +140,7 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: "tests/unit/attacks/ensemble/assets/population_data/all_population.csv" ) # For testing purposes only. fine_tuning_set = copy.deepcopy(train_set) - tabddpm_config_path = Path(cfg.shadow_training.training_json_config_paths.tabddpm_training_config_path) + training_config_path = Path(cfg.shadow_training.training_json_config_paths.training_config_path) tmp_training_dir = tmp_path # We should move ``dataset_meta.json`` and ``trans_domain.json`` files to the ``tmp_training_dir`` assert Path(cfg.shadow_training.training_json_config_paths.table_domain_file_path).exists() @@ -155,7 +155,7 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: ) configs, save_dir = save_additional_training_config( data_dir=tmp_training_dir, - training_config_json_path=tabddpm_config_path, + training_config_json_path=training_config_path, final_config_json_path=tmp_training_dir / "trans.json", experiment_name="test_experiment", workspace_name="test_workspace", diff --git a/tests/unit/attacks/ensemble/configs/shadow_training_config.yaml b/tests/unit/attacks/ensemble/configs/shadow_training_config.yaml index a6319f49..ed8e5a1f 100644 --- a/tests/unit/attacks/ensemble/configs/shadow_training_config.yaml +++ b/tests/unit/attacks/ensemble/configs/shadow_training_config.yaml @@ -7,7 +7,7 @@ shadow_training: training_json_config_paths: # Config json files used for tabddpm training on the trans table table_domain_file_path: ${base_test_assets_dir}/data_configs/trans_domain.json dataset_meta_file_path: ${base_test_assets_dir}/data_configs/dataset_meta.json - tabddpm_training_config_path: ${base_test_assets_dir}/data_configs/trans.json + training_config_path: ${base_test_assets_dir}/data_configs/trans.json # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name # Also, training configs for each shadow model are created under shadow_models_data_path. shadow_models_output_path: ${base_test_assets_dir}/shadow_models_data diff --git a/tests/unit/attacks/ensemble/test_shadow_model_utils.py b/tests/unit/attacks/ensemble/test_shadow_model_utils.py index a5f290b1..722918ea 100644 --- a/tests/unit/attacks/ensemble/test_shadow_model_utils.py +++ b/tests/unit/attacks/ensemble/test_shadow_model_utils.py @@ -18,7 +18,7 @@ def cfg() -> DictConfig: def test_save_additional_tabddpm_config(cfg: DictConfig, tmp_path: Path) -> None: # Input path - tabddpm_config_path = Path(cfg.shadow_training.training_json_config_paths.tabddpm_training_config_path) + tabddpm_config_path = Path(cfg.shadow_training.training_json_config_paths.training_config_path) # Extract original parameters with open(tabddpm_config_path, "r") as file: From 077d9091e3b0a1a7c0d4524c9a6b7c9b6951f0ae Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 25 Feb 2026 10:07:08 -0500 Subject: [PATCH 09/38] Removing ids dynamically --- examples/ensemble_attack/real_data_collection.py | 3 ++- examples/gan/ensemble_attack/run.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py index 57048b56..23424ad8 100644 --- a/examples/ensemble_attack/real_data_collection.py +++ b/examples/ensemble_attack/real_data_collection.py @@ -197,7 +197,8 @@ def collect_population_data_ensemble( data_processing_config=data_processing_config, ) # Drop ids. - df_population_no_id = df_population.drop(columns=["trans_id", "account_id"]) + id_columns = [c for c in df_population.columns if c.endswith("_id")] + df_population_no_id = df_population.drop(columns=id_columns) # Save the population data save_dataframe(df_population, save_dir, "population_all.csv") save_dataframe(df_population_no_id, save_dir, "population_all_no_id.csv") diff --git a/examples/gan/ensemble_attack/run.py b/examples/gan/ensemble_attack/run.py index 09f75a09..7edadb44 100644 --- a/examples/gan/ensemble_attack/run.py +++ b/examples/gan/ensemble_attack/run.py @@ -37,7 +37,9 @@ def main(config: DictConfig) -> None: "population_all_with_challenge.csv", ) - population_data_no_id = population_data.drop(columns=[config.ensemble_attack.table_id_column_name]) + # Removing id columns and saving the dataset + id_columns = [c for c in population_data.columns if c.endswith("_id")] + population_data_no_id = population_data.drop(columns=id_columns) save_dataframe( population_data_no_id, Path(config.ensemble_attack.data_paths.population_path), From b711fbd4080a2ed31d853bcfdf024bbd5a64f6aa Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 25 Feb 2026 15:51:33 -0500 Subject: [PATCH 10/38] Working! --- examples/gan/ensemble_attack/config.yaml | 6 +++++- src/midst_toolkit/attacks/ensemble/blending.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 353ea755..d889bb48 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -11,6 +11,7 @@ ensemble_attack: data_paths: processed_attack_data_path: ${base_data_dir}/ensemble_attack population_path: ${base_data_dir}/ensemble_attack + attack_evaluation_result_path: ${results_dir}/evaluation_results # Path where the attack evaluation results will be stored data_processing_config: column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. @@ -60,4 +61,7 @@ ensemble_attack: use_gpu: false # Temporary. Might remove having an epoch parameter. epochs: 1 - meta_classifier_model_name: ${ensemble_attach.metaclassifier.model_type}_metaclassifier_model + meta_classifier_model_name: ${ensemble_attack.metaclassifier.model_type}_metaclassifier_model + + model_paths: + metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved diff --git a/src/midst_toolkit/attacks/ensemble/blending.py b/src/midst_toolkit/attacks/ensemble/blending.py index 2b7c194d..87417d37 100644 --- a/src/midst_toolkit/attacks/ensemble/blending.py +++ b/src/midst_toolkit/attacks/ensemble/blending.py @@ -251,6 +251,10 @@ def predict( score = None if y_test is not None: - score = TprAtFpr.get_tpr_at_fpr(true_membership=y_test, predictions=probabilities, max_fpr=0.1) + score = TprAtFpr.get_tpr_at_fpr( + true_membership=y_test, + predicted_membership=probabilities, + fpr_threshold=0.1, + ) return probabilities, score From 1a38af25549d160f09a57f293735797371f0e45c Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 3 Mar 2026 13:53:59 -0500 Subject: [PATCH 11/38] Fixing indent on config file and adding some more information to the README file. --- examples/gan/ensemble_attack/README.md | 26 +++++- examples/gan/ensemble_attack/config.yaml | 108 +++++++++++------------ 2 files changed, 78 insertions(+), 56 deletions(-) diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index 54afb0c3..5879aa9c 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -6,7 +6,7 @@ using the [CTGAN](https://arxiv.org/pdf/1907.00503) model. ## Downloading data First, we need the data. Download it from this -[Google Drive link](https://drive.google.com/file/d/1B9z4vh51mH6ZMj5E0pJitqR8lid3EJKM/view?usp=sharing), +[Google Drive link](https://drive.google.com/file/d/1B9z4vh51mH6ZMj5E0pJitqR8lid3EJKM/view?usp=drive_link), extract the files and place them in a `/data/ensemble_attack` folder in within this folder (`examples/gan`). @@ -17,11 +17,33 @@ extract the files and place them in a `/data/ensemble_attack` folder in within t Here is a description of the files that have been extracted: - `master_challenge_train.csv`: - `population_all_with_challenge.csv`: +- `dataset_meta.json`: Metadata about the relationship between the tables in the dataset. Since this is a +single table dataset, it will only contain information about the transaction (`trans`) table. +- `trans_domain.json`: Metadata about the columns of the transaction table, such as their size +and type (`continuous` or `discrete`). +- `data_types.json`: Additional metadata about the columns, splitting them into 4 types: + - `numerical`: a list of the columns that contain numerical information + - `categorical`: a list of the columns that contain categorical information + - `variable_to_predict`: the name of the target column that will be predicted + - `id_column_name`: the name of the column in the table that represents the rows' id. + +With the data present in the correct folder, we can proceed with running the attack. ## Running the attack -To run, execute the following command from the prooject's root folder: +> [!NOTE] +> In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_trainig.model_name` +> is what determines this attack will be run with the CTGAN model. + +To run the attack, execute the following command from the project's root folder: ```bash python -m examples.gan.ensemble_attack.run ``` + +This will take a long time to run, so it might be a good idea to execute it as a +background process. If you want to have a quick test run before kicking off the +full process, you can change the number of iterations, epochs, population and +sample sizes to smaller numbers. + +## Results diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index d889bb48..5b455479 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -4,64 +4,64 @@ base_data_dir: examples/gan/data results_dir: examples/gan/results ensemble_attack: - random_seed: null # Set this to a value if you want to set a random seed for reproducibility - table_name: "trans" - table_id_column_name: "trans_id" + random_seed: null # Set this to a value if you want to set a random seed for reproducibility + table_name: "trans" + table_id_column_name: "trans_id" - data_paths: - processed_attack_data_path: ${base_data_dir}/ensemble_attack - population_path: ${base_data_dir}/ensemble_attack - attack_evaluation_result_path: ${results_dir}/evaluation_results # Path where the attack evaluation results will be stored + data_paths: + processed_attack_data_path: ${base_data_dir}/ensemble_attack + population_path: ${base_data_dir}/ensemble_attack + attack_evaluation_result_path: ${results_dir}/evaluation_results # Path where the attack evaluation results will be stored - data_processing_config: - column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. - population_sample_size: 40000 # Population size is the total data that your attack has access to. + data_processing_config: + column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. + population_sample_size: 40000 # Population size is the total data that your attack has access to. - pipeline: - # TODO: properly test these - run_data_processing: true # Set this to false if you have already saved the processed data - run_shadow_model_training: true # Set this to false if shadow models are already trained and saved - run_metaclassifier_training: true + pipeline: + # TODO: properly test these + run_data_processing: true # Set this to false if you have already saved the processed data + run_shadow_model_training: true # Set this to false if shadow models are already trained and saved + run_metaclassifier_training: true - shadow_training: - model_name: ctgan - model_config: # Configurations specific for the CTGAN model - training: - epochs: 300 - verbose: True - synthesizing: - sample_size: 20000 - shadow_models_output_path: ${results_dir}/ensemble_attack/shadow_models - target_model_output_path: ${results_dir}/shadow_target_model_and_data - training_json_config_paths: # Config json files used for tabddpm training on the trans table - table_domain_file_path: ${base_data_dir}/trans_domain.json - dataset_meta_file_path: ${base_data_dir}/dataset_meta.json - training_config_path: ${base_data_dir}/trans.json # if this is not present, it will be created by copying the example config - fine_tuning_config: - fine_tune_diffusion_iterations: 200000 - fine_tune_classifier_iterations: 20000 - pre_train_data_size: 60000 - number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. + shadow_training: + model_name: ctgan + model_config: # Configurations specific for the CTGAN model + training: + epochs: 300 + verbose: True + synthesizing: + sample_size: 20000 + shadow_models_output_path: ${results_dir}/ensemble_attack/shadow_models + target_model_output_path: ${results_dir}/shadow_target_model_and_data + training_json_config_paths: # Config json files used for tabddpm training on the trans table + table_domain_file_path: ${base_data_dir}/trans_domain.json + dataset_meta_file_path: ${base_data_dir}/dataset_meta.json + training_config_path: ${base_data_dir}/trans.json # if this is not present, it will be created by copying the example config + fine_tuning_config: + fine_tune_diffusion_iterations: 200000 + fine_tune_classifier_iterations: 20000 + pre_train_data_size: 60000 + number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. - final_shadow_models_path: [ - "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl", - "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl", - "${ensemble_attack.shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", - ] - target_synthetic_data_path: ${ensemble_attack.shadow_training.target_model_output_path}/target_synthetic_data.csv + final_shadow_models_path: [ + "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${ensemble_attack.shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", + ] + target_synthetic_data_path: ${ensemble_attack.shadow_training.target_model_output_path}/target_synthetic_data.csv - # Metaclassifier settings - metaclassifier: - # Data types json file is used for xgboost model training. - data_types_file_path: ${base_data_dir}/data_types.json - model_type: "xgb" - # Model training parameters - num_optuna_trials: 100 # Original code: 100 - num_kfolds: 5 - use_gpu: false - # Temporary. Might remove having an epoch parameter. - epochs: 1 - meta_classifier_model_name: ${ensemble_attack.metaclassifier.model_type}_metaclassifier_model + # Metaclassifier settings + metaclassifier: + # Data types json file is used for xgboost model training. + data_types_file_path: ${base_data_dir}/data_types.json + model_type: "xgb" + # Model training parameters + num_optuna_trials: 100 # Original code: 100 + num_kfolds: 5 + use_gpu: false + # Temporary. Might remove having an epoch parameter. + epochs: 1 + meta_classifier_model_name: ${ensemble_attack.metaclassifier.model_type}_metaclassifier_model - model_paths: - metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved + model_paths: + metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved From af4f04ea422ddfc31a4edcb9ccce7246d03d4564 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 4 Mar 2026 17:09:31 -0500 Subject: [PATCH 12/38] Adding test attack model code --- examples/ensemble_attack/test_attack_model.py | 34 ++++++++++++++--- examples/gan/ensemble_attack/README.md | 36 +++++++++++++++--- examples/gan/ensemble_attack/config.yaml | 22 +++++++++++ .../ensemble_attack/make_challenge_dataset.py | 38 +++++++++++++++++++ .../gan/ensemble_attack/test_attack_model.py | 18 +++++++++ .../{run.py => train_attack_model.py} | 6 +-- examples/gan/synthesize.py | 8 +++- examples/gan/train.py | 19 ++++++++-- 8 files changed, 161 insertions(+), 20 deletions(-) create mode 100644 examples/gan/ensemble_attack/make_challenge_dataset.py create mode 100644 examples/gan/ensemble_attack/test_attack_model.py rename examples/gan/ensemble_attack/{run.py => train_attack_model.py} (97%) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 36cad0b2..8ee08153 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -261,11 +261,29 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list A list containing three dictionaries, each representing a collection of shadow models with their training data IDs and generated synthetic outputs. """ - df_challenge_experiment, df_master_train = collect_challenge_and_train_data( - config.data_processing_config, - processed_attack_data_path=Path(config.data_paths.processed_attack_data_path), - targets_data_path=Path(config.data_processing_config.midst_data_path), + # Checking if challenge data exists + challenge_data_path = ( + Path(config.data_paths.processed_attack_data_path) / "population_all_with_challenge_challenge.csv" ) + + if challenge_data_path.exists(): + log(INFO, "Skipping data collection for testing phase.") + df_challenge_experiment = load_dataframe( + Path(config.data_paths.processed_attack_data_path), + "population_all_with_challenge_challenge.csv", + ) + df_master_train = load_dataframe( + Path(config.data_paths.processed_attack_data_path), + "master_challenge_train.csv", + ) + else: + # If challenge data does not exist, collect it from the cluster + df_challenge_experiment, df_master_train = collect_challenge_and_train_data( + config.data_processing_config, + processed_attack_data_path=Path(config.data_paths.processed_attack_data_path), + targets_data_path=Path(config.data_processing_config.midst_data_path), + ) + # Load the challenge dataframe for training RMIA shadow models. rmia_training_choice = RmiaTrainingDataChoice(config.target_model.attack_rmia_shadow_training_data_choice) df_challenge = select_challenge_data_for_training(rmia_training_choice, df_challenge_experiment, df_master_train) @@ -321,7 +339,13 @@ def run_metaclassifier_testing( test_data = pd.read_csv(challenge_data_path) log(INFO, f"Challenge data loaded from {challenge_data_path} with a size of {len(test_data)}.") - test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze() + if challenge_label_path.suffix == ".npy": + test_target = np.load(challenge_label_path).squeeze() + elif challenge_label_path.suffix == ".csv": + test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze() + else: + raise ValueError(f"Unsupported challenge label file type: {challenge_label_path}. Must be .npy or .csv.") + assert len(test_data) == len(test_target), "Number of challenge labels must match number of challenge data points." target_synthetic_path = Path(config.target_model.target_synthetic_data_path) diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index 5879aa9c..3ba14821 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -15,8 +15,7 @@ extract the files and place them in a `/data/ensemble_attack` folder in within t > of the [`config.yaml`](config.yaml) file. Here is a description of the files that have been extracted: -- `master_challenge_train.csv`: -- `population_all_with_challenge.csv`: +- `population_all_with_challenge.csv`: The full set of training data - `dataset_meta.json`: Metadata about the relationship between the tables in the dataset. Since this is a single table dataset, it will only contain information about the transaction (`trans`) table. - `trans_domain.json`: Metadata about the columns of the transaction table, such as their size @@ -29,16 +28,37 @@ and type (`continuous` or `discrete`). With the data present in the correct folder, we can proceed with running the attack. -## Running the attack +## Training the real model + +To train the real model and synthetic data that will be the target of the attack, run: + +```bash +python -m examples.gan.synthesize --config-path=./ensemble_attack +``` + +## Producing the challenge points dataset + +The challenge points dataset is composed of real data points where half of them +were used in training the real model and half weren't. It is the dataset we are going +to use to evaluate how good the attack model is in differentiating between +the points used in training and the ones not used in trainig. + +To produce such dataset, run the following script: + +```bash +python -m examples.gan.ensemble_attack.make_challenge_dataset +``` + +## Training the attack model > [!NOTE] > In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_trainig.model_name` > is what determines this attack will be run with the CTGAN model. -To run the attack, execute the following command from the project's root folder: +To train the attack models, execute the following command: ```bash -python -m examples.gan.ensemble_attack.run +python -m examples.gan.ensemble_attack.train_attack_model ``` This will take a long time to run, so it might be a good idea to execute it as a @@ -46,4 +66,8 @@ background process. If you want to have a quick test run before kicking off the full process, you can change the number of iterations, epochs, population and sample sizes to smaller numbers. -## Results +## Testing the attack model + +```bash +python -m examples.gan.ensemble_attack.test_attack_model +``` diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 5b455479..894f8663 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -3,6 +3,15 @@ base_data_dir: examples/gan/data results_dir: examples/gan/results +training: + epochs: 300 + verbose: True + data_path: ${base_data_dir}/ensemble_attack/population_all_with_challenge.csv + sample_size: 100000 + +synthesizing: + sample_size: 20000 + ensemble_attack: random_seed: null # Set this to a value if you want to set a random seed for reproducibility table_name: "trans" @@ -65,3 +74,16 @@ ensemble_attack: model_paths: metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved + + target_model: # This is only used for testing the attack on a real target model. + target_model_directory: ${ensemble_attack.shadow_training.target_model_output_path}/target_model/shadow_workspace + target_model_id: None + target_model_name: trained_target_model + target_synthetic_data_path: ${ensemble_attack.shadow_training.target_model_output_path}/target_synthetic_data.csv + challenge_data_path: ${ensemble_attack.data_paths.processed_attack_data_path}/master_challenge_test.csv + challenge_label_path: ${ensemble_attack.data_paths.processed_attack_data_path}/master_challenge_test_labels.npy + + target_shadow_models_output_path: ${results_dir}/test_all_targets # Sub-directory to store test shadows and results + attack_probabilities_result_path: ${results_dir}/test_probabilities + attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase. + # See select_challenge_data_for_training()'s docstring for more details. diff --git a/examples/gan/ensemble_attack/make_challenge_dataset.py b/examples/gan/ensemble_attack/make_challenge_dataset.py new file mode 100644 index 00000000..0aa2d5b7 --- /dev/null +++ b/examples/gan/ensemble_attack/make_challenge_dataset.py @@ -0,0 +1,38 @@ +from logging import INFO +from pathlib import Path + +import hydra +import pandas as pd +from omegaconf import DictConfig + +from examples.gan.utils import get_table_name +from midst_toolkit.common.logger import log + + +@hydra.main(config_path="./", config_name="config", version_base=None) +def make_challenge_dataset(config: DictConfig) -> None: + """Main function to make the challenge dataset.""" + log(INFO, "Making challenge dataset...") + + if config.training.data_path is None: + dataset_name = get_table_name(config.base_data_dir) + real_data = pd.read_csv(Path(config.base_data_dir) / f"{dataset_name}.csv") + else: + dataset_name = Path(config.training.data_path).stem + real_data = pd.read_csv(config.training.data_path) + + training_data = pd.read_csv(Path(config.results_dir) / f"{dataset_name}_sampled.csv") + untrained_data = real_data[~real_data["trans_id"].isin(training_data["trans_id"])].sample(len(training_data)) + + challenge_data = pd.concat([training_data, untrained_data]) + challenge_data["label"] = [1] * len(training_data) + [0] * len(untrained_data) + + challenge_data_path = ( + Path(config.ensemble_attack.data_paths.processed_attack_data_path) / f"{dataset_name}_challenge.csv" + ) + log(INFO, f"Saving challenge data to {challenge_data_path}") + challenge_data.to_csv(challenge_data_path, index=False) + + +if __name__ == "__main__": + make_challenge_dataset() diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/test_attack_model.py new file mode 100644 index 00000000..29b42b76 --- /dev/null +++ b/examples/gan/ensemble_attack/test_attack_model.py @@ -0,0 +1,18 @@ +from logging import INFO + +import hydra +from omegaconf import DictConfig + +from examples.ensemble_attack.test_attack_model import run_metaclassifier_testing +from midst_toolkit.common.logger import log + + +@hydra.main(config_path="./", config_name="config", version_base=None) +def test_attack_model(config: DictConfig) -> None: + """Main function to test the attack model.""" + log(INFO, f"Testing attack model at {config.ensemble_attack.target_model.target_model_directory}...") + run_metaclassifier_testing(config.ensemble_attack) + + +if __name__ == "__main__": + test_attack_model() diff --git a/examples/gan/ensemble_attack/run.py b/examples/gan/ensemble_attack/train_attack_model.py similarity index 97% rename from examples/gan/ensemble_attack/run.py rename to examples/gan/ensemble_attack/train_attack_model.py index 26b69d78..acd5503a 100644 --- a/examples/gan/ensemble_attack/run.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -14,9 +14,9 @@ @hydra.main(config_path="./", config_name="config", version_base=None) -def main(config: DictConfig) -> None: +def train_attack_model(config: DictConfig) -> None: """ - Run the Ensemble Attack pipeline with the CTGAN model. + Train the Ensemble Attack pipeline with CTGAN model. As the first step, data processing is done. Second step is shadow model training used for RMIA attack. @@ -110,4 +110,4 @@ def main(config: DictConfig) -> None: if __name__ == "__main__": - main() + train_attack_model() diff --git a/examples/gan/synthesize.py b/examples/gan/synthesize.py index 413b65e0..53a7b761 100644 --- a/examples/gan/synthesize.py +++ b/examples/gan/synthesize.py @@ -34,8 +34,12 @@ def main(config: DictConfig) -> None: log(INFO, f"Synthesizing data of size {config.synthesizing.sample_size}...") synthetic_data = ctgan.sample(num_rows=config.synthesizing.sample_size) - table_name = get_table_name(config.base_data_dir) - synthetic_data_file = Path(config.results_dir) / f"{table_name}_synthetic.csv" + if config.training.data_path is not None: + dataset_name = Path(config.training.data_path).stem + else: + dataset_name = get_table_name(config.base_data_dir) + + synthetic_data_file = Path(config.results_dir) / f"{dataset_name}_synthetic.csv" log(INFO, f"Saving synthetic data to {synthetic_data_file}...") synthetic_data.to_csv(synthetic_data_file, index=False) diff --git a/examples/gan/train.py b/examples/gan/train.py index 40bbbe02..0951a342 100644 --- a/examples/gan/train.py +++ b/examples/gan/train.py @@ -22,15 +22,26 @@ def main(config: DictConfig) -> None: Args: config: Configuration as an OmegaConf DictConfig object. """ - log(INFO, "Loading data...") - table_name = get_table_name(config.base_data_dir) + if config.training.data_path is None: + log(INFO, "Loading data with table name...") + dataset_name = table_name + real_data = pd.read_csv(Path(config.base_data_dir) / f"{table_name}.csv") + + else: + log(INFO, f"Loading data from {config.training.data_path}...") + dataset_name = Path(config.training.data_path).stem + real_data = pd.read_csv(config.training.data_path) + + if config.training.sample_size is not None: + log(INFO, f"Sampling {config.training.sample_size} rows from data...") + real_data = real_data.sample(n=config.training.sample_size) + real_data.to_csv(Path(config.results_dir) / f"{dataset_name}_sampled.csv", index=False) + with open(Path(config.base_data_dir) / f"{table_name}_domain.json", "r") as f: domain_info = json.load(f) - real_data = pd.read_csv(Path(config.base_data_dir) / f"{table_name}.csv") - metadata, real_data_without_ids = get_single_table_svd_metadata(real_data, domain_info) log(INFO, "Fitting CTGAN...") From 5afb7740b217da5f34c0d2d0b5eb0e87ac4769ae Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 5 Mar 2026 11:26:26 -0500 Subject: [PATCH 13/38] Small bug fixes --- examples/ensemble_attack/test_attack_model.py | 4 ++-- examples/gan/ensemble_attack/README.md | 16 ++++++++++------ .../ensemble_attack/make_challenge_dataset.py | 13 ++++++++++--- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 8ee08153..2efd20ef 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -263,14 +263,14 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list """ # Checking if challenge data exists challenge_data_path = ( - Path(config.data_paths.processed_attack_data_path) / "population_all_with_challenge_challenge.csv" + Path(config.data_paths.processed_attack_data_path) / "population_all_with_challenge_challenge_data.csv" ) if challenge_data_path.exists(): log(INFO, "Skipping data collection for testing phase.") df_challenge_experiment = load_dataframe( Path(config.data_paths.processed_attack_data_path), - "population_all_with_challenge_challenge.csv", + "population_all_with_challenge_challenge_data.csv", ) df_master_train = load_dataframe( Path(config.data_paths.processed_attack_data_path), diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index 3ba14821..42214139 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -3,7 +3,7 @@ On this example, we demonstrate how to run the [Ensemble Attack](examples/ensemble_attack) using the [CTGAN](https://arxiv.org/pdf/1907.00503) model. -## Downloading data +## 1. Downloading data First, we need the data. Download it from this [Google Drive link](https://drive.google.com/file/d/1B9z4vh51mH6ZMj5E0pJitqR8lid3EJKM/view?usp=drive_link), @@ -28,15 +28,16 @@ and type (`continuous` or `discrete`). With the data present in the correct folder, we can proceed with running the attack. -## Training the real model +## 2. Training the real model -To train the real model and synthetic data that will be the target of the attack, run: +To train the real model and produce the synthetic data that will be the target of the +attack, you can run: ```bash python -m examples.gan.synthesize --config-path=./ensemble_attack ``` -## Producing the challenge points dataset +## 3. Producing the challenge points dataset The challenge points dataset is composed of real data points where half of them were used in training the real model and half weren't. It is the dataset we are going @@ -49,7 +50,7 @@ To produce such dataset, run the following script: python -m examples.gan.ensemble_attack.make_challenge_dataset ``` -## Training the attack model +## 4. Training the attack model > [!NOTE] > In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_trainig.model_name` @@ -66,7 +67,10 @@ background process. If you want to have a quick test run before kicking off the full process, you can change the number of iterations, epochs, population and sample sizes to smaller numbers. -## Testing the attack model +## 5. Testing the attack model + +To test the attack model against the model and synthetic data produced on +[step 2](#2-training-the-real-model), please run: ```bash python -m examples.gan.ensemble_attack.test_attack_model diff --git a/examples/gan/ensemble_attack/make_challenge_dataset.py b/examples/gan/ensemble_attack/make_challenge_dataset.py index 0aa2d5b7..50b0ba50 100644 --- a/examples/gan/ensemble_attack/make_challenge_dataset.py +++ b/examples/gan/ensemble_attack/make_challenge_dataset.py @@ -2,6 +2,7 @@ from pathlib import Path import hydra +import numpy as np import pandas as pd from omegaconf import DictConfig @@ -22,16 +23,22 @@ def make_challenge_dataset(config: DictConfig) -> None: real_data = pd.read_csv(config.training.data_path) training_data = pd.read_csv(Path(config.results_dir) / f"{dataset_name}_sampled.csv") - untrained_data = real_data[~real_data["trans_id"].isin(training_data["trans_id"])].sample(len(training_data)) + id_column = config.ensemble_attack.table_id_column_name + untrained_data = real_data[~real_data[id_column].isin(training_data[id_column])].sample(len(training_data)) challenge_data = pd.concat([training_data, untrained_data]) - challenge_data["label"] = [1] * len(training_data) + [0] * len(untrained_data) + challenge_data_labels = np.concatenate([np.ones(len(training_data)), np.zeros(len(untrained_data))]) challenge_data_path = ( - Path(config.ensemble_attack.data_paths.processed_attack_data_path) / f"{dataset_name}_challenge.csv" + Path(config.ensemble_attack.data_paths.processed_attack_data_path) / f"{dataset_name}_challenge_data.csv" + ) + challenge_label_path = ( + Path(config.ensemble_attack.data_paths.processed_attack_data_path) / f"{dataset_name}_challenge_labels.npy" ) log(INFO, f"Saving challenge data to {challenge_data_path}") challenge_data.to_csv(challenge_data_path, index=False) + log(INFO, f"Saving challenge labels to {challenge_label_path}") + np.save(challenge_label_path, challenge_data_labels) if __name__ == "__main__": From e4ec79323a26d81ffac444309fbc9155bccbea43 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 5 Mar 2026 13:35:17 -0500 Subject: [PATCH 14/38] Updates to readme and config file values --- examples/gan/ensemble_attack/README.md | 16 ++++++++++------ examples/gan/ensemble_attack/config.yaml | 10 +++++----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index 42214139..20a31754 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -28,9 +28,13 @@ and type (`continuous` or `discrete`). With the data present in the correct folder, we can proceed with running the attack. -## 2. Training the real model +## 2. Training the target model -To train the real model and produce the synthetic data that will be the target of the +The target model is the model being attacked. If you already have a target model and +a set of synthetic data produced by it, you can add its path to the `ensemble_attack.target_model` +variables. + +If you wish to train one and produce the synthetic data that will be the target of the attack, you can run: ```bash @@ -40,9 +44,9 @@ python -m examples.gan.synthesize --config-path=./ensemble_attack ## 3. Producing the challenge points dataset The challenge points dataset is composed of real data points where half of them -were used in training the real model and half weren't. It is the dataset we are going +were used in training the target model and half weren't. It is the dataset we are going to use to evaluate how good the attack model is in differentiating between -the points used in training and the ones not used in trainig. +the points used in training and the ones not used in training. To produce such dataset, run the following script: @@ -69,8 +73,8 @@ sample sizes to smaller numbers. ## 5. Testing the attack model -To test the attack model against the model and synthetic data produced on -[step 2](#2-training-the-real-model), please run: +To test the attack model against the target model and synthetic data produced on +[step 2](#2-training-the-target-model), please run: ```bash python -m examples.gan.ensemble_attack.test_attack_model diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 894f8663..0a99056c 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -76,12 +76,12 @@ ensemble_attack: metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved target_model: # This is only used for testing the attack on a real target model. - target_model_directory: ${ensemble_attack.shadow_training.target_model_output_path}/target_model/shadow_workspace + target_model_directory: ${results_dir} target_model_id: None - target_model_name: trained_target_model - target_synthetic_data_path: ${ensemble_attack.shadow_training.target_model_output_path}/target_synthetic_data.csv - challenge_data_path: ${ensemble_attack.data_paths.processed_attack_data_path}/master_challenge_test.csv - challenge_label_path: ${ensemble_attack.data_paths.processed_attack_data_path}/master_challenge_test_labels.npy + target_model_name: trained_ctgan_model + target_synthetic_data_path: ${results_dir}/population_all_with_challenge_synthetic.csv + challenge_data_path: ${ensemble_attack.data_paths.processed_attack_data_path}/population_all_with_challenge_challenge_data.csv + challenge_label_path: ${ensemble_attack.data_paths.processed_attack_data_path}/population_all_with_challenge_challenge_labels.npy target_shadow_models_output_path: ${results_dir}/test_all_targets # Sub-directory to store test shadows and results attack_probabilities_result_path: ${results_dir}/test_probabilities From 1c1312665265a61b479335b8e804ea08ed6ea5c6 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 5 Mar 2026 14:06:07 -0500 Subject: [PATCH 15/38] Small changes on configs and script bug fixes --- examples/ensemble_attack/test_attack_model.py | 13 ++++++------- examples/gan/ensemble_attack/config.yaml | 8 ++++---- .../gan/ensemble_attack/make_challenge_dataset.py | 11 +++++------ examples/gan/train.py | 1 + 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 2efd20ef..bfe02c94 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -262,18 +262,17 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list models with their training data IDs and generated synthetic outputs. """ # Checking if challenge data exists - challenge_data_path = ( - Path(config.data_paths.processed_attack_data_path) / "population_all_with_challenge_challenge_data.csv" - ) + processed_attack_data_path = Path(config.data_paths.processed_attack_data_path) + challenge_data_file_name = "population_all_with_challenge_challenge_data.csv" - if challenge_data_path.exists(): + if (processed_attack_data_path / challenge_data_file_name).exists(): log(INFO, "Skipping data collection for testing phase.") df_challenge_experiment = load_dataframe( - Path(config.data_paths.processed_attack_data_path), - "population_all_with_challenge_challenge_data.csv", + processed_attack_data_path, + challenge_data_file_name, ) df_master_train = load_dataframe( - Path(config.data_paths.processed_attack_data_path), + processed_attack_data_path, "master_challenge_train.csv", ) else: diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 0a99056c..8331b9d8 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -1,12 +1,12 @@ # Training example configuration # Base data directory (can be overridden from command line) -base_data_dir: examples/gan/data +base_data_dir: examples/gan/data/ensemble_attack results_dir: examples/gan/results training: epochs: 300 verbose: True - data_path: ${base_data_dir}/ensemble_attack/population_all_with_challenge.csv + data_path: ${base_data_dir}/population_all_with_challenge.csv sample_size: 100000 synthesizing: @@ -18,8 +18,8 @@ ensemble_attack: table_id_column_name: "trans_id" data_paths: - processed_attack_data_path: ${base_data_dir}/ensemble_attack - population_path: ${base_data_dir}/ensemble_attack + processed_attack_data_path: ${base_data_dir} + population_path: ${base_data_dir} attack_evaluation_result_path: ${results_dir}/evaluation_results # Path where the attack evaluation results will be stored data_processing_config: diff --git a/examples/gan/ensemble_attack/make_challenge_dataset.py b/examples/gan/ensemble_attack/make_challenge_dataset.py index 50b0ba50..1ab30ea9 100644 --- a/examples/gan/ensemble_attack/make_challenge_dataset.py +++ b/examples/gan/ensemble_attack/make_challenge_dataset.py @@ -29,12 +29,11 @@ def make_challenge_dataset(config: DictConfig) -> None: challenge_data = pd.concat([training_data, untrained_data]) challenge_data_labels = np.concatenate([np.ones(len(training_data)), np.zeros(len(untrained_data))]) - challenge_data_path = ( - Path(config.ensemble_attack.data_paths.processed_attack_data_path) / f"{dataset_name}_challenge_data.csv" - ) - challenge_label_path = ( - Path(config.ensemble_attack.data_paths.processed_attack_data_path) / f"{dataset_name}_challenge_labels.npy" - ) + processed_attack_data_path = Path(config.ensemble_attack.data_paths.processed_attack_data_path) + processed_attack_data_path.mkdir(parents=True, exist_ok=True) + + challenge_data_path = processed_attack_data_path / f"{dataset_name}_challenge_data.csv" + challenge_label_path = processed_attack_data_path / f"{dataset_name}_challenge_labels.npy" log(INFO, f"Saving challenge data to {challenge_data_path}") challenge_data.to_csv(challenge_data_path, index=False) log(INFO, f"Saving challenge labels to {challenge_label_path}") diff --git a/examples/gan/train.py b/examples/gan/train.py index 0951a342..379c2a7f 100644 --- a/examples/gan/train.py +++ b/examples/gan/train.py @@ -37,6 +37,7 @@ def main(config: DictConfig) -> None: if config.training.sample_size is not None: log(INFO, f"Sampling {config.training.sample_size} rows from data...") real_data = real_data.sample(n=config.training.sample_size) + Path(config.results_dir).mkdir(parents=True, exist_ok=True) real_data.to_csv(Path(config.results_dir) / f"{dataset_name}_sampled.csv", index=False) with open(Path(config.base_data_dir) / f"{table_name}_domain.json", "r") as f: From 4e9a8c9d13b252cb6edea1b0bbae4936e50a200b Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 5 Mar 2026 14:47:48 -0500 Subject: [PATCH 16/38] Adding the compute attack success script and fixing minor issues --- .../ensemble_attack/compute_attack_success.py | 16 +++++++--- examples/ensemble_attack/test_attack_model.py | 5 +++- examples/gan/ensemble_attack/README.md | 28 +++++++++++++---- .../ensemble_attack/compute_attack_success.py | 30 +++++++++++++++++++ examples/gan/ensemble_attack/config.yaml | 3 -- .../gan/ensemble_attack/test_attack_model.py | 5 +++- 6 files changed, 72 insertions(+), 15 deletions(-) create mode 100644 examples/gan/ensemble_attack/compute_attack_success.py diff --git a/examples/ensemble_attack/compute_attack_success.py b/examples/ensemble_attack/compute_attack_success.py index 56871113..4c50806b 100644 --- a/examples/ensemble_attack/compute_attack_success.py +++ b/examples/ensemble_attack/compute_attack_success.py @@ -41,7 +41,12 @@ def load_target_challenge_labels_and_probabilities( test_prediction_probabilities = np.load(attack_result_file_path) # Challenge labels are the true membership labels for the challenge points. - test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze() + if challenge_label_path.suffix == ".npy": + test_target = np.load(challenge_label_path).squeeze() + elif challenge_label_path.suffix == ".csv": + test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze() + else: + raise ValueError(f"Unsupported challenge label file type: {challenge_label_path}. Must be .npy or .csv.") assert len(test_prediction_probabilities) == len(test_target), ( "Number of challenge labels must match number of prediction probabilities." @@ -71,9 +76,12 @@ def compute_attack_success_for_given_targets( predictions = [] targets = [] for target_id in target_ids: - # Override target model id in config as ``attack_probabilities_result_path`` and - # ``challenge_label_path`` are dependent on it and change in runtime. - target_model_config.target_model_id = target_id + # If there is a target model id in the config, override it with the current target id + if "target_model_id" in target_model_config: + # Override target model id in config as ``attack_probabilities_result_path`` and + # ``challenge_label_path`` are dependent on it and change in runtime. + target_model_config.target_model_id = target_id + # Load challenge labels and prediction probabilities log(INFO, f"Loading challenge labels and prediction probabilities for target model ID {target_id}...") test_target, test_prediction_probabilities = load_target_challenge_labels_and_probabilities( diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index bfe02c94..5ab5654b 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -309,7 +309,10 @@ def run_metaclassifier_testing( Args: config: Configuration object set in ``experiments_config.yaml``. """ - log(INFO, f"Running metaclassifier testing on target model {config.target_model.target_model_id}...") + log( + INFO, + f"Running metaclassifier testing on target synthetic data at {config.target_model.target_synthetic_data_path}...", + ) if config.random_seed is not None: set_all_random_seeds(seed=config.random_seed) diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index 20a31754..9309fbbd 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -28,14 +28,18 @@ and type (`continuous` or `discrete`). With the data present in the correct folder, we can proceed with running the attack. -## 2. Training the target model +## 2. Generating target synthetic data to be tested -The target model is the model being attacked. If you already have a target model and -a set of synthetic data produced by it, you can add its path to the `ensemble_attack.target_model` -variables. +The **target model** is the model being attacked, and the **target synthetic data** +is the synthetic data generated by the target model that will be evaluated against +the attack. -If you wish to train one and produce the synthetic data that will be the target of the -attack, you can run: +If you already have a set of synthetic data produced by a target model, +you can add its path to the `ensemble_attack.target_model.target_synthetic_data_path` +property in the [`config.yaml`](config.yaml) file and skip this step. + +If you wish to train a new target model and produce the synthetic data that will be the +target of the attack, you can run: ```bash python -m examples.gan.synthesize --config-path=./ensemble_attack @@ -79,3 +83,15 @@ To test the attack model against the target model and synthetic data produced on ```bash python -m examples.gan.ensemble_attack.test_attack_model ``` + +## 6. Compute the attack success + +To compute the metrics about the success of the attack against the target +synthetic data, you can run the following command: + +```bash +python -m examples.gan.ensemble_attack.compute_attack_success +``` + +The results will both printed on the console and saved in the file +`examples/gan/results/attack_success_for_xgb_metaclassifier_model.txt` diff --git a/examples/gan/ensemble_attack/compute_attack_success.py b/examples/gan/ensemble_attack/compute_attack_success.py new file mode 100644 index 00000000..c8c09f99 --- /dev/null +++ b/examples/gan/ensemble_attack/compute_attack_success.py @@ -0,0 +1,30 @@ +from logging import INFO +from pathlib import Path + +import hydra +from omegaconf import DictConfig + +from examples.ensemble_attack.compute_attack_success import compute_attack_success_for_given_targets +from midst_toolkit.common.logger import log + + +@hydra.main(config_path="./", config_name="config", version_base=None) +def compute_attack_success(config: DictConfig) -> None: + """Main function to compute the attack success.""" + log( + INFO, + f"Computing attack success for target synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...", + ) + + compute_attack_success_for_given_targets( + target_model_config=config.ensemble_attack.target_model, + # TODO: refactor this to work better outside of the challenge context (i.e. no target ID) + # No target ID needed for CTGAN, but it needs at least one element in this array. The value does not matter. + target_ids=[0], + experiment_directory=Path(config.results_dir), + metaclassifier_model_name=config.ensemble_attack.metaclassifier.meta_classifier_model_name, + ) + + +if __name__ == "__main__": + compute_attack_success() diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 8331b9d8..d7173b3e 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -76,9 +76,6 @@ ensemble_attack: metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved target_model: # This is only used for testing the attack on a real target model. - target_model_directory: ${results_dir} - target_model_id: None - target_model_name: trained_ctgan_model target_synthetic_data_path: ${results_dir}/population_all_with_challenge_synthetic.csv challenge_data_path: ${ensemble_attack.data_paths.processed_attack_data_path}/population_all_with_challenge_challenge_data.csv challenge_label_path: ${ensemble_attack.data_paths.processed_attack_data_path}/population_all_with_challenge_challenge_labels.npy diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/test_attack_model.py index 29b42b76..e1eb655a 100644 --- a/examples/gan/ensemble_attack/test_attack_model.py +++ b/examples/gan/ensemble_attack/test_attack_model.py @@ -10,7 +10,10 @@ @hydra.main(config_path="./", config_name="config", version_base=None) def test_attack_model(config: DictConfig) -> None: """Main function to test the attack model.""" - log(INFO, f"Testing attack model at {config.ensemble_attack.target_model.target_model_directory}...") + log( + INFO, + f"Testing attack model against synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...", + ) run_metaclassifier_testing(config.ensemble_attack) From d83aabf37e98102d21f3cfa18cb56b0ee1b45e6f Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 9 Mar 2026 13:06:33 -0400 Subject: [PATCH 17/38] Cr by CodeRabbit and Sara --- examples/ensemble_attack/README.md | 2 +- examples/ensemble_attack/run_shadow_model_training.py | 4 ++-- examples/gan/ensemble_attack/README.md | 8 ++++---- .../attacks/ensemble/rmia/shadow_model_training.py | 8 ++++---- src/midst_toolkit/attacks/ensemble/shadow_model_utils.py | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/ensemble_attack/README.md b/examples/ensemble_attack/README.md index d77e63fd..2b7c1b62 100644 --- a/examples/ensemble_attack/README.md +++ b/examples/ensemble_attack/README.md @@ -5,7 +5,7 @@ As the first step of the attack, we need to collect and split the data. The inpu Make sure directories and JSON files specified in `data_paths` and `data_processing_config` configurations in `examples/ensemble_attack/configs/experiment_config.yaml` exist. -To run the whole data processing pipeline, run `run_attack.py` and set `pipeline.run_data_processing` to `true` in [`configs/experiment_config.yaml`](configs/experiment_config.yaml). It reads data from `data_paths.midst_data_path` specified in config, and populates `data_paths.population_data` and `data_paths.processed_attack_data_path` directories. +To run the whole data processing pipeline, run `run_attack.py` and set `pipeline.run_data_processing` to `true` in [`configs/experiment_config.yaml`](configs/experiment_config.yaml). It reads data from `data_paths.midst_data_path` specified in config, and populates `data_paths.population_path` and `data_paths.processed_attack_data_path` directories. Data processing steps for the MIDST challenge provided resources according to Ensemble attack are as follows: diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 4c852823..614eefc2 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -14,7 +14,7 @@ ModelType, TrainingResult, save_additional_training_config, - train_or_fine_tune_ctgan, + train_or_fine_tune_and_synthesize_with_ctgan, train_tabddpm_and_synthesize, ) from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig @@ -85,7 +85,7 @@ def run_target_model_training(config: DictConfig) -> Path: number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, ) elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_ctgan( + train_result = train_or_fine_tune_and_synthesize_with_ctgan( dataset=df_real_data, configs=cast(CTGANTrainingConfig, configs), save_dir=save_dir, diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index 9309fbbd..0586fe4f 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -1,13 +1,13 @@ # CTGAN Ensemble Attack Example -On this example, we demonstrate how to run the [Ensemble Attack](examples/ensemble_attack) +On this example, we demonstrate how to run the [Ensemble Attack](../../ensemble_attack/README.md) using the [CTGAN](https://arxiv.org/pdf/1907.00503) model. ## 1. Downloading data First, we need the data. Download it from this [Google Drive link](https://drive.google.com/file/d/1B9z4vh51mH6ZMj5E0pJitqR8lid3EJKM/view?usp=drive_link), -extract the files and place them in a `/data/ensemble_attack` folder in within this folder +extract the files and place them in a `/data/ensemble_attack` folder within this folder (`examples/gan`). > [!NOTE] @@ -61,7 +61,7 @@ python -m examples.gan.ensemble_attack.make_challenge_dataset ## 4. Training the attack model > [!NOTE] -> In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_trainig.model_name` +> In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_training.model_name` > is what determines this attack will be run with the CTGAN model. To train the attack models, execute the following command: @@ -78,7 +78,7 @@ sample sizes to smaller numbers. ## 5. Testing the attack model To test the attack model against the target model and synthetic data produced on -[step 2](#2-training-the-target-model), please run: +[step 2](#2-generating-target-synthetic-data-to-be-tested), please run: ```bash python -m examples.gan.ensemble_attack.test_attack_model diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 90bd7edb..92b69088 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -13,7 +13,7 @@ TrainingResult, fine_tune_tabddpm_and_synthesize, save_additional_training_config, - train_or_fine_tune_ctgan, + train_or_fine_tune_and_synthesize_with_ctgan, train_tabddpm_and_synthesize, ) from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig @@ -143,7 +143,7 @@ def train_fine_tuned_shadow_models( synthesize=False, ) elif model_type == ModelType.CTGAN: - initial_model_training_results = train_or_fine_tune_ctgan( + initial_model_training_results = train_or_fine_tune_and_synthesize_with_ctgan( train, cast(CTGANTrainingConfig, configs), save_dir, @@ -205,7 +205,7 @@ def train_fine_tuned_shadow_models( number_of_points_to_synthesize=number_of_points_to_synthesize, ) elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_ctgan( + train_result = train_or_fine_tune_and_synthesize_with_ctgan( dataset=selected_challenges, configs=cast(CTGANTrainingConfig, configs), save_dir=save_dir, @@ -332,7 +332,7 @@ def train_shadow_on_half_challenge_data( number_of_points_to_synthesize=number_of_points_to_synthesize, ) elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_ctgan( + train_result = train_or_fine_tune_and_synthesize_with_ctgan( dataset=selected_challenges, configs=cast(CTGANTrainingConfig, configs), save_dir=save_dir, diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index 57abf906..c03af364 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -281,7 +281,7 @@ def fine_tune_tabddpm_and_synthesize( return result -def train_or_fine_tune_ctgan( +def train_or_fine_tune_and_synthesize_with_ctgan( dataset: pd.DataFrame, configs: CTGANTrainingConfig, save_dir: Path, From a198fe9d59a85061af9d092ce85db96791a92bee Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 9 Mar 2026 13:10:14 -0400 Subject: [PATCH 18/38] Reducing the amount of training samples to 20k --- examples/gan/ensemble_attack/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index d7173b3e..63b2145f 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -7,7 +7,7 @@ training: epochs: 300 verbose: True data_path: ${base_data_dir}/population_all_with_challenge.csv - sample_size: 100000 + sample_size: 20000 synthesizing: sample_size: 20000 From e69b07e8411da218d32366472a51c16b6bf1e2c4 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 9 Mar 2026 14:00:15 -0400 Subject: [PATCH 19/38] Change function name to avoid pytest thinking it's a test --- examples/gan/ensemble_attack/test_attack_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/test_attack_model.py index e1eb655a..d684402a 100644 --- a/examples/gan/ensemble_attack/test_attack_model.py +++ b/examples/gan/ensemble_attack/test_attack_model.py @@ -8,7 +8,7 @@ @hydra.main(config_path="./", config_name="config", version_base=None) -def test_attack_model(config: DictConfig) -> None: +def attack_model_test(config: DictConfig) -> None: """Main function to test the attack model.""" log( INFO, @@ -18,4 +18,4 @@ def test_attack_model(config: DictConfig) -> None: if __name__ == "__main__": - test_attack_model() + attack_model_test() From 5fa4fefba4b933d5f6aec763345395b05abaf268 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 9 Mar 2026 15:23:05 -0400 Subject: [PATCH 20/38] Fixing test assertions --- .../ensemble/test_shadow_model_training.py | 30 +++++-------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py index 2dbab11b..8008f97d 100644 --- a/tests/integration/attacks/ensemble/test_shadow_model_training.py +++ b/tests/integration/attacks/ensemble/test_shadow_model_training.py @@ -15,7 +15,6 @@ train_shadow_on_half_challenge_data, ) from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - TabDDPMTrainingResult, fine_tune_tabddpm_and_synthesize, save_additional_training_config, train_tabddpm_and_synthesize, @@ -68,17 +67,10 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None assert len(shadow_data["fine_tuning_sets"]) == 2 # n_models assert len(shadow_data["fine_tuned_results"]) == 2 # n_models - for result in shadow_data["fine_tuned_results"]: - assert type(result) is TabDDPMTrainingResult - assert result.synthetic_data is not None - assert result.tables is not None - assert result.models is not None - assert result.configs is not None - assert result.save_dir is not None - assert result.relation_order is not None - assert result.all_group_lengths_probabilities is not None - assert type(result.synthetic_data) is pd.DataFrame - assert len(result.synthetic_data) == 5 + for synthetic_data in shadow_data["fine_tuned_results"]: + assert type(synthetic_data) is pd.DataFrame + assert synthetic_data is not None + assert len(synthetic_data) == 5 # Fine tuning sets should be disjoint assert set(shadow_data["fine_tuning_sets"][0]).isdisjoint(set(shadow_data["fine_tuning_sets"][1])) @@ -114,17 +106,9 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) -> assert len(shadow_data["selected_sets"]) == 2 # n_models assert len(shadow_data["trained_results"]) == 2 # n_models - for result in shadow_data["trained_results"]: - assert type(result) is TabDDPMTrainingResult - assert result.synthetic_data is not None - assert result.tables is not None - assert result.models is not None - assert result.configs is not None - assert result.save_dir is not None - assert result.relation_order is not None - assert result.all_group_lengths_probabilities is not None - assert type(result.synthetic_data) is pd.DataFrame - assert len(result.synthetic_data) == 5 + for synthetic_data in shadow_data["trained_results"]: + assert type(synthetic_data) is pd.DataFrame + assert len(synthetic_data) == 5 # Training sets should be disjoint assert set(shadow_data["selected_sets"][0]).isdisjoint(set(shadow_data["selected_sets"][1])) From a9369f669481b2ba2f5de3c4bdc937c18b3b71de Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Fri, 13 Mar 2026 14:52:11 -0400 Subject: [PATCH 21/38] Making population_all_with_challenge.csv into a constant and adding an optional parameter to the config --- examples/ensemble_attack/real_data_collection.py | 12 ++++++++---- examples/ensemble_attack/run_attack.py | 4 ++-- .../run_metaclassifier_training.py | 4 +++- .../ensemble_attack/run_shadow_model_training.py | 14 +++++++------- examples/ensemble_attack/test_attack_model.py | 16 ++++++++++------ examples/gan/ensemble_attack/README.md | 2 +- examples/gan/ensemble_attack/config.yaml | 2 ++ .../gan/ensemble_attack/train_attack_model.py | 16 ++++------------ 8 files changed, 37 insertions(+), 33 deletions(-) diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py index e912d03c..0a2b710e 100644 --- a/examples/ensemble_attack/real_data_collection.py +++ b/examples/ensemble_attack/real_data_collection.py @@ -14,6 +14,10 @@ from midst_toolkit.common.logger import log +COLLECTED_DATA_FILE_NAME = "population_all_with_challenge.csv" +COLLECTED_DATA_NO_CHALLENGE_FILE_NAME = "population_all_no_challenge.csv" + + class AttackType(Enum): """Enum for the different attack types.""" @@ -244,23 +248,23 @@ def collect_population_data_ensemble( # Population data without the challenge points df_population_no_challenge = df_population[~df_population["trans_id"].isin(df_challenge["trans_id"])] - save_dataframe(df_population_no_challenge, save_dir, "population_all_no_challenge.csv") + save_dataframe(df_population_no_challenge, save_dir, COLLECTED_DATA_NO_CHALLENGE_FILE_NAME) # Remove ids df_population_no_challenge_no_id = df_population_no_challenge.drop(columns=["trans_id", "account_id"]) save_dataframe( df_population_no_challenge_no_id, save_dir, - "population_all_no_challenge_no_id.csv", + f"{Path(COLLECTED_DATA_NO_CHALLENGE_FILE_NAME).stem}_no_id.csv", ) # Population data with all the challenge points df_population_with_challenge = pd.concat([df_population_no_challenge, df_challenge]) - save_dataframe(df_population_with_challenge, save_dir, "population_all_with_challenge.csv") + save_dataframe(df_population_with_challenge, save_dir, COLLECTED_DATA_FILE_NAME) # Remove ids df_population_with_challenge_no_id = df_population_with_challenge.drop(columns=["trans_id", "account_id"]) save_dataframe( df_population_with_challenge_no_id, save_dir, - "population_all_with_challenge_no_id.csv", + f"{Path(COLLECTED_DATA_FILE_NAME).stem}_no_id.csv", ) return df_population_with_challenge diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index 4e67fa50..e252be45 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -11,7 +11,7 @@ import examples.ensemble_attack.run_metaclassifier_training as meta_pipeline import examples.ensemble_attack.run_shadow_model_training as shadow_pipeline -from examples.ensemble_attack.real_data_collection import collect_population_data_ensemble +from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME, collect_population_data_ensemble from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.attacks.ensemble.process_split_data import process_split_data from midst_toolkit.common.logger import log @@ -33,7 +33,7 @@ def run_data_processing(config: DictConfig) -> None: # is not enough. original_population_data = load_dataframe( Path(config.data_processing_config.original_population_data_path), - "population_all_with_challenge.csv", + COLLECTED_DATA_FILE_NAME, ) log(INFO, "Running data processing pipeline...") # Collect the real data from the MIDST challenge resources. diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py index 47cfdd32..d8166be1 100644 --- a/examples/ensemble_attack/run_metaclassifier_training.py +++ b/examples/ensemble_attack/run_metaclassifier_training.py @@ -6,6 +6,7 @@ import pandas as pd from omegaconf import DictConfig +from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.common.logger import log @@ -80,9 +81,10 @@ def run_metaclassifier_training( assert target_synthetic_data is not None, "Target model's synthetic data is missing." target_synthetic_data = target_synthetic_data.copy() + data_file_name = config.data_file_name if "data_file_name" in config else COLLECTED_DATA_FILE_NAME df_reference = load_dataframe( Path(config.data_paths.population_path), - "population_all_with_challenge_no_id.csv", + f"{Path(data_file_name).stem}_no_id.csv", ) log( INFO, diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 614eefc2..e9dde456 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -6,6 +6,7 @@ import pandas as pd from omegaconf import DictConfig +from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import ( train_three_sets_of_shadow_models, @@ -118,17 +119,16 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra at src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py. """ log(INFO, "Running shadow model training...") + + table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME + id_column_name = config.table_id_column_name if "table_id_column_name" in config else DEFAULT_ID_COLUMN_NAME + data_file_name = config.data_file_name if "data_file_name" in config else COLLECTED_DATA_FILE_NAME + # Load the required dataframes for shadow model training. # For shadow model training we need master_challenge_train and population data. # Master challenge is the main training (or fine-tuning) data for the shadow models. # Population data is used to pre-train some of the shadow models. - df_population_with_challenge = load_dataframe( - Path(config.data_paths.population_path), - "population_all_with_challenge.csv", - ) - - table_name = config.table_name if "table_name" in config else DEFAULT_TABLE_NAME - id_column_name = config.table_id_column_name if "table_id_column_name" in config else DEFAULT_ID_COLUMN_NAME + df_population_with_challenge = load_dataframe(Path(config.data_paths.population_path), data_file_name) model_type = DEFAULT_MODEL_TYPE if "model_name" in config.shadow_training: diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 5ab5654b..04018aa0 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -12,7 +12,12 @@ import pandas as pd from omegaconf import DictConfig -from examples.ensemble_attack.real_data_collection import AttackDataset, AttackType, collect_midst_data +from examples.ensemble_attack.real_data_collection import ( + COLLECTED_DATA_FILE_NAME, + AttackDataset, + AttackType, + collect_midst_data, +) from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType from midst_toolkit.attacks.ensemble.data_utils import load_dataframe @@ -263,7 +268,8 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list """ # Checking if challenge data exists processed_attack_data_path = Path(config.data_paths.processed_attack_data_path) - challenge_data_file_name = "population_all_with_challenge_challenge_data.csv" + data_file_name = config.data_file_name if "data_file_name" in config else COLLECTED_DATA_FILE_NAME + challenge_data_file_name = f"{Path(data_file_name).stem}_challenge_data.csv" if (processed_attack_data_path / challenge_data_file_name).exists(): log(INFO, "Skipping data collection for testing phase.") @@ -404,10 +410,8 @@ def run_metaclassifier_testing( # 5) Get predictions on the challenge data (test set). # Load the reference population data for DOMIAS signals. - df_reference = load_dataframe( - Path(config.data_paths.population_path), - "population_all_with_challenge_no_id.csv", - ) + data_file_name = config.data_file_name if "data_file_name" in config else COLLECTED_DATA_FILE_NAME + df_reference = load_dataframe(Path(config.data_paths.population_path), f"{Path(data_file_name).stem}_no_id.csv") probabilities, pred_score = blending_attacker.predict( df_test=test_data, diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index 0586fe4f..8b3295c2 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -15,7 +15,7 @@ extract the files and place them in a `/data/ensemble_attack` folder within this > of the [`config.yaml`](config.yaml) file. Here is a description of the files that have been extracted: -- `population_all_with_challenge.csv`: The full set of training data +- `trans.csv`: The full set of training data. - `dataset_meta.json`: Metadata about the relationship between the tables in the dataset. Since this is a single table dataset, it will only contain information about the transaction (`trans`) table. - `trans_domain.json`: Metadata about the columns of the transaction table, such as their size diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 63b2145f..5f139cf4 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -2,6 +2,8 @@ # Base data directory (can be overridden from command line) base_data_dir: examples/gan/data/ensemble_attack results_dir: examples/gan/results +data_name: trans +data_file_name: ${data_name}.csv training: epochs: 300 diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index acd5503a..84f2b5af 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -1,4 +1,3 @@ -import importlib import json from logging import INFO from pathlib import Path @@ -6,6 +5,7 @@ import hydra from omegaconf import DictConfig, OmegaConf +from examples.ensemble_attack.run_metaclassifier_training import run_metaclassifier_training from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe from midst_toolkit.attacks.ensemble.process_split_data import process_split_data @@ -34,7 +34,7 @@ def train_attack_model(config: DictConfig) -> None: # The following function saves the required dataframe splits in the specified processed_attack_data_path path. population_data = load_dataframe( Path(config.ensemble_attack.data_paths.population_path), - "population_all_with_challenge.csv", + config.data_file_name, ) # Removing id columns and saving the dataset @@ -43,7 +43,7 @@ def train_attack_model(config: DictConfig) -> None: save_dataframe( population_data_no_id, Path(config.ensemble_attack.data_paths.population_path), - "population_all_with_challenge_no_id.csv", + f"{Path(config.data_file_name).stem}_no_id.csv", ) process_split_data( @@ -98,15 +98,7 @@ def train_attack_model(config: DictConfig) -> None: "The target_data_path must be provided for metaclassifier training." ) - # Note: Importing the following module causes a segmentation fault error if imported at the top of this file. - # A quick solution is to load modules dynamically if any of the pipelines is called. - # TODO: Investigate the source of error. - meta_pipeline = importlib.import_module("examples.ensemble_attack.run_metaclassifier_training") - meta_pipeline.run_metaclassifier_training( - config.ensemble_attack, - shadow_data_paths, - target_model_synthetic_path, - ) + run_metaclassifier_training(config.ensemble_attack, shadow_data_paths, target_model_synthetic_path) if __name__ == "__main__": From 163bba8e2d6ebeabe118d105a68f8a3a0d50d9fe Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 16 Mar 2026 12:19:22 -0400 Subject: [PATCH 22/38] Addressing last comments by Fatemeh --- .../ensemble_attack/configs/experiment_config.yaml | 5 +---- .../configs/original_attack_config.yaml | 4 +--- .../ensemble_attack/run_metaclassifier_training.py | 2 +- examples/ensemble_attack/test_attack_model.py | 2 +- examples/gan/ensemble_attack/config.yaml | 11 +++++------ 5 files changed, 9 insertions(+), 15 deletions(-) diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml index b88e7902..979e1f64 100644 --- a/examples/ensemble_attack/configs/experiment_config.yaml +++ b/examples/ensemble_attack/configs/experiment_config.yaml @@ -32,10 +32,6 @@ data_paths: attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack (train phase) evaluation results will be stored (output) -model_paths: - metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved - - # Dataset specific information used for processing in this example data_processing_config: midst_data_path: /projects/midst-experiments/all_tabddpms/ # Used to collect the data (input) @@ -112,6 +108,7 @@ metaclassifier: # Temporary. Might remove having an epoch parameter. epochs: 1 meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model + metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved attack_success_computation: target_ids_to_test: [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] # List of target model IDs to compute the attack success for. diff --git a/examples/ensemble_attack/configs/original_attack_config.yaml b/examples/ensemble_attack/configs/original_attack_config.yaml index 22e6d52c..5e549b16 100644 --- a/examples/ensemble_attack/configs/original_attack_config.yaml +++ b/examples/ensemble_attack/configs/original_attack_config.yaml @@ -10,9 +10,6 @@ data_paths: processed_attack_data_path: ${base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored attack_evaluation_result_path: ${base_example_dir}/attack_results # Path where the attack evaluation results will be stored -model_paths: - metaclassifier_model_path: ${base_example_dir}/trained_models # Path where the trained metaclassifier model will be saved - # Pipeline control pipeline: run_data_processing: true # Set this to false if you have already saved the processed data @@ -93,6 +90,7 @@ metaclassifier: # Temporary. Might remove having an epoch parameter. epochs: 1 meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model + metaclassifier_model_path: ${base_example_dir}/trained_models # Path where the trained metaclassifier model will be saved # General settings diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py index d8166be1..dd79033a 100644 --- a/examples/ensemble_attack/run_metaclassifier_training.py +++ b/examples/ensemble_attack/run_metaclassifier_training.py @@ -127,7 +127,7 @@ def run_metaclassifier_training( ) model_filename = config.metaclassifier.meta_classifier_model_name - model_path = Path(config.model_paths.metaclassifier_model_path) / f"{model_filename}.pkl" + model_path = Path(config.metaclassifier.metaclassifier_model_path) / f"{model_filename}.pkl" model_path.parent.mkdir(parents=True, exist_ok=True) with open(model_path, "wb") as f: pickle.dump(blending_attacker.trained_model, f) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 04018aa0..f3558c83 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -328,7 +328,7 @@ def run_metaclassifier_testing( meta_classifier_type = MetaClassifierType(config.metaclassifier.model_type) metaclassifier_model_name = config.metaclassifier.meta_classifier_model_name - mataclassifier_path = Path(config.model_paths.metaclassifier_model_path) / f"{metaclassifier_model_name}.pkl" + mataclassifier_path = Path(config.metaclassifier.metaclassifier_model_path) / f"{metaclassifier_model_name}.pkl" assert mataclassifier_path.exists(), ( f"No metaclassifier model found at {mataclassifier_path}. Make sure to run the training script first." ) diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 5f139cf4..a383a9f9 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -18,10 +18,11 @@ ensemble_attack: random_seed: null # Set this to a value if you want to set a random seed for reproducibility table_name: "trans" table_id_column_name: "trans_id" + data_file_name: ${data_file_name} data_paths: processed_attack_data_path: ${base_data_dir} - population_path: ${base_data_dir} + population_path: ${base_data_dir} # This is the population data that the attacker has collected or has access to. attack_evaluation_result_path: ${results_dir}/evaluation_results # Path where the attack evaluation results will be stored data_processing_config: @@ -73,14 +74,12 @@ ensemble_attack: # Temporary. Might remove having an epoch parameter. epochs: 1 meta_classifier_model_name: ${ensemble_attack.metaclassifier.model_type}_metaclassifier_model - - model_paths: metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved target_model: # This is only used for testing the attack on a real target model. - target_synthetic_data_path: ${results_dir}/population_all_with_challenge_synthetic.csv - challenge_data_path: ${ensemble_attack.data_paths.processed_attack_data_path}/population_all_with_challenge_challenge_data.csv - challenge_label_path: ${ensemble_attack.data_paths.processed_attack_data_path}/population_all_with_challenge_challenge_labels.npy + target_synthetic_data_path: ${results_dir}/${data_name}_synthetic.csv + challenge_data_path: ${ensemble_attack.data_paths.processed_attack_data_path}/${data_name}_challenge_data.csv + challenge_label_path: ${ensemble_attack.data_paths.processed_attack_data_path}/${data_name}_challenge_labels.npy target_shadow_models_output_path: ${results_dir}/test_all_targets # Sub-directory to store test shadows and results attack_probabilities_result_path: ${results_dir}/test_probabilities From ecab1e2fb5df09ad1c07266a7104590490e53141 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 16 Mar 2026 16:05:33 -0400 Subject: [PATCH 23/38] WIP adding model runner class --- examples/ensemble_attack/run_attack.py | 5 + .../run_shadow_model_training.py | 59 +--- .../gan/ensemble_attack/train_attack_model.py | 25 +- src/midst_toolkit/attacks/ensemble/model.py | 269 +++++++++++++++ .../ensemble/rmia/shadow_model_training.py | 162 +++------ .../attacks/ensemble/shadow_model_utils.py | 314 +----------------- 6 files changed, 363 insertions(+), 471 deletions(-) create mode 100644 src/midst_toolkit/attacks/ensemble/model.py diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index e252be45..89ceef04 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -81,6 +81,11 @@ def main(config: DictConfig) -> None: Path(config.data_paths.processed_attack_data_path), "master_challenge_train.csv", ) + + # TODO: add these to the config + # configs.fine_tuning_diffusion_iterations = fine_tuning_config.fine_tune_diffusion_iterations + # configs.fine_tuning_classifier_iterations = fine_tuning_config.fine_tune_classifier_iterations + shadow_data_paths = shadow_pipeline.run_shadow_model_training(config, df_master_challenge_train) shadow_data_paths = [Path(path) for path in shadow_data_paths] diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index e9dde456..c7166aaa 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -8,30 +8,22 @@ from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import ( - train_three_sets_of_shadow_models, -) -from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - ModelType, - TrainingResult, - save_additional_training_config, - train_or_fine_tune_and_synthesize_with_ctgan, - train_tabddpm_and_synthesize, -) -from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig +from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models +from midst_toolkit.attacks.ensemble.model import EnsembleAttackModelRunner from midst_toolkit.common.logger import log DEFAULT_TABLE_NAME = "trans" DEFAULT_ID_COLUMN_NAME = "trans_id" -DEFAULT_MODEL_TYPE = ModelType.TABDDPM -def run_target_model_training(config: DictConfig) -> Path: +def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> Path: """ Function to run the target model training for RMIA attack. Args: + model_runner: The model runner to be used for training the target model. + Should be an instance of a subclass of `EnsembleAttackModelRunner`. config: Configuration object set in config.yaml. Returns: @@ -54,11 +46,6 @@ def run_target_model_training(config: DictConfig) -> Path: target_folder = target_model_output_path / "target_model" - model_type = DEFAULT_MODEL_TYPE - if "model_name" in config.shadow_training: - model_type = ModelType(config.shadow_training.model_name) - log(INFO, f"Training target model with model type: {model_type.value}") - target_folder.mkdir(parents=True, exist_ok=True) shutil.copyfile( target_training_json_config_paths.table_domain_file_path, @@ -68,30 +55,8 @@ def run_target_model_training(config: DictConfig) -> Path: target_training_json_config_paths.dataset_meta_file_path, target_folder / "dataset_meta.json", ) - configs, save_dir = save_additional_training_config( - data_dir=target_folder, - training_config_json_path=Path(target_training_json_config_paths.training_config_path), - final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json - experiment_name="trained_target_model", - model_type=model_type, - ) - train_result: TrainingResult - if model_type == ModelType.TABDDPM: - train_result = train_tabddpm_and_synthesize( - train_set=df_real_data, - configs=cast(ClavaDDPMTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, - ) - elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_and_synthesize_with_ctgan( - dataset=df_real_data, - configs=cast(CTGANTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - ) + train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=df_real_data, synthesize=True) # To train the attack model (metaclassifier), we only need to save target's synthetic data, # and not the entire target model's training result object. @@ -105,11 +70,13 @@ def run_target_model_training(config: DictConfig) -> Path: return target_model_synthetic_path -def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFrame) -> list[Path]: +def run_shadow_model_training(model_runner: EnsembleAttackModelRunner, config: DictConfig, df_challenge_train: pd.DataFrame) -> list[Path]: """ Function to run the shadow model training for RMIA attack. Args: + model_runner: The model runner to be used for training the shadow models. Should be an instance of + a subclass of `EnsembleAttackModelRunner`. config: Configuration object set in config.yaml. df_challenge_train: DataFrame containing the data that is used to train RMIA shadow models. @@ -130,10 +97,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra # Population data is used to pre-train some of the shadow models. df_population_with_challenge = load_dataframe(Path(config.data_paths.population_path), data_file_name) - model_type = DEFAULT_MODEL_TYPE - if "model_name" in config.shadow_training: - model_type = ModelType(config.shadow_training.model_name) - log(INFO, f"Training shadow models with model type: {model_type.value}") + log(INFO, f"Training shadow models with model runner: {model_runner}") # Make sure master challenge train and population data have the id column. assert id_column_name in df_challenge_train.columns, ( @@ -146,6 +110,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra # ``master_challenge_df`` is used for fine-tuning for half of the shadow models. # For the other half of the shadow models, only ``master_challenge_df`` is used for training. first_set_result_path, second_set_result_path, third_set_result_path = train_three_sets_of_shadow_models( + model_runner=model_runner, population_data=df_population_with_challenge, master_challenge_data=df_challenge_train, shadow_models_output_path=Path(config.shadow_training.shadow_models_output_path), @@ -157,9 +122,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra # ``4 * n_models_per_set`` total shadow models. n_models_per_set=4, # 4 based on the original code, must be even n_reps=12, # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code - number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, random_seed=config.random_seed, - model_type=model_type, ) log( INFO, diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index 84f2b5af..bfd87ab7 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -7,8 +7,11 @@ from examples.ensemble_attack.run_metaclassifier_training import run_metaclassifier_training from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training +from examples.gan.utils import get_single_table_svd_metadata, get_table_name from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe +from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANModelRunner, EnsembleAttackCTGANTrainingConfig from midst_toolkit.attacks.ensemble.process_split_data import process_split_data +from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -78,7 +81,27 @@ def train_attack_model(config: DictConfig) -> None: Path(config.ensemble_attack.data_paths.population_path), "master_challenge_train.csv", ) - shadow_data_paths = run_shadow_model_training(config.ensemble_attack, master_challenge_train) + + table_name = get_table_name(config.base_data_dir) + domain_file_path = Path(config.base_data_dir) / f"{table_name}_domain.json" + with open(domain_file_path, "r") as file: + domain_dictionary = json.load(file) + + training_config, _ = save_additional_training_config( + training_config_type=EnsembleAttackCTGANTrainingConfig, + data_dir=Path(config.base_data_dir), + training_config_json_path=training_config_path, + final_config_json_path=Path(config.base_data_dir) / f"{table_name}.json", # Path to the new json + experiment_name="pre_trained_model", + ) + + metadata, _ = get_single_table_svd_metadata(master_challenge_train, domain_dictionary) + training_config.metadata = metadata + training_config.table_name = table_name + + model_runner = EnsembleAttackCTGANModelRunner(training_config=training_config) + + shadow_data_paths = run_shadow_model_training(model_runner, config.ensemble_attack, master_challenge_train) shadow_data_paths = [Path(path) for path in shadow_data_paths] log(INFO, "Training the target model...") diff --git a/src/midst_toolkit/attacks/ensemble/model.py b/src/midst_toolkit/attacks/ensemble/model.py new file mode 100644 index 00000000..a4c53791 --- /dev/null +++ b/src/midst_toolkit/attacks/ensemble/model.py @@ -0,0 +1,269 @@ +""" +Module containing the base classes and implementations for the Ensemble Attack model runner and training result. +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +from typing import Any +from logging import INFO +import copy +import json + +import pandas as pd +from pydantic import BaseModel, ConfigDict +from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] +from sdv.metadata import SingleTableMetadata # type: ignore[import-untyped] + +from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, TrainingConfig +from midst_toolkit.models.clavaddpm.data_loaders import Tables, load_tables +from midst_toolkit.models.clavaddpm.enumerations import GroupLengthsProbDicts, Relation, RelationOrder +from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, CTGANModelArtifacts +from midst_toolkit.models.clavaddpm.clustering import clava_clustering +from midst_toolkit.models.clavaddpm.train import clava_training +from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing +from midst_toolkit.common.variables import DEVICE +from midst_toolkit.common.logger import log +from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning + + + + +# Base Classes +class EnsembleAttackTrainingConfig(TrainingConfig): + number_of_points_to_synthesize: int = 20000 + +class EnsembleAttackTrainingResult(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + save_dir: Path + configs: EnsembleAttackTrainingConfig + models: Any + synthetic_data: pd.DataFrame | None = None + + +class EnsembleAttackModelRunner(ABC): + def __init__(self, training_config: EnsembleAttackTrainingConfig): + """ + Initialize the ensemble attackmodel runner with a training config. + + Args: + training_config: The training config for the ensemble attack model. + """ + self.training_config = training_config + + @abstractmethod + def train_or_fine_tune_and_synthesize( + self, + dataset: pd.DataFrame, + synthesize: bool = True, + trained_model: EnsembleAttackTrainingResult | None = None, + ) -> EnsembleAttackTrainingResult: + """ + Train or fine tune a model and synthesize data. + + Args: + dataset: The dataset to train or fine tune the model on. + synthesize: Whether to synthesize data after training. + trained_model: The model to fine tune. If None, a new model should be trained. + Optional, default is None. + + Returns: + An instance of `EnsembleAttackTrainingResult` containing the training results. + """ + raise NotImplementedError("Subclasses must implement this method.") + + +# TabDDPM/ClavaDDPM implementation +class EnsembleAttackTabDDPMTrainingConfig(ClavaDDPMTrainingConfig, EnsembleAttackTrainingConfig): + fine_tuning_diffusion_iterations: int = 100 + fine_tuning_classifier_iterations: int = 10 + + +class TabDDPMTrainingResult(EnsembleAttackTrainingResult): + configs: EnsembleAttackTabDDPMTrainingConfig + models: dict[Relation, ClavaDDPMModelArtifacts] + tables: Tables + relation_order: RelationOrder + all_group_lengths_probabilities: GroupLengthsProbDicts + + +class EnsembleAttackTabDDPMModelRunner(EnsembleAttackModelRunner): + def train_or_fine_tune_and_synthesize( + self, + dataset: pd.DataFrame, + synthesize: bool = True, + trained_model: EnsembleAttackTrainingResult | None = None, + ) -> TabDDPMTrainingResult: + """ + Train or fine tune a TabDDPM model on the provided training set and optionally synthesize + data using the trained/fine-tuned models. + + Args: + dataset: The training dataset as a pandas DataFrame. + synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. + trained_model: The model to fine tune. If None, a new model should be trained. + Optional, default is None. + + Returns: + A dataclass TabDDPMTrainingResult object containing: + - save_dir: Directory where results are saved. + - configs: Configuration dictionary used for training. + - tables: Loaded tables after clustering. + - relation_order: Relation order of the tables. + - all_group_lengths_probabilities: Group lengths probability dictionaries. + - models: The trained models. + - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, + otherwise, None. + """ + # Load tables + tables, relation_order, _ = load_tables(self.training_config.general.data_dir, train_data={"trans": dataset}) + + save_dir = self.training_config.general.workspace_dir / self.training_config.general.exp_name + + # Clustering on the multi-table dataset + tables, all_group_lengths_prob_dicts = clava_clustering( + tables, + relation_order, + save_dir, + self.training_config.clustering, + ) + + if trained_model is None: + # Train models + models = clava_training( + tables, + relation_order, + save_dir, + diffusion_config=self.training_config.diffusion, + classifier_config=self.training_config.classifier, + device=DEVICE, + ) + + else: + # Fine-tune models + copied_models = copy.deepcopy(trained_model.models) + models = clava_fine_tuning( + copied_models, + tables, + relation_order, + diffusion_config=self.training_config.diffusion, + classifier_config=self.training_config.classifier, + fine_tuning_diffusion_iterations=self.training_config.fine_tuning_diffusion_iterations, + fine_tuning_classifier_iterations=self.training_config.fine_tuning_classifier_iterations, + ) + + result = TabDDPMTrainingResult( + save_dir=save_dir, + configs=self.training_config, + tables=tables, + relation_order=relation_order, + all_group_lengths_probabilities=all_group_lengths_prob_dicts, + models=models, + ) + + if synthesize: + # By default, Ensemble attack generates a synthetic data of length ``20,000``. + # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to + # generate 20,000 samples regardless of the training data size. But we control the + # synthetic data size directly here with ``number_of_points_to_synthesize``. + # ``sample_scale`` is later multiplied by the size of training data (no id) to determine + # the size of synthetic data. + assert len(tables["trans"].data) > 0, "Cannot synthesize: training data is empty" + sample_scale = self.training_config.number_of_points_to_synthesize / len(tables["trans"].data) + cleaned_tables, _, _ = clava_synthesizing( + tables, + relation_order, + save_dir, + models, + self.training_config.general, + self.training_config.sampling, + self.training_config.matching, + all_group_lengths_prob_dicts, + sample_scale=sample_scale, + ) + + result.synthetic_data = cleaned_tables["trans"] + + return result + + +# CTGAN implementation +class EnsembleAttackCTGANTrainingConfig(CTGANTrainingConfig, EnsembleAttackTrainingConfig): + model_config = ConfigDict(arbitrary_types_allowed=True) + + metadata: SingleTableMetadata = None + table_name: str = None + +class CTGANTrainingResult(EnsembleAttackTrainingResult): + configs: EnsembleAttackCTGANTrainingConfig + models: dict[Relation, CTGANModelArtifacts] + tables: Tables + relation_order: RelationOrder + all_group_lengths_probabilities: GroupLengthsProbDicts + + +class EnsembleAttackCTGANModelRunner(EnsembleAttackModelRunner): + def train_or_fine_tune_and_synthesize( + self, + dataset: pd.DataFrame, + synthesize: bool = True, + trained_model: EnsembleAttackTrainingResult | None = None, + ) -> CTGANTrainingResult: + """ + Train or fine tune a CTGAN model on the provided dataset and optionally synthesize data. + + If no trained model is provided, a new model will be trained. Otherwise, the + provided model will be fine tuned. + + Args: + dataset: The dataset as a pandas DataFrame. + configs: Configuration dictionary for CTGAN. + synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. + trained_model: The trained model to fine tune. If None, a new model will be trained. + + Returns: + A dataclass TrainingResult object containing: + - save_dir: Directory where results are saved. + - configs: Configuration dictionary used for training. + - models: The trained models. + - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, + otherwise, None. + """ + assert self.training_config.metadata is not None, "Metadata is not set" + assert self.training_config.table_name is not None, "Table name is not set" + + dataset_without_ids = dataset.drop(columns=[column_name for column_name in dataset.columns if "_id" in column_name]) + + if trained_model is None: + log(INFO, "Training new CTGAN model...") + ctgan = CTGANSynthesizer( + metadata=self.training_config.metadata, + epochs=self.training_config.training.epochs, + verbose=self.training_config.training.verbose, + ) + model_name = "trained_ctgan_model.pkl" + else: + log(INFO, "Fine tuning CTGAN model...") + ctgan = trained_model.models[(None, self.training_config.table_name)].model + model_name = "fine_tuned_ctgan_model.pkl" + + ctgan.fit(dataset_without_ids) + + save_dir = self.training_config.general.workspace_dir / self.training_config.general.exp_name + results_file = Path(save_dir) / model_name + results_file.parent.mkdir(parents=True, exist_ok=True) + + ctgan.save(results_file) + + result = CTGANTrainingResult( + save_dir=save_dir, + configs=self.training_config, + models={(None, self.training_config.table_name): CTGANModelArtifacts(model=ctgan, model_file_path=results_file)}, + ) + + if synthesize: + synthetic_data = ctgan.sample(num_rows=self.training_config.synthesizing.sample_size) + result.synthetic_data = synthetic_data + + return result diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 92b69088..56e34717 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -3,40 +3,30 @@ import shutil from logging import INFO from pathlib import Path -from typing import Any, cast +from typing import Any import pandas as pd from omegaconf import DictConfig -from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - ModelType, - TrainingResult, - fine_tune_tabddpm_and_synthesize, - save_additional_training_config, - train_or_fine_tune_and_synthesize_with_ctgan, - train_tabddpm_and_synthesize, -) -from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig +from midst_toolkit.attacks.ensemble.model import EnsembleAttackModelRunner from midst_toolkit.common.logger import log # TODO: This function and the next one can be unified later. def train_fine_tuned_shadow_models( + model_runner: EnsembleAttackModelRunner, n_models: int, n_reps: int, population_data: pd.DataFrame, master_challenge_data: pd.DataFrame, shadow_models_output_path: Path, training_json_config_paths: DictConfig, - fine_tuning_config: DictConfig, init_model_id: int, table_name: str, id_column_name: str, pre_training_data_size: int = 60000, - number_of_points_to_synthesize: int = 20000, init_data_seed: int | None = None, random_seed: int | None = None, - model_type: ModelType = ModelType.TABDDPM, ) -> Path: """ Train ``n_models`` shadow models that start from a pre-trained TabDDPM model and are fine-tuned on @@ -60,6 +50,8 @@ def train_fine_tuned_shadow_models( size of fine-tuning set. Args: + model_runner: The model runner to be used for training the shadow models. Should be an instance of + a subclass of `EnsembleAttackModelRunner`. n_models: Number of shadow models to train, must be even. n_reps: Number of repetitions for each challenge point in the fine-tuning set. population_data: The total population data that the attacker has access to. @@ -81,7 +73,6 @@ def train_fine_tuned_shadow_models( defaults to 20,000. init_data_seed: Random seed for the initial training set. random_seed: Random seed used for reproducibility, defaults to None. - model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. Returns: The path where the shadow models and their artifacts are saved. @@ -117,40 +108,13 @@ def train_fine_tuned_shadow_models( shadow_model_data_folder / "dataset_meta.json", ) - # Train initial model with 60K data without any challenge points - # ``save_additional_training_config`` makes a personalized copy of the training config for each - # training model (here the base model). - # All the shadow models will be saved under the base model data directory. - configs, save_dir = save_additional_training_config( - data_dir=shadow_model_data_folder, - training_config_json_path=Path(training_json_config_paths.training_config_path), - final_config_json_path=shadow_model_data_folder / f"{table_name}.json", # Path to the new json - experiment_name="pre_trained_model", - model_type=model_type, - ) - # Train the initial model if it is not already trained and saved. + save_dir = model_runner.training_config.general.workspace_dir / model_runner.training_config.general.exp_name initial_model_path = save_dir / f"initial_model_rmia_{init_model_id}.pkl" if not initial_model_path.exists(): - log(INFO, f"Training initial {model_type.value} model with ID {init_model_id}...") - - initial_model_training_results: TrainingResult - if model_type == ModelType.TABDDPM: - initial_model_training_results = train_tabddpm_and_synthesize( - train, - cast(ClavaDDPMTrainingConfig, configs), - save_dir, - synthesize=False, - ) - elif model_type == ModelType.CTGAN: - initial_model_training_results = train_or_fine_tune_and_synthesize_with_ctgan( - train, - cast(CTGANTrainingConfig, configs), - save_dir, - synthesize=False, - ) - else: - raise ValueError(f"Invalid model type: {model_type}") + log(INFO, f"Training initial model with runner {model_runner}. Model ID {init_model_id}...") + + initial_model_training_results = model_runner.train_or_fine_tune_and_synthesize(dataset=train, synthesize=False) # Save the initial model # Pickle dump the results @@ -193,27 +157,12 @@ def train_fine_tuned_shadow_models( # Shuffle the dataset selected_challenges = selected_challenges.sample(frac=1, random_state=random_seed).reset_index(drop=True) - if model_type == ModelType.TABDDPM: - train_result = fine_tune_tabddpm_and_synthesize( - trained_models=initial_model_training_results.models, - fine_tune_set=selected_challenges, - configs=cast(ClavaDDPMTrainingConfig, configs), - save_dir=save_dir, - fine_tuning_diffusion_iterations=fine_tuning_config.fine_tune_diffusion_iterations, - fine_tuning_classifier_iterations=fine_tuning_config.fine_tune_classifier_iterations, - synthesize=True, - number_of_points_to_synthesize=number_of_points_to_synthesize, - ) - elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_and_synthesize_with_ctgan( - dataset=selected_challenges, - configs=cast(CTGANTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - trained_model=initial_model_training_results.models[(None, table_name)].model, - ) - else: - raise ValueError(f"Invalid model type: {model_type}") + train_result = model_runner.train_or_fine_tune_and_synthesize( + dataset=selected_challenges, + save_dir=save_dir, + synthesize=True, + trained_model=initial_model_training_results, + ) assert train_result.synthetic_data is not None, "Fine-tuned models should generate synthetic data." log( @@ -231,6 +180,7 @@ def train_fine_tuned_shadow_models( def train_shadow_on_half_challenge_data( + model_runner: EnsembleAttackModelRunner, n_models: int, n_reps: int, master_challenge_data: pd.DataFrame, @@ -238,9 +188,7 @@ def train_shadow_on_half_challenge_data( training_json_config_paths: DictConfig, table_name: str, id_column_name: str, - number_of_points_to_synthesize: int = 20000, random_seed: int | None = None, - model_type: ModelType = ModelType.TABDDPM, ) -> Path: """ 1. Create eight training sets with exactly half of the observations included in the challenge lists @@ -251,22 +199,21 @@ def train_shadow_on_half_challenge_data( 3. A synthetic dataset of 20K observations is generated for each model. Args: - n_models: number of shadow models to train, must be even. - n_reps: number of repetitions for each challenge point in the fine-tuning set. - master_challenge_data: The master challenge training dataset. - shadow_models_output_path: Path where the all datasets and information necessary to train shadow models - will be saved. - training_json_config_paths: Configuration dictionary containing paths to the data JSON config files. - An example of this config is provided in ``examples/ensemble_attack/config.yaml``. Required keys are: - - table_domain_file_path (str): Path to the table domain json file. - - dataset_meta_file_path (str): Path to dataset meta json file. - - training_config_path (str): Path to table's training config json file. - table_name: Name of the main table to be used for training the TabDDPM model. - id_column_name: Name of the ID column in the data. - number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model, - defaults to 20,000. - random_seed: Random seed used for reproducibility, defaults to None. - model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. + model_runner: The model runner to be used for training the shadow models. Should be an instance of + a subclass of `EnsembleAttackModelRunner`. + n_models: number of shadow models to train, must be even. + n_reps: number of repetitions for each challenge point in the fine-tuning set. + master_challenge_data: The master challenge training dataset. + shadow_models_output_path: Path where the all datasets and information necessary to train shadow models + will be saved. + training_json_config_paths: Configuration dictionary containing paths to the data JSON config files. + An example of this config is provided in ``examples/ensemble_attack/config.yaml``. Required keys are: + - table_domain_file_path (str): Path to the table domain json file. + - dataset_meta_file_path (str): Path to dataset meta json file. + - training_config_path (str): Path to table's training config json file. + table_name: Name of the main table to be used for training the TabDDPM model. + id_column_name: Name of the ID column in the data. + random_seed: Random seed used for reproducibility, defaults to None. Returns: The path where the shadow models and their artifacts are saved. @@ -297,13 +244,7 @@ def train_shadow_on_half_challenge_data( training_json_config_paths.dataset_meta_file_path, shadow_folder / "dataset_meta.json", ) - configs, save_dir = save_additional_training_config( - data_dir=shadow_folder, - training_config_json_path=Path(training_json_config_paths.training_config_path), - final_config_json_path=shadow_folder / f"{table_name}.json", # Path to the new json - experiment_name="trained_model", - model_type=model_type, - ) + attack_data: dict[str, Any] = { "selected_sets": selected_id_lists, "trained_results": [], @@ -322,24 +263,7 @@ def train_shadow_on_half_challenge_data( # Shuffle the dataset selected_challenges = selected_challenges.sample(frac=1, random_state=random_seed).reset_index(drop=True) - train_result: TrainingResult - if model_type == ModelType.TABDDPM: - train_result = train_tabddpm_and_synthesize( - selected_challenges, - cast(ClavaDDPMTrainingConfig, configs), - save_dir, - synthesize=True, - number_of_points_to_synthesize=number_of_points_to_synthesize, - ) - elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_and_synthesize_with_ctgan( - dataset=selected_challenges, - configs=cast(CTGANTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - ) - else: - raise ValueError(f"Invalid model type: {model_type}") + train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=selected_challenges, synthesize=True) assert train_result.synthetic_data is not None, "Trained shadow model did not generate synthetic data." log( @@ -350,6 +274,7 @@ def train_shadow_on_half_challenge_data( attack_data["trained_results"].append(train_result.synthetic_data) # Pickle dump the results + save_dir = model_runner.training_config.general.workspace_dir / model_runner.training_config.general.exp_name result_path = Path(save_dir, "rmia_shadows_third_set.pkl") with open(result_path, "wb") as file: pickle.dump(attack_data, file) @@ -358,6 +283,7 @@ def train_shadow_on_half_challenge_data( def train_three_sets_of_shadow_models( + model_runner: EnsembleAttackModelRunner, population_data: pd.DataFrame, master_challenge_data: pd.DataFrame, shadow_models_output_path: Path, @@ -367,9 +293,7 @@ def train_three_sets_of_shadow_models( id_column_name: str, n_models_per_set: int = 4, n_reps: int = 12, - number_of_points_to_synthesize: int = 20000, random_seed: int | None = None, - model_type: ModelType = ModelType.TABDDPM, ) -> tuple[Path, Path, Path]: """ Runs the shadow model training pipeline of the ensemble attack. This pipeline trains three sets of shadow models. @@ -396,6 +320,8 @@ def train_three_sets_of_shadow_models( Args: + model_runner: The model runner to be used for training the shadow models. Should be an instance of + a subclass of `EnsembleAttackModelRunner`. population_data: The total population data used for pre-training some of the shadow models. master_challenge_data: The master challenge training dataset. shadow_models_output_path: Path where the all datasets and information (configs) necessary to @@ -416,10 +342,7 @@ def train_three_sets_of_shadow_models( id_column_name: Name of the ID column in the data. n_models_per_set: Number of shadow models to train by each approach. Must be an even number. Defaults to 4. n_reps: Number of repetitions for each challenge point in the fine-tuning or training sets, defaults to 12. - number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model, - defaults to 20,000. random_seed: Random seed used for reproducibility, defaults to None. - model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. Returns: Paths where the shadow models and their artifacts including synthetic data are saved for each of @@ -431,21 +354,19 @@ def train_three_sets_of_shadow_models( shadow_models_output_path.mkdir(parents=True, exist_ok=True) first_set_result_path = train_fine_tuned_shadow_models( + model_runner=model_runner, n_models=n_models_per_set, n_reps=n_reps, population_data=population_data, master_challenge_data=master_challenge_data, shadow_models_output_path=shadow_models_output_path, training_json_config_paths=training_json_config_paths, - fine_tuning_config=fine_tuning_config, init_model_id=1, # To distinguish these shadow models from the next ones table_name=table_name, id_column_name=id_column_name, pre_training_data_size=fine_tuning_config.pre_train_data_size, - number_of_points_to_synthesize=number_of_points_to_synthesize, init_data_seed=random_seed, random_seed=random_seed, - model_type=model_type, ) log( INFO, @@ -455,22 +376,20 @@ def train_three_sets_of_shadow_models( # with a new initial training set # in the hopes of increased performance (gain was minimal based on the submission comments)."" second_set_result_path = train_fine_tuned_shadow_models( + model_runner=model_runner, n_models=n_models_per_set, n_reps=n_reps, population_data=population_data, master_challenge_data=master_challenge_data, shadow_models_output_path=shadow_models_output_path, training_json_config_paths=training_json_config_paths, - fine_tuning_config=fine_tuning_config, init_model_id=2, # To distinguish these shadow models from the previous ones table_name=table_name, id_column_name=id_column_name, pre_training_data_size=fine_tuning_config.pre_train_data_size, - number_of_points_to_synthesize=number_of_points_to_synthesize, # Setting a different seed for the second train set init_data_seed=random_seed + 1 if random_seed is not None else None, random_seed=random_seed, - model_type=model_type, ) log( INFO, @@ -479,6 +398,7 @@ def train_three_sets_of_shadow_models( # Original codebase comment: "The following eight models are trained from scratch on the challenge points, # still in the hopes of increased performance (again the gain was minimal)."" third_set_result_path = train_shadow_on_half_challenge_data( + model_runner=model_runner, n_models=n_models_per_set * 2, n_reps=n_reps, master_challenge_data=master_challenge_data, @@ -486,9 +406,7 @@ def train_three_sets_of_shadow_models( training_json_config_paths=training_json_config_paths, table_name=table_name, id_column_name=id_column_name, - number_of_points_to_synthesize=number_of_points_to_synthesize, random_seed=random_seed, - model_type=model_type, ) log( INFO, diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index c03af364..8d1a0512 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -1,82 +1,42 @@ -import copy import json import os from dataclasses import dataclass -from enum import Enum from logging import INFO from pathlib import Path -from typing import Any +from typing import Type -import pandas as pd -from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] - -from examples.gan.utils import get_single_table_svd_metadata, get_table_name -from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning -from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, TrainingConfig +from midst_toolkit.attacks.ensemble.model import EnsembleAttackTrainingResult +from midst_toolkit.common.config import CTGANTrainingConfig, TrainingConfig from midst_toolkit.common.logger import log -from midst_toolkit.common.variables import DEVICE -from midst_toolkit.models.clavaddpm.clustering import clava_clustering -from midst_toolkit.models.clavaddpm.data_loaders import Tables, load_tables -from midst_toolkit.models.clavaddpm.enumerations import ( - GroupLengthsProbDicts, - Relation, - RelationOrder, -) -from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing -from midst_toolkit.models.clavaddpm.train import ( - ClavaDDPMModelArtifacts, - CTGANModelArtifacts, - clava_training, -) - - -class ModelType(Enum): - TABDDPM = "tabddpm" - CTGAN = "ctgan" - - -@dataclass(kw_only=True) # Setting kw_only=True avoids and error with default values and inheritance -class TrainingResult: - save_dir: Path - configs: TrainingConfig - models: Any - synthetic_data: pd.DataFrame | None = None +from midst_toolkit.models.clavaddpm.enumerations import Relation +from midst_toolkit.models.clavaddpm.train import CTGANModelArtifacts @dataclass -class CTGANTrainingResult(TrainingResult): +class CTGANTrainingResult(EnsembleAttackTrainingResult): configs: CTGANTrainingConfig models: dict[Relation, CTGANModelArtifacts] -@dataclass -class TabDDPMTrainingResult(TrainingResult): - configs: ClavaDDPMTrainingConfig - models: dict[Relation, ClavaDDPMModelArtifacts] - tables: Tables - relation_order: RelationOrder - all_group_lengths_probabilities: GroupLengthsProbDicts - - def save_additional_training_config( + training_config_type: Type[TrainingConfig], data_dir: Path, training_config_json_path: Path, final_config_json_path: Path, experiment_name: str = "attack_experiment", workspace_name: str = "shadow_workspace", - model_type: ModelType = ModelType.TABDDPM, ) -> tuple[TrainingConfig, Path]: """ Modifies a TabDDPM configuration JSON file with the specified data directory, experiment name and workspace name, and loads the resulting configuration. Args: - data_dir: Directory containing dataset_meta.json, trans_domain.json, and trans.json files. - training_config_json_path: Path to the original TabDDPM training configuration JSON file. - final_config_json_path: Path where the modified configuration JSON file will be saved. - experiment_name: Name of the experiment, used to create a unique save directory. - workspace_name: Name of the workspace, used to create a unique save directory. - model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. + training_config_type: The type of the training config to be used for training the shadow models. + data_dir: Directory containing dataset_meta.json, trans_domain.json, and trans.json files. + training_config_json_path: Path to the original TabDDPM training configuration JSON file. + final_config_json_path: Path where the modified configuration JSON file will be saved. + experiment_name: Name of the experiment, used to create a unique save directory. + workspace_name: Name of the workspace, used to create a unique save directory. Returns: configs: Loaded configuration dictionary for the model type. @@ -84,13 +44,7 @@ def save_additional_training_config( """ # Modify the config file to give the correct training data and saving directory with open(training_config_json_path, "r") as file: - configs: TrainingConfig - if model_type == ModelType.TABDDPM: - configs = ClavaDDPMTrainingConfig(**json.load(file)) - elif model_type == ModelType.CTGAN: - configs = CTGANTrainingConfig(**json.load(file)) - else: - raise ValueError(f"Invalid model type: {model_type}") + configs = training_config_type(**json.load(file)) configs.general.data_dir = data_dir # Save dir is set by joining the workspace_dir and exp_name @@ -109,246 +63,6 @@ def save_additional_training_config( return configs, save_dir -# TODO: This and the next function should be unified later. -def train_tabddpm_and_synthesize( - train_set: pd.DataFrame, - configs: ClavaDDPMTrainingConfig, - save_dir: Path, - synthesize: bool = True, - number_of_points_to_synthesize: int = 20000, -) -> TabDDPMTrainingResult: - """ - Train a TabDDPM model on the provided training set and optionally synthesize data using the trained models. - - Args: - train_set: The training dataset as a pandas DataFrame. - configs: Configuration dictionary for TabDDPM. - save_dir: Directory path where models and results will be saved. - synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. - number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000. - - Returns: - A dataclass TrainingResult object containing: - - save_dir: Directory where results are saved. - - configs: Configuration dictionary used for training. - - tables: Loaded tables after clustering. - - relation_order: Relation order of the tables. - - all_group_lengths_probabilities: Group lengths probability dictionaries. - - models: The trained models. - - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, - otherwise, None. - """ - # Load tables - tables, relation_order, _ = load_tables(configs.general.data_dir, train_data={"trans": train_set}) - - # Clustering on the multi-table dataset - tables, all_group_lengths_prob_dicts = clava_clustering(tables, relation_order, save_dir, configs.clustering) - - # Train models - tables, models = clava_training( - tables, - relation_order, - save_dir, - diffusion_config=configs.diffusion, - classifier_config=configs.classifier, - device=DEVICE, - ) - result = TabDDPMTrainingResult( - save_dir=save_dir, - configs=configs, - tables=tables, - relation_order=relation_order, - all_group_lengths_probabilities=all_group_lengths_prob_dicts, - models=models, - ) - - if synthesize: - # By default, Ensemble attack generates a synthetic data of length ``20,000``. - # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to - # generate 20,000 samples regardless of the training data size. But we control the - # synthetic data size directly here with ``number_of_points_to_synthesize``. - # ``sample_scale`` is later multiplied by the size of training data (no id) to determine - # the size of synthetic data. - assert len(tables["trans"].data) > 0, "Cannot synthesize: training data is empty" - sample_scale = number_of_points_to_synthesize / len(tables["trans"].data) - cleaned_tables, _, _ = clava_synthesizing( - tables, - relation_order, - save_dir, - models, - configs.general, - configs.sampling, - configs.matching, - all_group_lengths_prob_dicts, - sample_scale=sample_scale, - ) - - result.synthetic_data = cleaned_tables["trans"] - - return result - - -def fine_tune_tabddpm_and_synthesize( - trained_models: dict[Relation, ClavaDDPMModelArtifacts], - fine_tune_set: pd.DataFrame, - configs: ClavaDDPMTrainingConfig, - save_dir: Path, - fine_tuning_diffusion_iterations: int = 100, - fine_tuning_classifier_iterations: int = 10, - synthesize: bool = True, - number_of_points_to_synthesize: int = 20000, -) -> TrainingResult: - """ - Given the trained models and a new training set, fine-tune the TabDDPM models. - If ``synthesize`` is True, synthesizes data using the fine-tuned models. Number of - synthesized data points is determined by the ``classifier_scale`` parameter in training ``configs``. - - Args: - trained_models: The previously trained model material. - fine_tune_set: The new training dataset for fine-tuning. - configs: Configuration dictionary for TabDDPM. - save_dir: Directory path where models and results will be saved. - fine_tuning_diffusion_iterations: Diffusion iterations for fine tuning. Defaults to 100. - fine_tuning_classifier_iterations: Number of training iterations for the new classifier model. - Defaults to 10. - synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. - number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000. - - - Returns: - A dataclass TrainingResult object containing: - - save_dir: Directory where results are saved. - - configs: Configuration dictionary used for training. - - tables: Loaded tables after clustering. - - relation_order: Relation order of the tables. - - all_group_lengths_probabilities: Group lengths probability dictionaries. - - models: The trained models. - - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, - otherwise, None. - """ - # Load tables - new_tables, relation_order, _ = load_tables(configs.general.data_dir, train_data={"trans": fine_tune_set}) - - # Clustering on the multi-table dataset - # Original submission uses 'force_tables=True' to run the clustering even if checkpoint is found. - new_tables, all_group_lengths_prob_dicts = clava_clustering( - new_tables, relation_order, save_dir, configs.clustering - ) - - # Train models - copied_models = copy.deepcopy(trained_models) - new_models = clava_fine_tuning( - copied_models, - new_tables, - relation_order, - diffusion_config=configs.diffusion, - classifier_config=configs.classifier, - fine_tuning_diffusion_iterations=fine_tuning_diffusion_iterations, - fine_tuning_classifier_iterations=fine_tuning_classifier_iterations, - ) - result = TabDDPMTrainingResult( - save_dir=save_dir, - configs=configs, - tables=new_tables, - relation_order=relation_order, - all_group_lengths_probabilities=all_group_lengths_prob_dicts, - models=new_models, - ) - - if synthesize: - # By default, Ensemble attack generates a synthetic data of length ``20,000``. - # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to - # generate 20,000 samples regardless of the training data size. But we control the - # synthetic data size directly here with ``number_of_points_to_synthesize``. - # ``sample_scale`` is later multiplied by the size of training data (no id) to determine - # the size of synthetic data. - assert len(new_tables["trans"].data) > 0, "Cannot synthesize: training data is empty" - sample_scale = number_of_points_to_synthesize / len(new_tables["trans"].data) - cleaned_tables, _, _ = clava_synthesizing( - new_tables, - relation_order, - save_dir, - new_models, - configs.general, - configs.sampling, - configs.matching, - all_group_lengths_prob_dicts, - sample_scale=sample_scale, - ) - - result.synthetic_data = cleaned_tables["trans"] - - return result - - -def train_or_fine_tune_and_synthesize_with_ctgan( - dataset: pd.DataFrame, - configs: CTGANTrainingConfig, - save_dir: Path, - synthesize: bool = True, - trained_model: CTGANSynthesizer | None = None, -) -> TrainingResult: - """ - Train or fine tune a CTGAN model on the provided dataset and optionally synthesize data. - - If no trained model is provided, a new model will be trained. Otherwise, the - provided model will be fine tuned. - - Args: - dataset: The dataset as a pandas DataFrame. - configs: Configuration dictionary for CTGAN. - save_dir: Directory path where models and results will be saved. - synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. - trained_model: The trained model to fine tune. If None, a new model will be trained. - - Returns: - A dataclass TrainingResult object containing: - - save_dir: Directory where results are saved. - - configs: Configuration dictionary used for training. - - models: The trained models. - - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, - otherwise, None. - """ - table_name = get_table_name(configs.general.data_dir) - domain_file_path = configs.general.data_dir / f"{table_name}_domain.json" - with open(domain_file_path, "r") as file: - domain_dictionary = json.load(file) - - metadata, dataset_without_ids = get_single_table_svd_metadata(dataset, domain_dictionary) - - if trained_model is None: - log(INFO, "Training new CTGAN model...") - ctgan = CTGANSynthesizer( - metadata=metadata, - epochs=configs.training.epochs, - verbose=configs.training.verbose, - ) - model_name = "trained_ctgan_model.pkl" - else: - log(INFO, "Fine tuning CTGAN model...") - ctgan = trained_model - model_name = "fine_tuned_ctgan_model.pkl" - - ctgan.fit(dataset_without_ids) - - results_file = Path(save_dir) / model_name - results_file.parent.mkdir(parents=True, exist_ok=True) - - ctgan.save(results_file) - - result = CTGANTrainingResult( - save_dir=save_dir, - configs=configs, - models={(None, table_name): CTGANModelArtifacts(model=ctgan, model_file_path=results_file)}, - ) - - if synthesize: - synthetic_data = ctgan.sample(num_rows=configs.synthesizing.sample_size) - result.synthetic_data = synthetic_data - - return result - - # TODO: The following function is directly copied from the midst reference code since # I need it to run the attack code, but, it should probably be moved to somewhere else # as it is an essential part of a working TabDDPM training pipeline. From 38a20b5e19c57e3636e5333d60924ef54ec018de Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 18 Mar 2026 11:25:22 -0400 Subject: [PATCH 24/38] working first refactor --- examples/ensemble_attack/run_shadow_model_training.py | 7 ++++--- examples/gan/ensemble_attack/train_attack_model.py | 2 +- src/midst_toolkit/attacks/ensemble/model.py | 9 ++------- .../attacks/ensemble/rmia/shadow_model_training.py | 5 +++-- .../attacks/ensemble/shadow_model_utils.py | 11 +---------- 5 files changed, 11 insertions(+), 23 deletions(-) diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index c7166aaa..e9459451 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -1,15 +1,14 @@ import shutil from logging import INFO from pathlib import Path -from typing import cast import pandas as pd from omegaconf import DictConfig from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models from midst_toolkit.attacks.ensemble.model import EnsembleAttackModelRunner +from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models from midst_toolkit.common.logger import log @@ -70,7 +69,9 @@ def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: D return target_model_synthetic_path -def run_shadow_model_training(model_runner: EnsembleAttackModelRunner, config: DictConfig, df_challenge_train: pd.DataFrame) -> list[Path]: +def run_shadow_model_training( + model_runner: EnsembleAttackModelRunner, config: DictConfig, df_challenge_train: pd.DataFrame +) -> list[Path]: """ Function to run the shadow model training for RMIA attack. diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index bfd87ab7..b4ebebdc 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -105,7 +105,7 @@ def train_attack_model(config: DictConfig) -> None: shadow_data_paths = [Path(path) for path in shadow_data_paths] log(INFO, "Training the target model...") - target_model_synthetic_path = run_target_model_training(config.ensemble_attack) + target_model_synthetic_path = run_target_model_training(model_runner, config.ensemble_attack) if config.ensemble_attack.pipeline.run_metaclassifier_training: log(INFO, "Training the metaclassifier...") diff --git a/src/midst_toolkit/attacks/ensemble/model.py b/src/midst_toolkit/attacks/ensemble/model.py index a4c53791..c08029e3 100644 --- a/src/midst_toolkit/attacks/ensemble/model.py +++ b/src/midst_toolkit/attacks/ensemble/model.py @@ -3,12 +3,10 @@ """ from abc import ABC, abstractmethod -from dataclasses import dataclass from pathlib import Path from typing import Any from logging import INFO import copy -import json import pandas as pd from pydantic import BaseModel, ConfigDict @@ -27,8 +25,6 @@ from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning - - # Base Classes class EnsembleAttackTrainingConfig(TrainingConfig): number_of_points_to_synthesize: int = 20000 @@ -195,12 +191,11 @@ class EnsembleAttackCTGANTrainingConfig(CTGANTrainingConfig, EnsembleAttackTrain metadata: SingleTableMetadata = None table_name: str = None + class CTGANTrainingResult(EnsembleAttackTrainingResult): + save_dir: Path configs: EnsembleAttackCTGANTrainingConfig models: dict[Relation, CTGANModelArtifacts] - tables: Tables - relation_order: RelationOrder - all_group_lengths_probabilities: GroupLengthsProbDicts class EnsembleAttackCTGANModelRunner(EnsembleAttackModelRunner): diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 56e34717..065f87b0 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -114,7 +114,9 @@ def train_fine_tuned_shadow_models( if not initial_model_path.exists(): log(INFO, f"Training initial model with runner {model_runner}. Model ID {init_model_id}...") - initial_model_training_results = model_runner.train_or_fine_tune_and_synthesize(dataset=train, synthesize=False) + initial_model_training_results = model_runner.train_or_fine_tune_and_synthesize( + dataset=train, synthesize=False + ) # Save the initial model # Pickle dump the results @@ -159,7 +161,6 @@ def train_fine_tuned_shadow_models( train_result = model_runner.train_or_fine_tune_and_synthesize( dataset=selected_challenges, - save_dir=save_dir, synthesize=True, trained_model=initial_model_training_results, ) diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index 8d1a0512..12ef8225 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -5,17 +5,8 @@ from pathlib import Path from typing import Type -from midst_toolkit.attacks.ensemble.model import EnsembleAttackTrainingResult -from midst_toolkit.common.config import CTGANTrainingConfig, TrainingConfig +from midst_toolkit.common.config import TrainingConfig from midst_toolkit.common.logger import log -from midst_toolkit.models.clavaddpm.enumerations import Relation -from midst_toolkit.models.clavaddpm.train import CTGANModelArtifacts - - -@dataclass -class CTGANTrainingResult(EnsembleAttackTrainingResult): - configs: CTGANTrainingConfig - models: dict[Relation, CTGANModelArtifacts] def save_additional_training_config( From ac1a0bf3fb7bdd0b52049c0fde438880c2d95d84 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 18 Mar 2026 17:06:02 -0400 Subject: [PATCH 25/38] train attack model working --- examples/ensemble_attack/run_attack.py | 19 ++++++---- .../run_metaclassifier_training.py | 14 +++++-- examples/ensemble_attack/test_attack_model.py | 5 ++- .../gan/ensemble_attack/train_attack_model.py | 37 ++++--------------- examples/gan/ensemble_attack/utils.py | 32 ++++++++++++++++ src/midst_toolkit/attacks/ensemble/model.py | 3 +- .../attacks/ensemble/process_split_data.py | 14 +++++-- .../attacks/ensemble/shadow_model_utils.py | 1 - .../ensemble/test_process_data_split.py | 20 ++++++---- 9 files changed, 88 insertions(+), 57 deletions(-) create mode 100644 examples/gan/ensemble_attack/utils.py diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index 89ceef04..2d73b36d 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -7,13 +7,15 @@ from pathlib import Path import hydra +import json from omegaconf import DictConfig import examples.ensemble_attack.run_metaclassifier_training as meta_pipeline import examples.ensemble_attack.run_shadow_model_training as shadow_pipeline from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME, collect_population_data_ensemble from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.process_split_data import process_split_data +from midst_toolkit.attacks.ensemble.model import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig +from midst_toolkit.attacks.ensemble.process_split_data import process_split_data, PROCESSED_TRAIN_DATA_FILE_NAME from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -79,17 +81,20 @@ def main(config: DictConfig) -> None: if config.pipeline.run_shadow_model_training: df_master_challenge_train = load_dataframe( Path(config.data_paths.processed_attack_data_path), - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) - # TODO: add these to the config - # configs.fine_tuning_diffusion_iterations = fine_tuning_config.fine_tune_diffusion_iterations - # configs.fine_tuning_classifier_iterations = fine_tuning_config.fine_tune_classifier_iterations + with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file: + training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + training_config.fine_tuning_diffusion_iterations = config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + training_config.fine_tuning_classifier_iterations = config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations - shadow_data_paths = shadow_pipeline.run_shadow_model_training(config, df_master_challenge_train) + model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config) + + shadow_data_paths = shadow_pipeline.run_shadow_model_training(model_runner, config, df_master_challenge_train) shadow_data_paths = [Path(path) for path in shadow_data_paths] - target_model_synthetic_path = shadow_pipeline.run_target_model_training(config) + target_model_synthetic_path = shadow_pipeline.run_target_model_training(model_runner, config) if config.pipeline.run_metaclassifier_training: if not config.pipeline.run_shadow_model_training: diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py index dd79033a..c94e5e65 100644 --- a/examples/ensemble_attack/run_metaclassifier_training.py +++ b/examples/ensemble_attack/run_metaclassifier_training.py @@ -10,6 +10,12 @@ from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.common.logger import log +from midst_toolkit.attacks.ensemble.process_split_data import ( + PROCESSED_TRAIN_DATA_FILE_NAME, + PROCESSED_TEST_DATA_FILE_NAME, + PROCESSED_TRAIN_LABELS_FILE_NAME, + PROCESSED_TEST_LABELS_FILE_NAME, +) def run_metaclassifier_training( @@ -32,20 +38,20 @@ def run_metaclassifier_training( # Load the processed data splits. df_meta_train = load_dataframe( Path(config.data_paths.processed_attack_data_path), - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) # y_meta_train consists of binary labels (0s and 1s) indicating whether each row in df_meta_train # belongs to the target model's training set. y_meta_train = np.load( - Path(config.data_paths.processed_attack_data_path) / "master_challenge_train_labels.npy", + Path(config.data_paths.processed_attack_data_path) / PROCESSED_TRAIN_LABELS_FILE_NAME, ) df_meta_test = load_dataframe( Path(config.data_paths.processed_attack_data_path), - "master_challenge_test.csv", + PROCESSED_TEST_DATA_FILE_NAME, ) y_meta_test = np.load( - Path(config.data_paths.processed_attack_data_path) / "master_challenge_test_labels.npy", + Path(config.data_paths.processed_attack_data_path) / PROCESSED_TEST_LABELS_FILE_NAME, ) # Three sets of shadow models are trained separately and their paths are provided here. diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index f3558c83..4b403323 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -24,6 +24,7 @@ from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds from midst_toolkit.models.clavaddpm.train import get_df_without_id +from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME class RmiaTrainingDataChoice(Enum): @@ -198,7 +199,7 @@ def collect_challenge_and_train_data( # Load master challenge train data df_master_train = load_dataframe( processed_attack_data_path, - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) log( INFO, @@ -279,7 +280,7 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list ) df_master_train = load_dataframe( processed_attack_data_path, - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) else: # If challenge data does not exist, collect it from the cluster diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index b4ebebdc..c255a45f 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -3,15 +3,15 @@ from pathlib import Path import hydra -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig from examples.ensemble_attack.run_metaclassifier_training import run_metaclassifier_training from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training from examples.gan.utils import get_single_table_svd_metadata, get_table_name +from examples.gan.ensemble_attack.utils import make_training_config from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe -from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANModelRunner, EnsembleAttackCTGANTrainingConfig -from midst_toolkit.attacks.ensemble.process_split_data import process_split_data -from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config +from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANModelRunner +from midst_toolkit.attacks.ensemble.process_split_data import process_split_data, PROCESSED_TRAIN_DATA_FILE_NAME from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -58,28 +58,11 @@ def train_attack_model(config: DictConfig) -> None: random_seed=config.ensemble_attack.random_seed, ) - # Saving the model config from the config.yaml into a json file - # because that's what the ensemble attack code will be looking for - training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path) - training_config_path.unlink(missing_ok=True) - with open(training_config_path, "w") as f: - training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config) - assert isinstance(training_config, dict), "Training config must be a dictionary." - training_config["general"] = { - "test_data_dir": config.base_data_dir, - "sample_prefix": "ctgan", - # The values below will be overriden - "exp_name": "", - "data_dir": "", - "workspace_dir": "", - } - json.dump(training_config, f) - if config.ensemble_attack.pipeline.run_shadow_model_training: log(INFO, "Training the shadow models...") master_challenge_train = load_dataframe( Path(config.ensemble_attack.data_paths.population_path), - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) table_name = get_table_name(config.base_data_dir) @@ -87,15 +70,9 @@ def train_attack_model(config: DictConfig) -> None: with open(domain_file_path, "r") as file: domain_dictionary = json.load(file) - training_config, _ = save_additional_training_config( - training_config_type=EnsembleAttackCTGANTrainingConfig, - data_dir=Path(config.base_data_dir), - training_config_json_path=training_config_path, - final_config_json_path=Path(config.base_data_dir) / f"{table_name}.json", # Path to the new json - experiment_name="pre_trained_model", - ) - metadata, _ = get_single_table_svd_metadata(master_challenge_train, domain_dictionary) + + training_config = make_training_config(config) training_config.metadata = metadata training_config.table_name = table_name diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py new file mode 100644 index 00000000..c0e2e4f9 --- /dev/null +++ b/examples/gan/ensemble_attack/utils.py @@ -0,0 +1,32 @@ +from typing import Any +from pathlib import Path +import json + +from omegaconf import DictConfig, OmegaConf + +from midst_toolkit.attacks.ensemble.shadow_model_utils import setup_save_dir +from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANTrainingConfig + + +def make_training_config(config: DictConfig) -> EnsembleAttackCTGANTrainingConfig: + # Saving the model config from the config.yaml into a json file + # because that's what the ensemble attack code will be looking for + training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path) + training_config_path.unlink(missing_ok=True) + with open(training_config_path, "w") as f: + training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config, resolve=True) + assert isinstance(training_config, dict), "Training config must be a dictionary." + training_config["general"] = { + "test_data_dir": config.base_data_dir, + "sample_prefix": "ctgan", + "data_dir": config.base_data_dir, + "workspace_dir": str(Path(config.base_data_dir) / "shadow_workspace"), + "exp_name": "pre_trained_model", + } + json.dump(training_config, f) + + ctgan_training_config = EnsembleAttackCTGANTrainingConfig(**training_config) + + setup_save_dir(ctgan_training_config) + + return ctgan_training_config diff --git a/src/midst_toolkit/attacks/ensemble/model.py b/src/midst_toolkit/attacks/ensemble/model.py index c08029e3..037add81 100644 --- a/src/midst_toolkit/attacks/ensemble/model.py +++ b/src/midst_toolkit/attacks/ensemble/model.py @@ -127,7 +127,7 @@ def train_or_fine_tune_and_synthesize( if trained_model is None: # Train models - models = clava_training( + tables, models = clava_training( tables, relation_order, save_dir, @@ -193,7 +193,6 @@ class EnsembleAttackCTGANTrainingConfig(CTGANTrainingConfig, EnsembleAttackTrain class CTGANTrainingResult(EnsembleAttackTrainingResult): - save_dir: Path configs: EnsembleAttackCTGANTrainingConfig models: dict[Relation, CTGANModelArtifacts] diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py index bbf0360f..cd566c8c 100644 --- a/src/midst_toolkit/attacks/ensemble/process_split_data.py +++ b/src/midst_toolkit/attacks/ensemble/process_split_data.py @@ -9,6 +9,12 @@ from midst_toolkit.common.logger import log +PROCESSED_TRAIN_DATA_FILE_NAME = "master_challenge_train.csv" +PROCESSED_TEST_DATA_FILE_NAME = "master_challenge_test.csv" +PROCESSED_TRAIN_LABELS_FILE_NAME = "master_challenge_train_labels.npy" +PROCESSED_TEST_LABELS_FILE_NAME = "master_challenge_test_labels.npy" + + def split_real_data( df_real: pd.DataFrame, column_to_stratify: str | None = None, @@ -208,14 +214,14 @@ def process_split_data( save_dataframe(df_real_val, processed_attack_data_path, "real_val.csv") save_dataframe(df_real_test, processed_attack_data_path, "real_test.csv") - save_dataframe(df_val, processed_attack_data_path, "master_challenge_train.csv") + save_dataframe(df_val, processed_attack_data_path, PROCESSED_TRAIN_DATA_FILE_NAME) np.save( - processed_attack_data_path / "master_challenge_train_labels.npy", + processed_attack_data_path / PROCESSED_TRAIN_LABELS_FILE_NAME, y_val, ) - save_dataframe(df_test, processed_attack_data_path, "master_challenge_test.csv") + save_dataframe(df_test, processed_attack_data_path, PROCESSED_TEST_DATA_FILE_NAME) np.save( - processed_attack_data_path / "master_challenge_test_labels.npy", + processed_attack_data_path / PROCESSED_TEST_LABELS_FILE_NAME, y_test, ) log(INFO, f"Data splits saved to {processed_attack_data_path}") diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index 12ef8225..e42a2e19 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -1,6 +1,5 @@ import json import os -from dataclasses import dataclass from logging import INFO from pathlib import Path from typing import Type diff --git a/tests/unit/attacks/ensemble/test_process_data_split.py b/tests/unit/attacks/ensemble/test_process_data_split.py index 80023a8b..327fcb49 100644 --- a/tests/unit/attacks/ensemble/test_process_data_split.py +++ b/tests/unit/attacks/ensemble/test_process_data_split.py @@ -5,7 +5,13 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.process_split_data import process_split_data +from midst_toolkit.attacks.ensemble.process_split_data import ( + process_split_data, + PROCESSED_TRAIN_DATA_FILE_NAME, + PROCESSED_TEST_DATA_FILE_NAME, + PROCESSED_TRAIN_LABELS_FILE_NAME, + PROCESSED_TEST_LABELS_FILE_NAME, +) @pytest.fixture(scope="module") @@ -38,10 +44,10 @@ def test_process_split_data(cfg: DictConfig, tmp_path: Path) -> None: assert (output_dir / "real_test.csv").exists() # Assert that the master challenge data files are saved in the provided path - assert (output_dir / "master_challenge_train.csv").exists() - assert (output_dir / "master_challenge_train_labels.npy").exists() - assert (output_dir / "master_challenge_test.csv").exists() - assert (output_dir / "master_challenge_test_labels.npy").exists() + assert (output_dir / PROCESSED_TRAIN_DATA_FILE_NAME).exists() + assert (output_dir / PROCESSED_TRAIN_LABELS_FILE_NAME).exists() + assert (output_dir / PROCESSED_TEST_DATA_FILE_NAME).exists() + assert (output_dir / PROCESSED_TEST_LABELS_FILE_NAME).exists() # Assert that the collected data has the expected number of rows and columns real_train = load_dataframe(output_dir, "real_train.csv") @@ -57,11 +63,11 @@ def test_process_split_data(cfg: DictConfig, tmp_path: Path) -> None: # Recall that `master_challenge_train`` consists of two halves: one half (20 samples) from `real_val`` data # with their "is_train" column set to 0, and the other half (20 samples) from the real train data (`real_train``) # with their "is_train" column set to 1. Note that ["is_train"] column is dropped in the final dataframes. - master_challenge_train = load_dataframe(output_dir, "master_challenge_train.csv") + master_challenge_train = load_dataframe(output_dir, PROCESSED_TRAIN_DATA_FILE_NAME) assert master_challenge_train.shape == (40, 10), f" Shape is {master_challenge_train.shape}" # Recall that `master_challenge_test`` consists of two halves: one half (20 samples) from `real_test`` data # with their "is_train" column set to 0, and the other half (20 samples) from the real train data (`real_train``) # with their "is_train" column set to 1. Note that ["is_train"] column is dropped in the final dataframes. - master_challenge_test = load_dataframe(output_dir, "master_challenge_test.csv") + master_challenge_test = load_dataframe(output_dir, PROCESSED_TEST_DATA_FILE_NAME) assert master_challenge_test.shape == (40, 10), f" Shape is {master_challenge_test.shape}" From 2c3fa1ecbc5f6914a27577c7ef931cf9812092da Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 12:03:59 -0400 Subject: [PATCH 26/38] Adding changes for the test model script --- .../run_shadow_model_training.py | 8 ++- examples/ensemble_attack/test_attack_model.py | 60 +++++++++++++++---- .../gan/ensemble_attack/test_attack_model.py | 15 ++++- .../gan/ensemble_attack/train_attack_model.py | 21 ++----- examples/gan/ensemble_attack/utils.py | 45 +++++++++++++- .../ensemble/rmia/shadow_model_training.py | 8 +-- 6 files changed, 117 insertions(+), 40 deletions(-) diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index e9459451..4792fca2 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -70,14 +70,16 @@ def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: D def run_shadow_model_training( - model_runner: EnsembleAttackModelRunner, config: DictConfig, df_challenge_train: pd.DataFrame + model_runner: EnsembleAttackModelRunner, + config: DictConfig, + df_challenge_train: pd.DataFrame, ) -> list[Path]: """ Function to run the shadow model training for RMIA attack. Args: - model_runner: The model runner to be used for training the shadow models. Should be an instance of - a subclass of `EnsembleAttackModelRunner`. + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. config: Configuration object set in config.yaml. df_challenge_train: DataFrame containing the data that is used to train RMIA shadow models. diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 4b403323..0b2dbf48 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -21,6 +21,11 @@ from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.model import ( + EnsembleAttackModelRunner, + EnsembleAttackTabDDPMModelRunner, + EnsembleAttackTabDDPMTrainingConfig, +) from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds from midst_toolkit.models.clavaddpm.train import get_df_without_id @@ -88,7 +93,11 @@ def extract_primary_id_column( return data_frame[id_column_name] -def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) -> list[dict[str, list[Any]]]: +def run_rmia_shadow_training( + model_runner: EnsembleAttackModelRunner, + config: DictConfig, + df_challenge: pd.DataFrame, +) -> list[dict[str, list[Any]]]: """ Three sets of shadow models will be trained as a part of this attack. Note that shadow models need to be trained on the collection of challenge points once and used @@ -97,14 +106,16 @@ def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) -> of the shadow models, and these shadow models are used to attack all target models. Args: - config: Configuration object set in ``experiments_config.yaml``. + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. + config: Configuration object set in config.yaml. df_challenge: DataFrame containing the challenge data points for shadow model training. Return: A list containing three dictionaries, each representing a collection of shadow models with their training data and generated synthetic outputs. """ - shadow_model_paths = run_shadow_model_training(config, df_challenge_train=df_challenge) + shadow_model_paths = run_shadow_model_training(model_runner, config, df_challenge_train=df_challenge) assert len(shadow_model_paths) == 3, "For testing, meta classifier needs the path to three sets of shadow models." @@ -255,7 +266,10 @@ def select_challenge_data_for_training( return df_challenge -def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list[Any]]]: +def train_rmia_shadows_for_test_phase( + model_runner: EnsembleAttackModelRunner, + config: DictConfig, +) -> list[dict[str, list[Any]]]: """ Function to train RMIA shadow models for the testing phase using the dataset containing challenge data points. @@ -293,15 +307,10 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list # Load the challenge dataframe for training RMIA shadow models. rmia_training_choice = RmiaTrainingDataChoice(config.target_model.attack_rmia_shadow_training_data_choice) df_challenge = select_challenge_data_for_training(rmia_training_choice, df_challenge_experiment, df_master_train) - return run_rmia_shadow_training(config, df_challenge=df_challenge) + return run_rmia_shadow_training(model_runner, config, df_challenge=df_challenge) -# TODO: Perform inference on all the target models sequentially in a single run instead of running this script -# multiple times. For more information, refer to https://app.clickup.com/t/868h4xk86 -@hydra.main(config_path="configs", config_name="experiment_config", version_base=None) -def run_metaclassifier_testing( - config: DictConfig, -) -> None: +def run_metaclassifier_testing(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> None: """ Function to run the attack on a single target model using a trained metaclassifier. Note that RMIA shadow models need to be trained for every new set of target models on @@ -314,6 +323,8 @@ def run_metaclassifier_testing( Test prediction probabilities are saved to the specified attack result path in the config. Args: + model_runner: The model runner to be used for testing the metaclassifier. + Should be an instance of `EnsembleAttackModelRunner`. config: Configuration object set in ``experiments_config.yaml``. """ log( @@ -383,7 +394,7 @@ def run_metaclassifier_testing( if not models_exists: log(INFO, "Shadow models for testing phase do not exist. Training RMIA shadow models...") - shadow_data_collection = train_rmia_shadows_for_test_phase(config) + shadow_data_collection = train_rmia_shadows_for_test_phase(model_runner, config) else: log(INFO, "All shadow models for testing phase found. Using existing RMIA shadow models...") @@ -428,5 +439,28 @@ def run_metaclassifier_testing( save_results(attack_results_path, metaclassifier_model_name, probabilities, pred_score) +# TODO: Perform inference on all the target models sequentially in a single run instead of running this script +# multiple times. For more information, refer to https://app.clickup.com/t/868h4xk86 +@hydra.main(config_path="configs", config_name="experiment_config", version_base=None) +def run_metaclassifier_testing_with_tabddpm(config: DictConfig) -> None: + """ + Run the attack on a single target model using a trained metaclassifier. + RMIA shadow models will be trained using the TabDDPM model. + + Args: + config: Configuration object set in config.yaml. + """ + log(INFO, "Running metaclassifier testing with TabDDPM...") + + with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file: + training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + training_config.fine_tuning_diffusion_iterations = config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + training_config.fine_tuning_classifier_iterations = config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + + model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config) + + run_metaclassifier_testing(model_runner, config) + + if __name__ == "__main__": - run_metaclassifier_testing() + run_metaclassifier_testing_with_tabddpm() diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/test_attack_model.py index d684402a..3e2551d8 100644 --- a/examples/gan/ensemble_attack/test_attack_model.py +++ b/examples/gan/ensemble_attack/test_attack_model.py @@ -4,17 +4,28 @@ from omegaconf import DictConfig from examples.ensemble_attack.test_attack_model import run_metaclassifier_testing +from examples.gan.ensemble_attack.utils import make_training_config +from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANModelRunner from midst_toolkit.common.logger import log @hydra.main(config_path="./", config_name="config", version_base=None) def attack_model_test(config: DictConfig) -> None: - """Main function to test the attack model.""" + """ + Main function to test the attack model. + + Args: + config: The configuration object from the config.yaml file. + """ log( INFO, f"Testing attack model against synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...", ) - run_metaclassifier_testing(config.ensemble_attack) + + training_config = make_training_config(config) + model_runner = EnsembleAttackCTGANModelRunner(training_config=training_config) + + run_metaclassifier_testing(model_runner, config.ensemble_attack) if __name__ == "__main__": diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index c255a45f..03681b6b 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -7,11 +7,11 @@ from examples.ensemble_attack.run_metaclassifier_training import run_metaclassifier_training from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training -from examples.gan.utils import get_single_table_svd_metadata, get_table_name +from examples.gan.ensemble_attack.utils import get_master_challenge_train_data from examples.gan.ensemble_attack.utils import make_training_config from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANModelRunner -from midst_toolkit.attacks.ensemble.process_split_data import process_split_data, PROCESSED_TRAIN_DATA_FILE_NAME +from midst_toolkit.attacks.ensemble.process_split_data import process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -60,24 +60,11 @@ def train_attack_model(config: DictConfig) -> None: if config.ensemble_attack.pipeline.run_shadow_model_training: log(INFO, "Training the shadow models...") - master_challenge_train = load_dataframe( - Path(config.ensemble_attack.data_paths.population_path), - PROCESSED_TRAIN_DATA_FILE_NAME, - ) - - table_name = get_table_name(config.base_data_dir) - domain_file_path = Path(config.base_data_dir) / f"{table_name}_domain.json" - with open(domain_file_path, "r") as file: - domain_dictionary = json.load(file) - - metadata, _ = get_single_table_svd_metadata(master_challenge_train, domain_dictionary) - + training_config = make_training_config(config) - training_config.metadata = metadata - training_config.table_name = table_name - model_runner = EnsembleAttackCTGANModelRunner(training_config=training_config) + master_challenge_train = get_master_challenge_train_data(config) shadow_data_paths = run_shadow_model_training(model_runner, config.ensemble_attack, master_challenge_train) shadow_data_paths = [Path(path) for path in shadow_data_paths] diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py index c0e2e4f9..5cf10f6a 100644 --- a/examples/gan/ensemble_attack/utils.py +++ b/examples/gan/ensemble_attack/utils.py @@ -1,14 +1,45 @@ -from typing import Any from pathlib import Path import json +import pandas as pd from omegaconf import DictConfig, OmegaConf from midst_toolkit.attacks.ensemble.shadow_model_utils import setup_save_dir +from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANTrainingConfig +from examples.gan.utils import get_single_table_svd_metadata, get_table_name + + +def get_master_challenge_train_data(config: DictConfig) -> pd.DataFrame: + """ + Get the master challenge train data from the config's population path location. + + Args: + config: The configuration object. + + Returns: + The dataframe containing the master challenge train data. + """ + population_path = Path(config.ensemble_attack.data_paths.population_path) + assert population_path.exists(), f"Population path {population_path} does not exist. Please run the data processing pipeline first." + + master_challenge_train = load_dataframe(population_path, PROCESSED_TRAIN_DATA_FILE_NAME) + return master_challenge_train def make_training_config(config: DictConfig) -> EnsembleAttackCTGANTrainingConfig: + """ + Make the ensemble attacktraining config for the CTGAN model from the config.yaml file. + + Saves the training config json file to the shadow training json config paths location. + + Args: + config: The configuration object. + + Returns: + The ensemble attack training config for the CTGAN model. + """ # Saving the model config from the config.yaml into a json file # because that's what the ensemble attack code will be looking for training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path) @@ -29,4 +60,16 @@ def make_training_config(config: DictConfig) -> EnsembleAttackCTGANTrainingConfi setup_save_dir(ctgan_training_config) + master_challenge_train = get_master_challenge_train_data(config) + + table_name = get_table_name(config.base_data_dir) + domain_file_path = Path(config.base_data_dir) / f"{table_name}_domain.json" + with open(domain_file_path, "r") as file: + domain_dictionary = json.load(file) + + metadata, _ = get_single_table_svd_metadata(master_challenge_train, domain_dictionary) + + ctgan_training_config.metadata = metadata + ctgan_training_config.table_name = table_name + return ctgan_training_config diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 065f87b0..6ea9947f 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -200,8 +200,8 @@ def train_shadow_on_half_challenge_data( 3. A synthetic dataset of 20K observations is generated for each model. Args: - model_runner: The model runner to be used for training the shadow models. Should be an instance of - a subclass of `EnsembleAttackModelRunner`. + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. n_models: number of shadow models to train, must be even. n_reps: number of repetitions for each challenge point in the fine-tuning set. master_challenge_data: The master challenge training dataset. @@ -321,8 +321,8 @@ def train_three_sets_of_shadow_models( Args: - model_runner: The model runner to be used for training the shadow models. Should be an instance of - a subclass of `EnsembleAttackModelRunner`. + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. population_data: The total population data used for pre-training some of the shadow models. master_challenge_data: The master challenge training dataset. shadow_models_output_path: Path where the all datasets and information (configs) necessary to From ca87ac3bf66191c3fa4315469d665fa72fa610ca Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 12:04:27 -0400 Subject: [PATCH 27/38] Linter changes --- examples/ensemble_attack/test_attack_model.py | 10 +++++++--- examples/gan/ensemble_attack/test_attack_model.py | 2 +- examples/gan/ensemble_attack/train_attack_model.py | 6 ++---- examples/gan/ensemble_attack/utils.py | 12 +++++++----- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 0b2dbf48..9a14bf28 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -26,10 +26,10 @@ EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig, ) +from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds from midst_toolkit.models.clavaddpm.train import get_df_without_id -from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME class RmiaTrainingDataChoice(Enum): @@ -454,8 +454,12 @@ def run_metaclassifier_testing_with_tabddpm(config: DictConfig) -> None: with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file: training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) - training_config.fine_tuning_diffusion_iterations = config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations - training_config.fine_tuning_classifier_iterations = config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + training_config.fine_tuning_diffusion_iterations = ( + config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + ) + training_config.fine_tuning_classifier_iterations = ( + config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + ) model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config) diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/test_attack_model.py index 3e2551d8..9c093bc0 100644 --- a/examples/gan/ensemble_attack/test_attack_model.py +++ b/examples/gan/ensemble_attack/test_attack_model.py @@ -13,7 +13,7 @@ def attack_model_test(config: DictConfig) -> None: """ Main function to test the attack model. - + Args: config: The configuration object from the config.yaml file. """ diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index 03681b6b..a302bfa9 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -1,4 +1,3 @@ -import json from logging import INFO from pathlib import Path @@ -7,8 +6,7 @@ from examples.ensemble_attack.run_metaclassifier_training import run_metaclassifier_training from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training -from examples.gan.ensemble_attack.utils import get_master_challenge_train_data -from examples.gan.ensemble_attack.utils import make_training_config +from examples.gan.ensemble_attack.utils import get_master_challenge_train_data, make_training_config from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANModelRunner from midst_toolkit.attacks.ensemble.process_split_data import process_split_data @@ -60,7 +58,7 @@ def train_attack_model(config: DictConfig) -> None: if config.ensemble_attack.pipeline.run_shadow_model_training: log(INFO, "Training the shadow models...") - + training_config = make_training_config(config) model_runner = EnsembleAttackCTGANModelRunner(training_config=training_config) diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py index 5cf10f6a..0ab07446 100644 --- a/examples/gan/ensemble_attack/utils.py +++ b/examples/gan/ensemble_attack/utils.py @@ -1,14 +1,14 @@ -from pathlib import Path import json +from pathlib import Path import pandas as pd from omegaconf import DictConfig, OmegaConf -from midst_toolkit.attacks.ensemble.shadow_model_utils import setup_save_dir +from examples.gan.utils import get_single_table_svd_metadata, get_table_name from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANTrainingConfig -from examples.gan.utils import get_single_table_svd_metadata, get_table_name +from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME +from midst_toolkit.attacks.ensemble.shadow_model_utils import setup_save_dir def get_master_challenge_train_data(config: DictConfig) -> pd.DataFrame: @@ -22,7 +22,9 @@ def get_master_challenge_train_data(config: DictConfig) -> pd.DataFrame: The dataframe containing the master challenge train data. """ population_path = Path(config.ensemble_attack.data_paths.population_path) - assert population_path.exists(), f"Population path {population_path} does not exist. Please run the data processing pipeline first." + assert population_path.exists(), ( + f"Population path {population_path} does not exist. Please run the data processing pipeline first." + ) master_challenge_train = load_dataframe(population_path, PROCESSED_TRAIN_DATA_FILE_NAME) return master_challenge_train From c42ee6eef187531157cad217fdda3217d78138e0 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 13:47:35 -0400 Subject: [PATCH 28/38] Fixing mypy and ruff --- examples/ensemble_attack/run_attack.py | 12 ++++-- .../run_metaclassifier_training.py | 6 +-- examples/ensemble_attack/test_attack_model.py | 2 + examples/gan/ensemble_attack/utils.py | 5 +-- src/midst_toolkit/attacks/ensemble/model.py | 38 +++++++++++-------- .../attacks/ensemble/shadow_model_utils.py | 13 +++---- .../ensemble/test_process_data_split.py | 6 +-- 7 files changed, 45 insertions(+), 37 deletions(-) diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index 2d73b36d..dff9b644 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -3,11 +3,11 @@ provided resources and data. """ +import json from logging import INFO from pathlib import Path import hydra -import json from omegaconf import DictConfig import examples.ensemble_attack.run_metaclassifier_training as meta_pipeline @@ -15,7 +15,7 @@ from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME, collect_population_data_ensemble from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.attacks.ensemble.model import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig -from midst_toolkit.attacks.ensemble.process_split_data import process_split_data, PROCESSED_TRAIN_DATA_FILE_NAME +from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME, process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -86,8 +86,12 @@ def main(config: DictConfig) -> None: with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file: training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) - training_config.fine_tuning_diffusion_iterations = config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations - training_config.fine_tuning_classifier_iterations = config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + training_config.fine_tuning_diffusion_iterations = ( + config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + ) + training_config.fine_tuning_classifier_iterations = ( + config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + ) model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config) diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py index c94e5e65..1c3abbf7 100644 --- a/examples/ensemble_attack/run_metaclassifier_training.py +++ b/examples/ensemble_attack/run_metaclassifier_training.py @@ -9,13 +9,13 @@ from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.common.logger import log from midst_toolkit.attacks.ensemble.process_split_data import ( - PROCESSED_TRAIN_DATA_FILE_NAME, PROCESSED_TEST_DATA_FILE_NAME, - PROCESSED_TRAIN_LABELS_FILE_NAME, PROCESSED_TEST_LABELS_FILE_NAME, + PROCESSED_TRAIN_DATA_FILE_NAME, + PROCESSED_TRAIN_LABELS_FILE_NAME, ) +from midst_toolkit.common.logger import log def run_metaclassifier_training( diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 9a14bf28..81024680 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -275,6 +275,8 @@ def train_rmia_shadows_for_test_phase( challenge data points. Args: + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. config: Configuration object set in ``experiments_config.yaml``. Returns: diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py index 0ab07446..9c9d7afa 100644 --- a/examples/gan/ensemble_attack/utils.py +++ b/examples/gan/ensemble_attack/utils.py @@ -26,8 +26,7 @@ def get_master_challenge_train_data(config: DictConfig) -> pd.DataFrame: f"Population path {population_path} does not exist. Please run the data processing pipeline first." ) - master_challenge_train = load_dataframe(population_path, PROCESSED_TRAIN_DATA_FILE_NAME) - return master_challenge_train + return load_dataframe(population_path, PROCESSED_TRAIN_DATA_FILE_NAME) def make_training_config(config: DictConfig) -> EnsembleAttackCTGANTrainingConfig: @@ -58,7 +57,7 @@ def make_training_config(config: DictConfig) -> EnsembleAttackCTGANTrainingConfi } json.dump(training_config, f) - ctgan_training_config = EnsembleAttackCTGANTrainingConfig(**training_config) + ctgan_training_config = EnsembleAttackCTGANTrainingConfig(**training_config) # type: ignore[arg-type] setup_save_dir(ctgan_training_config) diff --git a/src/midst_toolkit/attacks/ensemble/model.py b/src/midst_toolkit/attacks/ensemble/model.py index 037add81..1d2f9818 100644 --- a/src/midst_toolkit/attacks/ensemble/model.py +++ b/src/midst_toolkit/attacks/ensemble/model.py @@ -1,34 +1,32 @@ -""" -Module containing the base classes and implementations for the Ensemble Attack model runner and training result. -""" +"""Module containing the base classes and implementations for the Ensemble Attack model runner and training result.""" +import copy from abc import ABC, abstractmethod +from logging import INFO from pathlib import Path from typing import Any -from logging import INFO -import copy import pandas as pd from pydantic import BaseModel, ConfigDict -from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] from sdv.metadata import SingleTableMetadata # type: ignore[import-untyped] +from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] +from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, TrainingConfig +from midst_toolkit.common.logger import log +from midst_toolkit.common.variables import DEVICE +from midst_toolkit.models.clavaddpm.clustering import clava_clustering from midst_toolkit.models.clavaddpm.data_loaders import Tables, load_tables from midst_toolkit.models.clavaddpm.enumerations import GroupLengthsProbDicts, Relation, RelationOrder -from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, CTGANModelArtifacts -from midst_toolkit.models.clavaddpm.clustering import clava_clustering -from midst_toolkit.models.clavaddpm.train import clava_training from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing -from midst_toolkit.common.variables import DEVICE -from midst_toolkit.common.logger import log -from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning +from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, CTGANModelArtifacts, clava_training # Base Classes class EnsembleAttackTrainingConfig(TrainingConfig): number_of_points_to_synthesize: int = 20000 + class EnsembleAttackTrainingResult(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) @@ -85,6 +83,8 @@ class TabDDPMTrainingResult(EnsembleAttackTrainingResult): class EnsembleAttackTabDDPMModelRunner(EnsembleAttackModelRunner): + training_config: EnsembleAttackTabDDPMTrainingConfig + def train_or_fine_tune_and_synthesize( self, dataset: pd.DataFrame, @@ -188,8 +188,8 @@ def train_or_fine_tune_and_synthesize( class EnsembleAttackCTGANTrainingConfig(CTGANTrainingConfig, EnsembleAttackTrainingConfig): model_config = ConfigDict(arbitrary_types_allowed=True) - metadata: SingleTableMetadata = None - table_name: str = None + metadata: SingleTableMetadata | None = None + table_name: str | None = None class CTGANTrainingResult(EnsembleAttackTrainingResult): @@ -198,6 +198,8 @@ class CTGANTrainingResult(EnsembleAttackTrainingResult): class EnsembleAttackCTGANModelRunner(EnsembleAttackModelRunner): + training_config: EnsembleAttackCTGANTrainingConfig + def train_or_fine_tune_and_synthesize( self, dataset: pd.DataFrame, @@ -227,7 +229,9 @@ def train_or_fine_tune_and_synthesize( assert self.training_config.metadata is not None, "Metadata is not set" assert self.training_config.table_name is not None, "Table name is not set" - dataset_without_ids = dataset.drop(columns=[column_name for column_name in dataset.columns if "_id" in column_name]) + dataset_without_ids = dataset.drop( + columns=[column_name for column_name in dataset.columns if "_id" in column_name] + ) if trained_model is None: log(INFO, "Training new CTGAN model...") @@ -253,7 +257,9 @@ def train_or_fine_tune_and_synthesize( result = CTGANTrainingResult( save_dir=save_dir, configs=self.training_config, - models={(None, self.training_config.table_name): CTGANModelArtifacts(model=ctgan, model_file_path=results_file)}, + models={ + (None, self.training_config.table_name): CTGANModelArtifacts(model=ctgan, model_file_path=results_file) + }, ) if synthesize: diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index e42a2e19..0033c340 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -2,14 +2,12 @@ import os from logging import INFO from pathlib import Path -from typing import Type from midst_toolkit.common.config import TrainingConfig from midst_toolkit.common.logger import log def save_additional_training_config( - training_config_type: Type[TrainingConfig], data_dir: Path, training_config_json_path: Path, final_config_json_path: Path, @@ -21,7 +19,6 @@ def save_additional_training_config( and loads the resulting configuration. Args: - training_config_type: The type of the training config to be used for training the shadow models. data_dir: Directory containing dataset_meta.json, trans_domain.json, and trans.json files. training_config_json_path: Path to the original TabDDPM training configuration JSON file. final_config_json_path: Path where the modified configuration JSON file will be saved. @@ -34,16 +31,16 @@ def save_additional_training_config( """ # Modify the config file to give the correct training data and saving directory with open(training_config_json_path, "r") as file: - configs = training_config_type(**json.load(file)) + configs = json.load(file) - configs.general.data_dir = data_dir + configs["general"]["data_dir"] = data_dir # Save dir is set by joining the workspace_dir and exp_name - configs.general.workspace_dir = data_dir / workspace_name - configs.general.exp_name = experiment_name + configs["general"]["workspace_dir"] = data_dir / workspace_name + configs["general"]["exp_name"] = experiment_name # save the changed to the new json file with open(final_config_json_path, "w") as file: - json.dump(configs.model_dump(mode="json"), file, indent=4) + json.dump(configs, file, indent=4) log(INFO, f"Config saved to {final_config_json_path}") diff --git a/tests/unit/attacks/ensemble/test_process_data_split.py b/tests/unit/attacks/ensemble/test_process_data_split.py index 327fcb49..c67236d0 100644 --- a/tests/unit/attacks/ensemble/test_process_data_split.py +++ b/tests/unit/attacks/ensemble/test_process_data_split.py @@ -6,11 +6,11 @@ from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.attacks.ensemble.process_split_data import ( - process_split_data, - PROCESSED_TRAIN_DATA_FILE_NAME, PROCESSED_TEST_DATA_FILE_NAME, - PROCESSED_TRAIN_LABELS_FILE_NAME, PROCESSED_TEST_LABELS_FILE_NAME, + PROCESSED_TRAIN_DATA_FILE_NAME, + PROCESSED_TRAIN_LABELS_FILE_NAME, + process_split_data, ) From 093b0e4ae3d8e316c3182df630fcd87e101d8c8c Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 14:23:23 -0400 Subject: [PATCH 29/38] Tests passing --- .gitignore | 1 + .../ensemble/rmia/shadow_model_training.py | 3 +- .../attacks/ensemble/shadow_model_utils.py | 14 ++-- .../ensemble/assets/data_configs/trans.json | 2 +- .../ensemble/test_shadow_model_training.py | 64 ++++++++++++------- 5 files changed, 53 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 08ea1217..b33edd6e 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ site/ # Test artifacts tests/integration/attacks/tartan_federer/assets/tabddpm_models/**/challenge_label_predictions.csv tests/integration/attacks/tartan_federer/assets/tartan_federer_attack_results +tests/integration/attacks/ensemble/assets/workspace # Training Logs *.err diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 6ea9947f..3155dc30 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -115,7 +115,8 @@ def train_fine_tuned_shadow_models( log(INFO, f"Training initial model with runner {model_runner}. Model ID {init_model_id}...") initial_model_training_results = model_runner.train_or_fine_tune_and_synthesize( - dataset=train, synthesize=False + dataset=train, + synthesize=False, ) # Save the initial model diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index 0033c340..91766670 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -8,6 +8,7 @@ def save_additional_training_config( + config_type: type[TrainingConfig], data_dir: Path, training_config_json_path: Path, final_config_json_path: Path, @@ -19,6 +20,7 @@ def save_additional_training_config( and loads the resulting configuration. Args: + config_type: Type of the training configuration to load. data_dir: Directory containing dataset_meta.json, trans_domain.json, and trans.json files. training_config_json_path: Path to the original TabDDPM training configuration JSON file. final_config_json_path: Path where the modified configuration JSON file will be saved. @@ -26,21 +28,21 @@ def save_additional_training_config( workspace_name: Name of the workspace, used to create a unique save directory. Returns: - configs: Loaded configuration dictionary for the model type. + configs: Loaded configuration dictionary for the given config type. save_dir: Directory path where results will be saved. """ # Modify the config file to give the correct training data and saving directory with open(training_config_json_path, "r") as file: - configs = json.load(file) + configs = config_type(**json.load(file)) - configs["general"]["data_dir"] = data_dir + configs.general.data_dir = data_dir # Save dir is set by joining the workspace_dir and exp_name - configs["general"]["workspace_dir"] = data_dir / workspace_name - configs["general"]["exp_name"] = experiment_name + configs.general.workspace_dir = data_dir / workspace_name + configs.general.exp_name = experiment_name # save the changed to the new json file with open(final_config_json_path, "w") as file: - json.dump(configs, file, indent=4) + json.dump(configs.model_dump(mode="json"), file, indent=4) log(INFO, f"Config saved to {final_config_json_path}") diff --git a/tests/integration/attacks/ensemble/assets/data_configs/trans.json b/tests/integration/attacks/ensemble/assets/data_configs/trans.json index d9a786df..2a77b82a 100644 --- a/tests/integration/attacks/ensemble/assets/data_configs/trans.json +++ b/tests/integration/attacks/ensemble/assets/data_configs/trans.json @@ -1,6 +1,6 @@ { "general": { - "data_dir": "tests/integration/attacks/ensemble/assets/shadow_models_data", + "data_dir": "tests/integration/attacks/ensemble/assets/data_configs", "exp_name": "ensemble_attack", "workspace_dir": "tests/integration/attacks/ensemble/assets/workspace", "sample_prefix": "", diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py index 8008f97d..f3bd058e 100644 --- a/tests/integration/attacks/ensemble/test_shadow_model_training.py +++ b/tests/integration/attacks/ensemble/test_shadow_model_training.py @@ -1,8 +1,8 @@ import copy +import json import pickle import shutil from pathlib import Path -from typing import cast import pandas as pd import pytest @@ -10,16 +10,12 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.model import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import ( train_fine_tuned_shadow_models, train_shadow_on_half_challenge_data, ) -from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - fine_tune_tabddpm_and_synthesize, - save_additional_training_config, - train_tabddpm_and_synthesize, -) -from midst_toolkit.common.config import ClavaDDPMTrainingConfig +from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config POPULATION_DATA = load_dataframe( @@ -42,21 +38,30 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None shadow_models_output_path = tmp_path # Input # Population data is used to pre-train some of the shadow models. + with open(cfg.shadow_training.training_json_config_paths.training_config_path, "r") as file: + training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + training_config.fine_tuning_diffusion_iterations = ( + cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + ) + training_config.fine_tuning_classifier_iterations = ( + cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + ) + training_config.number_of_points_to_synthesize = 5 + model_runner = EnsembleAttackTabDDPMModelRunner(training_config) result_path = train_fine_tuned_shadow_models( + model_runner=model_runner, n_models=2, n_reps=1, population_data=POPULATION_DATA, master_challenge_data=POPULATION_DATA[0:20], # Limiting the data to 20 samples for faster test execution shadow_models_output_path=shadow_models_output_path, training_json_config_paths=cfg.shadow_training.training_json_config_paths, - fine_tuning_config=cfg.shadow_training.fine_tuning_config, init_model_id=1, init_data_seed=cfg.random_seed, table_name="trans", id_column_name="trans_id", pre_training_data_size=cfg.shadow_training.fine_tuning_config.pre_train_data_size, - number_of_points_to_synthesize=5, random_seed=cfg.random_seed, ) # Expected saved models and synthesized data: @@ -87,7 +92,19 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) -> shadow_models_output_path = tmp_path # Input # Population data is loaded and used as challenge data for testing purposes. + with open(cfg.shadow_training.training_json_config_paths.training_config_path, "r") as file: + training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + training_config.fine_tuning_diffusion_iterations = ( + cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + ) + training_config.fine_tuning_classifier_iterations = ( + cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + ) + training_config.number_of_points_to_synthesize = 5 + + model_runner = EnsembleAttackTabDDPMModelRunner(training_config) result_path = train_shadow_on_half_challenge_data( + model_runner=model_runner, n_models=2, n_reps=1, master_challenge_data=POPULATION_DATA[0:40], # Limiting the data to 40 samples for faster test execution @@ -95,7 +112,6 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) -> training_json_config_paths=cfg.shadow_training.training_json_config_paths, table_name="trans", id_column_name="trans_id", - number_of_points_to_synthesize=5, random_seed=cfg.random_seed, ) # Expected saved models and synthesized data: @@ -137,7 +153,8 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: cfg.shadow_training.training_json_config_paths.dataset_meta_file_path, tmp_training_dir / "dataset_meta.json", ) - configs, save_dir = save_additional_training_config( + configs, _ = save_additional_training_config( + config_type=EnsembleAttackTabDDPMTrainingConfig, data_dir=tmp_training_dir, training_config_json_path=training_config_path, final_config_json_path=tmp_training_dir / "trans.json", @@ -145,13 +162,14 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: workspace_name="test_workspace", ) - train_result = train_tabddpm_and_synthesize( + configs.number_of_points_to_synthesize = 99 + model_runner = EnsembleAttackTabDDPMModelRunner(configs) + + train_result = model_runner.train_or_fine_tune_and_synthesize( train_set, - cast(ClavaDDPMTrainingConfig, configs), - save_dir, synthesize=True, - number_of_points_to_synthesize=99, ) + assert train_result.synthetic_data is not None assert type(train_result.synthetic_data) is pd.DataFrame assert len(train_result.synthetic_data) == 99 @@ -161,16 +179,16 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: assert len(train_result.models) == 1 # Only one model (TabDDPM) is trained. # Now fine-tune the trained TabDDPM model on a small set of data - fine_tuned_results = fine_tune_tabddpm_and_synthesize( - trained_models=train_result.models, - fine_tune_set=fine_tuning_set, # fine-tuning on the same data for testing purposes - configs=cast(ClavaDDPMTrainingConfig, configs), - save_dir=save_dir, - fine_tuning_diffusion_iterations=cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations, - fine_tuning_classifier_iterations=cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations, - # Number of synthetic samples is defined according to tabddpm_training_config's classifier_scale value. + configs.fine_tuning_diffusion_iterations = cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + configs.fine_tuning_classifier_iterations = cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + model_runner = EnsembleAttackTabDDPMModelRunner(configs) + + fine_tuned_results = model_runner.train_or_fine_tune_and_synthesize( + dataset=fine_tuning_set, synthesize=False, + trained_model=train_result, ) + assert fine_tuned_results.synthetic_data is None assert fine_tuned_results.models is not None assert type(fine_tuned_results.models) is dict From d50ff391b3682ba763520af40b688f57c74ecaf3 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 14:29:46 -0400 Subject: [PATCH 30/38] renaming model to models --- examples/ensemble_attack/run_attack.py | 2 +- examples/ensemble_attack/run_shadow_model_training.py | 6 ++++-- examples/ensemble_attack/test_attack_model.py | 2 +- examples/gan/ensemble_attack/test_attack_model.py | 2 +- examples/gan/ensemble_attack/train_attack_model.py | 2 +- examples/gan/ensemble_attack/utils.py | 2 +- src/midst_toolkit/attacks/ensemble/{model.py => models.py} | 0 .../attacks/ensemble/rmia/shadow_model_training.py | 2 +- .../attacks/ensemble/test_shadow_model_training.py | 2 +- 9 files changed, 11 insertions(+), 9 deletions(-) rename src/midst_toolkit/attacks/ensemble/{model.py => models.py} (100%) diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index dff9b644..c60663a8 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -14,7 +14,7 @@ import examples.ensemble_attack.run_shadow_model_training as shadow_pipeline from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME, collect_population_data_ensemble from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.model import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME, process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 4792fca2..73081e19 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -5,9 +5,11 @@ import pandas as pd from omegaconf import DictConfig -from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME +from examples.ensemble_attack.real_data_collection import ( + COLLECTED_DATA_FILE_NAME, +) from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.model import EnsembleAttackModelRunner +from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models from midst_toolkit.common.logger import log diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 81024680..b55fd11e 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -21,7 +21,7 @@ from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.model import ( +from midst_toolkit.attacks.ensemble.models import ( EnsembleAttackModelRunner, EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig, diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/test_attack_model.py index 9c093bc0..72e03de0 100644 --- a/examples/gan/ensemble_attack/test_attack_model.py +++ b/examples/gan/ensemble_attack/test_attack_model.py @@ -5,7 +5,7 @@ from examples.ensemble_attack.test_attack_model import run_metaclassifier_testing from examples.gan.ensemble_attack.utils import make_training_config -from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANModelRunner +from midst_toolkit.attacks.ensemble.models import EnsembleAttackCTGANModelRunner from midst_toolkit.common.logger import log diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index a302bfa9..dd63d79f 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -8,7 +8,7 @@ from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training from examples.gan.ensemble_attack.utils import get_master_challenge_train_data, make_training_config from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe -from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANModelRunner +from midst_toolkit.attacks.ensemble.models import EnsembleAttackCTGANModelRunner from midst_toolkit.attacks.ensemble.process_split_data import process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py index 9c9d7afa..9da30e3b 100644 --- a/examples/gan/ensemble_attack/utils.py +++ b/examples/gan/ensemble_attack/utils.py @@ -6,7 +6,7 @@ from examples.gan.utils import get_single_table_svd_metadata, get_table_name from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.model import EnsembleAttackCTGANTrainingConfig +from midst_toolkit.attacks.ensemble.models import EnsembleAttackCTGANTrainingConfig from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.shadow_model_utils import setup_save_dir diff --git a/src/midst_toolkit/attacks/ensemble/model.py b/src/midst_toolkit/attacks/ensemble/models.py similarity index 100% rename from src/midst_toolkit/attacks/ensemble/model.py rename to src/midst_toolkit/attacks/ensemble/models.py diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 3155dc30..d381ca98 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -8,7 +8,7 @@ import pandas as pd from omegaconf import DictConfig -from midst_toolkit.attacks.ensemble.model import EnsembleAttackModelRunner +from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner from midst_toolkit.common.logger import log diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py index f3bd058e..8aab0278 100644 --- a/tests/integration/attacks/ensemble/test_shadow_model_training.py +++ b/tests/integration/attacks/ensemble/test_shadow_model_training.py @@ -10,7 +10,7 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.model import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import ( train_fine_tuned_shadow_models, train_shadow_on_half_challenge_data, From 71359247836c9878d3c637545ac6adec4cdf6f26 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 14:38:15 -0400 Subject: [PATCH 31/38] Small bug fix --- examples/ensemble_attack/run_shadow_model_training.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 73081e19..8e85ad45 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -57,6 +57,10 @@ def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: D target_folder / "dataset_meta.json", ) + model_runner.training_config.general.data_dir = target_folder + model_runner.training_config.general.exp_name = "trained_target_model" + model_runner.training_config.general.workspace_dir = target_folder / "shadow_workspace" + train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=df_real_data, synthesize=True) # To train the attack model (metaclassifier), we only need to save target's synthetic data, From 082ea7ce400d45d4b7435844fcb1914155181b9f Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 14:50:59 -0400 Subject: [PATCH 32/38] Bringing back the config json saving function against my will --- .../run_shadow_model_training.py | 12 ++++++--- .../ensemble/rmia/shadow_model_training.py | 25 +++++++++++++++++-- .../attacks/ensemble/shadow_model_utils.py | 5 ++-- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 8e85ad45..f0c6cd10 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -11,6 +11,7 @@ from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models +from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config from midst_toolkit.common.logger import log @@ -57,9 +58,14 @@ def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: D target_folder / "dataset_meta.json", ) - model_runner.training_config.general.data_dir = target_folder - model_runner.training_config.general.exp_name = "trained_target_model" - model_runner.training_config.general.workspace_dir = target_folder / "shadow_workspace" + configs, _ = save_additional_training_config( + config_type=model_runner.training_config.__class__, + data_dir=target_folder, + training_config_json_path=Path(target_training_json_config_paths.training_config_path), + final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json + experiment_name="trained_target_model", + ) + model_runner.training_config = configs train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=df_real_data, synthesize=True) diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index d381ca98..7b4e200c 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -9,6 +9,7 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner +from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config from midst_toolkit.common.logger import log @@ -108,8 +109,20 @@ def train_fine_tuned_shadow_models( shadow_model_data_folder / "dataset_meta.json", ) + # Train initial model with 60K data without any challenge points + # ``save_additional_training_config`` makes a personalized copy of the training config for each + # training model (here the base model). + # All the shadow models will be saved under the base model data directory. + configs, save_dir = save_additional_training_config( + config_type=model_runner.training_config.__class__, + data_dir=shadow_model_data_folder, + training_config_json_path=Path(training_json_config_paths.training_config_path), + final_config_json_path=shadow_model_data_folder / f"{table_name}.json", # Path to the new json + experiment_name="pre_trained_model", + ) + model_runner.training_config = configs + # Train the initial model if it is not already trained and saved. - save_dir = model_runner.training_config.general.workspace_dir / model_runner.training_config.general.exp_name initial_model_path = save_dir / f"initial_model_rmia_{init_model_id}.pkl" if not initial_model_path.exists(): log(INFO, f"Training initial model with runner {model_runner}. Model ID {init_model_id}...") @@ -247,6 +260,15 @@ def train_shadow_on_half_challenge_data( shadow_folder / "dataset_meta.json", ) + configs, save_dir = save_additional_training_config( + config_type=model_runner.training_config.__class__, + data_dir=shadow_folder, + training_config_json_path=Path(training_json_config_paths.training_config_path), + final_config_json_path=shadow_folder / f"{table_name}.json", # Path to the new json + experiment_name="trained_model", + ) + model_runner.training_config = configs + attack_data: dict[str, Any] = { "selected_sets": selected_id_lists, "trained_results": [], @@ -276,7 +298,6 @@ def train_shadow_on_half_challenge_data( attack_data["trained_results"].append(train_result.synthetic_data) # Pickle dump the results - save_dir = model_runner.training_config.general.workspace_dir / model_runner.training_config.general.exp_name result_path = Path(save_dir, "rmia_shadows_third_set.pkl") with open(result_path, "wb") as file: pickle.dump(attack_data, file) diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index 91766670..2f48c67a 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -3,18 +3,19 @@ from logging import INFO from pathlib import Path +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTrainingConfig from midst_toolkit.common.config import TrainingConfig from midst_toolkit.common.logger import log def save_additional_training_config( - config_type: type[TrainingConfig], + config_type: type[EnsembleAttackTrainingConfig], data_dir: Path, training_config_json_path: Path, final_config_json_path: Path, experiment_name: str = "attack_experiment", workspace_name: str = "shadow_workspace", -) -> tuple[TrainingConfig, Path]: +) -> tuple[EnsembleAttackTrainingConfig, Path]: """ Modifies a TabDDPM configuration JSON file with the specified data directory, experiment name and workspace name, and loads the resulting configuration. From 94da62eb6cf5d72cb4b5d62a5fe578ab963a683a Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 14:58:09 -0400 Subject: [PATCH 33/38] one more bug fix --- examples/ensemble_attack/run_shadow_model_training.py | 4 +++- .../attacks/ensemble/rmia/shadow_model_training.py | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index f0c6cd10..8614fdb2 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -65,7 +65,9 @@ def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: D final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json experiment_name="trained_target_model", ) - model_runner.training_config = configs + model_runner.training_config.general.data_dir = configs.general.data_dir + model_runner.training_config.general.workspace_dir = configs.general.workspace_dir + model_runner.training_config.general.exp_name = configs.general.exp_name train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=df_real_data, synthesize=True) diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 7b4e200c..69864bb7 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -120,7 +120,9 @@ def train_fine_tuned_shadow_models( final_config_json_path=shadow_model_data_folder / f"{table_name}.json", # Path to the new json experiment_name="pre_trained_model", ) - model_runner.training_config = configs + model_runner.training_config.general.data_dir = configs.general.data_dir + model_runner.training_config.general.workspace_dir = configs.general.workspace_dir + model_runner.training_config.general.exp_name = configs.general.exp_name # Train the initial model if it is not already trained and saved. initial_model_path = save_dir / f"initial_model_rmia_{init_model_id}.pkl" @@ -267,7 +269,9 @@ def train_shadow_on_half_challenge_data( final_config_json_path=shadow_folder / f"{table_name}.json", # Path to the new json experiment_name="trained_model", ) - model_runner.training_config = configs + model_runner.training_config.general.data_dir = configs.general.data_dir + model_runner.training_config.general.workspace_dir = configs.general.workspace_dir + model_runner.training_config.general.exp_name = configs.general.exp_name attack_data: dict[str, Any] = { "selected_sets": selected_id_lists, From cc2cb81db51fd25c0a79c5709fa5374f16208c36 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 16:28:15 -0400 Subject: [PATCH 34/38] Fixing the test --- tests/unit/attacks/ensemble/test_shadow_model_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/attacks/ensemble/test_shadow_model_utils.py b/tests/unit/attacks/ensemble/test_shadow_model_utils.py index 722918ea..fd0463b5 100644 --- a/tests/unit/attacks/ensemble/test_shadow_model_utils.py +++ b/tests/unit/attacks/ensemble/test_shadow_model_utils.py @@ -5,9 +5,8 @@ from hydra import compose, initialize from omegaconf import DictConfig -from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - save_additional_training_config, -) +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMTrainingConfig +from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config @pytest.fixture(scope="module") @@ -34,6 +33,7 @@ def test_save_additional_tabddpm_config(cfg: DictConfig, tmp_path: Path) -> None final_json_path = tmp_path / "modified_config.json" configs, save_dir = save_additional_training_config( + config_type=EnsembleAttackTabDDPMTrainingConfig, data_dir=new_data_dir, training_config_json_path=tabddpm_config_path, final_config_json_path=final_json_path, From fe78e3473c5155d027a8bd13ed6d2c74c0022507 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 17:34:53 -0400 Subject: [PATCH 35/38] One more refactor to make things simpler. --- .../run_shadow_model_training.py | 11 +++--- src/midst_toolkit/attacks/ensemble/models.py | 23 +++++++------ .../ensemble/rmia/shadow_model_training.py | 30 +++++++--------- .../attacks/ensemble/shadow_model_utils.py | 34 ++++++++----------- .../ensemble/test_shadow_model_training.py | 10 +++--- .../ensemble/test_shadow_model_utils.py | 16 +++++---- 6 files changed, 58 insertions(+), 66 deletions(-) diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 8614fdb2..c8b5cc58 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -11,7 +11,7 @@ from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models -from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config +from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config from midst_toolkit.common.logger import log @@ -58,16 +58,13 @@ def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: D target_folder / "dataset_meta.json", ) - configs, _ = save_additional_training_config( - config_type=model_runner.training_config.__class__, + configs = update_and_save_training_config( + config=model_runner.training_config, data_dir=target_folder, - training_config_json_path=Path(target_training_json_config_paths.training_config_path), final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json experiment_name="trained_target_model", ) - model_runner.training_config.general.data_dir = configs.general.data_dir - model_runner.training_config.general.workspace_dir = configs.general.workspace_dir - model_runner.training_config.general.exp_name = configs.general.exp_name + model_runner.training_config = configs train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=df_real_data, synthesize=True) diff --git a/src/midst_toolkit/attacks/ensemble/models.py b/src/midst_toolkit/attacks/ensemble/models.py index 1d2f9818..940dffc6 100644 --- a/src/midst_toolkit/attacks/ensemble/models.py +++ b/src/midst_toolkit/attacks/ensemble/models.py @@ -7,7 +7,7 @@ from typing import Any import pandas as pd -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from sdv.metadata import SingleTableMetadata # type: ignore[import-untyped] from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] @@ -24,6 +24,7 @@ # Base Classes class EnsembleAttackTrainingConfig(TrainingConfig): + save_dir: Path | None = None number_of_points_to_synthesize: int = 20000 @@ -112,16 +113,16 @@ def train_or_fine_tune_and_synthesize( - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, otherwise, None. """ + assert self.training_config.save_dir is not None, "Save dir is not set" + # Load tables tables, relation_order, _ = load_tables(self.training_config.general.data_dir, train_data={"trans": dataset}) - save_dir = self.training_config.general.workspace_dir / self.training_config.general.exp_name - # Clustering on the multi-table dataset tables, all_group_lengths_prob_dicts = clava_clustering( tables, relation_order, - save_dir, + self.training_config.save_dir, self.training_config.clustering, ) @@ -130,7 +131,7 @@ def train_or_fine_tune_and_synthesize( tables, models = clava_training( tables, relation_order, - save_dir, + self.training_config.save_dir, diffusion_config=self.training_config.diffusion, classifier_config=self.training_config.classifier, device=DEVICE, @@ -150,7 +151,7 @@ def train_or_fine_tune_and_synthesize( ) result = TabDDPMTrainingResult( - save_dir=save_dir, + save_dir=self.training_config.save_dir, configs=self.training_config, tables=tables, relation_order=relation_order, @@ -170,7 +171,7 @@ def train_or_fine_tune_and_synthesize( cleaned_tables, _, _ = clava_synthesizing( tables, relation_order, - save_dir, + self.training_config.save_dir, models, self.training_config.general, self.training_config.sampling, @@ -188,7 +189,7 @@ def train_or_fine_tune_and_synthesize( class EnsembleAttackCTGANTrainingConfig(CTGANTrainingConfig, EnsembleAttackTrainingConfig): model_config = ConfigDict(arbitrary_types_allowed=True) - metadata: SingleTableMetadata | None = None + metadata: SingleTableMetadata | None = Field(default=None, exclude=True) table_name: str | None = None @@ -226,6 +227,7 @@ def train_or_fine_tune_and_synthesize( - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, otherwise, None. """ + assert self.training_config.save_dir is not None, "Save dir is not set" assert self.training_config.metadata is not None, "Metadata is not set" assert self.training_config.table_name is not None, "Table name is not set" @@ -248,14 +250,13 @@ def train_or_fine_tune_and_synthesize( ctgan.fit(dataset_without_ids) - save_dir = self.training_config.general.workspace_dir / self.training_config.general.exp_name - results_file = Path(save_dir) / model_name + results_file = self.training_config.save_dir / model_name results_file.parent.mkdir(parents=True, exist_ok=True) ctgan.save(results_file) result = CTGANTrainingResult( - save_dir=save_dir, + save_dir=self.training_config.save_dir, configs=self.training_config, models={ (None, self.training_config.table_name): CTGANModelArtifacts(model=ctgan, model_file_path=results_file) diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 69864bb7..78241179 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -9,7 +9,7 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner -from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config +from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config from midst_toolkit.common.logger import log @@ -110,22 +110,20 @@ def train_fine_tuned_shadow_models( ) # Train initial model with 60K data without any challenge points - # ``save_additional_training_config`` makes a personalized copy of the training config for each + # ``update_and_save_training_config`` makes a personalized copy of the training config for each # training model (here the base model). # All the shadow models will be saved under the base model data directory. - configs, save_dir = save_additional_training_config( - config_type=model_runner.training_config.__class__, + configs = update_and_save_training_config( + config=model_runner.training_config, data_dir=shadow_model_data_folder, - training_config_json_path=Path(training_json_config_paths.training_config_path), final_config_json_path=shadow_model_data_folder / f"{table_name}.json", # Path to the new json experiment_name="pre_trained_model", ) - model_runner.training_config.general.data_dir = configs.general.data_dir - model_runner.training_config.general.workspace_dir = configs.general.workspace_dir - model_runner.training_config.general.exp_name = configs.general.exp_name + model_runner.training_config = configs # Train the initial model if it is not already trained and saved. - initial_model_path = save_dir / f"initial_model_rmia_{init_model_id}.pkl" + assert model_runner.training_config.save_dir is not None, "Save dir is not set" + initial_model_path = model_runner.training_config.save_dir / f"initial_model_rmia_{init_model_id}.pkl" if not initial_model_path.exists(): log(INFO, f"Training initial model with runner {model_runner}. Model ID {init_model_id}...") @@ -189,7 +187,7 @@ def train_fine_tuned_shadow_models( attack_data["fine_tuned_results"].append(train_result.synthetic_data) # Pickle dump the results - result_path = Path(save_dir / "rmia_shadows.pkl") + result_path = model_runner.training_config.save_dir / "rmia_shadows.pkl" with open(result_path, "wb") as file: pickle.dump(attack_data, file) @@ -262,16 +260,13 @@ def train_shadow_on_half_challenge_data( shadow_folder / "dataset_meta.json", ) - configs, save_dir = save_additional_training_config( - config_type=model_runner.training_config.__class__, + configs = update_and_save_training_config( + config=model_runner.training_config, data_dir=shadow_folder, - training_config_json_path=Path(training_json_config_paths.training_config_path), final_config_json_path=shadow_folder / f"{table_name}.json", # Path to the new json experiment_name="trained_model", ) - model_runner.training_config.general.data_dir = configs.general.data_dir - model_runner.training_config.general.workspace_dir = configs.general.workspace_dir - model_runner.training_config.general.exp_name = configs.general.exp_name + model_runner.training_config = configs attack_data: dict[str, Any] = { "selected_sets": selected_id_lists, @@ -302,7 +297,8 @@ def train_shadow_on_half_challenge_data( attack_data["trained_results"].append(train_result.synthetic_data) # Pickle dump the results - result_path = Path(save_dir, "rmia_shadows_third_set.pkl") + assert model_runner.training_config.save_dir is not None, "Save dir is not set" + result_path = model_runner.training_config.save_dir / "rmia_shadows_third_set.pkl" with open(result_path, "wb") as file: pickle.dump(attack_data, file) diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index 2f48c67a..029a0890 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -8,49 +8,43 @@ from midst_toolkit.common.logger import log -def save_additional_training_config( - config_type: type[EnsembleAttackTrainingConfig], +def update_and_save_training_config( + config: EnsembleAttackTrainingConfig, data_dir: Path, - training_config_json_path: Path, final_config_json_path: Path, experiment_name: str = "attack_experiment", workspace_name: str = "shadow_workspace", -) -> tuple[EnsembleAttackTrainingConfig, Path]: +) -> EnsembleAttackTrainingConfig: """ - Modifies a TabDDPM configuration JSON file with the specified data directory, experiment name and workspace name, - and loads the resulting configuration. + Modifies a model configuration with the specified data directory, experiment name and workspace name, + and saves it to a JSON file. Args: - config_type: Type of the training configuration to load. + config: The training configuration to update. data_dir: Directory containing dataset_meta.json, trans_domain.json, and trans.json files. - training_config_json_path: Path to the original TabDDPM training configuration JSON file. final_config_json_path: Path where the modified configuration JSON file will be saved. experiment_name: Name of the experiment, used to create a unique save directory. workspace_name: Name of the workspace, used to create a unique save directory. Returns: - configs: Loaded configuration dictionary for the given config type. - save_dir: Directory path where results will be saved. + EnsembleAttackTrainingConfig: The updated training configuration. """ - # Modify the config file to give the correct training data and saving directory - with open(training_config_json_path, "r") as file: - configs = config_type(**json.load(file)) - - configs.general.data_dir = data_dir + # Modify the config to have the correct training data and saving directory + config.general.data_dir = data_dir # Save dir is set by joining the workspace_dir and exp_name - configs.general.workspace_dir = data_dir / workspace_name - configs.general.exp_name = experiment_name + config.general.workspace_dir = data_dir / workspace_name + config.general.exp_name = experiment_name # save the changed to the new json file with open(final_config_json_path, "w") as file: - json.dump(configs.model_dump(mode="json"), file, indent=4) + json.dump(config.model_dump(mode="json"), file, indent=4) log(INFO, f"Config saved to {final_config_json_path}") # Set up the config - save_dir = setup_save_dir(configs) + config.save_dir = setup_save_dir(config) - return configs, save_dir + return config # TODO: The following function is directly copied from the midst reference code since diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py index 8aab0278..1241358a 100644 --- a/tests/integration/attacks/ensemble/test_shadow_model_training.py +++ b/tests/integration/attacks/ensemble/test_shadow_model_training.py @@ -15,7 +15,7 @@ train_fine_tuned_shadow_models, train_shadow_on_half_challenge_data, ) -from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config +from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config POPULATION_DATA = load_dataframe( @@ -153,10 +153,12 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: cfg.shadow_training.training_json_config_paths.dataset_meta_file_path, tmp_training_dir / "dataset_meta.json", ) - configs, _ = save_additional_training_config( - config_type=EnsembleAttackTabDDPMTrainingConfig, + with open(training_config_path, "r") as file: + configs = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + + configs = update_and_save_training_config( + config=configs, data_dir=tmp_training_dir, - training_config_json_path=training_config_path, final_config_json_path=tmp_training_dir / "trans.json", experiment_name="test_experiment", workspace_name="test_workspace", diff --git a/tests/unit/attacks/ensemble/test_shadow_model_utils.py b/tests/unit/attacks/ensemble/test_shadow_model_utils.py index fd0463b5..ff0aa474 100644 --- a/tests/unit/attacks/ensemble/test_shadow_model_utils.py +++ b/tests/unit/attacks/ensemble/test_shadow_model_utils.py @@ -6,7 +6,7 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMTrainingConfig -from midst_toolkit.attacks.ensemble.shadow_model_utils import save_additional_training_config +from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config @pytest.fixture(scope="module") @@ -32,16 +32,18 @@ def test_save_additional_tabddpm_config(cfg: DictConfig, tmp_path: Path) -> None new_experiment_name = "test_experiment" final_json_path = tmp_path / "modified_config.json" - configs, save_dir = save_additional_training_config( - config_type=EnsembleAttackTabDDPMTrainingConfig, + with open(tabddpm_config_path, "r") as file: + configs = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + + configs = update_and_save_training_config( + config=configs, data_dir=new_data_dir, - training_config_json_path=tabddpm_config_path, final_config_json_path=final_json_path, experiment_name=new_experiment_name, workspace_name=new_workspace_name, ) - assert save_dir == new_data_dir / new_workspace_name / new_experiment_name + assert configs.save_dir == new_data_dir / new_workspace_name / new_experiment_name assert configs.general.data_dir == new_data_dir assert configs.general.workspace_dir == new_data_dir / new_workspace_name assert configs.general.exp_name == new_experiment_name @@ -50,5 +52,5 @@ def test_save_additional_tabddpm_config(cfg: DictConfig, tmp_path: Path) -> None assert old_workspace_dir != configs.general.workspace_dir assert old_exp_name != configs.general.exp_name # Ensure required directories are created - assert (save_dir / "models").exists() - assert (save_dir / "before_matching").exists() + assert (configs.save_dir / "models").exists() + assert (configs.save_dir / "before_matching").exists() From 26f88f65c9e114b979251a0ab5f3a8c3cd2596ca Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 19 Mar 2026 17:40:38 -0400 Subject: [PATCH 36/38] CR by Coderabbit --- examples/gan/ensemble_attack/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py index 9da30e3b..71e92081 100644 --- a/examples/gan/ensemble_attack/utils.py +++ b/examples/gan/ensemble_attack/utils.py @@ -31,7 +31,7 @@ def get_master_challenge_train_data(config: DictConfig) -> pd.DataFrame: def make_training_config(config: DictConfig) -> EnsembleAttackCTGANTrainingConfig: """ - Make the ensemble attacktraining config for the CTGAN model from the config.yaml file. + Make the ensemble attack training config for the CTGAN model from the config.yaml file. Saves the training config json file to the shadow training json config paths location. From 5137a87f6e89f4f2b15c963d493aaa3f935d48ab Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Fri, 20 Mar 2026 12:36:10 -0400 Subject: [PATCH 37/38] Fixing a bug on the amount of shadow model samples to generate --- examples/ensemble_attack/run_attack.py | 11 +++++------ examples/ensemble_attack/test_attack_model.py | 11 +++++------ examples/gan/ensemble_attack/test_attack_model.py | 2 ++ examples/gan/ensemble_attack/train_attack_model.py | 2 ++ src/midst_toolkit/attacks/ensemble/models.py | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index c60663a8..58850113 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -86,12 +86,11 @@ def main(config: DictConfig) -> None: with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file: training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) - training_config.fine_tuning_diffusion_iterations = ( - config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations - ) - training_config.fine_tuning_classifier_iterations = ( - config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations - ) + fine_tune_diffusion_iterations = config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + training_config.fine_tuning_diffusion_iterations = fine_tune_diffusion_iterations + fine_tune_classifier_iterations = config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + training_config.fine_tuning_classifier_iterations = fine_tune_classifier_iterations + training_config.number_of_points_to_synthesize = config.shadow_training.number_of_points_to_synthesize model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index b55fd11e..46e174c3 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -456,12 +456,11 @@ def run_metaclassifier_testing_with_tabddpm(config: DictConfig) -> None: with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file: training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) - training_config.fine_tuning_diffusion_iterations = ( - config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations - ) - training_config.fine_tuning_classifier_iterations = ( - config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations - ) + fine_tune_diffusion_iterations = config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + training_config.fine_tuning_diffusion_iterations = fine_tune_diffusion_iterations + fine_tune_classifier_iterations = config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + training_config.fine_tuning_classifier_iterations = fine_tune_classifier_iterations + training_config.number_of_points_to_synthesize = config.shadow_training.number_of_points_to_synthesize model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config) diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/test_attack_model.py index 72e03de0..c9e161c9 100644 --- a/examples/gan/ensemble_attack/test_attack_model.py +++ b/examples/gan/ensemble_attack/test_attack_model.py @@ -23,6 +23,8 @@ def attack_model_test(config: DictConfig) -> None: ) training_config = make_training_config(config) + number_of_points_to_synthesize = config.ensemble_attack.shadow_training.number_of_points_to_synthesize + training_config.number_of_points_to_synthesize = number_of_points_to_synthesize model_runner = EnsembleAttackCTGANModelRunner(training_config=training_config) run_metaclassifier_testing(model_runner, config.ensemble_attack) diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index dd63d79f..d9b1ed84 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -60,6 +60,8 @@ def train_attack_model(config: DictConfig) -> None: log(INFO, "Training the shadow models...") training_config = make_training_config(config) + number_of_points_to_synthesize = config.ensemble_attack.shadow_training.number_of_points_to_synthesize + training_config.number_of_points_to_synthesize = number_of_points_to_synthesize model_runner = EnsembleAttackCTGANModelRunner(training_config=training_config) master_challenge_train = get_master_challenge_train_data(config) diff --git a/src/midst_toolkit/attacks/ensemble/models.py b/src/midst_toolkit/attacks/ensemble/models.py index 940dffc6..33bf567c 100644 --- a/src/midst_toolkit/attacks/ensemble/models.py +++ b/src/midst_toolkit/attacks/ensemble/models.py @@ -264,7 +264,7 @@ def train_or_fine_tune_and_synthesize( ) if synthesize: - synthetic_data = ctgan.sample(num_rows=self.training_config.synthesizing.sample_size) + synthetic_data = ctgan.sample(num_rows=self.training_config.number_of_points_to_synthesize) result.synthetic_data = synthetic_data return result From 068d936ba428d5ed955153eb9c870534f6159280 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 23 Mar 2026 11:25:00 -0400 Subject: [PATCH 38/38] CR by David --- examples/ensemble_attack/test_attack_model.py | 1 + examples/gan/ensemble_attack/train_attack_model.py | 1 - src/midst_toolkit/attacks/ensemble/models.py | 14 +++++++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 46e174c3..aff4baed 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -312,6 +312,7 @@ def train_rmia_shadows_for_test_phase( return run_rmia_shadow_training(model_runner, config, df_challenge=df_challenge) +# TODO: consider moving this and potentially the other functions above to the main library. def run_metaclassifier_testing(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> None: """ Function to run the attack on a single target model using a trained metaclassifier. diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index d9b1ed84..21a6ceb6 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -66,7 +66,6 @@ def train_attack_model(config: DictConfig) -> None: master_challenge_train = get_master_challenge_train_data(config) shadow_data_paths = run_shadow_model_training(model_runner, config.ensemble_attack, master_challenge_train) - shadow_data_paths = [Path(path) for path in shadow_data_paths] log(INFO, "Training the target model...") target_model_synthetic_path = run_target_model_training(model_runner, config.ensemble_attack) diff --git a/src/midst_toolkit/attacks/ensemble/models.py b/src/midst_toolkit/attacks/ensemble/models.py index 33bf567c..877faf85 100644 --- a/src/midst_toolkit/attacks/ensemble/models.py +++ b/src/midst_toolkit/attacks/ensemble/models.py @@ -59,7 +59,9 @@ def train_or_fine_tune_and_synthesize( Args: dataset: The dataset to train or fine tune the model on. - synthesize: Whether to synthesize data after training. + synthesize: Whether to synthesize data after training. The number of points to synthesize + and the save directory is controlled by the `number_of_points_to_synthesize` and `save_dir` + attributes of the training config. Optional, default is True. trained_model: The model to fine tune. If None, a new model should be trained. Optional, default is None. @@ -98,7 +100,10 @@ def train_or_fine_tune_and_synthesize( Args: dataset: The training dataset as a pandas DataFrame. - synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. + synthesize: Flag indicating whether to generate synthetic data after training. + The number of points to synthesize and the save directory is controlled by + the `number_of_points_to_synthesize` and `save_dir` attributes of the + training config. Optional, default is True. trained_model: The model to fine tune. If None, a new model should be trained. Optional, default is None. @@ -216,7 +221,10 @@ def train_or_fine_tune_and_synthesize( Args: dataset: The dataset as a pandas DataFrame. configs: Configuration dictionary for CTGAN. - synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. + synthesize: Flag indicating whether to generate synthetic data after training. + The number of points to synthesize and the save directory is controlled by + the `number_of_points_to_synthesize` and `save_dir` attributes of the training + config. Optional, default is True. trained_model: The trained model to fine tune. If None, a new model will be trained. Returns: