diff --git a/.gitignore b/.gitignore index 08ea1217..b33edd6e 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ site/ # Test artifacts tests/integration/attacks/tartan_federer/assets/tabddpm_models/**/challenge_label_predictions.csv tests/integration/attacks/tartan_federer/assets/tartan_federer_attack_results +tests/integration/attacks/ensemble/assets/workspace # Training Logs *.err diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index e252be45..58850113 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -3,6 +3,7 @@ provided resources and data. """ +import json from logging import INFO from pathlib import Path @@ -13,7 +14,8 @@ import examples.ensemble_attack.run_shadow_model_training as shadow_pipeline from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME, collect_population_data_ensemble from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.process_split_data import process_split_data +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig +from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME, process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -79,12 +81,23 @@ def main(config: DictConfig) -> None: if config.pipeline.run_shadow_model_training: df_master_challenge_train = load_dataframe( Path(config.data_paths.processed_attack_data_path), - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) - shadow_data_paths = shadow_pipeline.run_shadow_model_training(config, df_master_challenge_train) + + with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file: + training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + fine_tune_diffusion_iterations = config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + training_config.fine_tuning_diffusion_iterations = fine_tune_diffusion_iterations + fine_tune_classifier_iterations = config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + training_config.fine_tuning_classifier_iterations = fine_tune_classifier_iterations + training_config.number_of_points_to_synthesize = config.shadow_training.number_of_points_to_synthesize + + model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config) + + shadow_data_paths = shadow_pipeline.run_shadow_model_training(model_runner, config, df_master_challenge_train) shadow_data_paths = [Path(path) for path in shadow_data_paths] - target_model_synthetic_path = shadow_pipeline.run_target_model_training(config) + target_model_synthetic_path = shadow_pipeline.run_target_model_training(model_runner, config) if config.pipeline.run_metaclassifier_training: if not config.pipeline.run_shadow_model_training: diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py index dd79033a..1c3abbf7 100644 --- a/examples/ensemble_attack/run_metaclassifier_training.py +++ b/examples/ensemble_attack/run_metaclassifier_training.py @@ -9,6 +9,12 @@ from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.process_split_data import ( + PROCESSED_TEST_DATA_FILE_NAME, + PROCESSED_TEST_LABELS_FILE_NAME, + PROCESSED_TRAIN_DATA_FILE_NAME, + PROCESSED_TRAIN_LABELS_FILE_NAME, +) from midst_toolkit.common.logger import log @@ -32,20 +38,20 @@ def run_metaclassifier_training( # Load the processed data splits. df_meta_train = load_dataframe( Path(config.data_paths.processed_attack_data_path), - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) # y_meta_train consists of binary labels (0s and 1s) indicating whether each row in df_meta_train # belongs to the target model's training set. y_meta_train = np.load( - Path(config.data_paths.processed_attack_data_path) / "master_challenge_train_labels.npy", + Path(config.data_paths.processed_attack_data_path) / PROCESSED_TRAIN_LABELS_FILE_NAME, ) df_meta_test = load_dataframe( Path(config.data_paths.processed_attack_data_path), - "master_challenge_test.csv", + PROCESSED_TEST_DATA_FILE_NAME, ) y_meta_test = np.load( - Path(config.data_paths.processed_attack_data_path) / "master_challenge_test_labels.npy", + Path(config.data_paths.processed_attack_data_path) / PROCESSED_TEST_LABELS_FILE_NAME, ) # Three sets of shadow models are trained separately and their paths are provided here. diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index e9dde456..c8b5cc58 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -1,37 +1,31 @@ import shutil from logging import INFO from pathlib import Path -from typing import cast import pandas as pd from omegaconf import DictConfig -from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME -from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import ( - train_three_sets_of_shadow_models, -) -from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - ModelType, - TrainingResult, - save_additional_training_config, - train_or_fine_tune_and_synthesize_with_ctgan, - train_tabddpm_and_synthesize, +from examples.ensemble_attack.real_data_collection import ( + COLLECTED_DATA_FILE_NAME, ) -from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig +from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner +from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models +from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config from midst_toolkit.common.logger import log DEFAULT_TABLE_NAME = "trans" DEFAULT_ID_COLUMN_NAME = "trans_id" -DEFAULT_MODEL_TYPE = ModelType.TABDDPM -def run_target_model_training(config: DictConfig) -> Path: +def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> Path: """ Function to run the target model training for RMIA attack. Args: + model_runner: The model runner to be used for training the target model. + Should be an instance of a subclass of `EnsembleAttackModelRunner`. config: Configuration object set in config.yaml. Returns: @@ -54,11 +48,6 @@ def run_target_model_training(config: DictConfig) -> Path: target_folder = target_model_output_path / "target_model" - model_type = DEFAULT_MODEL_TYPE - if "model_name" in config.shadow_training: - model_type = ModelType(config.shadow_training.model_name) - log(INFO, f"Training target model with model type: {model_type.value}") - target_folder.mkdir(parents=True, exist_ok=True) shutil.copyfile( target_training_json_config_paths.table_domain_file_path, @@ -68,30 +57,16 @@ def run_target_model_training(config: DictConfig) -> Path: target_training_json_config_paths.dataset_meta_file_path, target_folder / "dataset_meta.json", ) - configs, save_dir = save_additional_training_config( + + configs = update_and_save_training_config( + config=model_runner.training_config, data_dir=target_folder, - training_config_json_path=Path(target_training_json_config_paths.training_config_path), final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json experiment_name="trained_target_model", - model_type=model_type, ) + model_runner.training_config = configs - train_result: TrainingResult - if model_type == ModelType.TABDDPM: - train_result = train_tabddpm_and_synthesize( - train_set=df_real_data, - configs=cast(ClavaDDPMTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, - ) - elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_and_synthesize_with_ctgan( - dataset=df_real_data, - configs=cast(CTGANTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - ) + train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=df_real_data, synthesize=True) # To train the attack model (metaclassifier), we only need to save target's synthetic data, # and not the entire target model's training result object. @@ -105,11 +80,17 @@ def run_target_model_training(config: DictConfig) -> Path: return target_model_synthetic_path -def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFrame) -> list[Path]: +def run_shadow_model_training( + model_runner: EnsembleAttackModelRunner, + config: DictConfig, + df_challenge_train: pd.DataFrame, +) -> list[Path]: """ Function to run the shadow model training for RMIA attack. Args: + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. config: Configuration object set in config.yaml. df_challenge_train: DataFrame containing the data that is used to train RMIA shadow models. @@ -130,10 +111,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra # Population data is used to pre-train some of the shadow models. df_population_with_challenge = load_dataframe(Path(config.data_paths.population_path), data_file_name) - model_type = DEFAULT_MODEL_TYPE - if "model_name" in config.shadow_training: - model_type = ModelType(config.shadow_training.model_name) - log(INFO, f"Training shadow models with model type: {model_type.value}") + log(INFO, f"Training shadow models with model runner: {model_runner}") # Make sure master challenge train and population data have the id column. assert id_column_name in df_challenge_train.columns, ( @@ -146,6 +124,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra # ``master_challenge_df`` is used for fine-tuning for half of the shadow models. # For the other half of the shadow models, only ``master_challenge_df`` is used for training. first_set_result_path, second_set_result_path, third_set_result_path = train_three_sets_of_shadow_models( + model_runner=model_runner, population_data=df_population_with_challenge, master_challenge_data=df_challenge_train, shadow_models_output_path=Path(config.shadow_training.shadow_models_output_path), @@ -157,9 +136,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra # ``4 * n_models_per_set`` total shadow models. n_models_per_set=4, # 4 based on the original code, must be even n_reps=12, # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code - number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, random_seed=config.random_seed, - model_type=model_type, ) log( INFO, diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index f3558c83..aff4baed 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -21,6 +21,12 @@ from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.models import ( + EnsembleAttackModelRunner, + EnsembleAttackTabDDPMModelRunner, + EnsembleAttackTabDDPMTrainingConfig, +) +from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds from midst_toolkit.models.clavaddpm.train import get_df_without_id @@ -87,7 +93,11 @@ def extract_primary_id_column( return data_frame[id_column_name] -def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) -> list[dict[str, list[Any]]]: +def run_rmia_shadow_training( + model_runner: EnsembleAttackModelRunner, + config: DictConfig, + df_challenge: pd.DataFrame, +) -> list[dict[str, list[Any]]]: """ Three sets of shadow models will be trained as a part of this attack. Note that shadow models need to be trained on the collection of challenge points once and used @@ -96,14 +106,16 @@ def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) -> of the shadow models, and these shadow models are used to attack all target models. Args: - config: Configuration object set in ``experiments_config.yaml``. + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. + config: Configuration object set in config.yaml. df_challenge: DataFrame containing the challenge data points for shadow model training. Return: A list containing three dictionaries, each representing a collection of shadow models with their training data and generated synthetic outputs. """ - shadow_model_paths = run_shadow_model_training(config, df_challenge_train=df_challenge) + shadow_model_paths = run_shadow_model_training(model_runner, config, df_challenge_train=df_challenge) assert len(shadow_model_paths) == 3, "For testing, meta classifier needs the path to three sets of shadow models." @@ -198,7 +210,7 @@ def collect_challenge_and_train_data( # Load master challenge train data df_master_train = load_dataframe( processed_attack_data_path, - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) log( INFO, @@ -254,12 +266,17 @@ def select_challenge_data_for_training( return df_challenge -def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list[Any]]]: +def train_rmia_shadows_for_test_phase( + model_runner: EnsembleAttackModelRunner, + config: DictConfig, +) -> list[dict[str, list[Any]]]: """ Function to train RMIA shadow models for the testing phase using the dataset containing challenge data points. Args: + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. config: Configuration object set in ``experiments_config.yaml``. Returns: @@ -279,7 +296,7 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list ) df_master_train = load_dataframe( processed_attack_data_path, - "master_challenge_train.csv", + PROCESSED_TRAIN_DATA_FILE_NAME, ) else: # If challenge data does not exist, collect it from the cluster @@ -292,15 +309,11 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list # Load the challenge dataframe for training RMIA shadow models. rmia_training_choice = RmiaTrainingDataChoice(config.target_model.attack_rmia_shadow_training_data_choice) df_challenge = select_challenge_data_for_training(rmia_training_choice, df_challenge_experiment, df_master_train) - return run_rmia_shadow_training(config, df_challenge=df_challenge) + return run_rmia_shadow_training(model_runner, config, df_challenge=df_challenge) -# TODO: Perform inference on all the target models sequentially in a single run instead of running this script -# multiple times. For more information, refer to https://app.clickup.com/t/868h4xk86 -@hydra.main(config_path="configs", config_name="experiment_config", version_base=None) -def run_metaclassifier_testing( - config: DictConfig, -) -> None: +# TODO: consider moving this and potentially the other functions above to the main library. +def run_metaclassifier_testing(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> None: """ Function to run the attack on a single target model using a trained metaclassifier. Note that RMIA shadow models need to be trained for every new set of target models on @@ -313,6 +326,8 @@ def run_metaclassifier_testing( Test prediction probabilities are saved to the specified attack result path in the config. Args: + model_runner: The model runner to be used for testing the metaclassifier. + Should be an instance of `EnsembleAttackModelRunner`. config: Configuration object set in ``experiments_config.yaml``. """ log( @@ -382,7 +397,7 @@ def run_metaclassifier_testing( if not models_exists: log(INFO, "Shadow models for testing phase do not exist. Training RMIA shadow models...") - shadow_data_collection = train_rmia_shadows_for_test_phase(config) + shadow_data_collection = train_rmia_shadows_for_test_phase(model_runner, config) else: log(INFO, "All shadow models for testing phase found. Using existing RMIA shadow models...") @@ -427,5 +442,31 @@ def run_metaclassifier_testing( save_results(attack_results_path, metaclassifier_model_name, probabilities, pred_score) +# TODO: Perform inference on all the target models sequentially in a single run instead of running this script +# multiple times. For more information, refer to https://app.clickup.com/t/868h4xk86 +@hydra.main(config_path="configs", config_name="experiment_config", version_base=None) +def run_metaclassifier_testing_with_tabddpm(config: DictConfig) -> None: + """ + Run the attack on a single target model using a trained metaclassifier. + RMIA shadow models will be trained using the TabDDPM model. + + Args: + config: Configuration object set in config.yaml. + """ + log(INFO, "Running metaclassifier testing with TabDDPM...") + + with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file: + training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + fine_tune_diffusion_iterations = config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + training_config.fine_tuning_diffusion_iterations = fine_tune_diffusion_iterations + fine_tune_classifier_iterations = config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + training_config.fine_tuning_classifier_iterations = fine_tune_classifier_iterations + training_config.number_of_points_to_synthesize = config.shadow_training.number_of_points_to_synthesize + + model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config) + + run_metaclassifier_testing(model_runner, config) + + if __name__ == "__main__": - run_metaclassifier_testing() + run_metaclassifier_testing_with_tabddpm() diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/test_attack_model.py index d684402a..c9e161c9 100644 --- a/examples/gan/ensemble_attack/test_attack_model.py +++ b/examples/gan/ensemble_attack/test_attack_model.py @@ -4,17 +4,30 @@ from omegaconf import DictConfig from examples.ensemble_attack.test_attack_model import run_metaclassifier_testing +from examples.gan.ensemble_attack.utils import make_training_config +from midst_toolkit.attacks.ensemble.models import EnsembleAttackCTGANModelRunner from midst_toolkit.common.logger import log @hydra.main(config_path="./", config_name="config", version_base=None) def attack_model_test(config: DictConfig) -> None: - """Main function to test the attack model.""" + """ + Main function to test the attack model. + + Args: + config: The configuration object from the config.yaml file. + """ log( INFO, f"Testing attack model against synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...", ) - run_metaclassifier_testing(config.ensemble_attack) + + training_config = make_training_config(config) + number_of_points_to_synthesize = config.ensemble_attack.shadow_training.number_of_points_to_synthesize + training_config.number_of_points_to_synthesize = number_of_points_to_synthesize + model_runner = EnsembleAttackCTGANModelRunner(training_config=training_config) + + run_metaclassifier_testing(model_runner, config.ensemble_attack) if __name__ == "__main__": diff --git a/examples/gan/ensemble_attack/train_attack_model.py b/examples/gan/ensemble_attack/train_attack_model.py index 84f2b5af..21a6ceb6 100644 --- a/examples/gan/ensemble_attack/train_attack_model.py +++ b/examples/gan/ensemble_attack/train_attack_model.py @@ -1,13 +1,14 @@ -import json from logging import INFO from pathlib import Path import hydra -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig from examples.ensemble_attack.run_metaclassifier_training import run_metaclassifier_training from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training +from examples.gan.ensemble_attack.utils import get_master_challenge_train_data, make_training_config from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe +from midst_toolkit.attacks.ensemble.models import EnsembleAttackCTGANModelRunner from midst_toolkit.attacks.ensemble.process_split_data import process_split_data from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds @@ -55,34 +56,19 @@ def train_attack_model(config: DictConfig) -> None: random_seed=config.ensemble_attack.random_seed, ) - # Saving the model config from the config.yaml into a json file - # because that's what the ensemble attack code will be looking for - training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path) - training_config_path.unlink(missing_ok=True) - with open(training_config_path, "w") as f: - training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config) - assert isinstance(training_config, dict), "Training config must be a dictionary." - training_config["general"] = { - "test_data_dir": config.base_data_dir, - "sample_prefix": "ctgan", - # The values below will be overriden - "exp_name": "", - "data_dir": "", - "workspace_dir": "", - } - json.dump(training_config, f) - if config.ensemble_attack.pipeline.run_shadow_model_training: log(INFO, "Training the shadow models...") - master_challenge_train = load_dataframe( - Path(config.ensemble_attack.data_paths.population_path), - "master_challenge_train.csv", - ) - shadow_data_paths = run_shadow_model_training(config.ensemble_attack, master_challenge_train) - shadow_data_paths = [Path(path) for path in shadow_data_paths] + + training_config = make_training_config(config) + number_of_points_to_synthesize = config.ensemble_attack.shadow_training.number_of_points_to_synthesize + training_config.number_of_points_to_synthesize = number_of_points_to_synthesize + model_runner = EnsembleAttackCTGANModelRunner(training_config=training_config) + + master_challenge_train = get_master_challenge_train_data(config) + shadow_data_paths = run_shadow_model_training(model_runner, config.ensemble_attack, master_challenge_train) log(INFO, "Training the target model...") - target_model_synthetic_path = run_target_model_training(config.ensemble_attack) + target_model_synthetic_path = run_target_model_training(model_runner, config.ensemble_attack) if config.ensemble_attack.pipeline.run_metaclassifier_training: log(INFO, "Training the metaclassifier...") diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py new file mode 100644 index 00000000..71e92081 --- /dev/null +++ b/examples/gan/ensemble_attack/utils.py @@ -0,0 +1,76 @@ +import json +from pathlib import Path + +import pandas as pd +from omegaconf import DictConfig, OmegaConf + +from examples.gan.utils import get_single_table_svd_metadata, get_table_name +from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.models import EnsembleAttackCTGANTrainingConfig +from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME +from midst_toolkit.attacks.ensemble.shadow_model_utils import setup_save_dir + + +def get_master_challenge_train_data(config: DictConfig) -> pd.DataFrame: + """ + Get the master challenge train data from the config's population path location. + + Args: + config: The configuration object. + + Returns: + The dataframe containing the master challenge train data. + """ + population_path = Path(config.ensemble_attack.data_paths.population_path) + assert population_path.exists(), ( + f"Population path {population_path} does not exist. Please run the data processing pipeline first." + ) + + return load_dataframe(population_path, PROCESSED_TRAIN_DATA_FILE_NAME) + + +def make_training_config(config: DictConfig) -> EnsembleAttackCTGANTrainingConfig: + """ + Make the ensemble attack training config for the CTGAN model from the config.yaml file. + + Saves the training config json file to the shadow training json config paths location. + + Args: + config: The configuration object. + + Returns: + The ensemble attack training config for the CTGAN model. + """ + # Saving the model config from the config.yaml into a json file + # because that's what the ensemble attack code will be looking for + training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path) + training_config_path.unlink(missing_ok=True) + with open(training_config_path, "w") as f: + training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config, resolve=True) + assert isinstance(training_config, dict), "Training config must be a dictionary." + training_config["general"] = { + "test_data_dir": config.base_data_dir, + "sample_prefix": "ctgan", + "data_dir": config.base_data_dir, + "workspace_dir": str(Path(config.base_data_dir) / "shadow_workspace"), + "exp_name": "pre_trained_model", + } + json.dump(training_config, f) + + ctgan_training_config = EnsembleAttackCTGANTrainingConfig(**training_config) # type: ignore[arg-type] + + setup_save_dir(ctgan_training_config) + + master_challenge_train = get_master_challenge_train_data(config) + + table_name = get_table_name(config.base_data_dir) + domain_file_path = Path(config.base_data_dir) / f"{table_name}_domain.json" + with open(domain_file_path, "r") as file: + domain_dictionary = json.load(file) + + metadata, _ = get_single_table_svd_metadata(master_challenge_train, domain_dictionary) + + ctgan_training_config.metadata = metadata + ctgan_training_config.table_name = table_name + + return ctgan_training_config diff --git a/src/midst_toolkit/attacks/ensemble/models.py b/src/midst_toolkit/attacks/ensemble/models.py new file mode 100644 index 00000000..877faf85 --- /dev/null +++ b/src/midst_toolkit/attacks/ensemble/models.py @@ -0,0 +1,278 @@ +"""Module containing the base classes and implementations for the Ensemble Attack model runner and training result.""" + +import copy +from abc import ABC, abstractmethod +from logging import INFO +from pathlib import Path +from typing import Any + +import pandas as pd +from pydantic import BaseModel, ConfigDict, Field +from sdv.metadata import SingleTableMetadata # type: ignore[import-untyped] +from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] + +from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning +from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, TrainingConfig +from midst_toolkit.common.logger import log +from midst_toolkit.common.variables import DEVICE +from midst_toolkit.models.clavaddpm.clustering import clava_clustering +from midst_toolkit.models.clavaddpm.data_loaders import Tables, load_tables +from midst_toolkit.models.clavaddpm.enumerations import GroupLengthsProbDicts, Relation, RelationOrder +from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing +from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, CTGANModelArtifacts, clava_training + + +# Base Classes +class EnsembleAttackTrainingConfig(TrainingConfig): + save_dir: Path | None = None + number_of_points_to_synthesize: int = 20000 + + +class EnsembleAttackTrainingResult(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + save_dir: Path + configs: EnsembleAttackTrainingConfig + models: Any + synthetic_data: pd.DataFrame | None = None + + +class EnsembleAttackModelRunner(ABC): + def __init__(self, training_config: EnsembleAttackTrainingConfig): + """ + Initialize the ensemble attackmodel runner with a training config. + + Args: + training_config: The training config for the ensemble attack model. + """ + self.training_config = training_config + + @abstractmethod + def train_or_fine_tune_and_synthesize( + self, + dataset: pd.DataFrame, + synthesize: bool = True, + trained_model: EnsembleAttackTrainingResult | None = None, + ) -> EnsembleAttackTrainingResult: + """ + Train or fine tune a model and synthesize data. + + Args: + dataset: The dataset to train or fine tune the model on. + synthesize: Whether to synthesize data after training. The number of points to synthesize + and the save directory is controlled by the `number_of_points_to_synthesize` and `save_dir` + attributes of the training config. Optional, default is True. + trained_model: The model to fine tune. If None, a new model should be trained. + Optional, default is None. + + Returns: + An instance of `EnsembleAttackTrainingResult` containing the training results. + """ + raise NotImplementedError("Subclasses must implement this method.") + + +# TabDDPM/ClavaDDPM implementation +class EnsembleAttackTabDDPMTrainingConfig(ClavaDDPMTrainingConfig, EnsembleAttackTrainingConfig): + fine_tuning_diffusion_iterations: int = 100 + fine_tuning_classifier_iterations: int = 10 + + +class TabDDPMTrainingResult(EnsembleAttackTrainingResult): + configs: EnsembleAttackTabDDPMTrainingConfig + models: dict[Relation, ClavaDDPMModelArtifacts] + tables: Tables + relation_order: RelationOrder + all_group_lengths_probabilities: GroupLengthsProbDicts + + +class EnsembleAttackTabDDPMModelRunner(EnsembleAttackModelRunner): + training_config: EnsembleAttackTabDDPMTrainingConfig + + def train_or_fine_tune_and_synthesize( + self, + dataset: pd.DataFrame, + synthesize: bool = True, + trained_model: EnsembleAttackTrainingResult | None = None, + ) -> TabDDPMTrainingResult: + """ + Train or fine tune a TabDDPM model on the provided training set and optionally synthesize + data using the trained/fine-tuned models. + + Args: + dataset: The training dataset as a pandas DataFrame. + synthesize: Flag indicating whether to generate synthetic data after training. + The number of points to synthesize and the save directory is controlled by + the `number_of_points_to_synthesize` and `save_dir` attributes of the + training config. Optional, default is True. + trained_model: The model to fine tune. If None, a new model should be trained. + Optional, default is None. + + Returns: + A dataclass TabDDPMTrainingResult object containing: + - save_dir: Directory where results are saved. + - configs: Configuration dictionary used for training. + - tables: Loaded tables after clustering. + - relation_order: Relation order of the tables. + - all_group_lengths_probabilities: Group lengths probability dictionaries. + - models: The trained models. + - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, + otherwise, None. + """ + assert self.training_config.save_dir is not None, "Save dir is not set" + + # Load tables + tables, relation_order, _ = load_tables(self.training_config.general.data_dir, train_data={"trans": dataset}) + + # Clustering on the multi-table dataset + tables, all_group_lengths_prob_dicts = clava_clustering( + tables, + relation_order, + self.training_config.save_dir, + self.training_config.clustering, + ) + + if trained_model is None: + # Train models + tables, models = clava_training( + tables, + relation_order, + self.training_config.save_dir, + diffusion_config=self.training_config.diffusion, + classifier_config=self.training_config.classifier, + device=DEVICE, + ) + + else: + # Fine-tune models + copied_models = copy.deepcopy(trained_model.models) + models = clava_fine_tuning( + copied_models, + tables, + relation_order, + diffusion_config=self.training_config.diffusion, + classifier_config=self.training_config.classifier, + fine_tuning_diffusion_iterations=self.training_config.fine_tuning_diffusion_iterations, + fine_tuning_classifier_iterations=self.training_config.fine_tuning_classifier_iterations, + ) + + result = TabDDPMTrainingResult( + save_dir=self.training_config.save_dir, + configs=self.training_config, + tables=tables, + relation_order=relation_order, + all_group_lengths_probabilities=all_group_lengths_prob_dicts, + models=models, + ) + + if synthesize: + # By default, Ensemble attack generates a synthetic data of length ``20,000``. + # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to + # generate 20,000 samples regardless of the training data size. But we control the + # synthetic data size directly here with ``number_of_points_to_synthesize``. + # ``sample_scale`` is later multiplied by the size of training data (no id) to determine + # the size of synthetic data. + assert len(tables["trans"].data) > 0, "Cannot synthesize: training data is empty" + sample_scale = self.training_config.number_of_points_to_synthesize / len(tables["trans"].data) + cleaned_tables, _, _ = clava_synthesizing( + tables, + relation_order, + self.training_config.save_dir, + models, + self.training_config.general, + self.training_config.sampling, + self.training_config.matching, + all_group_lengths_prob_dicts, + sample_scale=sample_scale, + ) + + result.synthetic_data = cleaned_tables["trans"] + + return result + + +# CTGAN implementation +class EnsembleAttackCTGANTrainingConfig(CTGANTrainingConfig, EnsembleAttackTrainingConfig): + model_config = ConfigDict(arbitrary_types_allowed=True) + + metadata: SingleTableMetadata | None = Field(default=None, exclude=True) + table_name: str | None = None + + +class CTGANTrainingResult(EnsembleAttackTrainingResult): + configs: EnsembleAttackCTGANTrainingConfig + models: dict[Relation, CTGANModelArtifacts] + + +class EnsembleAttackCTGANModelRunner(EnsembleAttackModelRunner): + training_config: EnsembleAttackCTGANTrainingConfig + + def train_or_fine_tune_and_synthesize( + self, + dataset: pd.DataFrame, + synthesize: bool = True, + trained_model: EnsembleAttackTrainingResult | None = None, + ) -> CTGANTrainingResult: + """ + Train or fine tune a CTGAN model on the provided dataset and optionally synthesize data. + + If no trained model is provided, a new model will be trained. Otherwise, the + provided model will be fine tuned. + + Args: + dataset: The dataset as a pandas DataFrame. + configs: Configuration dictionary for CTGAN. + synthesize: Flag indicating whether to generate synthetic data after training. + The number of points to synthesize and the save directory is controlled by + the `number_of_points_to_synthesize` and `save_dir` attributes of the training + config. Optional, default is True. + trained_model: The trained model to fine tune. If None, a new model will be trained. + + Returns: + A dataclass TrainingResult object containing: + - save_dir: Directory where results are saved. + - configs: Configuration dictionary used for training. + - models: The trained models. + - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, + otherwise, None. + """ + assert self.training_config.save_dir is not None, "Save dir is not set" + assert self.training_config.metadata is not None, "Metadata is not set" + assert self.training_config.table_name is not None, "Table name is not set" + + dataset_without_ids = dataset.drop( + columns=[column_name for column_name in dataset.columns if "_id" in column_name] + ) + + if trained_model is None: + log(INFO, "Training new CTGAN model...") + ctgan = CTGANSynthesizer( + metadata=self.training_config.metadata, + epochs=self.training_config.training.epochs, + verbose=self.training_config.training.verbose, + ) + model_name = "trained_ctgan_model.pkl" + else: + log(INFO, "Fine tuning CTGAN model...") + ctgan = trained_model.models[(None, self.training_config.table_name)].model + model_name = "fine_tuned_ctgan_model.pkl" + + ctgan.fit(dataset_without_ids) + + results_file = self.training_config.save_dir / model_name + results_file.parent.mkdir(parents=True, exist_ok=True) + + ctgan.save(results_file) + + result = CTGANTrainingResult( + save_dir=self.training_config.save_dir, + configs=self.training_config, + models={ + (None, self.training_config.table_name): CTGANModelArtifacts(model=ctgan, model_file_path=results_file) + }, + ) + + if synthesize: + synthetic_data = ctgan.sample(num_rows=self.training_config.number_of_points_to_synthesize) + result.synthetic_data = synthetic_data + + return result diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py index bbf0360f..cd566c8c 100644 --- a/src/midst_toolkit/attacks/ensemble/process_split_data.py +++ b/src/midst_toolkit/attacks/ensemble/process_split_data.py @@ -9,6 +9,12 @@ from midst_toolkit.common.logger import log +PROCESSED_TRAIN_DATA_FILE_NAME = "master_challenge_train.csv" +PROCESSED_TEST_DATA_FILE_NAME = "master_challenge_test.csv" +PROCESSED_TRAIN_LABELS_FILE_NAME = "master_challenge_train_labels.npy" +PROCESSED_TEST_LABELS_FILE_NAME = "master_challenge_test_labels.npy" + + def split_real_data( df_real: pd.DataFrame, column_to_stratify: str | None = None, @@ -208,14 +214,14 @@ def process_split_data( save_dataframe(df_real_val, processed_attack_data_path, "real_val.csv") save_dataframe(df_real_test, processed_attack_data_path, "real_test.csv") - save_dataframe(df_val, processed_attack_data_path, "master_challenge_train.csv") + save_dataframe(df_val, processed_attack_data_path, PROCESSED_TRAIN_DATA_FILE_NAME) np.save( - processed_attack_data_path / "master_challenge_train_labels.npy", + processed_attack_data_path / PROCESSED_TRAIN_LABELS_FILE_NAME, y_val, ) - save_dataframe(df_test, processed_attack_data_path, "master_challenge_test.csv") + save_dataframe(df_test, processed_attack_data_path, PROCESSED_TEST_DATA_FILE_NAME) np.save( - processed_attack_data_path / "master_challenge_test_labels.npy", + processed_attack_data_path / PROCESSED_TEST_LABELS_FILE_NAME, y_test, ) log(INFO, f"Data splits saved to {processed_attack_data_path}") diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py index 92b69088..78241179 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py @@ -3,40 +3,31 @@ import shutil from logging import INFO from pathlib import Path -from typing import Any, cast +from typing import Any import pandas as pd from omegaconf import DictConfig -from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - ModelType, - TrainingResult, - fine_tune_tabddpm_and_synthesize, - save_additional_training_config, - train_or_fine_tune_and_synthesize_with_ctgan, - train_tabddpm_and_synthesize, -) -from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig +from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner +from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config from midst_toolkit.common.logger import log # TODO: This function and the next one can be unified later. def train_fine_tuned_shadow_models( + model_runner: EnsembleAttackModelRunner, n_models: int, n_reps: int, population_data: pd.DataFrame, master_challenge_data: pd.DataFrame, shadow_models_output_path: Path, training_json_config_paths: DictConfig, - fine_tuning_config: DictConfig, init_model_id: int, table_name: str, id_column_name: str, pre_training_data_size: int = 60000, - number_of_points_to_synthesize: int = 20000, init_data_seed: int | None = None, random_seed: int | None = None, - model_type: ModelType = ModelType.TABDDPM, ) -> Path: """ Train ``n_models`` shadow models that start from a pre-trained TabDDPM model and are fine-tuned on @@ -60,6 +51,8 @@ def train_fine_tuned_shadow_models( size of fine-tuning set. Args: + model_runner: The model runner to be used for training the shadow models. Should be an instance of + a subclass of `EnsembleAttackModelRunner`. n_models: Number of shadow models to train, must be even. n_reps: Number of repetitions for each challenge point in the fine-tuning set. population_data: The total population data that the attacker has access to. @@ -81,7 +74,6 @@ def train_fine_tuned_shadow_models( defaults to 20,000. init_data_seed: Random seed for the initial training set. random_seed: Random seed used for reproducibility, defaults to None. - model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. Returns: The path where the shadow models and their artifacts are saved. @@ -118,39 +110,27 @@ def train_fine_tuned_shadow_models( ) # Train initial model with 60K data without any challenge points - # ``save_additional_training_config`` makes a personalized copy of the training config for each + # ``update_and_save_training_config`` makes a personalized copy of the training config for each # training model (here the base model). # All the shadow models will be saved under the base model data directory. - configs, save_dir = save_additional_training_config( + configs = update_and_save_training_config( + config=model_runner.training_config, data_dir=shadow_model_data_folder, - training_config_json_path=Path(training_json_config_paths.training_config_path), final_config_json_path=shadow_model_data_folder / f"{table_name}.json", # Path to the new json experiment_name="pre_trained_model", - model_type=model_type, ) + model_runner.training_config = configs # Train the initial model if it is not already trained and saved. - initial_model_path = save_dir / f"initial_model_rmia_{init_model_id}.pkl" + assert model_runner.training_config.save_dir is not None, "Save dir is not set" + initial_model_path = model_runner.training_config.save_dir / f"initial_model_rmia_{init_model_id}.pkl" if not initial_model_path.exists(): - log(INFO, f"Training initial {model_type.value} model with ID {init_model_id}...") - - initial_model_training_results: TrainingResult - if model_type == ModelType.TABDDPM: - initial_model_training_results = train_tabddpm_and_synthesize( - train, - cast(ClavaDDPMTrainingConfig, configs), - save_dir, - synthesize=False, - ) - elif model_type == ModelType.CTGAN: - initial_model_training_results = train_or_fine_tune_and_synthesize_with_ctgan( - train, - cast(CTGANTrainingConfig, configs), - save_dir, - synthesize=False, - ) - else: - raise ValueError(f"Invalid model type: {model_type}") + log(INFO, f"Training initial model with runner {model_runner}. Model ID {init_model_id}...") + + initial_model_training_results = model_runner.train_or_fine_tune_and_synthesize( + dataset=train, + synthesize=False, + ) # Save the initial model # Pickle dump the results @@ -193,27 +173,11 @@ def train_fine_tuned_shadow_models( # Shuffle the dataset selected_challenges = selected_challenges.sample(frac=1, random_state=random_seed).reset_index(drop=True) - if model_type == ModelType.TABDDPM: - train_result = fine_tune_tabddpm_and_synthesize( - trained_models=initial_model_training_results.models, - fine_tune_set=selected_challenges, - configs=cast(ClavaDDPMTrainingConfig, configs), - save_dir=save_dir, - fine_tuning_diffusion_iterations=fine_tuning_config.fine_tune_diffusion_iterations, - fine_tuning_classifier_iterations=fine_tuning_config.fine_tune_classifier_iterations, - synthesize=True, - number_of_points_to_synthesize=number_of_points_to_synthesize, - ) - elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_and_synthesize_with_ctgan( - dataset=selected_challenges, - configs=cast(CTGANTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - trained_model=initial_model_training_results.models[(None, table_name)].model, - ) - else: - raise ValueError(f"Invalid model type: {model_type}") + train_result = model_runner.train_or_fine_tune_and_synthesize( + dataset=selected_challenges, + synthesize=True, + trained_model=initial_model_training_results, + ) assert train_result.synthetic_data is not None, "Fine-tuned models should generate synthetic data." log( @@ -223,7 +187,7 @@ def train_fine_tuned_shadow_models( attack_data["fine_tuned_results"].append(train_result.synthetic_data) # Pickle dump the results - result_path = Path(save_dir / "rmia_shadows.pkl") + result_path = model_runner.training_config.save_dir / "rmia_shadows.pkl" with open(result_path, "wb") as file: pickle.dump(attack_data, file) @@ -231,6 +195,7 @@ def train_fine_tuned_shadow_models( def train_shadow_on_half_challenge_data( + model_runner: EnsembleAttackModelRunner, n_models: int, n_reps: int, master_challenge_data: pd.DataFrame, @@ -238,9 +203,7 @@ def train_shadow_on_half_challenge_data( training_json_config_paths: DictConfig, table_name: str, id_column_name: str, - number_of_points_to_synthesize: int = 20000, random_seed: int | None = None, - model_type: ModelType = ModelType.TABDDPM, ) -> Path: """ 1. Create eight training sets with exactly half of the observations included in the challenge lists @@ -251,22 +214,21 @@ def train_shadow_on_half_challenge_data( 3. A synthetic dataset of 20K observations is generated for each model. Args: - n_models: number of shadow models to train, must be even. - n_reps: number of repetitions for each challenge point in the fine-tuning set. - master_challenge_data: The master challenge training dataset. - shadow_models_output_path: Path where the all datasets and information necessary to train shadow models - will be saved. - training_json_config_paths: Configuration dictionary containing paths to the data JSON config files. - An example of this config is provided in ``examples/ensemble_attack/config.yaml``. Required keys are: - - table_domain_file_path (str): Path to the table domain json file. - - dataset_meta_file_path (str): Path to dataset meta json file. - - training_config_path (str): Path to table's training config json file. - table_name: Name of the main table to be used for training the TabDDPM model. - id_column_name: Name of the ID column in the data. - number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model, - defaults to 20,000. - random_seed: Random seed used for reproducibility, defaults to None. - model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. + n_models: number of shadow models to train, must be even. + n_reps: number of repetitions for each challenge point in the fine-tuning set. + master_challenge_data: The master challenge training dataset. + shadow_models_output_path: Path where the all datasets and information necessary to train shadow models + will be saved. + training_json_config_paths: Configuration dictionary containing paths to the data JSON config files. + An example of this config is provided in ``examples/ensemble_attack/config.yaml``. Required keys are: + - table_domain_file_path (str): Path to the table domain json file. + - dataset_meta_file_path (str): Path to dataset meta json file. + - training_config_path (str): Path to table's training config json file. + table_name: Name of the main table to be used for training the TabDDPM model. + id_column_name: Name of the ID column in the data. + random_seed: Random seed used for reproducibility, defaults to None. Returns: The path where the shadow models and their artifacts are saved. @@ -297,13 +259,15 @@ def train_shadow_on_half_challenge_data( training_json_config_paths.dataset_meta_file_path, shadow_folder / "dataset_meta.json", ) - configs, save_dir = save_additional_training_config( + + configs = update_and_save_training_config( + config=model_runner.training_config, data_dir=shadow_folder, - training_config_json_path=Path(training_json_config_paths.training_config_path), final_config_json_path=shadow_folder / f"{table_name}.json", # Path to the new json experiment_name="trained_model", - model_type=model_type, ) + model_runner.training_config = configs + attack_data: dict[str, Any] = { "selected_sets": selected_id_lists, "trained_results": [], @@ -322,24 +286,7 @@ def train_shadow_on_half_challenge_data( # Shuffle the dataset selected_challenges = selected_challenges.sample(frac=1, random_state=random_seed).reset_index(drop=True) - train_result: TrainingResult - if model_type == ModelType.TABDDPM: - train_result = train_tabddpm_and_synthesize( - selected_challenges, - cast(ClavaDDPMTrainingConfig, configs), - save_dir, - synthesize=True, - number_of_points_to_synthesize=number_of_points_to_synthesize, - ) - elif model_type == ModelType.CTGAN: - train_result = train_or_fine_tune_and_synthesize_with_ctgan( - dataset=selected_challenges, - configs=cast(CTGANTrainingConfig, configs), - save_dir=save_dir, - synthesize=True, - ) - else: - raise ValueError(f"Invalid model type: {model_type}") + train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=selected_challenges, synthesize=True) assert train_result.synthetic_data is not None, "Trained shadow model did not generate synthetic data." log( @@ -350,7 +297,8 @@ def train_shadow_on_half_challenge_data( attack_data["trained_results"].append(train_result.synthetic_data) # Pickle dump the results - result_path = Path(save_dir, "rmia_shadows_third_set.pkl") + assert model_runner.training_config.save_dir is not None, "Save dir is not set" + result_path = model_runner.training_config.save_dir / "rmia_shadows_third_set.pkl" with open(result_path, "wb") as file: pickle.dump(attack_data, file) @@ -358,6 +306,7 @@ def train_shadow_on_half_challenge_data( def train_three_sets_of_shadow_models( + model_runner: EnsembleAttackModelRunner, population_data: pd.DataFrame, master_challenge_data: pd.DataFrame, shadow_models_output_path: Path, @@ -367,9 +316,7 @@ def train_three_sets_of_shadow_models( id_column_name: str, n_models_per_set: int = 4, n_reps: int = 12, - number_of_points_to_synthesize: int = 20000, random_seed: int | None = None, - model_type: ModelType = ModelType.TABDDPM, ) -> tuple[Path, Path, Path]: """ Runs the shadow model training pipeline of the ensemble attack. This pipeline trains three sets of shadow models. @@ -396,6 +343,8 @@ def train_three_sets_of_shadow_models( Args: + model_runner: The model runner to be used for training the shadow models. + Should be an instance of `EnsembleAttackModelRunner`. population_data: The total population data used for pre-training some of the shadow models. master_challenge_data: The master challenge training dataset. shadow_models_output_path: Path where the all datasets and information (configs) necessary to @@ -416,10 +365,7 @@ def train_three_sets_of_shadow_models( id_column_name: Name of the ID column in the data. n_models_per_set: Number of shadow models to train by each approach. Must be an even number. Defaults to 4. n_reps: Number of repetitions for each challenge point in the fine-tuning or training sets, defaults to 12. - number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model, - defaults to 20,000. random_seed: Random seed used for reproducibility, defaults to None. - model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. Returns: Paths where the shadow models and their artifacts including synthetic data are saved for each of @@ -431,21 +377,19 @@ def train_three_sets_of_shadow_models( shadow_models_output_path.mkdir(parents=True, exist_ok=True) first_set_result_path = train_fine_tuned_shadow_models( + model_runner=model_runner, n_models=n_models_per_set, n_reps=n_reps, population_data=population_data, master_challenge_data=master_challenge_data, shadow_models_output_path=shadow_models_output_path, training_json_config_paths=training_json_config_paths, - fine_tuning_config=fine_tuning_config, init_model_id=1, # To distinguish these shadow models from the next ones table_name=table_name, id_column_name=id_column_name, pre_training_data_size=fine_tuning_config.pre_train_data_size, - number_of_points_to_synthesize=number_of_points_to_synthesize, init_data_seed=random_seed, random_seed=random_seed, - model_type=model_type, ) log( INFO, @@ -455,22 +399,20 @@ def train_three_sets_of_shadow_models( # with a new initial training set # in the hopes of increased performance (gain was minimal based on the submission comments)."" second_set_result_path = train_fine_tuned_shadow_models( + model_runner=model_runner, n_models=n_models_per_set, n_reps=n_reps, population_data=population_data, master_challenge_data=master_challenge_data, shadow_models_output_path=shadow_models_output_path, training_json_config_paths=training_json_config_paths, - fine_tuning_config=fine_tuning_config, init_model_id=2, # To distinguish these shadow models from the previous ones table_name=table_name, id_column_name=id_column_name, pre_training_data_size=fine_tuning_config.pre_train_data_size, - number_of_points_to_synthesize=number_of_points_to_synthesize, # Setting a different seed for the second train set init_data_seed=random_seed + 1 if random_seed is not None else None, random_seed=random_seed, - model_type=model_type, ) log( INFO, @@ -479,6 +421,7 @@ def train_three_sets_of_shadow_models( # Original codebase comment: "The following eight models are trained from scratch on the challenge points, # still in the hopes of increased performance (again the gain was minimal)."" third_set_result_path = train_shadow_on_half_challenge_data( + model_runner=model_runner, n_models=n_models_per_set * 2, n_reps=n_reps, master_challenge_data=master_challenge_data, @@ -486,9 +429,7 @@ def train_three_sets_of_shadow_models( training_json_config_paths=training_json_config_paths, table_name=table_name, id_column_name=id_column_name, - number_of_points_to_synthesize=number_of_points_to_synthesize, random_seed=random_seed, - model_type=model_type, ) log( INFO, diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index c03af364..029a0890 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -1,352 +1,50 @@ -import copy import json import os -from dataclasses import dataclass -from enum import Enum from logging import INFO from pathlib import Path -from typing import Any -import pandas as pd -from sdv.single_table import CTGANSynthesizer # type: ignore[import-untyped] - -from examples.gan.utils import get_single_table_svd_metadata, get_table_name -from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning -from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, TrainingConfig +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTrainingConfig +from midst_toolkit.common.config import TrainingConfig from midst_toolkit.common.logger import log -from midst_toolkit.common.variables import DEVICE -from midst_toolkit.models.clavaddpm.clustering import clava_clustering -from midst_toolkit.models.clavaddpm.data_loaders import Tables, load_tables -from midst_toolkit.models.clavaddpm.enumerations import ( - GroupLengthsProbDicts, - Relation, - RelationOrder, -) -from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing -from midst_toolkit.models.clavaddpm.train import ( - ClavaDDPMModelArtifacts, - CTGANModelArtifacts, - clava_training, -) - - -class ModelType(Enum): - TABDDPM = "tabddpm" - CTGAN = "ctgan" - - -@dataclass(kw_only=True) # Setting kw_only=True avoids and error with default values and inheritance -class TrainingResult: - save_dir: Path - configs: TrainingConfig - models: Any - synthetic_data: pd.DataFrame | None = None - - -@dataclass -class CTGANTrainingResult(TrainingResult): - configs: CTGANTrainingConfig - models: dict[Relation, CTGANModelArtifacts] -@dataclass -class TabDDPMTrainingResult(TrainingResult): - configs: ClavaDDPMTrainingConfig - models: dict[Relation, ClavaDDPMModelArtifacts] - tables: Tables - relation_order: RelationOrder - all_group_lengths_probabilities: GroupLengthsProbDicts - - -def save_additional_training_config( +def update_and_save_training_config( + config: EnsembleAttackTrainingConfig, data_dir: Path, - training_config_json_path: Path, final_config_json_path: Path, experiment_name: str = "attack_experiment", workspace_name: str = "shadow_workspace", - model_type: ModelType = ModelType.TABDDPM, -) -> tuple[TrainingConfig, Path]: +) -> EnsembleAttackTrainingConfig: """ - Modifies a TabDDPM configuration JSON file with the specified data directory, experiment name and workspace name, - and loads the resulting configuration. + Modifies a model configuration with the specified data directory, experiment name and workspace name, + and saves it to a JSON file. Args: - data_dir: Directory containing dataset_meta.json, trans_domain.json, and trans.json files. - training_config_json_path: Path to the original TabDDPM training configuration JSON file. - final_config_json_path: Path where the modified configuration JSON file will be saved. - experiment_name: Name of the experiment, used to create a unique save directory. - workspace_name: Name of the workspace, used to create a unique save directory. - model_type: Type of model to be used for training the shadow models. Defaults to ModelType.TABDDPM. + config: The training configuration to update. + data_dir: Directory containing dataset_meta.json, trans_domain.json, and trans.json files. + final_config_json_path: Path where the modified configuration JSON file will be saved. + experiment_name: Name of the experiment, used to create a unique save directory. + workspace_name: Name of the workspace, used to create a unique save directory. Returns: - configs: Loaded configuration dictionary for the model type. - save_dir: Directory path where results will be saved. + EnsembleAttackTrainingConfig: The updated training configuration. """ - # Modify the config file to give the correct training data and saving directory - with open(training_config_json_path, "r") as file: - configs: TrainingConfig - if model_type == ModelType.TABDDPM: - configs = ClavaDDPMTrainingConfig(**json.load(file)) - elif model_type == ModelType.CTGAN: - configs = CTGANTrainingConfig(**json.load(file)) - else: - raise ValueError(f"Invalid model type: {model_type}") - - configs.general.data_dir = data_dir + # Modify the config to have the correct training data and saving directory + config.general.data_dir = data_dir # Save dir is set by joining the workspace_dir and exp_name - configs.general.workspace_dir = data_dir / workspace_name - configs.general.exp_name = experiment_name + config.general.workspace_dir = data_dir / workspace_name + config.general.exp_name = experiment_name # save the changed to the new json file with open(final_config_json_path, "w") as file: - json.dump(configs.model_dump(mode="json"), file, indent=4) + json.dump(config.model_dump(mode="json"), file, indent=4) log(INFO, f"Config saved to {final_config_json_path}") # Set up the config - save_dir = setup_save_dir(configs) - - return configs, save_dir - - -# TODO: This and the next function should be unified later. -def train_tabddpm_and_synthesize( - train_set: pd.DataFrame, - configs: ClavaDDPMTrainingConfig, - save_dir: Path, - synthesize: bool = True, - number_of_points_to_synthesize: int = 20000, -) -> TabDDPMTrainingResult: - """ - Train a TabDDPM model on the provided training set and optionally synthesize data using the trained models. - - Args: - train_set: The training dataset as a pandas DataFrame. - configs: Configuration dictionary for TabDDPM. - save_dir: Directory path where models and results will be saved. - synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. - number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000. - - Returns: - A dataclass TrainingResult object containing: - - save_dir: Directory where results are saved. - - configs: Configuration dictionary used for training. - - tables: Loaded tables after clustering. - - relation_order: Relation order of the tables. - - all_group_lengths_probabilities: Group lengths probability dictionaries. - - models: The trained models. - - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, - otherwise, None. - """ - # Load tables - tables, relation_order, _ = load_tables(configs.general.data_dir, train_data={"trans": train_set}) - - # Clustering on the multi-table dataset - tables, all_group_lengths_prob_dicts = clava_clustering(tables, relation_order, save_dir, configs.clustering) - - # Train models - tables, models = clava_training( - tables, - relation_order, - save_dir, - diffusion_config=configs.diffusion, - classifier_config=configs.classifier, - device=DEVICE, - ) - result = TabDDPMTrainingResult( - save_dir=save_dir, - configs=configs, - tables=tables, - relation_order=relation_order, - all_group_lengths_probabilities=all_group_lengths_prob_dicts, - models=models, - ) - - if synthesize: - # By default, Ensemble attack generates a synthetic data of length ``20,000``. - # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to - # generate 20,000 samples regardless of the training data size. But we control the - # synthetic data size directly here with ``number_of_points_to_synthesize``. - # ``sample_scale`` is later multiplied by the size of training data (no id) to determine - # the size of synthetic data. - assert len(tables["trans"].data) > 0, "Cannot synthesize: training data is empty" - sample_scale = number_of_points_to_synthesize / len(tables["trans"].data) - cleaned_tables, _, _ = clava_synthesizing( - tables, - relation_order, - save_dir, - models, - configs.general, - configs.sampling, - configs.matching, - all_group_lengths_prob_dicts, - sample_scale=sample_scale, - ) - - result.synthetic_data = cleaned_tables["trans"] - - return result - - -def fine_tune_tabddpm_and_synthesize( - trained_models: dict[Relation, ClavaDDPMModelArtifacts], - fine_tune_set: pd.DataFrame, - configs: ClavaDDPMTrainingConfig, - save_dir: Path, - fine_tuning_diffusion_iterations: int = 100, - fine_tuning_classifier_iterations: int = 10, - synthesize: bool = True, - number_of_points_to_synthesize: int = 20000, -) -> TrainingResult: - """ - Given the trained models and a new training set, fine-tune the TabDDPM models. - If ``synthesize`` is True, synthesizes data using the fine-tuned models. Number of - synthesized data points is determined by the ``classifier_scale`` parameter in training ``configs``. - - Args: - trained_models: The previously trained model material. - fine_tune_set: The new training dataset for fine-tuning. - configs: Configuration dictionary for TabDDPM. - save_dir: Directory path where models and results will be saved. - fine_tuning_diffusion_iterations: Diffusion iterations for fine tuning. Defaults to 100. - fine_tuning_classifier_iterations: Number of training iterations for the new classifier model. - Defaults to 10. - synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. - number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000. - - - Returns: - A dataclass TrainingResult object containing: - - save_dir: Directory where results are saved. - - configs: Configuration dictionary used for training. - - tables: Loaded tables after clustering. - - relation_order: Relation order of the tables. - - all_group_lengths_probabilities: Group lengths probability dictionaries. - - models: The trained models. - - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, - otherwise, None. - """ - # Load tables - new_tables, relation_order, _ = load_tables(configs.general.data_dir, train_data={"trans": fine_tune_set}) - - # Clustering on the multi-table dataset - # Original submission uses 'force_tables=True' to run the clustering even if checkpoint is found. - new_tables, all_group_lengths_prob_dicts = clava_clustering( - new_tables, relation_order, save_dir, configs.clustering - ) - - # Train models - copied_models = copy.deepcopy(trained_models) - new_models = clava_fine_tuning( - copied_models, - new_tables, - relation_order, - diffusion_config=configs.diffusion, - classifier_config=configs.classifier, - fine_tuning_diffusion_iterations=fine_tuning_diffusion_iterations, - fine_tuning_classifier_iterations=fine_tuning_classifier_iterations, - ) - result = TabDDPMTrainingResult( - save_dir=save_dir, - configs=configs, - tables=new_tables, - relation_order=relation_order, - all_group_lengths_probabilities=all_group_lengths_prob_dicts, - models=new_models, - ) - - if synthesize: - # By default, Ensemble attack generates a synthetic data of length ``20,000``. - # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to - # generate 20,000 samples regardless of the training data size. But we control the - # synthetic data size directly here with ``number_of_points_to_synthesize``. - # ``sample_scale`` is later multiplied by the size of training data (no id) to determine - # the size of synthetic data. - assert len(new_tables["trans"].data) > 0, "Cannot synthesize: training data is empty" - sample_scale = number_of_points_to_synthesize / len(new_tables["trans"].data) - cleaned_tables, _, _ = clava_synthesizing( - new_tables, - relation_order, - save_dir, - new_models, - configs.general, - configs.sampling, - configs.matching, - all_group_lengths_prob_dicts, - sample_scale=sample_scale, - ) - - result.synthetic_data = cleaned_tables["trans"] - - return result - - -def train_or_fine_tune_and_synthesize_with_ctgan( - dataset: pd.DataFrame, - configs: CTGANTrainingConfig, - save_dir: Path, - synthesize: bool = True, - trained_model: CTGANSynthesizer | None = None, -) -> TrainingResult: - """ - Train or fine tune a CTGAN model on the provided dataset and optionally synthesize data. - - If no trained model is provided, a new model will be trained. Otherwise, the - provided model will be fine tuned. - - Args: - dataset: The dataset as a pandas DataFrame. - configs: Configuration dictionary for CTGAN. - save_dir: Directory path where models and results will be saved. - synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True. - trained_model: The trained model to fine tune. If None, a new model will be trained. - - Returns: - A dataclass TrainingResult object containing: - - save_dir: Directory where results are saved. - - configs: Configuration dictionary used for training. - - models: The trained models. - - synthetic_data: The synthesized data as a pandas DataFrame, if synthesis was performed, - otherwise, None. - """ - table_name = get_table_name(configs.general.data_dir) - domain_file_path = configs.general.data_dir / f"{table_name}_domain.json" - with open(domain_file_path, "r") as file: - domain_dictionary = json.load(file) - - metadata, dataset_without_ids = get_single_table_svd_metadata(dataset, domain_dictionary) - - if trained_model is None: - log(INFO, "Training new CTGAN model...") - ctgan = CTGANSynthesizer( - metadata=metadata, - epochs=configs.training.epochs, - verbose=configs.training.verbose, - ) - model_name = "trained_ctgan_model.pkl" - else: - log(INFO, "Fine tuning CTGAN model...") - ctgan = trained_model - model_name = "fine_tuned_ctgan_model.pkl" - - ctgan.fit(dataset_without_ids) - - results_file = Path(save_dir) / model_name - results_file.parent.mkdir(parents=True, exist_ok=True) - - ctgan.save(results_file) - - result = CTGANTrainingResult( - save_dir=save_dir, - configs=configs, - models={(None, table_name): CTGANModelArtifacts(model=ctgan, model_file_path=results_file)}, - ) - - if synthesize: - synthetic_data = ctgan.sample(num_rows=configs.synthesizing.sample_size) - result.synthetic_data = synthetic_data + config.save_dir = setup_save_dir(config) - return result + return config # TODO: The following function is directly copied from the midst reference code since diff --git a/tests/integration/attacks/ensemble/assets/data_configs/trans.json b/tests/integration/attacks/ensemble/assets/data_configs/trans.json index d9a786df..2a77b82a 100644 --- a/tests/integration/attacks/ensemble/assets/data_configs/trans.json +++ b/tests/integration/attacks/ensemble/assets/data_configs/trans.json @@ -1,6 +1,6 @@ { "general": { - "data_dir": "tests/integration/attacks/ensemble/assets/shadow_models_data", + "data_dir": "tests/integration/attacks/ensemble/assets/data_configs", "exp_name": "ensemble_attack", "workspace_dir": "tests/integration/attacks/ensemble/assets/workspace", "sample_prefix": "", diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py index 8008f97d..1241358a 100644 --- a/tests/integration/attacks/ensemble/test_shadow_model_training.py +++ b/tests/integration/attacks/ensemble/test_shadow_model_training.py @@ -1,8 +1,8 @@ import copy +import json import pickle import shutil from pathlib import Path -from typing import cast import pandas as pd import pytest @@ -10,16 +10,12 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import ( train_fine_tuned_shadow_models, train_shadow_on_half_challenge_data, ) -from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - fine_tune_tabddpm_and_synthesize, - save_additional_training_config, - train_tabddpm_and_synthesize, -) -from midst_toolkit.common.config import ClavaDDPMTrainingConfig +from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config POPULATION_DATA = load_dataframe( @@ -42,21 +38,30 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None shadow_models_output_path = tmp_path # Input # Population data is used to pre-train some of the shadow models. + with open(cfg.shadow_training.training_json_config_paths.training_config_path, "r") as file: + training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + training_config.fine_tuning_diffusion_iterations = ( + cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + ) + training_config.fine_tuning_classifier_iterations = ( + cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + ) + training_config.number_of_points_to_synthesize = 5 + model_runner = EnsembleAttackTabDDPMModelRunner(training_config) result_path = train_fine_tuned_shadow_models( + model_runner=model_runner, n_models=2, n_reps=1, population_data=POPULATION_DATA, master_challenge_data=POPULATION_DATA[0:20], # Limiting the data to 20 samples for faster test execution shadow_models_output_path=shadow_models_output_path, training_json_config_paths=cfg.shadow_training.training_json_config_paths, - fine_tuning_config=cfg.shadow_training.fine_tuning_config, init_model_id=1, init_data_seed=cfg.random_seed, table_name="trans", id_column_name="trans_id", pre_training_data_size=cfg.shadow_training.fine_tuning_config.pre_train_data_size, - number_of_points_to_synthesize=5, random_seed=cfg.random_seed, ) # Expected saved models and synthesized data: @@ -87,7 +92,19 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) -> shadow_models_output_path = tmp_path # Input # Population data is loaded and used as challenge data for testing purposes. + with open(cfg.shadow_training.training_json_config_paths.training_config_path, "r") as file: + training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + training_config.fine_tuning_diffusion_iterations = ( + cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + ) + training_config.fine_tuning_classifier_iterations = ( + cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + ) + training_config.number_of_points_to_synthesize = 5 + + model_runner = EnsembleAttackTabDDPMModelRunner(training_config) result_path = train_shadow_on_half_challenge_data( + model_runner=model_runner, n_models=2, n_reps=1, master_challenge_data=POPULATION_DATA[0:40], # Limiting the data to 40 samples for faster test execution @@ -95,7 +112,6 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) -> training_json_config_paths=cfg.shadow_training.training_json_config_paths, table_name="trans", id_column_name="trans_id", - number_of_points_to_synthesize=5, random_seed=cfg.random_seed, ) # Expected saved models and synthesized data: @@ -137,21 +153,25 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: cfg.shadow_training.training_json_config_paths.dataset_meta_file_path, tmp_training_dir / "dataset_meta.json", ) - configs, save_dir = save_additional_training_config( + with open(training_config_path, "r") as file: + configs = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + + configs = update_and_save_training_config( + config=configs, data_dir=tmp_training_dir, - training_config_json_path=training_config_path, final_config_json_path=tmp_training_dir / "trans.json", experiment_name="test_experiment", workspace_name="test_workspace", ) - train_result = train_tabddpm_and_synthesize( + configs.number_of_points_to_synthesize = 99 + model_runner = EnsembleAttackTabDDPMModelRunner(configs) + + train_result = model_runner.train_or_fine_tune_and_synthesize( train_set, - cast(ClavaDDPMTrainingConfig, configs), - save_dir, synthesize=True, - number_of_points_to_synthesize=99, ) + assert train_result.synthetic_data is not None assert type(train_result.synthetic_data) is pd.DataFrame assert len(train_result.synthetic_data) == 99 @@ -161,16 +181,16 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None: assert len(train_result.models) == 1 # Only one model (TabDDPM) is trained. # Now fine-tune the trained TabDDPM model on a small set of data - fine_tuned_results = fine_tune_tabddpm_and_synthesize( - trained_models=train_result.models, - fine_tune_set=fine_tuning_set, # fine-tuning on the same data for testing purposes - configs=cast(ClavaDDPMTrainingConfig, configs), - save_dir=save_dir, - fine_tuning_diffusion_iterations=cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations, - fine_tuning_classifier_iterations=cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations, - # Number of synthetic samples is defined according to tabddpm_training_config's classifier_scale value. + configs.fine_tuning_diffusion_iterations = cfg.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations + configs.fine_tuning_classifier_iterations = cfg.shadow_training.fine_tuning_config.fine_tune_classifier_iterations + model_runner = EnsembleAttackTabDDPMModelRunner(configs) + + fine_tuned_results = model_runner.train_or_fine_tune_and_synthesize( + dataset=fine_tuning_set, synthesize=False, + trained_model=train_result, ) + assert fine_tuned_results.synthetic_data is None assert fine_tuned_results.models is not None assert type(fine_tuned_results.models) is dict diff --git a/tests/unit/attacks/ensemble/test_process_data_split.py b/tests/unit/attacks/ensemble/test_process_data_split.py index 80023a8b..c67236d0 100644 --- a/tests/unit/attacks/ensemble/test_process_data_split.py +++ b/tests/unit/attacks/ensemble/test_process_data_split.py @@ -5,7 +5,13 @@ from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.data_utils import load_dataframe -from midst_toolkit.attacks.ensemble.process_split_data import process_split_data +from midst_toolkit.attacks.ensemble.process_split_data import ( + PROCESSED_TEST_DATA_FILE_NAME, + PROCESSED_TEST_LABELS_FILE_NAME, + PROCESSED_TRAIN_DATA_FILE_NAME, + PROCESSED_TRAIN_LABELS_FILE_NAME, + process_split_data, +) @pytest.fixture(scope="module") @@ -38,10 +44,10 @@ def test_process_split_data(cfg: DictConfig, tmp_path: Path) -> None: assert (output_dir / "real_test.csv").exists() # Assert that the master challenge data files are saved in the provided path - assert (output_dir / "master_challenge_train.csv").exists() - assert (output_dir / "master_challenge_train_labels.npy").exists() - assert (output_dir / "master_challenge_test.csv").exists() - assert (output_dir / "master_challenge_test_labels.npy").exists() + assert (output_dir / PROCESSED_TRAIN_DATA_FILE_NAME).exists() + assert (output_dir / PROCESSED_TRAIN_LABELS_FILE_NAME).exists() + assert (output_dir / PROCESSED_TEST_DATA_FILE_NAME).exists() + assert (output_dir / PROCESSED_TEST_LABELS_FILE_NAME).exists() # Assert that the collected data has the expected number of rows and columns real_train = load_dataframe(output_dir, "real_train.csv") @@ -57,11 +63,11 @@ def test_process_split_data(cfg: DictConfig, tmp_path: Path) -> None: # Recall that `master_challenge_train`` consists of two halves: one half (20 samples) from `real_val`` data # with their "is_train" column set to 0, and the other half (20 samples) from the real train data (`real_train``) # with their "is_train" column set to 1. Note that ["is_train"] column is dropped in the final dataframes. - master_challenge_train = load_dataframe(output_dir, "master_challenge_train.csv") + master_challenge_train = load_dataframe(output_dir, PROCESSED_TRAIN_DATA_FILE_NAME) assert master_challenge_train.shape == (40, 10), f" Shape is {master_challenge_train.shape}" # Recall that `master_challenge_test`` consists of two halves: one half (20 samples) from `real_test`` data # with their "is_train" column set to 0, and the other half (20 samples) from the real train data (`real_train``) # with their "is_train" column set to 1. Note that ["is_train"] column is dropped in the final dataframes. - master_challenge_test = load_dataframe(output_dir, "master_challenge_test.csv") + master_challenge_test = load_dataframe(output_dir, PROCESSED_TEST_DATA_FILE_NAME) assert master_challenge_test.shape == (40, 10), f" Shape is {master_challenge_test.shape}" diff --git a/tests/unit/attacks/ensemble/test_shadow_model_utils.py b/tests/unit/attacks/ensemble/test_shadow_model_utils.py index 722918ea..ff0aa474 100644 --- a/tests/unit/attacks/ensemble/test_shadow_model_utils.py +++ b/tests/unit/attacks/ensemble/test_shadow_model_utils.py @@ -5,9 +5,8 @@ from hydra import compose, initialize from omegaconf import DictConfig -from midst_toolkit.attacks.ensemble.shadow_model_utils import ( - save_additional_training_config, -) +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMTrainingConfig +from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config @pytest.fixture(scope="module") @@ -33,15 +32,18 @@ def test_save_additional_tabddpm_config(cfg: DictConfig, tmp_path: Path) -> None new_experiment_name = "test_experiment" final_json_path = tmp_path / "modified_config.json" - configs, save_dir = save_additional_training_config( + with open(tabddpm_config_path, "r") as file: + configs = EnsembleAttackTabDDPMTrainingConfig(**json.load(file)) + + configs = update_and_save_training_config( + config=configs, data_dir=new_data_dir, - training_config_json_path=tabddpm_config_path, final_config_json_path=final_json_path, experiment_name=new_experiment_name, workspace_name=new_workspace_name, ) - assert save_dir == new_data_dir / new_workspace_name / new_experiment_name + assert configs.save_dir == new_data_dir / new_workspace_name / new_experiment_name assert configs.general.data_dir == new_data_dir assert configs.general.workspace_dir == new_data_dir / new_workspace_name assert configs.general.exp_name == new_experiment_name @@ -50,5 +52,5 @@ def test_save_additional_tabddpm_config(cfg: DictConfig, tmp_path: Path) -> None assert old_workspace_dir != configs.general.workspace_dir assert old_exp_name != configs.general.exp_name # Ensure required directories are created - assert (save_dir / "models").exists() - assert (save_dir / "before_matching").exists() + assert (configs.save_dir / "models").exists() + assert (configs.save_dir / "before_matching").exists()