From f61a1eaf6aee09b9e42731dd6341e50c81a373cf Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Tue, 26 May 2026 00:29:53 -0700 Subject: [PATCH 1/2] test: unskip gpu intensive tests --- .../train/common_utils/finetune_utils.py | 1 + .../src/sagemaker/train/rlvr_trainer.py | 3 ++ sagemaker-train/tests/integ/train/conftest.py | 10 ++++++ .../train/test_dpo_trainer_integration.py | 16 +++++---- .../train/test_rlaif_trainer_integration.py | 22 +++++++----- .../train/test_rlvr_trainer_integration.py | 34 +++++++++---------- .../train/test_sft_trainer_integration.py | 34 ++++++++++--------- 7 files changed, 70 insertions(+), 50 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py index 6f37601aa4..0ea74ee207 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py @@ -105,6 +105,7 @@ def _resolve_mlflow_resource_arn(sagemaker_session, mlflow_resource_arn: Optiona return mlflow_resource_arn try: + mlflow_apps = MlflowApp.get_all( session=sagemaker_session.boto_session, region=sagemaker_session.boto_session.region_name diff --git a/sagemaker-train/src/sagemaker/train/rlvr_trainer.py b/sagemaker-train/src/sagemaker/train/rlvr_trainer.py index 49b35f124e..333a93fc55 100644 --- a/sagemaker-train/src/sagemaker/train/rlvr_trainer.py +++ b/sagemaker-train/src/sagemaker/train/rlvr_trainer.py @@ -171,6 +171,9 @@ def _process_hyperparameters(self): if hasattr(self.hyperparameters, 'reward_lambda_arn'): delattr(self.hyperparameters, 'reward_lambda_arn') self.hyperparameters._specs.pop('reward_lambda_arn', None) + if hasattr(self.hyperparameters, 'preset_reward_function'): + delattr(self.hyperparameters, 'preset_reward_function') + self.hyperparameters._specs.pop('preset_reward_function', None) if hasattr(self.hyperparameters, 'data_path'): delattr(self.hyperparameters, 'data_path') self.hyperparameters._specs.pop('data_path', None) diff --git a/sagemaker-train/tests/integ/train/conftest.py b/sagemaker-train/tests/integ/train/conftest.py index 9d9e7f53e0..1857a6262d 100644 --- a/sagemaker-train/tests/integ/train/conftest.py +++ b/sagemaker-train/tests/integ/train/conftest.py @@ -38,3 +38,13 @@ def sagemaker_session(): if region_manual_set and "AWS_DEFAULT_REGION" in os.environ: del os.environ["AWS_DEFAULT_REGION"] + + +NOVA_REGION = "us-east-1" + + +@pytest.fixture(scope="module") +def sagemaker_session_us_east_1(): + """Create a SageMaker session in us-east-1 for Nova model tests.""" + boto_session = boto3.Session(region_name=NOVA_REGION) + return Session(boto_session=boto_session) diff --git a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py index 65cbd6c246..96a1d7f4d4 100644 --- a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py @@ -22,17 +22,18 @@ import pytest -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_dpo_trainer_lora_complete_workflow(sagemaker_session): """Test complete DPO training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" # Create DPOTrainer instance with comprehensive configuration trainer = DPOTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"dpo-lora-integ-{unique_id}", ) # Customize hyperparameters for quick training @@ -61,18 +62,19 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_dpo_trainer_with_validation_dataset(sagemaker_session): """Test DPO trainer with both training and validation datasets.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" dpo_trainer = DPOTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", - validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", + validation_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"dpo-val-integ-{unique_id}", ) # Customize hyperparameters for quick training diff --git a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py index 296d62bfd8..103484c53f 100644 --- a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py @@ -14,6 +14,7 @@ from __future__ import absolute_import import time +import random import boto3 from sagemaker.core.helper.session_helper import Session from sagemaker.train.rlaif_trainer import RLAIFTrainer @@ -21,9 +22,9 @@ import pytest -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -33,9 +34,10 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): reward_prompt='Builtin.Summarize', mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-lora-integ-{unique_id}", ) # Create training job @@ -61,9 +63,9 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): """Test RLAIF trainer with different reward model and prompt.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -73,9 +75,10 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): reward_prompt="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlaif-test-prompt/0.0.1", mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-rwd-integ-{unique_id}", ) training_job = rlaif_trainer.train(wait=False) @@ -100,9 +103,9 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_continued_finetuning(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="arn:aws:sagemaker:us-west-2:729646638167:model-package/sdk-test-finetuned-models/1", @@ -112,9 +115,10 @@ def test_rlaif_trainer_continued_finetuning(sagemaker_session): reward_prompt='Builtin.Summarize', mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-cont-integ-{unique_id}", ) # Create training job diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 63d3ae3134..951f9ab35d 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -13,8 +13,8 @@ """Integration tests for RLVR trainer""" from __future__ import absolute_import -import os import time +import random import pytest import boto3 from sagemaker.core.helper.session_helper import Session @@ -22,9 +22,9 @@ from sagemaker.train.common import TrainingType -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLVR training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -32,9 +32,10 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-rlvr-finetuned-models-exp", mlflow_run_name="test-rlvr-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlvr-lora-integ-{unique_id}", ) # Create training job @@ -60,9 +61,9 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): """Test RLVR trainer with custom reward function.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -70,10 +71,11 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-rlvr-finetuned-models-exp", mlflow_run_name="test-rlvr-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", custom_reward_function="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlvr-test-rf/0.0.1", - accept_eula=True + accept_eula=True, + base_job_name=f"rlvr-rf-integ-{unique_id}", ) training_job = rlvr_trainer.train(wait=False) @@ -98,14 +100,12 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): assert training_job.output_model_package_arn is not None -# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1") -@pytest.mark.skip(reason="Skipping GPU resource intensive test") -def test_rlvr_trainer_nova_workflow(sagemaker_session): +@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1") +def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): """Test RLVR training workflow with Nova model.""" - import os - os.environ['SAGEMAKER_REGION'] = 'us-east-1' + # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) - # For fine-tuning + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="nova-textgeneration-lite-v2", model_package_group="sdk-test-finetuned-models", @@ -115,12 +115,10 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session): validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1", - accept_eula=True + accept_eula=True, + sagemaker_session=sagemaker_session_us_east_1, + base_job_name=f"rlvr-nova-integ-{unique_id}", ) - rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket' - - rlvr_trainer.hyperparameters.reward_lambda_arn = 'arn:aws:lambda:us-east-1:729646638167:function:rlvr-nova-reward-function' - training_job = rlvr_trainer.train(wait=False) # Manual wait loop diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index 98dd154c3f..c4f94aba91 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -13,8 +13,8 @@ """Integration tests for SFT trainer""" from __future__ import absolute_import -import os import time +import random import pytest import boto3 from sagemaker.core.helper.session_helper import Session @@ -22,17 +22,18 @@ from sagemaker.train.common import TrainingType -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_lora_complete_workflow(sagemaker_session): """Test complete SFT training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer = SFTTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"sft-lora-integ-{unique_id}", ) # Create training job @@ -58,17 +59,18 @@ def test_sft_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_with_validation_dataset(sagemaker_session): """Test SFT trainer with both training and validation datasets.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer = SFTTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", - validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", - accept_eula=True + training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", + validation_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", + accept_eula=True, + base_job_name=f"sft-val-integ-{unique_id}", ) training_job = sft_trainer.train(wait=False) @@ -92,22 +94,22 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session): assert hasattr(training_job, 'output_model_package_arn') -# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1") -@pytest.mark.skip(reason="Skipping GPU resource intensive test") -def test_sft_trainer_nova_workflow(sagemaker_session): +@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1") +def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): """Test SFT trainer with Nova model.""" - import os - os.environ['SAGEMAKER_REGION'] = 'us-east-1' + # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) - # For fine-tuning + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer_nova = SFTTrainer( model="nova-textgeneration-lite-v2", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-nova-finetuned-models-exp", mlflow_run_name="test-nova-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/DataSet/sft-nova-test-dataset/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/" + training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_200_samples.jsonl", + s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", + sagemaker_session=sagemaker_session_us_east_1, + base_job_name=f"sft-nova-integ-{unique_id}", ) # Create training job From 181df7138b70c2919d9b549c0241996ff6dc0940 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Tue, 26 May 2026 14:15:44 -0700 Subject: [PATCH 2/2] tests: add new marker and new github action for 3 times per day gpu intensive tests --- .github/workflows/gpu-integ-tests.yml | 24 +++++++++++++++++++ .../integ/train/test_benchmark_evaluator.py | 2 ++ .../train/test_custom_scorer_evaluator.py | 2 ++ .../train/test_dpo_trainer_integration.py | 2 ++ .../train/test_llm_as_judge_base_model_fix.py | 2 ++ .../train/test_llm_as_judge_evaluator.py | 2 ++ .../train/test_rlaif_trainer_integration.py | 2 ++ .../train/test_rlvr_trainer_integration.py | 2 ++ .../train/test_sft_trainer_integration.py | 2 ++ .../integ/train/test_tuner_distributed.py | 2 ++ sagemaker-train/tox.ini | 1 + 11 files changed, 43 insertions(+) create mode 100644 .github/workflows/gpu-integ-tests.yml diff --git a/.github/workflows/gpu-integ-tests.yml b/.github/workflows/gpu-integ-tests.yml new file mode 100644 index 0000000000..a53d76d8e3 --- /dev/null +++ b/.github/workflows/gpu-integ-tests.yml @@ -0,0 +1,24 @@ +name: GPU Integ Tests +on: + schedule: + - cron: "0 */8 * * *" + workflow_dispatch: + +permissions: + id-token: write # This is required for requesting the JWT + +jobs: + gpu-integ-tests: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run GPU Integ Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests + source-version: refs/heads/master diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index 0db9b856d0..9cbfc13011 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -23,6 +23,8 @@ EvaluationPipelineExecution, ) +pytestmark = pytest.mark.gpu_intensive + # Configure logging logging.basicConfig( level=logging.INFO, diff --git a/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py b/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py index 54cec846dc..a4e6044ed9 100644 --- a/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py @@ -22,6 +22,8 @@ EvaluationPipelineExecution, ) +pytestmark = pytest.mark.gpu_intensive + # Configure logging logging.basicConfig( level=logging.INFO, diff --git a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py index 96a1d7f4d4..34a159a2d5 100644 --- a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py @@ -21,6 +21,8 @@ from sagemaker.train.common import TrainingType import pytest +pytestmark = pytest.mark.gpu_intensive + def test_dpo_trainer_lora_complete_workflow(sagemaker_session): """Test complete DPO training workflow with LORA.""" diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py index 1da31f71c6..1883c99be3 100644 --- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py +++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py @@ -28,6 +28,8 @@ EvaluationPipelineExecution, ) +pytestmark = pytest.mark.gpu_intensive + # Configure logging logging.basicConfig( level=logging.INFO, diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py index 02b528bfa3..84318b93ea 100644 --- a/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py @@ -22,6 +22,8 @@ EvaluationPipelineExecution, ) +pytestmark = pytest.mark.gpu_intensive + # Configure logging logging.basicConfig( level=logging.INFO, diff --git a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py index 103484c53f..3c52d5f8fe 100644 --- a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py @@ -21,6 +21,8 @@ from sagemaker.train.common import TrainingType import pytest +pytestmark = pytest.mark.gpu_intensive + def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 951f9ab35d..5d5883c307 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -21,6 +21,8 @@ from sagemaker.train.rlvr_trainer import RLVRTrainer from sagemaker.train.common import TrainingType +pytestmark = pytest.mark.gpu_intensive + def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLVR training workflow with LORA.""" diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index c4f94aba91..4c2688b8f3 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -21,6 +21,8 @@ from sagemaker.train.sft_trainer import SFTTrainer from sagemaker.train.common import TrainingType +pytestmark = pytest.mark.gpu_intensive + def test_sft_trainer_lora_complete_workflow(sagemaker_session): """Test complete SFT training workflow with LORA.""" diff --git a/sagemaker-train/tests/integ/train/test_tuner_distributed.py b/sagemaker-train/tests/integ/train/test_tuner_distributed.py index 733f6a48a8..876116a614 100644 --- a/sagemaker-train/tests/integ/train/test_tuner_distributed.py +++ b/sagemaker-train/tests/integ/train/test_tuner_distributed.py @@ -28,6 +28,8 @@ from sagemaker.train.configs import SourceCode, Compute from sagemaker.train.distributed import Torchrun from sagemaker.train.tuner import HyperparameterTuner + +pytestmark = pytest.mark.gpu_intensive from sagemaker.core.parameter import ContinuousParameter logger = logging.getLogger(__name__) diff --git a/sagemaker-train/tox.ini b/sagemaker-train/tox.ini index 136b99c69c..028925d95f 100644 --- a/sagemaker-train/tox.ini +++ b/sagemaker-train/tox.ini @@ -62,6 +62,7 @@ markers = slow_test release image_uris_unit_test + gpu_intensive: mark a test as GPU resource intensive (runs on scheduled CI, not PR checks). timeout: mark a test as a timeout. serial: marks tests that must run serially (not in parallel)