Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/workflows/gpu-integ-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: GPU Integ Tests
on:
schedule:
- cron: "0 */8 * * *"
workflow_dispatch:

permissions:
id-token: write # This is required for requesting the JWT

jobs:
gpu-integ-tests:
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
aws-region: us-west-2
Comment thread
lucasjia-aws marked this conversation as resolved.
role-duration-seconds: 10800
- name: Run GPU Integ Tests
uses: aws-actions/aws-codebuild-run-build@v1
with:
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
source-version: refs/heads/master
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def _resolve_mlflow_resource_arn(sagemaker_session, mlflow_resource_arn: Optiona
return mlflow_resource_arn

try:

mlflow_apps = MlflowApp.get_all(
session=sagemaker_session.boto_session,
region=sagemaker_session.boto_session.region_name
Expand Down
3 changes: 3 additions & 0 deletions sagemaker-train/src/sagemaker/train/rlvr_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ def _process_hyperparameters(self):
if hasattr(self.hyperparameters, 'reward_lambda_arn'):
delattr(self.hyperparameters, 'reward_lambda_arn')
self.hyperparameters._specs.pop('reward_lambda_arn', None)
if hasattr(self.hyperparameters, 'preset_reward_function'):
delattr(self.hyperparameters, 'preset_reward_function')
self.hyperparameters._specs.pop('preset_reward_function', None)
if hasattr(self.hyperparameters, 'data_path'):
delattr(self.hyperparameters, 'data_path')
self.hyperparameters._specs.pop('data_path', None)
Expand Down
10 changes: 10 additions & 0 deletions sagemaker-train/tests/integ/train/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,13 @@ def sagemaker_session():

if region_manual_set and "AWS_DEFAULT_REGION" in os.environ:
del os.environ["AWS_DEFAULT_REGION"]


NOVA_REGION = "us-east-1"


@pytest.fixture(scope="module")
def sagemaker_session_us_east_1():
"""Create a SageMaker session in us-east-1 for Nova model tests."""
boto_session = boto3.Session(region_name=NOVA_REGION)
return Session(boto_session=boto_session)
2 changes: 2 additions & 0 deletions sagemaker-train/tests/integ/train/test_benchmark_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
EvaluationPipelineExecution,
)

pytestmark = pytest.mark.gpu_intensive

# Configure logging
logging.basicConfig(
level=logging.INFO,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
EvaluationPipelineExecution,
)

pytestmark = pytest.mark.gpu_intensive

# Configure logging
logging.basicConfig(
level=logging.INFO,
Expand Down
18 changes: 11 additions & 7 deletions sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,21 @@
from sagemaker.train.common import TrainingType
import pytest

pytestmark = pytest.mark.gpu_intensive


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_dpo_trainer_lora_complete_workflow(sagemaker_session):
"""Test complete DPO training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
# Create DPOTrainer instance with comprehensive configuration
trainer = DPOTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"dpo-lora-integ-{unique_id}",
)

# Customize hyperparameters for quick training
Expand Down Expand Up @@ -61,18 +64,19 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_dpo_trainer_with_validation_dataset(sagemaker_session):
"""Test DPO trainer with both training and validation datasets."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

dpo_trainer = DPOTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
validation_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"dpo-val-integ-{unique_id}",
)

# Customize hyperparameters for quick training
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
EvaluationPipelineExecution,
)

pytestmark = pytest.mark.gpu_intensive

# Configure logging
logging.basicConfig(
level=logging.INFO,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
EvaluationPipelineExecution,
)

pytestmark = pytest.mark.gpu_intensive

# Configure logging
logging.basicConfig(
level=logging.INFO,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@
from __future__ import absolute_import

import time
import random
import boto3
from sagemaker.core.helper.session_helper import Session
from sagemaker.train.rlaif_trainer import RLAIFTrainer
from sagemaker.train.common import TrainingType
import pytest

pytestmark = pytest.mark.gpu_intensive


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
"""Test complete RLAIF training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlaif_trainer = RLAIFTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
Expand All @@ -33,9 +36,10 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
reward_prompt='Builtin.Summarize',
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
mlflow_run_name="test-rlaif-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"rlaif-lora-integ-{unique_id}",
)

# Create training job
Expand All @@ -61,9 +65,9 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
"""Test RLAIF trainer with different reward model and prompt."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlaif_trainer = RLAIFTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
Expand All @@ -73,9 +77,10 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
reward_prompt="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlaif-test-prompt/0.0.1",
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
mlflow_run_name="test-rlaif-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"rlaif-rwd-integ-{unique_id}",
)

training_job = rlaif_trainer.train(wait=False)
Expand All @@ -100,9 +105,9 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlaif_trainer_continued_finetuning(sagemaker_session):
"""Test complete RLAIF training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlaif_trainer = RLAIFTrainer(
model="arn:aws:sagemaker:us-west-2:729646638167:model-package/sdk-test-finetuned-models/1",
Expand All @@ -112,9 +117,10 @@ def test_rlaif_trainer_continued_finetuning(sagemaker_session):
reward_prompt='Builtin.Summarize',
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
mlflow_run_name="test-rlaif-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"rlaif-cont-integ-{unique_id}",
)

# Create training job
Expand Down
36 changes: 18 additions & 18 deletions sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,31 @@
"""Integration tests for RLVR trainer"""
from __future__ import absolute_import

import os
import time
import random
import pytest
import boto3
from sagemaker.core.helper.session_helper import Session
from sagemaker.train.rlvr_trainer import RLVRTrainer
from sagemaker.train.common import TrainingType

pytestmark = pytest.mark.gpu_intensive


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlvr_trainer_lora_complete_workflow(sagemaker_session):
"""Test complete RLVR training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlvr_trainer = RLVRTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
mlflow_experiment_name="test-rlvr-finetuned-models-exp",
mlflow_run_name="test-rlvr-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"rlvr-lora-integ-{unique_id}",
)

# Create training job
Expand All @@ -60,20 +63,21 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlvr_trainer_with_custom_reward_function(sagemaker_session):
"""Test RLVR trainer with custom reward function."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlvr_trainer = RLVRTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
mlflow_experiment_name="test-rlvr-finetuned-models-exp",
mlflow_run_name="test-rlvr-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
custom_reward_function="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlvr-test-rf/0.0.1",
accept_eula=True
accept_eula=True,
base_job_name=f"rlvr-rf-integ-{unique_id}",
)

training_job = rlvr_trainer.train(wait=False)
Expand All @@ -98,14 +102,12 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session):
assert training_job.output_model_package_arn is not None


# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1")
@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlvr_trainer_nova_workflow(sagemaker_session):
@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1")
def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1):
"""Test RLVR training workflow with Nova model."""
import os
os.environ['SAGEMAKER_REGION'] = 'us-east-1'
# sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region)

# For fine-tuning
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
rlvr_trainer = RLVRTrainer(
model="nova-textgeneration-lite-v2",
model_package_group="sdk-test-finetuned-models",
Expand All @@ -115,12 +117,10 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session):
validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl",
s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/",
custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1",
accept_eula=True
accept_eula=True,
sagemaker_session=sagemaker_session_us_east_1,
base_job_name=f"rlvr-nova-integ-{unique_id}",
)
rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket'

rlvr_trainer.hyperparameters.reward_lambda_arn = 'arn:aws:lambda:us-east-1:729646638167:function:rlvr-nova-reward-function'

training_job = rlvr_trainer.train(wait=False)

# Manual wait loop
Expand Down
Loading
Loading