diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 917774591c..f3c3ed8eca 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -117,6 +117,201 @@ font-size: 0.9em; } +/* Feedback Component */ +.feedback-container { + border-top: 1px solid #e1e5e9; + margin-top: 40px; + padding: 24px; + background-color: #f8f9fa; + border-radius: 8px; + border: 1px solid #e1e5e9; +} + +.feedback-question { + font-size: 20px; + font-weight: 500; + color: #374151; + margin-bottom: 20px; + text-align: left; +} + +.feedback-buttons { + display: flex; + justify-content: flex-start; + gap: 12px; +} + +.feedback-btn { + display: flex; + align-items: center; + gap: 8px; + padding: 8px 16px; + border: 1px solid #d1d5db; + border-radius: 6px; + background: white; + color: #6b7280; + font-size: 14px; + cursor: pointer; + transition: all 0.2s ease; +} + +.feedback-btn:hover { + border-color: #9ca3af; + background: #f9fafb; +} + +.feedback-btn.selected { + border-color: #3b82f6; + background: #eff6ff; + color: #1d4ed8; +} + +.feedback-btn svg { + width: 16px; + height: 16px; +} + +.feedback-thanks { + display: none; + color: #059669; + font-size: 14px; + text-align: center; + margin-top: 16px; +} + +/* Feedback Options */ +.feedback-options { + display: none; + margin-top: 20px; +} + +.feedback-options-title { + font-size: 16px; + font-weight: 500; + color: #374151; + margin-bottom: 16px; +} + +.required { + color: #ff6b6b; +} + +.feedback-checkboxes { + display: flex; + flex-direction: column; + gap: 12px; + margin-bottom: 20px; +} + +.feedback-checkbox { + display: flex; + align-items: center; + cursor: pointer; + color: #374151; + font-size: 14px; + position: relative; + padding-left: 32px; +} + +.feedback-checkbox input[type="checkbox"] { + position: absolute; + opacity: 0; + cursor: pointer; + height: 0; + width: 0; +} + +.checkmark { + position: absolute; + left: 0; + top: 2px; + height: 18px; + width: 18px; + background-color: transparent; + border: 2px solid #6b7280; + border-radius: 3px; +} + +.feedback-checkbox:hover input ~ .checkmark { + border-color: #9ca3af; +} + +.feedback-checkbox input:checked ~ .checkmark { + background-color: #3b82f6; + border-color: #3b82f6; +} + +.checkmark:after { + content: ""; + position: absolute; + display: none; +} + +.feedback-checkbox input:checked ~ .checkmark:after { + display: block; +} + +.feedback-checkbox .checkmark:after { + left: 5px; + top: 2px; + width: 4px; + height: 8px; + border: solid white; + border-width: 0 2px 2px 0; + transform: rotate(45deg); +} + +.feedback-more { + display: flex; + flex-direction: column; + gap: 12px; +} + +.feedback-more-title { + font-size: 16px; + font-weight: 500; + color: #374151; +} + +.feedback-textarea { + padding: 12px; + border: 1px solid #d1d5db; + border-radius: 6px; + font-size: 14px; + resize: vertical; + min-height: 80px; + font-family: inherit; + background-color: #ffffff; + color: #374151; +} + +.feedback-textarea::placeholder { + color: #6b7280; +} + +.feedback-textarea:focus { + outline: none; + border-color: #3b82f6; + box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1); +} + +.feedback-submit-btn { + padding: 10px 20px; + background: #3b82f6; + color: white; + border: none; + border-radius: 6px; + font-size: 14px; + font-weight: 500; + cursor: pointer; + align-self: flex-start; + transition: background 0.2s; +} + +.feedback-submit-btn:hover { + background: #2563eb; +} + header { background-color: white; diff --git a/docs/_static/feedback.js b/docs/_static/feedback.js new file mode 100644 index 0000000000..6a4b8207f1 --- /dev/null +++ b/docs/_static/feedback.js @@ -0,0 +1,127 @@ +// Feedback functionality +document.addEventListener('DOMContentLoaded', function() { + // Add feedback component to the very bottom of each page + const article = document.querySelector('article[role="main"]') || document.querySelector('.bd-article') || document.querySelector('main'); + if (article) { + const feedbackHTML = ` +
+
Was this page helpful?
+
+ + +
+
+ + + +
+
Thank you for your feedback!
+
+ `; + + article.insertAdjacentHTML('beforeend', feedbackHTML); + + // Add click handlers + const feedbackBtns = document.querySelectorAll('.feedback-btn'); + const thanksMessage = document.querySelector('.feedback-thanks'); + const feedbackOptions = document.querySelector('.feedback-options'); + const checkboxes = document.querySelectorAll('.feedback-checkbox input[type="checkbox"]'); + const submitBtn = document.querySelector('.feedback-submit-btn'); + const textarea = document.querySelector('.feedback-textarea'); + + feedbackBtns.forEach(btn => { + btn.addEventListener('click', function() { + const feedback = this.dataset.feedback; + + // Remove selected class from all buttons + feedbackBtns.forEach(b => b.classList.remove('selected')); + + // Add selected class to clicked button + this.classList.add('selected'); + + if (feedback === 'yes') { + // Hide options and show thanks + feedbackOptions.style.display = 'none'; + thanksMessage.style.display = 'block'; + + // Send positive feedback + if (typeof gtag !== 'undefined') { + gtag('event', 'page_feedback', { + 'feedback_value': 'positive', + 'page_location': window.location.href + }); + } + } else { + // Show options for negative feedback + feedbackOptions.style.display = 'block'; + thanksMessage.style.display = 'none'; + } + }); + }); + + // Handle submit button + submitBtn.addEventListener('click', function() { + const selectedReasons = []; + checkboxes.forEach(checkbox => { + if (checkbox.checked) { + selectedReasons.push(checkbox.dataset.reason); + } + }); + const additionalFeedback = textarea.value.trim(); + + // Hide options and show thanks + feedbackOptions.style.display = 'none'; + thanksMessage.style.display = 'block'; + + // Send negative feedback with details + if (typeof gtag !== 'undefined') { + gtag('event', 'page_feedback', { + 'feedback_value': 'negative', + 'feedback_reasons': selectedReasons, + 'feedback_details': additionalFeedback, + 'page_location': window.location.href + }); + } + }); + } +}); diff --git a/docs/api/index.rst b/docs/api/index.rst index 4923bba097..cb1b9ca5a6 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -2,3 +2,12 @@ API Reference ============= Complete API documentation for SageMaker Python SDK V3. + +.. toctree:: + :maxdepth: 2 + :caption: API Reference + + sagemaker_core + sagemaker_train + sagemaker_serve + sagemaker_mlops diff --git a/docs/api/sagemaker_core.rst b/docs/api/sagemaker_core.rst new file mode 100644 index 0000000000..bbad9631ab --- /dev/null +++ b/docs/api/sagemaker_core.rst @@ -0,0 +1,60 @@ +SageMaker Core +============== + +Core SageMaker resources and utilities for managing AWS SageMaker services. + +.. currentmodule:: sagemaker.core + +Core Resources +-------------- + +.. automodule:: sagemaker.core.resources + :members: + :undoc-members: + :show-inheritance: + +Session Management +------------------ + +.. automodule:: sagemaker.core.session_settings + :members: + :undoc-members: + +Configuration +------------- + +.. automodule:: sagemaker.core.config_schema + :members: + :undoc-members: + +Processing +---------- + +.. automodule:: sagemaker.core.processing + :members: + :undoc-members: + +Transformers +------------ + +.. automodule:: sagemaker.core.transformer + :members: + :undoc-members: + +Utilities +--------- + +.. automodule:: sagemaker.core.common_utils + :members: + :undoc-members: + +.. automodule:: sagemaker.core.image_uris + :members: + :undoc-members: + +Exceptions +---------- + +.. automodule:: sagemaker.core.exceptions + :members: + :undoc-members: diff --git a/docs/api/sagemaker_mlops.rst b/docs/api/sagemaker_mlops.rst new file mode 100644 index 0000000000..f67879111d --- /dev/null +++ b/docs/api/sagemaker_mlops.rst @@ -0,0 +1,30 @@ +SageMaker MLOps +=============== + +MLOps capabilities including pipelines, workflows, and model management. + +.. currentmodule:: sagemaker.mlops + +Pipeline Management +------------------- + +.. automodule:: sagemaker.mlops + :members: + :undoc-members: + :show-inheritance: + +Workflow Management +------------------- + +.. automodule:: sagemaker.mlops.workflow + :members: + :undoc-members: + :show-inheritance: + +Local Development +----------------- + +.. automodule:: sagemaker.mlops.local + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/sagemaker_serve.rst b/docs/api/sagemaker_serve.rst new file mode 100644 index 0000000000..471b79530b --- /dev/null +++ b/docs/api/sagemaker_serve.rst @@ -0,0 +1,14 @@ +SageMaker Serve +=============== + +Model serving and inference capabilities for deploying and managing ML models. + +.. currentmodule:: sagemaker.serve + +Model Deployment +---------------- + +.. automodule:: sagemaker.serve + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/sagemaker_train.rst b/docs/api/sagemaker_train.rst new file mode 100644 index 0000000000..d54b720a92 --- /dev/null +++ b/docs/api/sagemaker_train.rst @@ -0,0 +1,30 @@ +SageMaker Train +=============== + +Training capabilities including model training, hyperparameter tuning, and distributed training. + +.. currentmodule:: sagemaker.train + +Model Training +-------------- + +.. automodule:: sagemaker.train + :members: + :undoc-members: + :show-inheritance: + +Distributed Training +-------------------- + +.. automodule:: sagemaker.train.distributed + :members: + :undoc-members: + :show-inheritance: + +Model Evaluation +---------------- + +.. automodule:: sagemaker.train.evaluate + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index f97b18135e..de04756c3a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -2,7 +2,12 @@ import sys from datetime import datetime +# Add the source directories to Python path sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath('../sagemaker-core/src')) +sys.path.insert(0, os.path.abspath('../sagemaker-train/src')) +sys.path.insert(0, os.path.abspath('../sagemaker-serve/src')) +sys.path.insert(0, os.path.abspath('../sagemaker-mlops/src')) project = 'SageMaker Python SDK V3' copyright = f'{datetime.now().year}, Amazon Web Services' @@ -20,7 +25,20 @@ ] templates_path = ['_templates'] -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'sagemaker-core/docs/*', 'sagemaker-core/CHANGELOG.md', 'sagemaker-core/CONTRIBUTING.md'] +exclude_patterns = [ + '_build', + 'Thumbs.db', + '.DS_Store', + 'sagemaker-core/docs/*', + 'sagemaker-core/CHANGELOG.md', + 'sagemaker-core/CONTRIBUTING.md', +] + +# Suppress specific warnings +suppress_warnings = [ + 'myst.header', # Suppress header level warnings from notebooks + 'toc.not_readable', # Suppress toctree warnings for symlinked files +] html_theme = 'sphinx_book_theme' html_theme_options = { @@ -40,6 +58,7 @@ html_static_path = ['_static'] html_css_files = ['custom.css'] +html_js_files = ['feedback.js'] html_context = { 'display_github': True, @@ -54,6 +73,16 @@ nb_execution_mode = 'off' nb_execution_allow_errors = True -# Suppress autodoc warnings for missing modules -autodoc_mock_imports = ['sagemaker'] +# Autodoc configuration +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'show-inheritance': True, +} + +# Generate autosummary stubs +autosummary_generate = True + +# Don't mock imports - let them fail gracefully and show what's available +autodoc_mock_imports = [] suppress_warnings = ['autodoc.import_error'] diff --git a/docs/model_customization/index.rst b/docs/model_customization/index.rst index 9023a203f9..85870d9cc3 100644 --- a/docs/model_customization/index.rst +++ b/docs/model_customization/index.rst @@ -209,3 +209,4 @@ Explore comprehensive model customization examples that demonstrate V3 capabilit ../v3-examples/model-customization-examples/bedrock-modelbuilder-deployment ../v3-examples/model-customization-examples/model_builder_deployment_notebook ../v3-examples/model-customization-examples/ai_registry_example + ../v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook diff --git a/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb b/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb new file mode 100644 index 0000000000..20c51e562e --- /dev/null +++ b/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb @@ -0,0 +1,1087 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "babaeb90", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "novaTrainingJobNotebookHeaderMarkdown" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Model Customization using SageMaker Training Job" + ] + }, + { + "cell_type": "markdown", + "id": "a16fc6c1-c18f-4a06-ae98-36b12ec72ab3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This notebook provides an end-to-end walkthrough for creating SageMaker Training job using a SageMaker Nova model and deploy it for inference." + ] + }, + { + "cell_type": "markdown", + "id": "940f9af2-cb1e-40be-839d-48db014d67f1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Setup and Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84cf410f", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade sagemaker --quiet # restart the kernel after running this cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "234f7398-fd6b-4d02-a406-0491924c461d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import os\n", + "import json\n", + "import boto3\n", + "from rich.pretty import pprint\n", + "from sagemaker.core.helper.session_helper import Session\n", + "\n", + "REGION = boto3.Session().region_name\n", + "sm_client = boto3.client(\"sagemaker\", region_name=REGION)\n", + "\n", + "# Create SageMaker session\n", + "sagemaker_session = Session(sagemaker_client=sm_client)\n", + "\n", + "print(f\"Region: {REGION}\")\n", + "\n", + "# For MLFlow native metrics in Trainer wait, run below line with appropriate region\n", + "os.environ[\"SAGEMAKER_MLFLOW_CUSTOM_ENDPOINT\"] = f\"https://mlflow.sagemaker.{REGION}.app.aws\"" + ] + }, + { + "cell_type": "markdown", + "id": "b9bf5959", + "metadata": {}, + "source": [ + "#### Create Training Dataset\n", + "Below section provides sample code to create the training dataset arn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39aaeb1d", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.ai_registry.dataset import DataSet\n", + "from sagemaker.ai_registry.dataset_utils import CustomizationTechnique\n", + "\n", + "# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN\n", + "dataset = DataSet.create(\n", + " name=\"demo-sft-dataset\",\n", + " source=\"s3://your-bucket/dataset/training_dataset.jsonl\", # source can be S3 or local path\n", + " #customization_technique=CUSTOMIZATION_TECHNIQUE.SFT # or DPO or RLVR\n", + " # Optional technique name for minimal dataset format check.\n", + " wait=True\n", + ")\n", + "\n", + "print(f\"TRAINING_DATASET ARN: {dataset.arn}\")\n", + "# TRAINING_DATASET = dataset.arn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea22bd22", + "metadata": {}, + "outputs": [], + "source": [ + "# Required Configs\n", + "BASE_MODEL = \"\"\n", + "\n", + "# MODEL_PACKAGE_GROUP_NAME is same as CUSTOM_MODEL_NAME\n", + "MODEL_PACKAGE_GROUP_NAME = \"\"\n", + "\n", + "TRAINING_DATASET = \"\"\n", + "\n", + "S3_OUTPUT_PATH = \"\"\n", + "\n", + "ROLE_ARN = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "259aca67d3d3863b", + "metadata": {}, + "source": [ + "#### Create Model Package Group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90a1069d19eeee7", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.core.resources import ModelPackageGroup\n", + "model_package_group = ModelPackageGroup.create(\n", + " model_package_group_name=MODEL_PACKAGE_GROUP_NAME,\n", + " model_package_group_description='' # Required Description\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1c931764", + "metadata": {}, + "source": [ + "## Part 1: Fine-tuning\n", + "\n", + "### Step 1: Creating the Trainer" + ] + }, + { + "cell_type": "markdown", + "id": "f23e67f7", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "" + }, + "source": [ + "#### Choose one of the following trainer techniques:\n", + "- **Option 1: SFT Trainer (Supervised Fine-Tuning)** \n", + "- **Option 2: Create RLVRTrainer (Reinforcement Learning with Verifiable Rewards)**. \n", + "- **Option 3: DPO Trainer (Direct Preference Optimization)** \n", + "\n", + "**Instructions:** Run only ONE of the trainers, not all of them." + ] + }, + { + "cell_type": "markdown", + "id": "32fd436b", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "SFT" + }, + "source": [ + "#### Create SFT Trainer (Supervised Fine-Tuning)\n", + "\n", + "##### Key Parameters:\n", + "* `model`: base_model id on Sagemaker Hubcontent that is available to finetune (or) ModelPackage artifacts\n", + "* `training_type`: Choose from TrainingType Enum(sagemaker.train.common) either LORA OR FULL. (optional)\n", + "* `model_package_group`: ModelPackage group name or ModelPackageGroup (optional)\n", + "* `mlflow_resource_arn`: MLFlow app ARN to track the training job (optional)\n", + "* `mlflow_experiment_name`: MLFlow app experiment name(str) (optional)\n", + "* `mlflow_run_name`: MLFlow app run name(str) (optional)\n", + "* `training_dataset`: Training Dataset - either Dataset ARN or S3 Path of the dataset (Please note these are required for a training job to run, can be either provided via Trainer or .train()) (optional)\n", + "* `validation_dataset`: Validation Dataset - either Dataset ARN or S3 Path of the dataset (optional)\n", + "* `s3_output_path`: S3 path for the trained model artifacts (optional)\n", + "* `base_job_name` : Unique job name (optional)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "062953d8", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "SFT" + }, + "outputs": [], + "source": [ + "from sagemaker.train.sft_trainer import SFTTrainer\n", + "from sagemaker.train.common import TrainingType\n", + "\n", + "trainer = SFTTrainer(\n", + " model=BASE_MODEL,\n", + " training_type=TrainingType.LORA,\n", + " model_package_group=model_package_group,\n", + " training_dataset=TRAINING_DATASET,\n", + " s3_output_path=S3_OUTPUT_PATH,\n", + " sagemaker_session=sagemaker_session,\n", + " role=ROLE_ARN\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cd93226c", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "" + }, + "source": [ + "### OR" + ] + }, + { + "cell_type": "markdown", + "id": "1b5603ee", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "RLVR" + }, + "source": [ + "#### Create RLVRTrainer (Reinforcement Learning with Verifiable Rewards)\n", + "\n", + "##### Key Parameters:\n", + "* `model`: base_model id on Sagemaker Hubcontent that is available to finetune (or) ModelPackage artifacts\n", + "* `custom_reward_function`: Custom reward function/Evaluator ARN (optional)\n", + "* `model_package_group`: ModelPackage group name or ModelPackageGroup (optional)\n", + "* `mlflow_resource_arn`: MLFlow app ARN to track the training job (optional)\n", + "* `mlflow_experiment_name`: MLFlow app experiment name(str) (optional)\n", + "* `mlflow_run_name`: MLFlow app run name(str) (optional)\n", + "* `training_dataset`: Training Dataset - either Dataset ARN or S3 Path of the dataset (Please note these are required for a training job to run, can be either provided via Trainer or .train()) (optional)\n", + "* `validation_dataset`: Validation Dataset - either Dataset ARN or S3 Path of the dataset (optional)\n", + "* `s3_output_path`: S3 path for the trained model artifacts (optional)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa51a5f", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "RLVR" + }, + "outputs": [], + "source": [ + "from sagemaker.train.rlvr_trainer import RLVRTrainer\n", + "\n", + "trainer = RLVRTrainer(\n", + " model=BASE_MODEL,\n", + " model_package_group=model_package_group,\n", + " training_dataset=TRAINING_DATASET,\n", + " s3_output_path=S3_OUTPUT_PATH,\n", + " sagemaker_session=sagemaker_session,\n", + " role=ROLE_ARN\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a61dbe9f", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "" + }, + "source": [ + "### OR" + ] + }, + { + "cell_type": "markdown", + "id": "88f8bfde", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "DPO" + }, + "source": [ + "#### Create DPO Trainer (Direct Preference Optimization)\n", + "\n", + "Direct Preference Optimization (DPO) is a method for training language models to follow human preferences. Unlike traditional RLHF (Reinforcement Learning from Human Feedback), DPO directly optimizes the model using preference pairs without needing a reward model.\n", + "\n", + "##### Key Parameters:\n", + "- `model` Base model to fine-tune (from SageMaker Hub)\n", + "- `training_type` Fine-tuning method (LoRA recommended for efficiency)\n", + "- `training_dataset` ARN of the registered preference dataset\n", + "- `model_package_group` Where to store the fine-tuned model\n", + "- `mlflow_resource_arn` MLflow tracking server for experiment logging " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b74c57f8", + "metadata": { + "editable": true, + "jumpStartAlterations": [ + "trainerSelection" + ], + "slideshow": { + "slide_type": "" + }, + "tags": [], + "trainer_type": "DPO" + }, + "outputs": [], + "source": [ + "from sagemaker.train.dpo_trainer import DPOTrainer\n", + "from sagemaker.train.common import TrainingType\n", + "\n", + "trainer = DPOTrainer(\n", + " model=BASE_MODEL,\n", + " training_type=TrainingType.LORA,\n", + " model_package_group=model_package_group,\n", + " training_dataset=TRAINING_DATASET,\n", + " s3_output_path=S3_OUTPUT_PATH,\n", + " sagemaker_session=sagemaker_session,\n", + " role=ROLE_ARN\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "514186e9", + "metadata": {}, + "source": [ + "### Step 2: Get Finetuning Options and Modify" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f6eeb5e", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Default Finetuning Options:\")\n", + "pprint(trainer.hyperparameters.to_dict())\n", + "\n", + "# Modify options like object attributes\n", + "trainer.hyperparameters.learning_rate = 0.0002\n", + "\n", + "print(\"\\nModified/User defined Options:\")\n", + "pprint(trainer.hyperparameters.to_dict())" + ] + }, + { + "cell_type": "markdown", + "id": "18f4e5df", + "metadata": {}, + "source": [ + "### Step 3: Start Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31690f41", + "metadata": {}, + "outputs": [], + "source": [ + "training_job = trainer.train(wait=True)\n", + "\n", + "TRAINING_JOB_NAME = training_job.training_job_name\n", + "\n", + "pprint(training_job)" + ] + }, + { + "cell_type": "markdown", + "id": "60b77a45", + "metadata": {}, + "source": [ + "### Step 4: Describe Training job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9277fde0", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.core.resources import TrainingJob\n", + "\n", + "response = TrainingJob.get(training_job_name=TRAINING_JOB_NAME)\n", + "pprint(response)" + ] + }, + { + "cell_type": "markdown", + "id": "evaluation-section", + "metadata": {}, + "source": [ + "# Part 2: Model Evaluation\n", + "\n", + "This section demonstrates the basic user-facing flow for creating and managing evaluation jobs" + ] + }, + { + "cell_type": "markdown", + "id": "cleanup-pipeline", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Step 1: Create BenchmarkEvaluator\n", + "\n", + "Create a BenchmarkEvaluator instance with the desired benchmark. The evaluator will use Jinja2 templates to render a complete pipeline definition.\n", + "\n", + "### Key Parameters:\n", + "- `benchmark`: Benchmark type from the Benchmark enum\n", + "- `model`: Model ARN from SageMaker hub content\n", + "- `s3_output_path`: S3 location for evaluation outputs\n", + "- `mlflow_resource_arn`: MLflow tracking server ARN for experiment tracking (optional)\n", + "- `model_package_group`: Model package group ARN (optional)\n", + "- `source_model_package`: Source model package ARN (optional)\n", + "- `model_artifact`: ARN of model artifact for lineage tracking (auto-inferred from source_model_package) (optional)\n", + "\n", + "**Note:** When you call `evaluate()`, the system will start evaluation job. The evaluator will:\n", + "1. Build template context with all required parameters\n", + "2. Render the pipeline definition from `DETERMINISTIC_TEMPLATE` using Jinja2\n", + "3. Create or update the pipeline with the rendered definition\n", + "4. Start the pipeline execution with empty parameters (all values pre-substituted) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "delete-existing-pipeline", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from sagemaker.train.evaluate import BenchMarkEvaluator\n", + "from sagemaker.train.evaluate import get_benchmarks, get_benchmark_properties\n", + "from rich.pretty import pprint\n", + "import logging\n", + "logging.basicConfig(\n", + " level=logging.INFO,\n", + " format='%(levelname)s - %(name)s - %(message)s'\n", + ")\n", + "\n", + "# Get available benchmarks\n", + "Benchmark = get_benchmarks()\n", + "pprint(list(Benchmark))\n", + "\n", + "# Print properties for a specific benchmark\n", + "pprint(get_benchmark_properties(benchmark=Benchmark.GEN_QA))\n", + "\n", + "\n", + "# Create evaluator with GEN_QA benchmark\n", + "evaluator = BenchMarkEvaluator(\n", + " benchmark=Benchmark.GEN_QA,\n", + " model=BASE_MODEL,\n", + " s3_output_path=S3_OUTPUT_PATH,\n", + ")\n", + "\n", + "pprint(evaluator)" + ] + }, + { + "cell_type": "markdown", + "id": "run-evaluation", + "metadata": {}, + "source": [ + "## Step 2: Run Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "start-evaluation", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Run evaluation\n", + "execution = evaluator.evaluate()\n", + "\n", + "print(f\"Evaluation job started!\")\n", + "print(f\"Job ARN: {execution.arn}\")\n", + "print(f\"Job Name: {execution.name}\")\n", + "print(f\"Status: {execution.status.overall_status}\")\n", + "\n", + "pprint(execution)" + ] + }, + { + "cell_type": "markdown", + "id": "a3de8255-9f98-444a-99a6-cfe7cc2584af", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Step 3: Monitor Execution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "monitor-evaluation", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "execution.refresh()\n", + "\n", + "print(f\"Current status: {execution.status}\")\n", + "\n", + "# Display individual step statuses\n", + "if execution.status.step_details:\n", + " print(\"\\nStep Details:\")\n", + " for step in execution.status.step_details:\n", + " print(f\" {step.name}: {step.status}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2ebac85a-adee-4f18-935d-478037c7a1f3", + "metadata": {}, + "source": [ + "## Step 4: Wait for Completion\n", + "\n", + "Wait for the pipeline to complete. This provides rich progress updates in Jupyter notebooks:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74b51cca-2024-4276-b05d-48f52e527c06", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "execution.wait(target_status=\"Succeeded\", poll=5, timeout=3600)\n", + "\n", + "print(f\"\\nFinal Status: {execution.status.overall_status}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d153370-213a-41d0-8a95-f4ffccf8f9aa", + "metadata": {}, + "source": [ + "## Step 5: View Results\n", + "\n", + "Display the evaluation results in a formatted table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f470824-7740-48bb-9282-a7b9d0407fff", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "execution.show_results()" + ] + }, + { + "cell_type": "markdown", + "id": "92bda96d-5be7-408f-9b47-ae46772ac03e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Part 3. Deploying the Model to Bedrock for inference\n", + "\n", + "Trained model artifacts and checkpoints are stored in your designated escrow S3 bucket. You can access the training checkpoint location from the `describe_training_job` response.\n", + "\n", + "By calling `create_custom_model` API, you can create your custom model referencing the model artifacts stored in your S3 escrow bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "784304f4-eb4f-48c8-b572-e5a18c5a9929", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import json\n", + "from urllib.parse import urlparse\n", + "\n", + "bedrock_custom_model_name = \"\" # customize as needed\n", + "\n", + "describe_training_response = sm_client.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)\n", + "\n", + "training_output_s3_uri = describe_training_response['OutputDataConfig']['S3OutputPath']\n", + "\n", + "def get_s3_manifest(training_output_s3_uri):\n", + " try:\n", + " s3_client = boto3.client('s3')\n", + " parsed_uri = urlparse(training_output_s3_uri)\n", + " bucket = parsed_uri.netloc\n", + " key = parsed_uri.path.lstrip('/')\n", + " manifest_key = f\"{key.rstrip('/')}/{TRAINING_JOB_NAME}/output/output/manifest.json\"\n", + "\n", + " print(f\"Fetching manifest from s3://{bucket}/{manifest_key}\")\n", + " response = s3_client.get_object(Bucket=bucket, Key=manifest_key)\n", + "\n", + " manifest_content = response['Body'].read().decode('utf-8')\n", + " manifest = json.loads(manifest_content)\n", + " if 'checkpoint_s3_bucket' not in manifest:\n", + " raise ValueError(\"Checkpoint location not found in manifest\")\n", + " print(f\"Successfully retrieved checkpoint S3 URI: {manifest['checkpoint_s3_bucket']}\")\n", + " return manifest['checkpoint_s3_bucket']\n", + " except s3_client.exceptions.NoSuchKey:\n", + " raise FileNotFoundError(f\"Manifest file not found at s3://{bucket}/{manifest_key}\")\n", + " except json.JSONDecodeError as e:\n", + " raise ValueError(f\"Failed to parse manifest JSON: {str(e)}\")\n", + " except Exception as e:\n", + " raise Exception(f\"Error fetching manifest: {str(e)}\")\n", + "\n", + "s3_checkpoint_path = get_s3_manifest(training_output_s3_uri)\n", + "\n", + "\n", + "bedrock_client = boto3.Session().client(service_name=\"bedrock\", region_name=REGION)\n", + "\n", + "\n", + "s3_checkpoint_path = describe_training_response[\"CheckpointConfig\"][\"S3Uri\"]\n", + "\n", + "try:\n", + " response = bedrock_client.create_custom_model(\n", + " modelName=bedrock_custom_model_name,\n", + " modelSourceConfig={\"s3DataSource\": {\"s3Uri\": s3_checkpoint_path}},\n", + " roleArn=ROLE_ARN,\n", + " # Optionally, add modelTags here\n", + " )\n", + " print(\"Custom model ARN:\", response[\"modelArn\"])\n", + "except Exception as e:\n", + " print(f\"An unexpected error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f0dc4211-788c-4e5d-844a-58176ac69cea", + "metadata": {}, + "source": [ + "To monitor the job, use the `get_custom_model` operation to retrieve the job status. Please allow some time for the job to complete as this can take upto 20 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3795cd13-57fd-44f7-b2e2-9f51f2df74c4", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "while True:\n", + " custom_model_response = bedrock_client.get_custom_model(modelIdentifier=bedrock_custom_model_name)\n", + " model_status = custom_model_response[\"modelStatus\"]\n", + " print(f\"Custom model status: {model_status}\")\n", + " if model_status == \"Active\":\n", + " break\n", + " elif model_status in [\"Failed\"]:\n", + " raise Exception(f\"Custom model creation failed with status: {model_status}\")\n", + " time.sleep(30)\n", + "print(\"Custom model is ACTIVE.\")\n", + "custom_model_response" + ] + }, + { + "cell_type": "markdown", + "id": "c4ecb46f-26ac-463e-b644-c8eb65173ac2", + "metadata": {}, + "source": [ + "After you create a custom model, you can set up inference using one of the following options:\n", + "1. **Purchase Provisioned Throughput** – Purchase Provisioned Throughput for your model to set up dedicated compute capacity with guaranteed throughput for consistent performance and lower latency.\n", + "For more information about Provisioned Throughput, see [Increase model invocation capacity with Provisioned Throughput in Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/prov-throughput.html). For more information about using custom models with Provisioned Throughput, [see Purchase Provisioned Throughput for a custom model](https://docs.aws.amazon.com/bedrock/latest/userguide/custom-model-use-pt.html).\n", + "2. **Deploy custom model for on-demand inference (only LoRA fine-tuned Amazon Nova models)** – To set up on-demand inference, you deploy the custom model with a custom model deployment. After you deploy the model, you invoke it using the ARN for the custom model deployment. With on-demand inference, you only pay for what you use and you don't need to set up provisioned compute resources.\n", + "For more information about deploying custom models for on-demand inference, see [Deploy a custom model for on-demand inference](https://docs.aws.amazon.com/bedrock/latest/userguide/deploy-custom-model-on-demand.html)." + ] + }, + { + "cell_type": "markdown", + "id": "f7b52193-3624-4485-84c7-86b7f5d0e7fb", + "metadata": {}, + "source": [ + "#### Deploy custom model for inference by using Provisioned Throughput" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0064c4-7302-47de-a125-a0f07d4e13ad", + "metadata": {}, + "outputs": [], + "source": [ + "provisioned_model_name = \"test-provisioned-model\"\n", + "custom_model_id = custom_model_response[\"modelArn\"]\n", + "\n", + "try:\n", + " response = bedrock_client.create_provisioned_model_throughput(\n", + " modelId=custom_model_id, provisionedModelName=provisioned_model_name, modelUnits=1\n", + " )\n", + " provisioned_model_arn = response[\"provisionedModelArn\"]\n", + " print(\"Provisioned model ARN:\", provisioned_model_arn)\n", + "except Exception as e:\n", + " print(f\"An unexpected error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ef1671ec-81d7-4fc7-89a5-7ee5a2e8b67b", + "metadata": {}, + "source": [ + "Wait for provisioned model to become ACTIVE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efef8325-fade-49ef-a2bd-ec0215a56f25", + "metadata": {}, + "outputs": [], + "source": [ + "while True:\n", + " response = bedrock_client.get_provisioned_model_throughput(\n", + " provisionedModelId=provisioned_model_arn\n", + " )\n", + " model_status = response[\"status\"]\n", + " print(f\"Provisioned model status: {model_status}\")\n", + " if model_status == \"InService\":\n", + " break\n", + " elif model_status in [\"Failed\"]:\n", + " raise Exception(f\"Provisioned model failed with status: {model_status}\")\n", + " time.sleep(30)\n", + "print(\"Provisioned model is in service.\")\n", + "response" + ] + }, + { + "cell_type": "markdown", + "id": "68a1443e-ef01-4ee1-9f8d-10e9ec3a55a3", + "metadata": {}, + "source": [ + "Finally, you can invoke the model like any other Bedrock-hosted model using the invoke-model API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80f8ecee-9a5f-46a5-97dc-72a2e6a7c693", + "metadata": {}, + "outputs": [], + "source": [ + "# Invoke model (Inference)\n", + "bedrock_runtime = boto3.client(\"bedrock-runtime\", region_name=REGION)\n", + "\n", + "request_body = {\n", + " \"inferenceConfig\": {\"max_new_tokens\": 1000, \"temperature\": 0.7, \"top_p\": 0.9},\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"text\": \"Tell me about Amazon Bedrock in less than 100 words.\"}\n", + " ],\n", + " }\n", + " ],\n", + "}\n", + "\n", + "response = bedrock_runtime.invoke_model(\n", + " modelId=provisioned_model_arn,\n", + " body=json.dumps(request_body),\n", + " contentType=\"application/json\",\n", + " accept=\"application/json\",\n", + ")\n", + "\n", + "response_body = json.loads(response[\"body\"].read())\n", + "print(response_body[\"output\"][\"message\"][\"content\"][0][\"text\"])" + ] + }, + { + "cell_type": "markdown", + "id": "b51edee9-6663-4863-a5f2-c72e9cfe7e9e", + "metadata": {}, + "source": [ + "#### Deploy custom model for On-Demand Inference\n", + "**Important Note:** On-demand inference is currently supported only for LoRA-based fine-tuned models.\n", + "\n", + "Once the custom model has reached Active Status, deploy it for on-demand inference by creating custom model deployment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d8beda2-c96d-40e5-ac86-ff2a58eadd40", + "metadata": {}, + "outputs": [], + "source": [ + "model_deployment_name = \"\"\n", + "custom_model_arn=custom_model_response[\"modelArn\"]\n", + "try:\n", + " response = bedrock_client.create_custom_model_deployment(\n", + " modelDeploymentName=model_deployment_name,\n", + " modelArn=custom_model_arn,\n", + " description=\"\",\n", + " tags=[\n", + " {\n", + " \"key\":\"\",\n", + " \"value\":\"\"\n", + " }\n", + " ]\n", + " )\n", + " custom_model_deployment_arn = response[\"customModelDeploymentArn\"]\n", + " print(\"Custom model deployment ARN:\", custom_model_deployment_arn)\n", + "except Exception as e:\n", + " print(f\"An unexpected error occurred: {e}\")\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "863102f7-4e5d-4f64-945d-df5be269b307", + "metadata": {}, + "outputs": [], + "source": [ + "while True:\n", + " response = bedrock_client.get_custom_model_deployment(customModelDeploymentIdentifier=custom_model_deployment_arn)\n", + " model_status = response[\"status\"]\n", + " print(f\"Custom model deployment status: {model_status}\")\n", + " if model_status == \"Active\":\n", + " break\n", + " elif model_status in [\"Failed\"]:\n", + " raise Exception(f\"Custom model deployment failed with status: {model_status}\")\n", + " time.sleep(30)\n", + "print(\"Custom model is ACTIVE.\")\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79b99c7a-aa7f-409a-a988-9ed618e856e9", + "metadata": {}, + "outputs": [], + "source": [ + "bedrock_runtime = boto3.client(\"bedrock-runtime\", region_name=REGION)\n", + "\n", + "# invoke a deployed custom model using Converse API\n", + "response = bedrock_runtime.converse(\n", + " modelId=custom_model_deployment_arn,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"text\": \"Tell me about Amazon Bedrock in less than 100 words.\",\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + " )\n", + "result = response.get('output')\n", + "print(result)\n", + "\n", + "# invoke a deployed custom model using InvokeModel API\n", + "request_body = {\n", + " \"schemaVersion\": \"messages-v1\",\n", + " \"messages\": [{\"role\": \"user\", \n", + " \"content\": [{\"text\": \"Tell me about Amazon Bedrock in less than 100 words.\"}]}],\n", + " \"system\": [{\"text\": \"What is amazon bedrock?\"}],\n", + " \"inferenceConfig\": {\"maxTokens\": 500, \n", + " \"topP\": 0.9, \n", + " \"temperature\": 0.0\n", + " }\n", + "}\n", + "body = json.dumps(request_body)\n", + "response = bedrock_runtime.invoke_model(\n", + " modelId=custom_model_deployment_arn,\n", + " body=body\n", + " )\n", + "\n", + "# Extract and print the response text\n", + "model_response = json.loads(response[\"body\"].read())\n", + "response_text = model_response[\"output\"][\"message\"][\"content\"][0][\"text\"]\n", + "print(response_text)" + ] + }, + { + "cell_type": "markdown", + "id": "1b80972e-7f59-4357-9b23-74c1d3877342", + "metadata": {}, + "source": [ + "### Cleanup\n", + "Delete the resources that were created to stop incurring charges." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f83840f7-1279-4192-a13f-a05bef8fb3e4", + "metadata": {}, + "outputs": [], + "source": [ + "# Delete provisioned model throughput\n", + "print(f\"Deleting provisioned model throughput: {provisioned_model_arn}\")\n", + "try:\n", + " bedrock_client.delete_provisioned_model_throughput(\n", + " provisionedModelId=provisioned_model_name\n", + " )\n", + " print(\"Provisioned model throughput deleted successfully.\")\n", + "except Exception as e:\n", + " print(f\"Error deleting provisioned throughput: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41188290-dc41-4231-95f7-d371aa77fb1c", + "metadata": {}, + "outputs": [], + "source": [ + "# Delete custom model deployment if you have used on-demand inference.\n", + "print(f\"Deleting custom model deployment: {custom_model_deployment_arn}\")\n", + "try:\n", + " bedrock_client.delete_custom_model_deployment(\n", + " customModelDeploymentIdentifier=custom_model_deployment_arn\n", + " )\n", + " print(\"Custom model deployment deleted successfully.\")\n", + "except Exception as e:\n", + " print(f\"Error deleting custom model deployment: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff1805d7-14b6-4d6b-a331-1924fbae346b", + "metadata": {}, + "outputs": [], + "source": [ + "# Delete custom model\n", + "print(f\"Deleting custom model: {custom_model_id}\")\n", + "try:\n", + " bedrock_client.delete_custom_model(modelIdentifier=custom_model_id)\n", + " print(\"Custom model deleted successfully.\")\n", + "except Exception as e:\n", + " print(f\"Error deleting custom model: {e}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}