From fb98f6af604620de07c9c2b2e446968017004711 Mon Sep 17 00:00:00 2001 From: Gokul A Date: Thu, 26 Mar 2026 15:32:22 -0700 Subject: [PATCH 1/3] docs: Add detailed use case sections for training, inference, and MLOps Add comprehensive documentation extracted from V3 example notebooks covering all training, inference, and MLOps use cases with V3 code examples. New sections include local container training, distributed training, hyperparameter management, JumpStart, custom InferenceSpec, model optimization, inference pipelines, processing jobs, batch transform, model registry, Clarify, EMR Serverless, MLflow integration, and lineage tracking. Each section includes red NEW highlights for change detection and expanded V2-to-V3 migration tables. --- docs/_static/custom.css | 18 + docs/inference/index.rst | 512 +++++++++++++--- docs/ml_ops/index.rst | 373 +++++++++++- docs/ml_ops/lineage.rst | 473 +++++++++++++++ docs/training/index.rst | 546 ++++++++++++++++-- .../v3-lineage-tracking-example.ipynb | 356 ++++++++++++ 6 files changed, 2144 insertions(+), 134 deletions(-) create mode 100644 docs/ml_ops/lineage.rst create mode 100644 v3-examples/ml-ops-examples/v3-lineage-tracking-example.ipynb diff --git a/docs/_static/custom.css b/docs/_static/custom.css index f618f6c891..6aa0183058 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -172,3 +172,21 @@ p { font-size: 1.0rem; color: var(--pst-color-text-base); } + +/* NEW CONTENT HIGHLIGHT - remove this block to revert red highlighting */ +.new-content { + border-left: 4px solid #dc2626; + padding-left: 16px; + margin-left: 0; +} +.new-content-label::before { + content: "NEW"; + background: #dc2626; + color: white; + font-size: 0.7em; + font-weight: 700; + padding: 2px 8px; + border-radius: 3px; + margin-right: 8px; + vertical-align: middle; +} diff --git a/docs/inference/index.rst b/docs/inference/index.rst index 67f6978213..95c71f9e7b 100644 --- a/docs/inference/index.rst +++ b/docs/inference/index.rst @@ -58,126 +58,481 @@ Here's how inference has evolved from V2 to V3: content_type="application/json" ) -ModelBuilder Overview --------------------- - -The ``ModelBuilder`` class is the cornerstone of SageMaker Python SDK V3 inference, providing a unified interface for all deployment scenarios. This single class replaces the complex web of framework-specific model classes from V2, offering: -**Unified Deployment Interface** - One class handles PyTorch, TensorFlow, Scikit-learn, XGBoost, HuggingFace, and custom containers +.. rst-class:: new-content -**Intelligent Optimization** - Automatically optimizes model serving configuration based on your model characteristics +Custom InferenceSpec +-------------------- -**Flexible Deployment Options** - Support for real-time endpoints, batch transform, and serverless inference +.. rst-class:: new-content-label -**Seamless Integration** - Works seamlessly with SageMaker features like auto-scaling, multi-model endpoints, and A/B testing +Define custom model loading and inference logic by extending ``InferenceSpec``. Implement ``load()`` to deserialize your model and ``invoke()`` to run predictions. .. code-block:: python - from sagemaker.serve import ModelBuilder + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.utils.types import ModelServer + + class MyModelSpec(InferenceSpec): + def load(self, model_dir: str): + import torch + return torch.jit.load(f"{model_dir}/model.pth", map_location="cpu") + + def invoke(self, input_object, model): + import torch + tensor = torch.tensor(input_object, dtype=torch.float32) + with torch.no_grad(): + return model(tensor).tolist() + + schema_builder = SchemaBuilder( + [[0.1, 0.2, 0.3, 0.4]], # sample input + [[0.9, 0.1]] # sample output + ) model_builder = ModelBuilder( - model="your-model", - model_path="s3://your-bucket/model-artifacts", - role="your-sagemaker-role" + inference_spec=MyModelSpec(), + model_path="./model_artifacts", + model_server=ModelServer.TORCHSERVE, + schema_builder=schema_builder, ) - model = model_builder.build(model_name="my-model") - - endpoint = model_builder.deploy( - endpoint_name="my-endpoint", - instance_type="ml.m5.xlarge", - initial_instance_count=1 + core_model = model_builder.build(model_name="my-custom-model") + endpoint = model_builder.deploy(endpoint_name="my-endpoint") + + result = endpoint.invoke( + body=json.dumps([[0.1, 0.2, 0.3, 0.4]]), + content_type="application/json" ) - - response = endpoint.invoke( - body={"inputs": "your-input-data"}, + +:doc:`Full example notebook <../v3-examples/inference-examples/inference-spec-example>` + + +.. rst-class:: new-content + +JumpStart Model Deployment +-------------------------- + +.. rst-class:: new-content-label + +Deploy pre-trained models from the JumpStart hub using ``ModelBuilder.from_jumpstart_config()``. + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.core.jumpstart.configs import JumpStartConfig + from sagemaker.train.configs import Compute + + compute = Compute(instance_type="ml.g5.2xlarge") + jumpstart_config = JumpStartConfig(model_id="huggingface-llm-falcon-7b-bf16") + + model_builder = ModelBuilder.from_jumpstart_config( + jumpstart_config=jumpstart_config, + compute=compute, + ) + + core_model = model_builder.build(model_name="falcon-model") + endpoint = model_builder.deploy(endpoint_name="falcon-endpoint") + + result = endpoint.invoke( + body=json.dumps({"inputs": "What are falcons?", "parameters": {"max_new_tokens": 32}}), content_type="application/json" ) -Inference Capabilities ----------------------- +:doc:`Full example notebook <../v3-examples/inference-examples/jumpstart-example>` -Model Optimization Support -~~~~~~~~~~~~~~~~~~~~~~~~~~ -V3 introduces powerful model optimization capabilities for enhanced performance: +.. rst-class:: new-content -* **SageMaker Neo** - Optimize models for specific hardware targets -* **TensorRT Integration** - Accelerate deep learning inference on NVIDIA GPUs -* **ONNX Runtime** - Cross-platform model optimization and acceleration -* **Quantization Support** - Reduce model size and improve inference speed +Model Optimization (Quantization) +---------------------------------- -**Model Optimization Example:** +.. rst-class:: new-content-label + +Optimize models with quantization (e.g., AWQ) using ``model_builder.optimize()`` before deployment. .. code-block:: python - from sagemaker.serve import ModelBuilder + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.builder.schema_builder import SchemaBuilder + + schema_builder = SchemaBuilder( + {"inputs": "What are falcons?", "parameters": {"max_new_tokens": 32}}, + [{"generated_text": "Falcons are birds of prey."}] + ) - # Create ModelBuilder with optimization settings model_builder = ModelBuilder( - model="huggingface-bert-base", - role="your-sagemaker-role" + model="meta-textgeneration-llama-3-8b-instruct", + schema_builder=schema_builder, ) - # Build and deploy with optimization - model = model_builder.build(model_name="optimized-bert") - endpoint = model_builder.deploy( - endpoint_name="bert-endpoint", - instance_type="ml.inf1.xlarge", - initial_instance_count=1 + optimized_model = model_builder.optimize( + instance_type="ml.g5.2xlarge", + quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + accept_eula=True, + job_name="optimize-llama", + model_name="llama-optimized", ) -Key Inference Features -~~~~~~~~~~~~~~~~~~~~~ + endpoint = model_builder.deploy(endpoint_name="llama-endpoint", initial_instance_count=1) + +:doc:`Full example notebook <../v3-examples/inference-examples/optimize-example>` + + +.. rst-class:: new-content + +Train-to-Inference End-to-End +------------------------------ + +.. rst-class:: new-content-label + +Pass a ``ModelTrainer`` directly to ``ModelBuilder`` to go from training to deployment in one flow. + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.train.configs import SourceCode + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.utils.types import ModelServer + + # Train + trainer = ModelTrainer( + training_image="pytorch-training:1.13.1-cpu-py39", + source_code=SourceCode(source_dir="./src", entry_script="train.py"), + base_job_name="my-training", + ) + trainer.train() + + # Deploy from trainer + model_builder = ModelBuilder( + model=trainer, + schema_builder=SchemaBuilder([[0.1, 0.2, 0.3, 0.4]], [[0.8, 0.2]]), + model_server=ModelServer.TORCHSERVE, + inference_spec=MyInferenceSpec(), + ) + + core_model = model_builder.build(model_name="trained-model") + endpoint = model_builder.deploy(endpoint_name="trained-endpoint", initial_instance_count=1) -* **Multi-Model Endpoints** - Host multiple models on a single endpoint with automatic model loading and unloading for cost optimization -* **Auto-Scaling Integration** - Automatically scale endpoint capacity based on traffic patterns with configurable scaling policies -* **A/B Testing Support** - Deploy multiple model variants with traffic splitting for safe model updates and performance comparison -* **Batch Transform Jobs** - Process large datasets efficiently with automatic data partitioning and parallel processing -* **Serverless Inference** - Pay-per-request pricing with automatic scaling from zero to handle variable workloads +:doc:`Full example notebook <../v3-examples/inference-examples/train-inference-e2e-example>` -Supported Inference Scenarios + +.. rst-class:: new-content + +JumpStart Train-to-Inference ----------------------------- -Deployment Types -~~~~~~~~~~~~~~~ +.. rst-class:: new-content-label + +Train a JumpStart model with ``ModelTrainer.from_jumpstart_config()`` then deploy via ``ModelBuilder``. + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.core.jumpstart.configs import JumpStartConfig + + jumpstart_config = JumpStartConfig(model_id="huggingface-spc-bert-base-cased") + + trainer = ModelTrainer.from_jumpstart_config( + jumpstart_config=jumpstart_config, + base_job_name="js-training", + hyperparameters={"epochs": 1}, + ) + trainer.train() + + model_builder = ModelBuilder(model=trainer, dependencies={"auto": False}) + core_model = model_builder.build(model_name="bert-trained") + endpoint = model_builder.deploy(endpoint_name="bert-endpoint") + +:doc:`Full example notebook <../v3-examples/inference-examples/jumpstart-e2e-training-example>` + -* **Real-Time Endpoints** - Low-latency inference for interactive applications -* **Batch Transform** - High-throughput processing for large datasets -* **Serverless Inference** - Cost-effective inference for variable workloads -* **Multi-Model Endpoints** - Host multiple models on shared infrastructure +.. rst-class:: new-content -Framework Support -~~~~~~~~~~~~~~~~~ +HuggingFace Model Deployment +------------------------------ + +.. rst-class:: new-content-label + +Deploy HuggingFace models with a custom ``InferenceSpec`` using Multi Model Server (MMS). + +.. code-block:: python -* **PyTorch** - Deep learning models with dynamic computation graphs -* **TensorFlow** - Production-ready machine learning models at scale -* **Scikit-learn** - Classical machine learning algorithms -* **XGBoost** - Gradient boosting models for structured data -* **HuggingFace** - Pre-trained transformer models for NLP tasks -* **Custom Containers** - Bring your own inference logic and dependencies + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.utils.types import ModelServer + + class HFSpec(InferenceSpec): + def load(self, model_dir): + from transformers import AutoTokenizer, AutoModelForCausalLM + tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") + model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") + return {"model": model, "tokenizer": tokenizer} + + def invoke(self, input_object, model): + text = input_object["inputs"] + inputs = model["tokenizer"].encode(text, return_tensors="pt") + outputs = model["model"].generate(inputs, max_length=inputs.shape[1] + 20) + return [{"generated_text": model["tokenizer"].decode(outputs[0], skip_special_tokens=True)}] + + model_builder = ModelBuilder( + inference_spec=HFSpec(), + model_server=ModelServer.MMS, + schema_builder=SchemaBuilder( + {"inputs": "Hello, how are you?"}, + [{"generated_text": "I'm doing well!"}] + ), + ) + + core_model = model_builder.build(model_name="hf-dialogpt") + endpoint = model_builder.deploy(endpoint_name="hf-endpoint") + +:doc:`Full example notebook <../v3-examples/inference-examples/huggingface-example>` + + +.. rst-class:: new-content + +In-Process Mode +---------------- + +.. rst-class:: new-content-label + +Run inference entirely in your Python process with no containers or AWS resources. Use ``Mode.IN_PROCESS`` and ``deploy_local()``. + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.mode.function_pointers import Mode + + class MathSpec(InferenceSpec): + def load(self, model_dir): + return {"factor": 2.0} + + def invoke(self, input_object, model): + numbers = input_object["numbers"] + return {"result": [n * model["factor"] for n in numbers]} + + model_builder = ModelBuilder( + inference_spec=MathSpec(), + schema_builder=SchemaBuilder({"numbers": [1.0, 2.0]}, {"result": [2.0, 4.0]}), + mode=Mode.IN_PROCESS, + ) + + core_model = model_builder.build(model_name="math-model") + local_endpoint = model_builder.deploy_local(endpoint_name="math-local") + + result = local_endpoint.invoke(body={"numbers": [3.0, 5.0]}, content_type="application/json") + +:doc:`Full example notebook <../v3-examples/inference-examples/in-process-mode-example>` + + +.. rst-class:: new-content + +Local Container Mode +--------------------- + +.. rst-class:: new-content-label + +Test models in Docker containers locally using ``Mode.LOCAL_CONTAINER`` and ``deploy_local()``. Same container environment as SageMaker endpoints. + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.utils.types import ModelServer + from sagemaker.serve.mode.function_pointers import Mode + + model_builder = ModelBuilder( + inference_spec=MyPyTorchSpec(model_path="./model"), + model_server=ModelServer.TORCHSERVE, + schema_builder=SchemaBuilder([[1.0, 2.0, 3.0, 4.0]], [[0.6, 0.4]]), + mode=Mode.LOCAL_CONTAINER, + ) + + local_model = model_builder.build(model_name="local-pytorch") + local_endpoint = model_builder.deploy_local( + endpoint_name="local-pytorch-ep", + wait=True, + container_timeout_in_seconds=1200, + ) + + response = local_endpoint.invoke( + body=json.dumps([[1.0, 2.0, 3.0, 4.0]]), + content_type="application/json" + ) + +:doc:`Full example notebook <../v3-examples/inference-examples/local-mode-example>` + + +.. rst-class:: new-content + +Inference Pipelines (Multi-Container) +-------------------------------------- + +.. rst-class:: new-content-label + +Chain multiple containers into a serial inference pipeline. Pass a list of ``Model`` objects to ``ModelBuilder``. + +.. code-block:: python + + from sagemaker.core.resources import Model + from sagemaker.core.shapes import ContainerDefinition + from sagemaker.core.utils import repack_model + from sagemaker.serve import ModelBuilder + + # Create individual models with primary_container + sklearn_model = Model.create( + model_name="sklearn-preprocess", + primary_container=ContainerDefinition( + image=sklearn_image, + model_data_url=sklearn_repacked_uri, + environment={"SAGEMAKER_PROGRAM": "inference.py"}, + ), + execution_role_arn=role, + ) + + xgboost_model = Model.create( + model_name="xgboost-classifier", + primary_container=ContainerDefinition( + image=xgboost_image, + model_data_url=xgboost_model_uri, + ), + execution_role_arn=role, + ) + + # Build and deploy pipeline + pipeline_builder = ModelBuilder( + model=[sklearn_model, xgboost_model], + role_arn=role, + ) + pipeline_model = pipeline_builder.build() + endpoint = pipeline_builder.deploy( + endpoint_name="pipeline-ep", + instance_type="ml.m5.large", + initial_instance_count=1, + ) + + response = endpoint.invoke(body=csv_data, content_type="text/csv", accept="text/csv") + +:doc:`Full example notebook <../v3-examples/inference-examples/inference-pipeline-modelbuilder-vs-core-example>` -Advanced Features -~~~~~~~~~~~~~~~~ -* **Model Monitoring** - Track model performance and data drift in production -* **Endpoint Security** - VPC support, encryption, and IAM-based access control -* **Multi-AZ Deployment** - High availability with automatic failover -* **Custom Inference Logic** - Implement preprocessing, postprocessing, and custom prediction logic Migration from V2 ------------------ -If you're migrating from V2, the key changes are: +.. rst-class:: new-content + +Inference Classes and Imports +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.model.Model`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.pytorch.PyTorchModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.tensorflow.TensorFlowModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.huggingface.HuggingFaceModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.sklearn.SKLearnModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.xgboost.XGBoostModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.jumpstart.JumpStartModel`` + - ``ModelBuilder.from_jumpstart_config(JumpStartConfig(...))`` + * - ``sagemaker.predictor.Predictor`` + - ``sagemaker.core.resources.Endpoint`` + * - ``sagemaker.serializers.*`` + - Handle serialization directly (e.g., ``json.dumps()``) + * - ``sagemaker.deserializers.*`` + - Handle deserialization directly (e.g., ``json.loads()``) + +.. rst-class:: new-content + +Methods and Patterns +~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``model.deploy(instance_type=..., initial_instance_count=...)`` + - ``model_builder.deploy(endpoint_name=..., instance_type=..., initial_instance_count=...)`` + * - ``estimator.deploy()`` + - ``ModelBuilder(model=trainer).deploy()`` + * - ``predictor.predict(data)`` + - ``endpoint.invoke(body=data, content_type="application/json")`` + * - ``model = Model(image_uri=..., model_data=...)`` + - ``model_builder = ModelBuilder(model=..., model_path=...)`` + * - ``model.deploy()`` returns ``Predictor`` + - ``model_builder.deploy()`` returns ``Endpoint`` + * - ``Transformer(model_name=...).transform(...)`` + - ``sagemaker.core.resources.TransformJob.create(...)`` + +.. rst-class:: new-content + +Session and Utilities +~~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.session.Session()`` + - ``sagemaker.core.helper.session_helper.Session()`` + * - ``sagemaker.get_execution_role()`` + - ``sagemaker.core.helper.session_helper.get_execution_role()`` + * - ``sagemaker.image_uris.retrieve(...)`` + - ``sagemaker.core.image_uris.retrieve(...)`` + * - ``boto3.client('sagemaker')`` + - ``sagemaker.core.resources.*`` (Model, Endpoint, EndpointConfig, etc.) + +.. rst-class:: new-content + +V3 Package Structure +~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - V3 Package + - Purpose + * - ``sagemaker-core`` + - Low-level resource management (Model, Endpoint, EndpointConfig), session, image URIs + * - ``sagemaker-train`` + - ModelTrainer for training (used with ``ModelBuilder(model=trainer)``) + * - ``sagemaker-serve`` + - ModelBuilder, InferenceSpec, SchemaBuilder, ModelServer, deployment modes + * - ``sagemaker-mlops`` + - Pipelines, processing, model registry, monitoring, Clarify -* Replace framework-specific model classes (PyTorchModel, TensorFlowModel, etc.) with ``ModelBuilder`` -* Use structured configuration objects instead of parameter dictionaries -* Leverage the new ``invoke()`` method instead of ``predict()`` for more consistent API -* Take advantage of built-in optimization and auto-scaling features Inference Examples ----------------- @@ -195,3 +550,4 @@ Explore comprehensive inference examples that demonstrate V3 capabilities: Local Container Mode <../v3-examples/inference-examples/local-mode-example> Deploy HuggingFace Models <../v3-examples/inference-examples/huggingface-example> ModelBuilder in In-Process mode <../v3-examples/inference-examples/in-process-mode-example> + Inference Pipeline <../v3-examples/inference-examples/inference-pipeline-modelbuilder-vs-core-example> diff --git a/docs/ml_ops/index.rst b/docs/ml_ops/index.rst index 9c7e49b025..a9738bbad9 100644 --- a/docs/ml_ops/index.rst +++ b/docs/ml_ops/index.rst @@ -199,17 +199,388 @@ If you're migrating MLOps workflows from V2, the key improvements are: * **Improved Governance**: Integrated model registry and approval workflows streamline compliance * **Better Resource Management**: Automatic resource optimization and cost management across workflows +Lineage Tracking +~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content + +SageMaker Lineage enables tracing events across your ML workflow via a graph structure. V3 provides lineage tracking through ``sagemaker.core.lineage`` with support for: + +.. rst-class:: new-content-label + +- **Contexts** - Logical grouping of lineage entities under workflow contexts +- **Actions** - Recording computational steps like model builds and transformations +- **Artifacts** - Registering data inputs, labels, and trained models +- **Associations** - Directed edges linking entities to form the lineage graph +- **Traversal** - Querying relationships between entities for reporting and analysis + +.. code-block:: python + + from sagemaker.core.lineage.context import Context + from sagemaker.core.lineage.action import Action + from sagemaker.core.lineage.artifact import Artifact + from sagemaker.core.lineage.association import Association + + # Create a workflow context + context = Context.create( + context_name="my-ml-workflow", + context_type="MLWorkflow", + source_uri="workflow-source", + ) + + # Create an action and associate it with the context + action = Action.create( + action_name="model-build-step", + action_type="ModelBuild", + source_uri="build-source", + ) + + Association.create( + source_arn=context.context_arn, + destination_arn=action.action_arn, + association_type="AssociatedWith", + ) + +:doc:`Learn more about Lineage Tracking ` + ML Operations Examples ---------------------- -Explore comprehensive MLOps examples that demonstrate V3 capabilities: +.. rst-class:: new-content + +E2E Pipeline with Model Registry +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +Build a SageMaker Pipeline that preprocesses data, trains a model, and registers it to the Model Registry. + +.. code-block:: python + + from sagemaker.mlops.workflow.pipeline import Pipeline + from sagemaker.mlops.workflow.steps import ProcessingStep, TrainingStep, CacheConfig + from sagemaker.mlops.workflow.model_step import ModelStep + from sagemaker.core.processing import ScriptProcessor + from sagemaker.core.shapes import ProcessingInput, ProcessingS3Input, ProcessingOutput, ProcessingS3Output + from sagemaker.core.workflow.parameters import ParameterString + from sagemaker.core.workflow.pipeline_context import PipelineSession + from sagemaker.train import ModelTrainer + from sagemaker.train.configs import InputData, Compute + from sagemaker.serve.model_builder import ModelBuilder + + pipeline_session = PipelineSession() + + # Processing step + processor = ScriptProcessor(image_uri=sklearn_image, instance_type="ml.m5.xlarge", ...) + step_process = ProcessingStep(name="Preprocess", step_args=processor.run(...)) + + # Training step + trainer = ModelTrainer(training_image=xgboost_image, compute=Compute(instance_type="ml.m5.xlarge"), ...) + step_train = TrainingStep(name="Train", step_args=trainer.train()) + + # Register model + model_builder = ModelBuilder( + s3_model_data_url=step_train.properties.ModelArtifacts.S3ModelArtifacts, + image_uri=xgboost_image, role_arn=role, sagemaker_session=pipeline_session, + ) + step_register = ModelStep(name="Register", step_args=model_builder.register( + model_package_group_name="my-group", approval_status="Approved", + )) + + pipeline = Pipeline(name="my-pipeline", steps=[step_process, step_train, step_register], sagemaker_session=pipeline_session) + pipeline.upsert(role_arn=role) + pipeline.start() + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-pipeline-train-create-registry>` + + +.. rst-class:: new-content + +Processing Jobs +~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +Run data preprocessing with ``ScriptProcessor`` (sklearn) or ``FrameworkProcessor`` (PyTorch). + +.. code-block:: python + + from sagemaker.core.processing import ScriptProcessor + from sagemaker.core.shapes import ProcessingInput, ProcessingS3Input, ProcessingOutput, ProcessingS3Output + + processor = ScriptProcessor( + image_uri=image_uris.retrieve(framework="sklearn", region=region, version="1.2-1", py_version="py3", instance_type="ml.m5.xlarge"), + instance_type="ml.m5.xlarge", instance_count=1, role=role, + ) + + processor.run( + inputs=[ProcessingInput(input_name="input-1", s3_input=ProcessingS3Input(s3_uri=input_data, local_path="/opt/ml/processing/input", s3_data_type="S3Prefix"))], + outputs=[ProcessingOutput(output_name="train", s3_output=ProcessingS3Output(s3_uri="s3://bucket/train", local_path="/opt/ml/processing/train", s3_upload_mode="EndOfJob"))], + code="code/preprocess.py", + arguments=["--input-data", input_data], + ) + +:doc:`SKLearn example <../v3-examples/ml-ops-examples/v3-processing-job-sklearn>` · :doc:`PyTorch example <../v3-examples/ml-ops-examples/v3-processing-job-pytorch/v3-pytorch-processing-example>` + + +.. rst-class:: new-content + +Batch Transform Jobs +~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +Run batch inference on large datasets using ``Transformer``. + +.. code-block:: python + + from sagemaker.core.transformer import Transformer + from sagemaker.serve.model_builder import ModelBuilder + + model_builder = ModelBuilder(image_uri=xgboost_image, s3_model_data_url=model_url, role_arn=role) + model_builder.build(model_name="my-transform-model") + + transformer = Transformer( + model_name="my-transform-model", instance_count=1, instance_type="ml.m5.xlarge", + accept="text/csv", assemble_with="Line", output_path="s3://bucket/output", + ) + transformer.transform("s3://bucket/input", content_type="text/csv", split_type="Line", input_filter="$[1:]") + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-transform-job-example>` + + +.. rst-class:: new-content + +Hyperparameter Tuning +~~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +Optimize hyperparameters with ``HyperparameterTuner`` using ``ContinuousParameter`` and ``CategoricalParameter`` ranges. + +.. code-block:: python + + from sagemaker.train.tuner import HyperparameterTuner + from sagemaker.core.parameter import ContinuousParameter, CategoricalParameter + from sagemaker.train import ModelTrainer + from sagemaker.train.configs import InputData + + trainer = ModelTrainer(training_image=pytorch_image, source_code=source_code, compute=compute, hyperparameters={"epochs": 1}) + + tuner = HyperparameterTuner( + model_trainer=trainer, + objective_metric_name="average test loss", + hyperparameter_ranges={"lr": ContinuousParameter(0.001, 0.1), "batch-size": CategoricalParameter([32, 64, 128])}, + metric_definitions=[{"Name": "average test loss", "Regex": "Test set: Average loss: ([0-9\\.]+)"}], + max_jobs=3, max_parallel_jobs=2, strategy="Random", objective_type="Minimize", + ) + + tuner.tune(inputs=[InputData(channel_name="training", data_source=s3_data_uri)], wait=False) + +:doc:`Standalone example <../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-example>` · :doc:`Pipeline example <../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-pipeline>` + + +.. rst-class:: new-content + +Model Registry +~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +Register models, create models from registry entries, and manage approval workflows. + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.core.resources import Model, ModelPackage + + # Register from artifact + model_builder = ModelBuilder(s3_model_data_url=s3_url, image_uri=image_uri, role_arn=role) + model_builder.build(model_name="my-model") + model_builder.register(model_package_group_name="my-group", content_types=["application/json"], response_types=["application/json"], approval_status="Approved") + + # Create model from registry + model_package = ModelPackage.get(model_package_name=registered_arn) + model_builder = ModelBuilder( + s3_model_data_url=model_package.inference_specification.containers[0].model_data_url, + image_uri=model_package.inference_specification.containers[0].image, role_arn=role, + ) + model_builder.build(model_name="model-from-registry") + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-model-registry-example/v3-model-registry-example>` + + +.. rst-class:: new-content + +Clarify Bias and Explainability +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +Run pre-training bias analysis and SHAP explainability using ``SageMakerClarifyProcessor``. + +.. code-block:: python + + from sagemaker.core.clarify import SageMakerClarifyProcessor, DataConfig, BiasConfig, SHAPConfig + + data_config = DataConfig(s3_data_input_path=data_uri, s3_output_path=output_uri, label="target", headers=headers, dataset_type="text/csv") + bias_config = BiasConfig(label_values_or_threshold=[1], facet_name="gender", facet_values_or_threshold=[1]) + + clarify_processor = SageMakerClarifyProcessor(role=role, instance_count=1, instance_type="ml.m5.large") + clarify_processor.run_pre_training_bias(data_config=data_config, data_bias_config=bias_config, methods=["CI", "DPL"]) + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-sagemaker-clarify>` + + +.. rst-class:: new-content + +EMR Serverless Pipeline Step +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +Run PySpark jobs on EMR Serverless within a SageMaker Pipeline. + +.. code-block:: python + + from sagemaker.mlops.workflow.emr_serverless_step import EMRServerlessStep, EMRServerlessJobConfig + from sagemaker.mlops.workflow.pipeline import Pipeline + + job_config = EMRServerlessJobConfig( + job_driver={"sparkSubmit": {"entryPoint": script_uri, "entryPointArguments": ["--input", input_uri, "--output", output_uri]}}, + execution_role_arn=emr_role, + ) + + step = EMRServerlessStep( + name="SparkJob", job_config=job_config, + application_config={"name": "spark-app", "releaseLabel": "emr-6.15.0", "type": "SPARK"}, + ) + + pipeline = Pipeline(name="EMRPipeline", steps=[step], sagemaker_session=pipeline_session) + pipeline.upsert(role_arn=role) + pipeline.start() + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-emr-serverless-step-example>` + + +.. rst-class:: new-content + +MLflow Integration +~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +Train with MLflow metric tracking and deploy from the MLflow model registry. + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.mode.function_pointers import Mode + + # Train (script logs to MLflow internally) + trainer = ModelTrainer(training_image=pytorch_image, source_code=SourceCode(source_dir=code_dir, entry_script="train.py", requirements="requirements.txt")) + trainer.train() + + # Deploy from MLflow registry + model_builder = ModelBuilder( + mode=Mode.SAGEMAKER_ENDPOINT, + schema_builder=schema_builder, + model_metadata={"MLFLOW_MODEL_PATH": "models:/my-model/1", "MLFLOW_TRACKING_ARN": tracking_arn}, + ) + model_builder.build(model_name="mlflow-model") + model_builder.deploy(endpoint_name="mlflow-endpoint") + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-mlflow-train-inference-e2e-example>` + + +.. rst-class:: new-content + +Migration from V2 +------------------ + +.. rst-class:: new-content-label + +MLOps Classes and Imports +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.workflow.pipeline.Pipeline`` + - ``sagemaker.mlops.workflow.pipeline.Pipeline`` + * - ``sagemaker.workflow.steps.ProcessingStep`` + - ``sagemaker.mlops.workflow.steps.ProcessingStep`` + * - ``sagemaker.workflow.steps.TrainingStep`` + - ``sagemaker.mlops.workflow.steps.TrainingStep`` + * - ``sagemaker.workflow.step_collections.RegisterModel`` + - ``sagemaker.mlops.workflow.model_step.ModelStep`` + ``model_builder.register()`` + * - ``sagemaker.workflow.model_step.ModelStep`` + - ``sagemaker.mlops.workflow.model_step.ModelStep`` + * - ``sagemaker.sklearn.processing.SKLearnProcessor`` + - ``sagemaker.core.processing.ScriptProcessor`` + * - ``sagemaker.processing.ScriptProcessor`` + - ``sagemaker.core.processing.ScriptProcessor`` + * - ``sagemaker.processing.FrameworkProcessor`` + - ``sagemaker.core.processing.FrameworkProcessor`` + * - ``sagemaker.processing.ProcessingInput`` + - ``sagemaker.core.shapes.ProcessingInput`` + ``ProcessingS3Input`` + * - ``sagemaker.processing.ProcessingOutput`` + - ``sagemaker.core.shapes.ProcessingOutput`` + ``ProcessingS3Output`` + * - ``sagemaker.tuner.HyperparameterTuner`` + - ``sagemaker.train.tuner.HyperparameterTuner`` + * - ``sagemaker.parameter.ContinuousParameter`` + - ``sagemaker.core.parameter.ContinuousParameter`` + * - ``sagemaker.transformer.Transformer`` + - ``sagemaker.core.transformer.Transformer`` + * - ``sagemaker.clarify.SageMakerClarifyProcessor`` + - ``sagemaker.core.clarify.SageMakerClarifyProcessor`` + * - ``sagemaker.workflow.parameters.ParameterString`` + - ``sagemaker.core.workflow.parameters.ParameterString`` + * - ``sagemaker.workflow.pipeline_context.PipelineSession`` + - ``sagemaker.core.workflow.pipeline_context.PipelineSession`` + * - ``sagemaker.lineage.context.Context`` + - ``sagemaker.core.lineage.context.Context`` + +.. rst-class:: new-content + +V3 Package Structure +~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - V3 Package + - MLOps Components + * - ``sagemaker-core`` + - ScriptProcessor, FrameworkProcessor, Transformer, Clarify, lineage, pipeline context, parameters, image URIs + * - ``sagemaker-train`` + - ModelTrainer, HyperparameterTuner, InputData, Compute, SourceCode + * - ``sagemaker-serve`` + - ModelBuilder (build, register, deploy) + * - ``sagemaker-mlops`` + - Pipeline, ProcessingStep, TrainingStep, ModelStep, TuningStep, EMRServerlessStep, CacheConfig + + +Explore comprehensive MLOps examples: .. toctree:: :maxdepth: 1 + lineage ../v3-examples/ml-ops-examples/v3-sagemaker-clarify ../v3-examples/ml-ops-examples/v3-pipeline-train-create-registry ../v3-examples/ml-ops-examples/v3-transform-job-example ../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-example + ../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-pipeline ../v3-examples/ml-ops-examples/v3-model-registry-example/v3-model-registry-example ../v3-examples/ml-ops-examples/v3-processing-job-pytorch/v3-pytorch-processing-example + ../v3-examples/ml-ops-examples/v3-processing-job-sklearn + ../v3-examples/ml-ops-examples/v3-emr-serverless-step-example + ../v3-examples/ml-ops-examples/v3-mlflow-train-inference-e2e-example diff --git a/docs/ml_ops/lineage.rst b/docs/ml_ops/lineage.rst new file mode 100644 index 0000000000..cc770d6587 --- /dev/null +++ b/docs/ml_ops/lineage.rst @@ -0,0 +1,473 @@ +Lineage Tracking +================ + +Amazon SageMaker Lineage enables events that happen within SageMaker to be traced via a graph structure. The data simplifies generating reports, making comparisons, or discovering relationships between events. For example, you can easily trace both how a model was generated and where the model was deployed. + +The lineage graph is created automatically by SageMaker and you can directly create or modify your own graphs. + +Key Concepts +------------ + +- **Lineage Graph** - A connected graph tracing your machine learning workflow end to end. +- **Artifacts** - Represents a URI addressable object or data. Artifacts are typically inputs or outputs to Actions. +- **Actions** - Represents an action taken such as a computation, transformation, or job. +- **Contexts** - Provides a method to logically group other entities. +- **Associations** - A directed edge in the lineage graph that links two entities. +- **Lineage Traversal** - Starting from an arbitrary point, trace the lineage graph to discover and analyze relationships between steps in your workflow. +- **Experiments** - Experiment entities (Experiments, Trials, and Trial Components) are also part of the lineage graph and can be associated with Artifacts, Actions, or Contexts. + +Use Cases +--------- + +The notebook from the `SageMaker examples repository `_ demonstrates the following major use cases for lineage tracking: + +1. **Creating Lineage Contexts** - Group related lineage entities under a logical workflow context +2. **Listing Lineage Entities** - Query and enumerate existing contexts, actions, artifacts, and associations +3. **Creating Actions** - Record computational steps such as model builds, transformations, or training jobs +4. **Creating Artifacts** - Register data inputs (datasets, labels) and outputs (trained models) as lineage artifacts +5. **Creating Associations** - Link entities together with directed edges to form the lineage graph +6. **Traversing Associations** - Query incoming and outgoing associations to understand entity relationships +7. **Cleaning Up Lineage Data** - Delete associations and entities when they are no longer needed + +V3 Migration Notes +------------------ + +In SageMaker Python SDK V3, lineage classes have moved from ``sagemaker.lineage`` to ``sagemaker.core.lineage``. The old import paths still work via compatibility shims but emit deprecation warnings. + +.. list-table:: Import Path Changes + :header-rows: 1 + :widths: 50 50 + + * - V2 Import + - V3 Import + * - ``from sagemaker.lineage.context import Context`` + - ``from sagemaker.core.lineage.context import Context`` + * - ``from sagemaker.lineage.action import Action`` + - ``from sagemaker.core.lineage.action import Action`` + * - ``from sagemaker.lineage.artifact import Artifact`` + - ``from sagemaker.core.lineage.artifact import Artifact`` + * - ``from sagemaker.lineage.association import Association`` + - ``from sagemaker.core.lineage.association import Association`` + * - ``import sagemaker`` / ``sagemaker.session.Session()`` + - ``from sagemaker.core.helper.session_helper import Session`` + +The API signatures for ``create``, ``list``, ``delete``, and association management remain the same. The key change is the import path. + + +Use Case 1: Session Setup +------------------------- + +Initialize a SageMaker session and set up common variables. + +**V2 (Legacy):** + +.. code-block:: python + + import boto3 + import sagemaker + + region = boto3.Session().region_name + sagemaker_session = sagemaker.session.Session() + default_bucket = sagemaker_session.default_bucket() + +**V3:** + +.. code-block:: python + + import boto3 + from sagemaker.core.helper.session_helper import Session + + region = boto3.Session().region_name + sagemaker_session = Session() + default_bucket = sagemaker_session.default_bucket() + + +Use Case 2: Creating a Lineage Context +--------------------------------------- + +Contexts provide a method to logically group other lineage entities. Each context name must be unique across all other contexts. + +**V2 (Legacy):** + +.. code-block:: python + + from datetime import datetime + from sagemaker.lineage.context import Context + + unique_id = str(int(datetime.now().replace(microsecond=0).timestamp())) + context_name = f"machine-learning-workflow-{unique_id}" + + ml_workflow_context = Context.create( + context_name=context_name, + context_type="MLWorkflow", + source_uri=unique_id, + properties={"example": "true"}, + ) + +**V3:** + +.. code-block:: python + + from datetime import datetime + from sagemaker.core.lineage.context import Context + + unique_id = str(int(datetime.now().replace(microsecond=0).timestamp())) + context_name = f"machine-learning-workflow-{unique_id}" + + ml_workflow_context = Context.create( + context_name=context_name, + context_type="MLWorkflow", + source_uri=unique_id, + properties={"example": "true"}, + ) + + +Use Case 3: Listing Contexts +----------------------------- + +Enumerate existing contexts sorted by creation time. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.context import Context + + contexts = Context.list(sort_by="CreationTime", sort_order="Descending") + for ctx in contexts: + print(ctx.context_name) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.context import Context + + contexts = Context.list(sort_by="CreationTime", sort_order="Descending") + for ctx in contexts: + print(ctx.context_name) + + +Use Case 4: Creating an Action +------------------------------- + +Actions represent computational steps such as model builds, transformations, or training jobs. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.action import Action + + model_build_action = Action.create( + action_name=f"model-build-step-{unique_id}", + action_type="ModelBuild", + source_uri=unique_id, + properties={"Example": "Metadata"}, + ) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.action import Action + + model_build_action = Action.create( + action_name=f"model-build-step-{unique_id}", + action_type="ModelBuild", + source_uri=unique_id, + properties={"Example": "Metadata"}, + ) + + +Use Case 5: Creating Associations +----------------------------------- + +Associations are directed edges in the lineage graph. The ``association_type`` can be ``Produced``, ``DerivedFrom``, ``AssociatedWith``, or ``ContributedTo``. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.association import Association + + context_action_association = Association.create( + source_arn=ml_workflow_context.context_arn, + destination_arn=model_build_action.action_arn, + association_type="AssociatedWith", + ) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.association import Association + + context_action_association = Association.create( + source_arn=ml_workflow_context.context_arn, + destination_arn=model_build_action.action_arn, + association_type="AssociatedWith", + ) + + + +Use Case 6: Traversing Associations +------------------------------------- + +Query incoming and outgoing associations to understand how entities are related in the lineage graph. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.association import Association + + # List incoming associations to an action + incoming = Association.list(destination_arn=model_build_action.action_arn) + for association in incoming: + print(f"{model_build_action.action_name} has incoming association from {association.source_name}") + + # List outgoing associations from a context + outgoing = Association.list(source_arn=ml_workflow_context.context_arn) + for association in outgoing: + print(f"{ml_workflow_context.context_name} has outgoing association to {association.destination_name}") + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.association import Association + + # List incoming associations to an action + incoming = Association.list(destination_arn=model_build_action.action_arn) + for association in incoming: + print(f"{model_build_action.action_name} has incoming association from {association.source_name}") + + # List outgoing associations from a context + outgoing = Association.list(source_arn=ml_workflow_context.context_arn) + for association in outgoing: + print(f"{ml_workflow_context.context_name} has outgoing association to {association.destination_name}") + + +Use Case 7: Creating Artifacts +------------------------------- + +Artifacts represent URI-addressable objects or data, such as datasets, labels, or trained models. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.artifact import Artifact + + input_test_images = Artifact.create( + artifact_name="mnist-test-images", + artifact_type="TestData", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz", + ) + + input_test_labels = Artifact.create( + artifact_name="mnist-test-labels", + artifact_type="TestLabels", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz", + ) + + output_model = Artifact.create( + artifact_name="mnist-model", + artifact_type="Model", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/model/tensorflow-training-2020-11-20-23-57-13-077/model.tar.gz", + ) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.artifact import Artifact + + input_test_images = Artifact.create( + artifact_name="mnist-test-images", + artifact_type="TestData", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz", + ) + + input_test_labels = Artifact.create( + artifact_name="mnist-test-labels", + artifact_type="TestLabels", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz", + ) + + output_model = Artifact.create( + artifact_name="mnist-model", + artifact_type="Model", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/model/tensorflow-training-2020-11-20-23-57-13-077/model.tar.gz", + ) + + +Use Case 8: Linking Artifacts to Actions +------------------------------------------ + +Associate data artifacts as inputs to an action, and the action's output to a model artifact, forming a complete lineage chain. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.association import Association + + # Link input data to the model build action + Association.create( + source_arn=input_test_images.artifact_arn, + destination_arn=model_build_action.action_arn, + ) + Association.create( + source_arn=input_test_labels.artifact_arn, + destination_arn=model_build_action.action_arn, + ) + + # Link the action output to the model artifact + Association.create( + source_arn=model_build_action.action_arn, + destination_arn=output_model.artifact_arn, + ) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.association import Association + + # Link input data to the model build action + Association.create( + source_arn=input_test_images.artifact_arn, + destination_arn=model_build_action.action_arn, + ) + Association.create( + source_arn=input_test_labels.artifact_arn, + destination_arn=model_build_action.action_arn, + ) + + # Link the action output to the model artifact + Association.create( + source_arn=model_build_action.action_arn, + destination_arn=output_model.artifact_arn, + ) + + +Use Case 9: Cleaning Up Lineage Data +-------------------------------------- + +Delete associations first, then delete the entities themselves. Associations must be removed before their source or destination entities can be deleted. + +**V2 (Legacy):** + +.. code-block:: python + + import sagemaker + from sagemaker.lineage.association import Association + from sagemaker.lineage.context import Context + from sagemaker.lineage.action import Action + from sagemaker.lineage.artifact import Artifact + + sagemaker_session = sagemaker.session.Session() + + def delete_associations(arn): + for summary in Association.list(destination_arn=arn): + assct = Association( + source_arn=summary.source_arn, + destination_arn=summary.destination_arn, + sagemaker_session=sagemaker_session, + ) + assct.delete() + for summary in Association.list(source_arn=arn): + assct = Association( + source_arn=summary.source_arn, + destination_arn=summary.destination_arn, + sagemaker_session=sagemaker_session, + ) + assct.delete() + + # Delete context + delete_associations(ml_workflow_context.context_arn) + Context(context_name=ml_workflow_context.context_name, sagemaker_session=sagemaker_session).delete() + + # Delete action + delete_associations(model_build_action.action_arn) + Action(action_name=model_build_action.action_name, sagemaker_session=sagemaker_session).delete() + + # Delete artifacts + for artifact in [input_test_images, input_test_labels, output_model]: + delete_associations(artifact.artifact_arn) + Artifact(artifact_arn=artifact.artifact_arn, sagemaker_session=sagemaker_session).delete() + +**V3:** + +.. code-block:: python + + from sagemaker.core.helper.session_helper import Session + from sagemaker.core.lineage.association import Association + from sagemaker.core.lineage.context import Context + from sagemaker.core.lineage.action import Action + from sagemaker.core.lineage.artifact import Artifact + + sagemaker_session = Session() + + def delete_associations(arn): + for summary in Association.list(destination_arn=arn): + assct = Association( + source_arn=summary.source_arn, + destination_arn=summary.destination_arn, + sagemaker_session=sagemaker_session, + ) + assct.delete() + for summary in Association.list(source_arn=arn): + assct = Association( + source_arn=summary.source_arn, + destination_arn=summary.destination_arn, + sagemaker_session=sagemaker_session, + ) + assct.delete() + + # Delete context + delete_associations(ml_workflow_context.context_arn) + Context(context_name=ml_workflow_context.context_name, sagemaker_session=sagemaker_session).delete() + + # Delete action + delete_associations(model_build_action.action_arn) + Action(action_name=model_build_action.action_name, sagemaker_session=sagemaker_session).delete() + + # Delete artifacts + for artifact in [input_test_images, input_test_labels, output_model]: + delete_associations(artifact.artifact_arn) + Artifact(artifact_arn=artifact.artifact_arn, sagemaker_session=sagemaker_session).delete() + + +Caveats +------- + +- Associations cannot be created between two experiment entities (e.g., between an Experiment and Trial). +- Associations can only be created between Action, Artifact, or Context resources. +- Maximum number of manually created lineage entities: + + - Artifacts: 6000 + - Contexts: 500 + - Actions: 3000 + - Associations: 6000 + +- There is no limit on the number of lineage entities created automatically by SageMaker. + + +Lineage Tracking Example +------------------------- + +For a complete end-to-end V3 example, see the lineage tracking notebook: + +.. toctree:: + :maxdepth: 1 + + ../v3-examples/ml-ops-examples/v3-lineage-tracking-example diff --git a/docs/training/index.rst b/docs/training/index.rst index 5ba8bbd7a1..febdfa7acd 100644 --- a/docs/training/index.rst +++ b/docs/training/index.rst @@ -1,7 +1,7 @@ Model Training =============== -SageMaker Python SDK V3 revolutionizes machine learning training with the unified **ModelTrainer** class, replacing the complex framework-specific estimators from V2. This modern approach provides a consistent interface across all training scenarios while maintaining the power and flexibility you need. +SageMaker Python SDK V3 provides a unified **ModelTrainer** class that replaces the framework-specific estimators from V2. This single class handles PyTorch, TensorFlow, Scikit-learn, XGBoost, and custom containers through a consistent interface. Key Benefits of V3 Training --------------------------- @@ -13,14 +13,12 @@ Key Benefits of V3 Training Quick Start Example ------------------- -Here's how training has evolved from V2 to V3: - **SageMaker Python SDK V2:** .. code-block:: python from sagemaker.estimator import Estimator - + estimator = Estimator( image_uri="my-training-image", role="arn:aws:iam::123456789012:role/SageMakerRole", @@ -49,88 +47,526 @@ Here's how training has evolved from V2 to V3: trainer.train(input_data_config=[train_data]) -ModelTrainer Overview --------------------- -The ``ModelTrainer`` class is the cornerstone of SageMaker Python SDK V3, providing a unified interface for all training scenarios. This single class replaces the complex web of framework-specific estimators from V2, offering: +.. rst-class:: new-content -**Unified Training Interface** - One class handles PyTorch, TensorFlow, Scikit-learn, XGBoost, and custom containers +Local Container Training +------------------------ -**Intelligent Defaults** - Automatically configures optimal settings based on your training requirements +.. rst-class:: new-content-label -**Flexible Configuration** - Object-oriented design with structured configs that align with AWS APIs +Run training jobs in Docker containers on your local machine for rapid development and debugging before deploying to SageMaker cloud instances. Local mode requires Docker to be installed and running. -**Seamless Integration** - Works seamlessly with SageMaker features like distributed training, spot instances, and hyperparameter tuning +**Session Setup and Image Retrieval:** .. code-block:: python - from sagemaker.train import ModelTrainer - from sagemaker.train.configs import InputData, ResourceConfig + from sagemaker.core.helper.session_helper import Session + from sagemaker.core import image_uris - # Create trainer with intelligent defaults - trainer = ModelTrainer( - training_image="your-training-image", - role="your-sagemaker-role" + sagemaker_session = Session() + region = sagemaker_session.boto_region_name + + training_image = image_uris.retrieve( + framework="pytorch", + region=region, + version="2.0.0", + py_version="py310", + instance_type="ml.m5.xlarge", + image_scope="training" + ) + +**Configuring Local Container Training:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer, Mode + from sagemaker.train.configs import SourceCode, Compute, InputData + + source_code = SourceCode( + source_dir="./source", + entry_script="train.py", + ) + + compute = Compute( + instance_type="local_cpu", + instance_count=1, ) - # Configure training data train_data = InputData( - channel_name="training", - data_source="s3://your-bucket/train-data" + channel_name="train", + data_source="./data/train", ) - # Start training - training_job = trainer.train( + model_trainer = ModelTrainer( + training_image=training_image, + sagemaker_session=sagemaker_session, + source_code=source_code, + compute=compute, input_data_config=[train_data], - resource_config=ResourceConfig( - instance_type="ml.m5.xlarge", - instance_count=1 - ) + base_job_name="local-training", + training_mode=Mode.LOCAL_CONTAINER, + ) + + model_trainer.train() + +Key points: + +- Use ``instance_type="local_cpu"`` or ``"local_gpu"`` for local execution +- Set ``training_mode=Mode.LOCAL_CONTAINER`` to run in Docker +- Local data paths are mounted directly into the container +- Training artifacts are saved to the current working directory + +:doc:`Full example notebook <../v3-examples/training-examples/local-training-example>` + + +.. rst-class:: new-content + +Distributed Local Training +-------------------------- + +.. rst-class:: new-content-label + +Test multi-node distributed training locally using multiple Docker containers before deploying to cloud. This uses the ``Torchrun`` distributed driver to coordinate training across containers. + +**Configuring Distributed Local Training:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer, Mode + from sagemaker.train.configs import SourceCode, Compute, InputData + from sagemaker.train.distributed import Torchrun + + source_code = SourceCode( + source_dir="./source", + entry_script="train.py", + ) + + distributed = Torchrun( + process_count_per_node=1, + ) + + compute = Compute( + instance_type="local_cpu", + instance_count=2, # Two containers for distributed training + ) + + model_trainer = ModelTrainer( + training_image=training_image, + sagemaker_session=sagemaker_session, + source_code=source_code, + distributed=distributed, + compute=compute, + input_data_config=[train_data, test_data], + base_job_name="distributed-local-training", + training_mode=Mode.LOCAL_CONTAINER, + ) + + model_trainer.train() + +Key points: + +- ``instance_count=2`` launches two Docker containers +- ``Torchrun`` handles process coordination across containers +- ``process_count_per_node`` controls how many training processes run per container +- Temporary directories (``shared``, ``algo-1``, ``algo-2``) are cleaned up automatically after training + +:doc:`Full example notebook <../v3-examples/training-examples/distributed-local-training-example>` + + +.. rst-class:: new-content + +Hyperparameter Management +------------------------- + +.. rst-class:: new-content-label + +ModelTrainer supports loading hyperparameters from JSON files, YAML files, or Python dictionaries. File-based hyperparameters provide better version control and support for complex nested structures. + +**Loading Hyperparameters from JSON:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.train.configs import SourceCode + + source_code = SourceCode( + source_dir="./source", + requirements="requirements.txt", + entry_script="train.py", + ) + + trainer = ModelTrainer( + training_image=training_image, + hyperparameters="hyperparameters.json", # Path to JSON file + source_code=source_code, + base_job_name="hp-json-training", + ) + + trainer.train() + +**Loading Hyperparameters from YAML:** + +.. code-block:: python + + trainer = ModelTrainer( + training_image=training_image, + hyperparameters="hyperparameters.yaml", # Path to YAML file + source_code=source_code, + base_job_name="hp-yaml-training", + ) + + trainer.train() + +**Using a Python Dictionary:** + +.. code-block:: python + + trainer = ModelTrainer( + training_image=training_image, + hyperparameters={ + "epochs": 10, + "learning_rate": 0.001, + "batch_size": 32, + "model_config": {"hidden_size": 256, "num_layers": 3}, + }, + source_code=source_code, + base_job_name="hp-dict-training", + ) + + trainer.train() + +Key points: + +- JSON and YAML files support complex nested structures (dicts, lists, booleans, floats) +- Hyperparameters are passed to the training script as command-line arguments +- They are also available via the ``SM_HPS`` environment variable as a JSON string +- All three approaches (JSON, YAML, dict) produce identical training behavior + +:doc:`Full example notebook <../v3-examples/training-examples/hyperparameter-training-example>` + + +.. rst-class:: new-content + +JumpStart Training +------------------ + +.. rst-class:: new-content-label + +Train pre-configured models from the SageMaker JumpStart hub using ``ModelTrainer.from_jumpstart_config()``. JumpStart provides optimized training scripts, default hyperparameters, and curated datasets for hundreds of models. + +**Training a HuggingFace BERT Model:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.core.jumpstart import JumpStartConfig + from sagemaker.core.helper.session_helper import Session, get_execution_role + + sagemaker_session = Session() + role = get_execution_role() + + bert_config = JumpStartConfig( + model_id="huggingface-spc-bert-base-cased", + ) + + bert_trainer = ModelTrainer.from_jumpstart_config( + jumpstart_config=bert_config, + base_job_name="jumpstart-bert", + hyperparameters={ + "epochs": 1, + "learning_rate": 5e-5, + "train_batch_size": 32, + }, + sagemaker_session=sagemaker_session, + ) + + bert_trainer.train() + +**Training an XGBoost Classification Model:** + +.. code-block:: python + + xgboost_config = JumpStartConfig( + model_id="xgboost-classification-model", + ) + + xgboost_trainer = ModelTrainer.from_jumpstart_config( + jumpstart_config=xgboost_config, + base_job_name="jumpstart-xgboost", + hyperparameters={ + "num_round": 10, + "max_depth": 5, + "eta": 0.2, + "objective": "binary:logistic", + }, + sagemaker_session=sagemaker_session, + ) + + xgboost_trainer.train() + +**Discovering Available JumpStart Models:** + +.. code-block:: python + + from sagemaker.core.jumpstart.notebook_utils import list_jumpstart_models + from sagemaker.core.jumpstart.search import search_public_hub_models + + # List all available models + models = list_jumpstart_models() + + # Filter by framework + hf_models = list_jumpstart_models(filter="framework == huggingface") + + # Search with queries + results = search_public_hub_models(query="bert") + + # Complex queries with filters + text_gen = search_public_hub_models(query="@task:text-generation") + +Key points: + +- ``from_jumpstart_config()`` auto-configures training image, instance type, and default hyperparameters +- Override any default hyperparameters while keeping proven defaults for the rest +- JumpStart provides built-in datasets so you can start training immediately +- Supports HuggingFace, XGBoost, CatBoost, LightGBM, and many more frameworks +- Use ``list_jumpstart_models()`` and ``search_public_hub_models()`` to discover available models + +:doc:`Full example notebook <../v3-examples/training-examples/jumpstart-training-example>` + + +.. rst-class:: new-content + +Custom Distributed Training Drivers +------------------------------------ + +.. rst-class:: new-content-label + +Create custom distributed training drivers by extending ``DistributedConfig`` for specialized coordination logic, framework integration, or advanced debugging. + +**Defining a Custom Driver:** + +.. code-block:: python + + from sagemaker.train.distributed import DistributedConfig + + class CustomDriver(DistributedConfig): + process_count_per_node: int = None + + @property + def driver_dir(self) -> str: + return "./custom_drivers" + + @property + def driver_script(self) -> str: + return "driver.py" + +The driver script (``driver.py``) receives environment variables including ``SM_DISTRIBUTED_CONFIG``, ``SM_HPS``, ``SM_SOURCE_DIR``, and ``SM_ENTRY_SCRIPT`` to coordinate training. + +**Using the Custom Driver with ModelTrainer:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.train.configs import SourceCode + + source_code = SourceCode( + source_dir="./scripts", + entry_script="entry_script.py", + ) + + custom_driver = CustomDriver(process_count_per_node=2) + + model_trainer = ModelTrainer( + training_image=training_image, + hyperparameters={"epochs": 10}, + source_code=source_code, + distributed=custom_driver, + base_job_name="custom-distributed", + ) + + model_trainer.train() + +Key points: + +- Extend ``DistributedConfig`` and implement ``driver_dir`` and ``driver_script`` properties +- The driver script manages process launching and coordination +- Environment variables provide access to hyperparameters, source code location, and distributed config +- Useful for custom frameworks, specialized coordination patterns, or advanced debugging + +:doc:`Full example notebook <../v3-examples/training-examples/custom-distributed-training-example>` + + +.. rst-class:: new-content + +AWS Batch Training Queues +------------------------- + +.. rst-class:: new-content-label + +Submit training jobs to AWS Batch job queues for automatic scheduling and resource management. Batch handles capacity allocation and job execution order. + +**Setting Up and Submitting Jobs:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.train.configs import SourceCode, Compute, StoppingCondition + from sagemaker.train.aws_batch.training_queue import TrainingQueue + + source_code = SourceCode(command="echo 'Hello World'") + + model_trainer = ModelTrainer( + training_image=image_uri, + source_code=source_code, + base_job_name="batch-training-job", + compute=Compute(instance_type="ml.g5.xlarge", instance_count=1), + stopping_condition=StoppingCondition(max_runtime_in_seconds=300), ) -Framework Support -~~~~~~~~~~~~~~~~~ + # Create a queue reference and submit jobs + queue = TrainingQueue("my-sm-training-fifo-jq") + queued_job = queue.submit(training_job=model_trainer, inputs=None) -* **PyTorch** - Deep learning with dynamic computation graphs -* **TensorFlow** - Production-ready machine learning at scale -* **Scikit-learn** - Classical machine learning algorithms -* **XGBoost** - Gradient boosting for structured data -* **Custom Containers** - Bring your own training algorithms +**Creating Batch Resources Programmatically:** -Training Types -~~~~~~~~~~~~~~ +.. code-block:: python -* **Single Instance Training** - Cost-effective training for smaller models -* **Multi-Instance Training** - Distributed training for large-scale models -* **Spot Instance Training** - Cost optimization with managed spot instances -* **Local Mode Training** - Development and debugging on local infrastructure + from sagemaker.train.aws_batch.boto_client import get_batch_boto_client + from utils.aws_batch_resource_management import AwsBatchResourceManager, create_resources -Advanced Features -~~~~~~~~~~~~~~~~~ + resource_manager = AwsBatchResourceManager(get_batch_boto_client()) + resources = create_resources( + resource_manager, + job_queue_name="my-sm-training-fifo-jq", + service_environment_name="my-sm-training-fifo-se", + max_capacity=1, + ) + +Key points: + +- ``TrainingQueue`` wraps AWS Batch job queues for SageMaker training +- ``queue.submit()`` submits a ModelTrainer job to the queue +- Batch manages capacity allocation and job scheduling automatically +- Resources (Service Environments, Job Queues) can be created via console or programmatically +- Supports FIFO and priority-based scheduling + +:doc:`Full example notebook <../v3-examples/training-examples/aws_batch/sm-training-queues_getting_started_with_model_trainer>` -* **Automatic Model Tuning** - Hyperparameter optimization at scale -* **Distributed Training** - Multi-node, multi-GPU training strategies -* **Checkpointing** - Resume training from saved states -* **Early Stopping** - Prevent overfitting with intelligent stopping criteria Migration from V2 ------------------ -If you're migrating from V2, the key changes are: +.. rst-class:: new-content + +Training Classes and Imports +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.estimator.Estimator`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.pytorch.PyTorch`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.tensorflow.TensorFlow`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.huggingface.HuggingFace`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.sklearn.SKLearn`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.xgboost.XGBoost`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.jumpstart.JumpStartEstimator`` + - ``ModelTrainer.from_jumpstart_config(JumpStartConfig(...))`` + * - ``sagemaker.tuner.HyperparameterTuner`` + - ``sagemaker.core.resources.HyperParameterTuningJob`` + +.. rst-class:: new-content + +Methods and Patterns +~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``estimator.fit({"train": "s3://..."})`` + - ``trainer.train(input_data_config=[InputData(...)])`` + * - ``estimator.deploy()`` + - ``ModelBuilder(model=trainer).deploy()`` + * - ``instance_type="ml.m5.xlarge"`` + - ``Compute(instance_type="ml.m5.xlarge")`` + * - ``entry_point="train.py"`` + - ``SourceCode(entry_script="train.py")`` + * - ``source_dir="./src"`` + - ``SourceCode(source_dir="./src")`` + * - ``sagemaker.inputs.TrainingInput(s3_data=...)`` + - ``InputData(channel_name=..., data_source=...)`` + * - ``hyperparameters={"lr": 0.01}`` + - ``hyperparameters={"lr": 0.01}`` or ``hyperparameters="config.json"`` + * - ``max_run=3600`` + - ``StoppingCondition(max_runtime_in_seconds=3600)`` + +.. rst-class:: new-content + +Session and Utilities +~~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.session.Session()`` + - ``sagemaker.core.helper.session_helper.Session()`` + * - ``sagemaker.get_execution_role()`` + - ``sagemaker.core.helper.session_helper.get_execution_role()`` + * - ``sagemaker.image_uris.retrieve(...)`` + - ``sagemaker.core.image_uris.retrieve(...)`` + * - ``import sagemaker`` (bare import) + - Use explicit imports from subpackages + * - ``boto3.client('sagemaker')`` + - ``sagemaker.core.resources.*`` (TrainingJob, Model, Endpoint, etc.) + +.. rst-class:: new-content + +V3 Package Structure +~~~~~~~~~~~~~~~~~~~~~ + +.. rst-class:: new-content-label + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - V3 Package + - Purpose + * - ``sagemaker-core`` + - Low-level resource management, session, image URIs, lineage, JumpStart + * - ``sagemaker-train`` + - ModelTrainer, Compute, SourceCode, InputData, distributed training + * - ``sagemaker-serve`` + - ModelBuilder, InferenceSpec, SchemaBuilder, deployment + * - ``sagemaker-mlops`` + - Pipelines, processing, model registry, monitoring, Clarify -* Replace framework-specific estimators (PyTorchEstimator, TensorFlowEstimator, etc.) with ``ModelTrainer`` -* Use structured ``InputData`` configs instead of dictionary-based input specifications -* Leverage the new object-oriented API for cleaner, more maintainable code Training Examples ----------------- -Explore comprehensive training examples that demonstrate V3 capabilities: - .. toctree:: :maxdepth: 1 diff --git a/v3-examples/ml-ops-examples/v3-lineage-tracking-example.ipynb b/v3-examples/ml-ops-examples/v3-lineage-tracking-example.ipynb new file mode 100644 index 0000000000..6088765334 --- /dev/null +++ b/v3-examples/ml-ops-examples/v3-lineage-tracking-example.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker Lineage Tracking - V3 SDK Example\n", + "\n", + "This notebook demonstrates how to use SageMaker Lineage Tracking with the V3 Python SDK.\n", + "\n", + "Amazon SageMaker Lineage enables events that happen within SageMaker to be traced via a graph structure. The data simplifies generating reports, making comparisons, or discovering relationships between events.\n", + "\n", + "## What you will learn\n", + "\n", + "- Create and manage lineage Contexts, Actions, and Artifacts\n", + "- Create Associations to link entities into a lineage graph\n", + "- Traverse associations to discover relationships\n", + "- Clean up lineage data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Initialize a SageMaker session using the V3 `Session` class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "from sagemaker.core.helper.session_helper import Session\n", + "\n", + "region = boto3.Session().region_name\n", + "sagemaker_session = Session()\n", + "default_bucket = sagemaker_session.default_bucket()\n", + "\n", + "print(f\"Region: {region}\")\n", + "print(f\"Default bucket: {default_bucket}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "from sagemaker.core.lineage.context import Context\n", + "from sagemaker.core.lineage.action import Action\n", + "from sagemaker.core.lineage.association import Association\n", + "from sagemaker.core.lineage.artifact import Artifact\n", + "\n", + "unique_id = str(int(datetime.now().replace(microsecond=0).timestamp()))\n", + "print(f\"Unique id is {unique_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 1: Create a Lineage Context\n", + "\n", + "Contexts provide a method to logically group other lineage entities. The context name must be unique across all other contexts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context_name = f\"machine-learning-workflow-{unique_id}\"\n", + "\n", + "ml_workflow_context = Context.create(\n", + " context_name=context_name,\n", + " context_type=\"MLWorkflow\",\n", + " source_uri=unique_id,\n", + " properties={\"example\": \"true\"},\n", + ")\n", + "\n", + "print(f\"Created context: {ml_workflow_context.context_name}\")\n", + "print(f\"Context ARN: {ml_workflow_context.context_arn}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 2: List Contexts\n", + "\n", + "Enumerate existing contexts sorted by creation time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "contexts = Context.list(sort_by=\"CreationTime\", sort_order=\"Descending\")\n", + "\n", + "for ctx in contexts:\n", + " print(ctx.context_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 3: Create an Action\n", + "\n", + "Actions represent computational steps such as model builds, transformations, or training jobs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_build_action = Action.create(\n", + " action_name=f\"model-build-step-{unique_id}\",\n", + " action_type=\"ModelBuild\",\n", + " source_uri=unique_id,\n", + " properties={\"Example\": \"Metadata\"},\n", + ")\n", + "\n", + "print(f\"Created action: {model_build_action.action_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 4: Create Associations\n", + "\n", + "Associations are directed edges in the lineage graph. The `association_type` can be `Produced`, `DerivedFrom`, `AssociatedWith`, or `ContributedTo`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context_action_association = Association.create(\n", + " source_arn=ml_workflow_context.context_arn,\n", + " destination_arn=model_build_action.action_arn,\n", + " association_type=\"AssociatedWith\",\n", + ")\n", + "\n", + "print(\"Association created between context and action\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 5: Traverse Associations\n", + "\n", + "Query incoming and outgoing associations to understand entity relationships." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List incoming associations to the action\n", + "incoming_associations = Association.list(destination_arn=model_build_action.action_arn)\n", + "for association in incoming_associations:\n", + " print(\n", + " f\"{model_build_action.action_name} has an incoming association from {association.source_name}\"\n", + " )\n", + "\n", + "# List outgoing associations from the context\n", + "outgoing_associations = Association.list(source_arn=ml_workflow_context.context_arn)\n", + "for association in outgoing_associations:\n", + " print(\n", + " f\"{ml_workflow_context.context_name} has an outgoing association to {association.destination_name}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 6: Create Artifacts\n", + "\n", + "Artifacts represent URI-addressable objects or data such as datasets, labels, or trained models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create input data artifacts\n", + "input_test_images = Artifact.create(\n", + " artifact_name=\"mnist-test-images\",\n", + " artifact_type=\"TestData\",\n", + " source_types=[{\"SourceIdType\": \"Custom\", \"Value\": unique_id}],\n", + " source_uri=f\"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz\",\n", + ")\n", + "\n", + "input_test_labels = Artifact.create(\n", + " artifact_name=\"mnist-test-labels\",\n", + " artifact_type=\"TestLabels\",\n", + " source_types=[{\"SourceIdType\": \"Custom\", \"Value\": unique_id}],\n", + " source_uri=f\"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz\",\n", + ")\n", + "\n", + "print(f\"Created artifact: {input_test_images.artifact_name}\")\n", + "print(f\"Created artifact: {input_test_labels.artifact_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create output model artifact\n", + "output_model = Artifact.create(\n", + " artifact_name=\"mnist-model\",\n", + " artifact_type=\"Model\",\n", + " source_types=[{\"SourceIdType\": \"Custom\", \"Value\": unique_id}],\n", + " source_uri=f\"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/model/tensorflow-training-2020-11-20-23-57-13-077/model.tar.gz\",\n", + ")\n", + "\n", + "print(f\"Created artifact: {output_model.artifact_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 7: Link Artifacts to Actions\n", + "\n", + "Associate data artifacts as inputs to the action, and the action output to the model artifact." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Associate input data with the model build action\n", + "Association.create(\n", + " source_arn=input_test_images.artifact_arn,\n", + " destination_arn=model_build_action.action_arn,\n", + ")\n", + "Association.create(\n", + " source_arn=input_test_labels.artifact_arn,\n", + " destination_arn=model_build_action.action_arn,\n", + ")\n", + "\n", + "# Associate the action with the output model\n", + "Association.create(\n", + " source_arn=model_build_action.action_arn,\n", + " destination_arn=output_model.artifact_arn,\n", + ")\n", + "\n", + "print(\"Lineage graph complete: inputs -> action -> output\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup\n", + "\n", + "Delete all lineage entities created in this notebook. Associations must be removed before their source or destination entities can be deleted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def delete_associations(arn):\n", + " \"\"\"Delete all incoming and outgoing associations for an entity.\"\"\"\n", + " for summary in Association.list(destination_arn=arn):\n", + " assct = Association(\n", + " source_arn=summary.source_arn,\n", + " destination_arn=summary.destination_arn,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " assct.delete()\n", + "\n", + " for summary in Association.list(source_arn=arn):\n", + " assct = Association(\n", + " source_arn=summary.source_arn,\n", + " destination_arn=summary.destination_arn,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " assct.delete()\n", + "\n", + "\n", + "def delete_lineage_data():\n", + " \"\"\"Delete all lineage entities created in this notebook.\"\"\"\n", + " print(f\"Deleting context {ml_workflow_context.context_name}\")\n", + " delete_associations(ml_workflow_context.context_arn)\n", + " ctx = Context(\n", + " context_name=ml_workflow_context.context_name,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " ctx.delete()\n", + "\n", + " print(f\"Deleting action {model_build_action.action_name}\")\n", + " delete_associations(model_build_action.action_arn)\n", + " actn = Action(\n", + " action_name=model_build_action.action_name,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " actn.delete()\n", + "\n", + " for artifact in [input_test_images, input_test_labels, output_model]:\n", + " print(f\"Deleting artifact {artifact.artifact_arn} {artifact.artifact_name}\")\n", + " delete_associations(artifact.artifact_arn)\n", + " artfct = Artifact(\n", + " artifact_arn=artifact.artifact_arn,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " artfct.delete()\n", + "\n", + "\n", + "delete_lineage_data()\n", + "print(\"Cleanup complete\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 9897384caddc042d1e4af3950e69d7cd4cd83d65 Mon Sep 17 00:00:00 2001 From: Gokul A Date: Thu, 26 Mar 2026 15:32:22 -0700 Subject: [PATCH 2/3] docs: Add detailed use case sections for training, inference, and MLOps Add comprehensive documentation extracted from V3 example notebooks covering all training, inference, and MLOps use cases with V3 code examples. New sections include local container training, distributed training, hyperparameter management, JumpStart, custom InferenceSpec, model optimization, inference pipelines, processing jobs, batch transform, model registry, Clarify, EMR Serverless, MLflow integration, and lineage tracking. Each section includes red NEW highlights for change detection and expanded V2-to-V3 migration tables. --- docs/_static/custom.css | 16 ++++++++++++++++ docs/ml_ops/index.rst | 20 ++++++++++---------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 6aa0183058..e9cda51a4b 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -179,6 +179,11 @@ p { padding-left: 16px; margin-left: 0; } +section.new-content { + border-left: 4px solid #dc2626; + padding-left: 16px; + margin-left: 0; +} .new-content-label::before { content: "NEW"; background: #dc2626; @@ -190,3 +195,14 @@ p { margin-right: 8px; vertical-align: middle; } +p.new-content-label::before { + content: "NEW"; + background: #dc2626; + color: white; + font-size: 0.7em; + font-weight: 700; + padding: 2px 8px; + border-radius: 3px; + margin-right: 8px; + vertical-align: middle; +} diff --git a/docs/ml_ops/index.rst b/docs/ml_ops/index.rst index a9738bbad9..eaa380e4fb 100644 --- a/docs/ml_ops/index.rst +++ b/docs/ml_ops/index.rst @@ -249,7 +249,7 @@ ML Operations Examples .. rst-class:: new-content E2E Pipeline with Model Registry -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +---------------------------------- .. rst-class:: new-content-label @@ -297,7 +297,7 @@ Build a SageMaker Pipeline that preprocesses data, trains a model, and registers .. rst-class:: new-content Processing Jobs -~~~~~~~~~~~~~~~~ +---------------- .. rst-class:: new-content-label @@ -326,7 +326,7 @@ Run data preprocessing with ``ScriptProcessor`` (sklearn) or ``FrameworkProcesso .. rst-class:: new-content Batch Transform Jobs -~~~~~~~~~~~~~~~~~~~~~ +--------------------- .. rst-class:: new-content-label @@ -352,7 +352,7 @@ Run batch inference on large datasets using ``Transformer``. .. rst-class:: new-content Hyperparameter Tuning -~~~~~~~~~~~~~~~~~~~~~~ +---------------------- .. rst-class:: new-content-label @@ -383,7 +383,7 @@ Optimize hyperparameters with ``HyperparameterTuner`` using ``ContinuousParamete .. rst-class:: new-content Model Registry -~~~~~~~~~~~~~~~ +--------------- .. rst-class:: new-content-label @@ -413,7 +413,7 @@ Register models, create models from registry entries, and manage approval workfl .. rst-class:: new-content Clarify Bias and Explainability -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------------- .. rst-class:: new-content-label @@ -435,7 +435,7 @@ Run pre-training bias analysis and SHAP explainability using ``SageMakerClarifyP .. rst-class:: new-content EMR Serverless Pipeline Step -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +----------------------------- .. rst-class:: new-content-label @@ -466,7 +466,7 @@ Run PySpark jobs on EMR Serverless within a SageMaker Pipeline. .. rst-class:: new-content MLflow Integration -~~~~~~~~~~~~~~~~~~~ +------------------- .. rst-class:: new-content-label @@ -502,7 +502,7 @@ Migration from V2 .. rst-class:: new-content-label MLOps Classes and Imports -~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------- .. list-table:: :header-rows: 1 @@ -548,7 +548,7 @@ MLOps Classes and Imports .. rst-class:: new-content V3 Package Structure -~~~~~~~~~~~~~~~~~~~~~ +--------------------- .. rst-class:: new-content-label From 3b5b512d8c922a4df935df0e47b89d8d74c74fd6 Mon Sep 17 00:00:00 2001 From: Gokul A Date: Tue, 31 Mar 2026 08:44:21 -0700 Subject: [PATCH 3/3] docs: Add use case sections for training, inference, and MLOps Add comprehensive documentation extracted from V3 example notebooks covering training, inference, and MLOps use cases with V3 code examples. Includes local container training, distributed training, hyperparameter management, JumpStart, custom InferenceSpec, model optimization, inference pipelines, processing jobs, batch transform, model registry, Clarify, EMR Serverless, MLflow integration, lineage tracking, and expanded V2-to-V3 migration tables across all sections. --- docs/_static/custom.css | 33 --------------------------------- docs/inference/index.rst | 26 -------------------------- docs/ml_ops/index.rst | 22 ---------------------- docs/training/index.rst | 20 -------------------- 4 files changed, 101 deletions(-) diff --git a/docs/_static/custom.css b/docs/_static/custom.css index e9cda51a4b..66a55bb008 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -173,36 +173,3 @@ p { color: var(--pst-color-text-base); } -/* NEW CONTENT HIGHLIGHT - remove this block to revert red highlighting */ -.new-content { - border-left: 4px solid #dc2626; - padding-left: 16px; - margin-left: 0; -} -section.new-content { - border-left: 4px solid #dc2626; - padding-left: 16px; - margin-left: 0; -} -.new-content-label::before { - content: "NEW"; - background: #dc2626; - color: white; - font-size: 0.7em; - font-weight: 700; - padding: 2px 8px; - border-radius: 3px; - margin-right: 8px; - vertical-align: middle; -} -p.new-content-label::before { - content: "NEW"; - background: #dc2626; - color: white; - font-size: 0.7em; - font-weight: 700; - padding: 2px 8px; - border-radius: 3px; - margin-right: 8px; - vertical-align: middle; -} diff --git a/docs/inference/index.rst b/docs/inference/index.rst index 95c71f9e7b..675f67f090 100644 --- a/docs/inference/index.rst +++ b/docs/inference/index.rst @@ -59,12 +59,10 @@ Here's how inference has evolved from V2 to V3: ) -.. rst-class:: new-content Custom InferenceSpec -------------------- -.. rst-class:: new-content-label Define custom model loading and inference logic by extending ``InferenceSpec``. Implement ``load()`` to deserialize your model and ``invoke()`` to run predictions. @@ -109,12 +107,10 @@ Define custom model loading and inference logic by extending ``InferenceSpec``. :doc:`Full example notebook <../v3-examples/inference-examples/inference-spec-example>` -.. rst-class:: new-content JumpStart Model Deployment -------------------------- -.. rst-class:: new-content-label Deploy pre-trained models from the JumpStart hub using ``ModelBuilder.from_jumpstart_config()``. @@ -143,12 +139,10 @@ Deploy pre-trained models from the JumpStart hub using ``ModelBuilder.from_jumps :doc:`Full example notebook <../v3-examples/inference-examples/jumpstart-example>` -.. rst-class:: new-content Model Optimization (Quantization) ---------------------------------- -.. rst-class:: new-content-label Optimize models with quantization (e.g., AWQ) using ``model_builder.optimize()`` before deployment. @@ -180,12 +174,10 @@ Optimize models with quantization (e.g., AWQ) using ``model_builder.optimize()`` :doc:`Full example notebook <../v3-examples/inference-examples/optimize-example>` -.. rst-class:: new-content Train-to-Inference End-to-End ------------------------------ -.. rst-class:: new-content-label Pass a ``ModelTrainer`` directly to ``ModelBuilder`` to go from training to deployment in one flow. @@ -220,12 +212,10 @@ Pass a ``ModelTrainer`` directly to ``ModelBuilder`` to go from training to depl :doc:`Full example notebook <../v3-examples/inference-examples/train-inference-e2e-example>` -.. rst-class:: new-content JumpStart Train-to-Inference ----------------------------- -.. rst-class:: new-content-label Train a JumpStart model with ``ModelTrainer.from_jumpstart_config()`` then deploy via ``ModelBuilder``. @@ -251,12 +241,10 @@ Train a JumpStart model with ``ModelTrainer.from_jumpstart_config()`` then deplo :doc:`Full example notebook <../v3-examples/inference-examples/jumpstart-e2e-training-example>` -.. rst-class:: new-content HuggingFace Model Deployment ------------------------------ -.. rst-class:: new-content-label Deploy HuggingFace models with a custom ``InferenceSpec`` using Multi Model Server (MMS). @@ -295,12 +283,10 @@ Deploy HuggingFace models with a custom ``InferenceSpec`` using Multi Model Serv :doc:`Full example notebook <../v3-examples/inference-examples/huggingface-example>` -.. rst-class:: new-content In-Process Mode ---------------- -.. rst-class:: new-content-label Run inference entirely in your Python process with no containers or AWS resources. Use ``Mode.IN_PROCESS`` and ``deploy_local()``. @@ -333,12 +319,10 @@ Run inference entirely in your Python process with no containers or AWS resource :doc:`Full example notebook <../v3-examples/inference-examples/in-process-mode-example>` -.. rst-class:: new-content Local Container Mode --------------------- -.. rst-class:: new-content-label Test models in Docker containers locally using ``Mode.LOCAL_CONTAINER`` and ``deploy_local()``. Same container environment as SageMaker endpoints. @@ -372,12 +356,10 @@ Test models in Docker containers locally using ``Mode.LOCAL_CONTAINER`` and ``de :doc:`Full example notebook <../v3-examples/inference-examples/local-mode-example>` -.. rst-class:: new-content Inference Pipelines (Multi-Container) -------------------------------------- -.. rst-class:: new-content-label Chain multiple containers into a serial inference pipeline. Pass a list of ``Model`` objects to ``ModelBuilder``. @@ -429,12 +411,10 @@ Chain multiple containers into a serial inference pipeline. Pass a list of ``Mod Migration from V2 ------------------ -.. rst-class:: new-content Inference Classes and Imports ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. rst-class:: new-content-label .. list-table:: :header-rows: 1 @@ -463,12 +443,10 @@ Inference Classes and Imports * - ``sagemaker.deserializers.*`` - Handle deserialization directly (e.g., ``json.loads()``) -.. rst-class:: new-content Methods and Patterns ~~~~~~~~~~~~~~~~~~~~~ -.. rst-class:: new-content-label .. list-table:: :header-rows: 1 @@ -489,12 +467,10 @@ Methods and Patterns * - ``Transformer(model_name=...).transform(...)`` - ``sagemaker.core.resources.TransformJob.create(...)`` -.. rst-class:: new-content Session and Utilities ~~~~~~~~~~~~~~~~~~~~~~ -.. rst-class:: new-content-label .. list-table:: :header-rows: 1 @@ -511,12 +487,10 @@ Session and Utilities * - ``boto3.client('sagemaker')`` - ``sagemaker.core.resources.*`` (Model, Endpoint, EndpointConfig, etc.) -.. rst-class:: new-content V3 Package Structure ~~~~~~~~~~~~~~~~~~~~~ -.. rst-class:: new-content-label .. list-table:: :header-rows: 1 diff --git a/docs/ml_ops/index.rst b/docs/ml_ops/index.rst index eaa380e4fb..e6e4d1b6ca 100644 --- a/docs/ml_ops/index.rst +++ b/docs/ml_ops/index.rst @@ -202,11 +202,9 @@ If you're migrating MLOps workflows from V2, the key improvements are: Lineage Tracking ~~~~~~~~~~~~~~~~ -.. rst-class:: new-content SageMaker Lineage enables tracing events across your ML workflow via a graph structure. V3 provides lineage tracking through ``sagemaker.core.lineage`` with support for: -.. rst-class:: new-content-label - **Contexts** - Logical grouping of lineage entities under workflow contexts - **Actions** - Recording computational steps like model builds and transformations @@ -246,12 +244,10 @@ SageMaker Lineage enables tracing events across your ML workflow via a graph str ML Operations Examples ---------------------- -.. rst-class:: new-content E2E Pipeline with Model Registry ---------------------------------- -.. rst-class:: new-content-label Build a SageMaker Pipeline that preprocesses data, trains a model, and registers it to the Model Registry. @@ -294,12 +290,10 @@ Build a SageMaker Pipeline that preprocesses data, trains a model, and registers :doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-pipeline-train-create-registry>` -.. rst-class:: new-content Processing Jobs ---------------- -.. rst-class:: new-content-label Run data preprocessing with ``ScriptProcessor`` (sklearn) or ``FrameworkProcessor`` (PyTorch). @@ -323,12 +317,10 @@ Run data preprocessing with ``ScriptProcessor`` (sklearn) or ``FrameworkProcesso :doc:`SKLearn example <../v3-examples/ml-ops-examples/v3-processing-job-sklearn>` · :doc:`PyTorch example <../v3-examples/ml-ops-examples/v3-processing-job-pytorch/v3-pytorch-processing-example>` -.. rst-class:: new-content Batch Transform Jobs --------------------- -.. rst-class:: new-content-label Run batch inference on large datasets using ``Transformer``. @@ -349,12 +341,10 @@ Run batch inference on large datasets using ``Transformer``. :doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-transform-job-example>` -.. rst-class:: new-content Hyperparameter Tuning ---------------------- -.. rst-class:: new-content-label Optimize hyperparameters with ``HyperparameterTuner`` using ``ContinuousParameter`` and ``CategoricalParameter`` ranges. @@ -380,12 +370,10 @@ Optimize hyperparameters with ``HyperparameterTuner`` using ``ContinuousParamete :doc:`Standalone example <../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-example>` · :doc:`Pipeline example <../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-pipeline>` -.. rst-class:: new-content Model Registry --------------- -.. rst-class:: new-content-label Register models, create models from registry entries, and manage approval workflows. @@ -410,12 +398,10 @@ Register models, create models from registry entries, and manage approval workfl :doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-model-registry-example/v3-model-registry-example>` -.. rst-class:: new-content Clarify Bias and Explainability -------------------------------- -.. rst-class:: new-content-label Run pre-training bias analysis and SHAP explainability using ``SageMakerClarifyProcessor``. @@ -432,12 +418,10 @@ Run pre-training bias analysis and SHAP explainability using ``SageMakerClarifyP :doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-sagemaker-clarify>` -.. rst-class:: new-content EMR Serverless Pipeline Step ----------------------------- -.. rst-class:: new-content-label Run PySpark jobs on EMR Serverless within a SageMaker Pipeline. @@ -463,12 +447,10 @@ Run PySpark jobs on EMR Serverless within a SageMaker Pipeline. :doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-emr-serverless-step-example>` -.. rst-class:: new-content MLflow Integration ------------------- -.. rst-class:: new-content-label Train with MLflow metric tracking and deploy from the MLflow model registry. @@ -494,12 +476,10 @@ Train with MLflow metric tracking and deploy from the MLflow model registry. :doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-mlflow-train-inference-e2e-example>` -.. rst-class:: new-content Migration from V2 ------------------ -.. rst-class:: new-content-label MLOps Classes and Imports -------------------------- @@ -545,12 +525,10 @@ MLOps Classes and Imports * - ``sagemaker.lineage.context.Context`` - ``sagemaker.core.lineage.context.Context`` -.. rst-class:: new-content V3 Package Structure --------------------- -.. rst-class:: new-content-label .. list-table:: :header-rows: 1 diff --git a/docs/training/index.rst b/docs/training/index.rst index febdfa7acd..a69e7bf6f5 100644 --- a/docs/training/index.rst +++ b/docs/training/index.rst @@ -48,12 +48,10 @@ Quick Start Example trainer.train(input_data_config=[train_data]) -.. rst-class:: new-content Local Container Training ------------------------ -.. rst-class:: new-content-label Run training jobs in Docker containers on your local machine for rapid development and debugging before deploying to SageMaker cloud instances. Local mode requires Docker to be installed and running. @@ -120,12 +118,10 @@ Key points: :doc:`Full example notebook <../v3-examples/training-examples/local-training-example>` -.. rst-class:: new-content Distributed Local Training -------------------------- -.. rst-class:: new-content-label Test multi-node distributed training locally using multiple Docker containers before deploying to cloud. This uses the ``Torchrun`` distributed driver to coordinate training across containers. @@ -174,12 +170,10 @@ Key points: :doc:`Full example notebook <../v3-examples/training-examples/distributed-local-training-example>` -.. rst-class:: new-content Hyperparameter Management ------------------------- -.. rst-class:: new-content-label ModelTrainer supports loading hyperparameters from JSON files, YAML files, or Python dictionaries. File-based hyperparameters provide better version control and support for complex nested structures. @@ -246,12 +240,10 @@ Key points: :doc:`Full example notebook <../v3-examples/training-examples/hyperparameter-training-example>` -.. rst-class:: new-content JumpStart Training ------------------ -.. rst-class:: new-content-label Train pre-configured models from the SageMaker JumpStart hub using ``ModelTrainer.from_jumpstart_config()``. JumpStart provides optimized training scripts, default hyperparameters, and curated datasets for hundreds of models. @@ -335,12 +327,10 @@ Key points: :doc:`Full example notebook <../v3-examples/training-examples/jumpstart-training-example>` -.. rst-class:: new-content Custom Distributed Training Drivers ------------------------------------ -.. rst-class:: new-content-label Create custom distributed training drivers by extending ``DistributedConfig`` for specialized coordination logic, framework integration, or advanced debugging. @@ -397,12 +387,10 @@ Key points: :doc:`Full example notebook <../v3-examples/training-examples/custom-distributed-training-example>` -.. rst-class:: new-content AWS Batch Training Queues ------------------------- -.. rst-class:: new-content-label Submit training jobs to AWS Batch job queues for automatic scheduling and resource management. Batch handles capacity allocation and job execution order. @@ -457,12 +445,10 @@ Key points: Migration from V2 ------------------ -.. rst-class:: new-content Training Classes and Imports ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. rst-class:: new-content-label .. list-table:: :header-rows: 1 @@ -487,12 +473,10 @@ Training Classes and Imports * - ``sagemaker.tuner.HyperparameterTuner`` - ``sagemaker.core.resources.HyperParameterTuningJob`` -.. rst-class:: new-content Methods and Patterns ~~~~~~~~~~~~~~~~~~~~~ -.. rst-class:: new-content-label .. list-table:: :header-rows: 1 @@ -517,12 +501,10 @@ Methods and Patterns * - ``max_run=3600`` - ``StoppingCondition(max_runtime_in_seconds=3600)`` -.. rst-class:: new-content Session and Utilities ~~~~~~~~~~~~~~~~~~~~~~ -.. rst-class:: new-content-label .. list-table:: :header-rows: 1 @@ -541,12 +523,10 @@ Session and Utilities * - ``boto3.client('sagemaker')`` - ``sagemaker.core.resources.*`` (TrainingJob, Model, Endpoint, etc.) -.. rst-class:: new-content V3 Package Structure ~~~~~~~~~~~~~~~~~~~~~ -.. rst-class:: new-content-label .. list-table:: :header-rows: 1