diff --git a/docs/_static/custom.css b/docs/_static/custom.css index f618f6c891..66a55bb008 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -172,3 +172,4 @@ p { font-size: 1.0rem; color: var(--pst-color-text-base); } + diff --git a/docs/inference/index.rst b/docs/inference/index.rst index 67f6978213..675f67f090 100644 --- a/docs/inference/index.rst +++ b/docs/inference/index.rst @@ -58,126 +58,455 @@ Here's how inference has evolved from V2 to V3: content_type="application/json" ) -ModelBuilder Overview --------------------- - -The ``ModelBuilder`` class is the cornerstone of SageMaker Python SDK V3 inference, providing a unified interface for all deployment scenarios. This single class replaces the complex web of framework-specific model classes from V2, offering: -**Unified Deployment Interface** - One class handles PyTorch, TensorFlow, Scikit-learn, XGBoost, HuggingFace, and custom containers -**Intelligent Optimization** - Automatically optimizes model serving configuration based on your model characteristics +Custom InferenceSpec +-------------------- -**Flexible Deployment Options** - Support for real-time endpoints, batch transform, and serverless inference -**Seamless Integration** - Works seamlessly with SageMaker features like auto-scaling, multi-model endpoints, and A/B testing +Define custom model loading and inference logic by extending ``InferenceSpec``. Implement ``load()`` to deserialize your model and ``invoke()`` to run predictions. .. code-block:: python - from sagemaker.serve import ModelBuilder + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.utils.types import ModelServer + + class MyModelSpec(InferenceSpec): + def load(self, model_dir: str): + import torch + return torch.jit.load(f"{model_dir}/model.pth", map_location="cpu") + + def invoke(self, input_object, model): + import torch + tensor = torch.tensor(input_object, dtype=torch.float32) + with torch.no_grad(): + return model(tensor).tolist() + + schema_builder = SchemaBuilder( + [[0.1, 0.2, 0.3, 0.4]], # sample input + [[0.9, 0.1]] # sample output + ) model_builder = ModelBuilder( - model="your-model", - model_path="s3://your-bucket/model-artifacts", - role="your-sagemaker-role" + inference_spec=MyModelSpec(), + model_path="./model_artifacts", + model_server=ModelServer.TORCHSERVE, + schema_builder=schema_builder, ) - model = model_builder.build(model_name="my-model") - - endpoint = model_builder.deploy( - endpoint_name="my-endpoint", - instance_type="ml.m5.xlarge", - initial_instance_count=1 + core_model = model_builder.build(model_name="my-custom-model") + endpoint = model_builder.deploy(endpoint_name="my-endpoint") + + result = endpoint.invoke( + body=json.dumps([[0.1, 0.2, 0.3, 0.4]]), + content_type="application/json" ) - - response = endpoint.invoke( - body={"inputs": "your-input-data"}, + +:doc:`Full example notebook <../v3-examples/inference-examples/inference-spec-example>` + + + +JumpStart Model Deployment +-------------------------- + + +Deploy pre-trained models from the JumpStart hub using ``ModelBuilder.from_jumpstart_config()``. + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.core.jumpstart.configs import JumpStartConfig + from sagemaker.train.configs import Compute + + compute = Compute(instance_type="ml.g5.2xlarge") + jumpstart_config = JumpStartConfig(model_id="huggingface-llm-falcon-7b-bf16") + + model_builder = ModelBuilder.from_jumpstart_config( + jumpstart_config=jumpstart_config, + compute=compute, + ) + + core_model = model_builder.build(model_name="falcon-model") + endpoint = model_builder.deploy(endpoint_name="falcon-endpoint") + + result = endpoint.invoke( + body=json.dumps({"inputs": "What are falcons?", "parameters": {"max_new_tokens": 32}}), content_type="application/json" ) -Inference Capabilities ----------------------- +:doc:`Full example notebook <../v3-examples/inference-examples/jumpstart-example>` + -Model Optimization Support -~~~~~~~~~~~~~~~~~~~~~~~~~~ -V3 introduces powerful model optimization capabilities for enhanced performance: +Model Optimization (Quantization) +---------------------------------- -* **SageMaker Neo** - Optimize models for specific hardware targets -* **TensorRT Integration** - Accelerate deep learning inference on NVIDIA GPUs -* **ONNX Runtime** - Cross-platform model optimization and acceleration -* **Quantization Support** - Reduce model size and improve inference speed -**Model Optimization Example:** +Optimize models with quantization (e.g., AWQ) using ``model_builder.optimize()`` before deployment. .. code-block:: python - from sagemaker.serve import ModelBuilder + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.builder.schema_builder import SchemaBuilder + + schema_builder = SchemaBuilder( + {"inputs": "What are falcons?", "parameters": {"max_new_tokens": 32}}, + [{"generated_text": "Falcons are birds of prey."}] + ) - # Create ModelBuilder with optimization settings model_builder = ModelBuilder( - model="huggingface-bert-base", - role="your-sagemaker-role" + model="meta-textgeneration-llama-3-8b-instruct", + schema_builder=schema_builder, ) - # Build and deploy with optimization - model = model_builder.build(model_name="optimized-bert") - endpoint = model_builder.deploy( - endpoint_name="bert-endpoint", - instance_type="ml.inf1.xlarge", - initial_instance_count=1 + optimized_model = model_builder.optimize( + instance_type="ml.g5.2xlarge", + quantization_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}}, + accept_eula=True, + job_name="optimize-llama", + model_name="llama-optimized", ) -Key Inference Features -~~~~~~~~~~~~~~~~~~~~~ + endpoint = model_builder.deploy(endpoint_name="llama-endpoint", initial_instance_count=1) -* **Multi-Model Endpoints** - Host multiple models on a single endpoint with automatic model loading and unloading for cost optimization -* **Auto-Scaling Integration** - Automatically scale endpoint capacity based on traffic patterns with configurable scaling policies -* **A/B Testing Support** - Deploy multiple model variants with traffic splitting for safe model updates and performance comparison -* **Batch Transform Jobs** - Process large datasets efficiently with automatic data partitioning and parallel processing -* **Serverless Inference** - Pay-per-request pricing with automatic scaling from zero to handle variable workloads +:doc:`Full example notebook <../v3-examples/inference-examples/optimize-example>` -Supported Inference Scenarios + + +Train-to-Inference End-to-End +------------------------------ + + +Pass a ``ModelTrainer`` directly to ``ModelBuilder`` to go from training to deployment in one flow. + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.train.configs import SourceCode + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.utils.types import ModelServer + + # Train + trainer = ModelTrainer( + training_image="pytorch-training:1.13.1-cpu-py39", + source_code=SourceCode(source_dir="./src", entry_script="train.py"), + base_job_name="my-training", + ) + trainer.train() + + # Deploy from trainer + model_builder = ModelBuilder( + model=trainer, + schema_builder=SchemaBuilder([[0.1, 0.2, 0.3, 0.4]], [[0.8, 0.2]]), + model_server=ModelServer.TORCHSERVE, + inference_spec=MyInferenceSpec(), + ) + + core_model = model_builder.build(model_name="trained-model") + endpoint = model_builder.deploy(endpoint_name="trained-endpoint", initial_instance_count=1) + +:doc:`Full example notebook <../v3-examples/inference-examples/train-inference-e2e-example>` + + + +JumpStart Train-to-Inference ----------------------------- -Deployment Types -~~~~~~~~~~~~~~~ -* **Real-Time Endpoints** - Low-latency inference for interactive applications -* **Batch Transform** - High-throughput processing for large datasets -* **Serverless Inference** - Cost-effective inference for variable workloads -* **Multi-Model Endpoints** - Host multiple models on shared infrastructure +Train a JumpStart model with ``ModelTrainer.from_jumpstart_config()`` then deploy via ``ModelBuilder``. + +.. code-block:: python -Framework Support -~~~~~~~~~~~~~~~~~ + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.core.jumpstart.configs import JumpStartConfig -* **PyTorch** - Deep learning models with dynamic computation graphs -* **TensorFlow** - Production-ready machine learning models at scale -* **Scikit-learn** - Classical machine learning algorithms -* **XGBoost** - Gradient boosting models for structured data -* **HuggingFace** - Pre-trained transformer models for NLP tasks -* **Custom Containers** - Bring your own inference logic and dependencies + jumpstart_config = JumpStartConfig(model_id="huggingface-spc-bert-base-cased") + + trainer = ModelTrainer.from_jumpstart_config( + jumpstart_config=jumpstart_config, + base_job_name="js-training", + hyperparameters={"epochs": 1}, + ) + trainer.train() + + model_builder = ModelBuilder(model=trainer, dependencies={"auto": False}) + core_model = model_builder.build(model_name="bert-trained") + endpoint = model_builder.deploy(endpoint_name="bert-endpoint") + +:doc:`Full example notebook <../v3-examples/inference-examples/jumpstart-e2e-training-example>` + + + +HuggingFace Model Deployment +------------------------------ + + +Deploy HuggingFace models with a custom ``InferenceSpec`` using Multi Model Server (MMS). + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.utils.types import ModelServer + + class HFSpec(InferenceSpec): + def load(self, model_dir): + from transformers import AutoTokenizer, AutoModelForCausalLM + tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") + model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") + return {"model": model, "tokenizer": tokenizer} + + def invoke(self, input_object, model): + text = input_object["inputs"] + inputs = model["tokenizer"].encode(text, return_tensors="pt") + outputs = model["model"].generate(inputs, max_length=inputs.shape[1] + 20) + return [{"generated_text": model["tokenizer"].decode(outputs[0], skip_special_tokens=True)}] + + model_builder = ModelBuilder( + inference_spec=HFSpec(), + model_server=ModelServer.MMS, + schema_builder=SchemaBuilder( + {"inputs": "Hello, how are you?"}, + [{"generated_text": "I'm doing well!"}] + ), + ) + + core_model = model_builder.build(model_name="hf-dialogpt") + endpoint = model_builder.deploy(endpoint_name="hf-endpoint") + +:doc:`Full example notebook <../v3-examples/inference-examples/huggingface-example>` + + + +In-Process Mode +---------------- + + +Run inference entirely in your Python process with no containers or AWS resources. Use ``Mode.IN_PROCESS`` and ``deploy_local()``. + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.mode.function_pointers import Mode + + class MathSpec(InferenceSpec): + def load(self, model_dir): + return {"factor": 2.0} + + def invoke(self, input_object, model): + numbers = input_object["numbers"] + return {"result": [n * model["factor"] for n in numbers]} + + model_builder = ModelBuilder( + inference_spec=MathSpec(), + schema_builder=SchemaBuilder({"numbers": [1.0, 2.0]}, {"result": [2.0, 4.0]}), + mode=Mode.IN_PROCESS, + ) + + core_model = model_builder.build(model_name="math-model") + local_endpoint = model_builder.deploy_local(endpoint_name="math-local") + + result = local_endpoint.invoke(body={"numbers": [3.0, 5.0]}, content_type="application/json") + +:doc:`Full example notebook <../v3-examples/inference-examples/in-process-mode-example>` + + + +Local Container Mode +--------------------- + + +Test models in Docker containers locally using ``Mode.LOCAL_CONTAINER`` and ``deploy_local()``. Same container environment as SageMaker endpoints. + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.spec.inference_spec import InferenceSpec + from sagemaker.serve.builder.schema_builder import SchemaBuilder + from sagemaker.serve.utils.types import ModelServer + from sagemaker.serve.mode.function_pointers import Mode + + model_builder = ModelBuilder( + inference_spec=MyPyTorchSpec(model_path="./model"), + model_server=ModelServer.TORCHSERVE, + schema_builder=SchemaBuilder([[1.0, 2.0, 3.0, 4.0]], [[0.6, 0.4]]), + mode=Mode.LOCAL_CONTAINER, + ) + + local_model = model_builder.build(model_name="local-pytorch") + local_endpoint = model_builder.deploy_local( + endpoint_name="local-pytorch-ep", + wait=True, + container_timeout_in_seconds=1200, + ) + + response = local_endpoint.invoke( + body=json.dumps([[1.0, 2.0, 3.0, 4.0]]), + content_type="application/json" + ) + +:doc:`Full example notebook <../v3-examples/inference-examples/local-mode-example>` + + + +Inference Pipelines (Multi-Container) +-------------------------------------- + + +Chain multiple containers into a serial inference pipeline. Pass a list of ``Model`` objects to ``ModelBuilder``. + +.. code-block:: python + + from sagemaker.core.resources import Model + from sagemaker.core.shapes import ContainerDefinition + from sagemaker.core.utils import repack_model + from sagemaker.serve import ModelBuilder + + # Create individual models with primary_container + sklearn_model = Model.create( + model_name="sklearn-preprocess", + primary_container=ContainerDefinition( + image=sklearn_image, + model_data_url=sklearn_repacked_uri, + environment={"SAGEMAKER_PROGRAM": "inference.py"}, + ), + execution_role_arn=role, + ) + + xgboost_model = Model.create( + model_name="xgboost-classifier", + primary_container=ContainerDefinition( + image=xgboost_image, + model_data_url=xgboost_model_uri, + ), + execution_role_arn=role, + ) + + # Build and deploy pipeline + pipeline_builder = ModelBuilder( + model=[sklearn_model, xgboost_model], + role_arn=role, + ) + pipeline_model = pipeline_builder.build() + endpoint = pipeline_builder.deploy( + endpoint_name="pipeline-ep", + instance_type="ml.m5.large", + initial_instance_count=1, + ) + + response = endpoint.invoke(body=csv_data, content_type="text/csv", accept="text/csv") + +:doc:`Full example notebook <../v3-examples/inference-examples/inference-pipeline-modelbuilder-vs-core-example>` -Advanced Features -~~~~~~~~~~~~~~~~ -* **Model Monitoring** - Track model performance and data drift in production -* **Endpoint Security** - VPC support, encryption, and IAM-based access control -* **Multi-AZ Deployment** - High availability with automatic failover -* **Custom Inference Logic** - Implement preprocessing, postprocessing, and custom prediction logic Migration from V2 ------------------ -If you're migrating from V2, the key changes are: -* Replace framework-specific model classes (PyTorchModel, TensorFlowModel, etc.) with ``ModelBuilder`` -* Use structured configuration objects instead of parameter dictionaries -* Leverage the new ``invoke()`` method instead of ``predict()`` for more consistent API -* Take advantage of built-in optimization and auto-scaling features +Inference Classes and Imports +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.model.Model`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.pytorch.PyTorchModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.tensorflow.TensorFlowModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.huggingface.HuggingFaceModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.sklearn.SKLearnModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.xgboost.XGBoostModel`` + - ``sagemaker.serve.model_builder.ModelBuilder`` + * - ``sagemaker.jumpstart.JumpStartModel`` + - ``ModelBuilder.from_jumpstart_config(JumpStartConfig(...))`` + * - ``sagemaker.predictor.Predictor`` + - ``sagemaker.core.resources.Endpoint`` + * - ``sagemaker.serializers.*`` + - Handle serialization directly (e.g., ``json.dumps()``) + * - ``sagemaker.deserializers.*`` + - Handle deserialization directly (e.g., ``json.loads()``) + + +Methods and Patterns +~~~~~~~~~~~~~~~~~~~~~ + + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``model.deploy(instance_type=..., initial_instance_count=...)`` + - ``model_builder.deploy(endpoint_name=..., instance_type=..., initial_instance_count=...)`` + * - ``estimator.deploy()`` + - ``ModelBuilder(model=trainer).deploy()`` + * - ``predictor.predict(data)`` + - ``endpoint.invoke(body=data, content_type="application/json")`` + * - ``model = Model(image_uri=..., model_data=...)`` + - ``model_builder = ModelBuilder(model=..., model_path=...)`` + * - ``model.deploy()`` returns ``Predictor`` + - ``model_builder.deploy()`` returns ``Endpoint`` + * - ``Transformer(model_name=...).transform(...)`` + - ``sagemaker.core.resources.TransformJob.create(...)`` + + +Session and Utilities +~~~~~~~~~~~~~~~~~~~~~~ + + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.session.Session()`` + - ``sagemaker.core.helper.session_helper.Session()`` + * - ``sagemaker.get_execution_role()`` + - ``sagemaker.core.helper.session_helper.get_execution_role()`` + * - ``sagemaker.image_uris.retrieve(...)`` + - ``sagemaker.core.image_uris.retrieve(...)`` + * - ``boto3.client('sagemaker')`` + - ``sagemaker.core.resources.*`` (Model, Endpoint, EndpointConfig, etc.) + + +V3 Package Structure +~~~~~~~~~~~~~~~~~~~~~ + + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - V3 Package + - Purpose + * - ``sagemaker-core`` + - Low-level resource management (Model, Endpoint, EndpointConfig), session, image URIs + * - ``sagemaker-train`` + - ModelTrainer for training (used with ``ModelBuilder(model=trainer)``) + * - ``sagemaker-serve`` + - ModelBuilder, InferenceSpec, SchemaBuilder, ModelServer, deployment modes + * - ``sagemaker-mlops`` + - Pipelines, processing, model registry, monitoring, Clarify + Inference Examples ----------------- @@ -195,3 +524,4 @@ Explore comprehensive inference examples that demonstrate V3 capabilities: Local Container Mode <../v3-examples/inference-examples/local-mode-example> Deploy HuggingFace Models <../v3-examples/inference-examples/huggingface-example> ModelBuilder in In-Process mode <../v3-examples/inference-examples/in-process-mode-example> + Inference Pipeline <../v3-examples/inference-examples/inference-pipeline-modelbuilder-vs-core-example> diff --git a/docs/ml_ops/index.rst b/docs/ml_ops/index.rst index 9c7e49b025..e6e4d1b6ca 100644 --- a/docs/ml_ops/index.rst +++ b/docs/ml_ops/index.rst @@ -199,17 +199,366 @@ If you're migrating MLOps workflows from V2, the key improvements are: * **Improved Governance**: Integrated model registry and approval workflows streamline compliance * **Better Resource Management**: Automatic resource optimization and cost management across workflows +Lineage Tracking +~~~~~~~~~~~~~~~~ + + +SageMaker Lineage enables tracing events across your ML workflow via a graph structure. V3 provides lineage tracking through ``sagemaker.core.lineage`` with support for: + + +- **Contexts** - Logical grouping of lineage entities under workflow contexts +- **Actions** - Recording computational steps like model builds and transformations +- **Artifacts** - Registering data inputs, labels, and trained models +- **Associations** - Directed edges linking entities to form the lineage graph +- **Traversal** - Querying relationships between entities for reporting and analysis + +.. code-block:: python + + from sagemaker.core.lineage.context import Context + from sagemaker.core.lineage.action import Action + from sagemaker.core.lineage.artifact import Artifact + from sagemaker.core.lineage.association import Association + + # Create a workflow context + context = Context.create( + context_name="my-ml-workflow", + context_type="MLWorkflow", + source_uri="workflow-source", + ) + + # Create an action and associate it with the context + action = Action.create( + action_name="model-build-step", + action_type="ModelBuild", + source_uri="build-source", + ) + + Association.create( + source_arn=context.context_arn, + destination_arn=action.action_arn, + association_type="AssociatedWith", + ) + +:doc:`Learn more about Lineage Tracking ` + ML Operations Examples ---------------------- -Explore comprehensive MLOps examples that demonstrate V3 capabilities: + +E2E Pipeline with Model Registry +---------------------------------- + + +Build a SageMaker Pipeline that preprocesses data, trains a model, and registers it to the Model Registry. + +.. code-block:: python + + from sagemaker.mlops.workflow.pipeline import Pipeline + from sagemaker.mlops.workflow.steps import ProcessingStep, TrainingStep, CacheConfig + from sagemaker.mlops.workflow.model_step import ModelStep + from sagemaker.core.processing import ScriptProcessor + from sagemaker.core.shapes import ProcessingInput, ProcessingS3Input, ProcessingOutput, ProcessingS3Output + from sagemaker.core.workflow.parameters import ParameterString + from sagemaker.core.workflow.pipeline_context import PipelineSession + from sagemaker.train import ModelTrainer + from sagemaker.train.configs import InputData, Compute + from sagemaker.serve.model_builder import ModelBuilder + + pipeline_session = PipelineSession() + + # Processing step + processor = ScriptProcessor(image_uri=sklearn_image, instance_type="ml.m5.xlarge", ...) + step_process = ProcessingStep(name="Preprocess", step_args=processor.run(...)) + + # Training step + trainer = ModelTrainer(training_image=xgboost_image, compute=Compute(instance_type="ml.m5.xlarge"), ...) + step_train = TrainingStep(name="Train", step_args=trainer.train()) + + # Register model + model_builder = ModelBuilder( + s3_model_data_url=step_train.properties.ModelArtifacts.S3ModelArtifacts, + image_uri=xgboost_image, role_arn=role, sagemaker_session=pipeline_session, + ) + step_register = ModelStep(name="Register", step_args=model_builder.register( + model_package_group_name="my-group", approval_status="Approved", + )) + + pipeline = Pipeline(name="my-pipeline", steps=[step_process, step_train, step_register], sagemaker_session=pipeline_session) + pipeline.upsert(role_arn=role) + pipeline.start() + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-pipeline-train-create-registry>` + + + +Processing Jobs +---------------- + + +Run data preprocessing with ``ScriptProcessor`` (sklearn) or ``FrameworkProcessor`` (PyTorch). + +.. code-block:: python + + from sagemaker.core.processing import ScriptProcessor + from sagemaker.core.shapes import ProcessingInput, ProcessingS3Input, ProcessingOutput, ProcessingS3Output + + processor = ScriptProcessor( + image_uri=image_uris.retrieve(framework="sklearn", region=region, version="1.2-1", py_version="py3", instance_type="ml.m5.xlarge"), + instance_type="ml.m5.xlarge", instance_count=1, role=role, + ) + + processor.run( + inputs=[ProcessingInput(input_name="input-1", s3_input=ProcessingS3Input(s3_uri=input_data, local_path="/opt/ml/processing/input", s3_data_type="S3Prefix"))], + outputs=[ProcessingOutput(output_name="train", s3_output=ProcessingS3Output(s3_uri="s3://bucket/train", local_path="/opt/ml/processing/train", s3_upload_mode="EndOfJob"))], + code="code/preprocess.py", + arguments=["--input-data", input_data], + ) + +:doc:`SKLearn example <../v3-examples/ml-ops-examples/v3-processing-job-sklearn>` · :doc:`PyTorch example <../v3-examples/ml-ops-examples/v3-processing-job-pytorch/v3-pytorch-processing-example>` + + + +Batch Transform Jobs +--------------------- + + +Run batch inference on large datasets using ``Transformer``. + +.. code-block:: python + + from sagemaker.core.transformer import Transformer + from sagemaker.serve.model_builder import ModelBuilder + + model_builder = ModelBuilder(image_uri=xgboost_image, s3_model_data_url=model_url, role_arn=role) + model_builder.build(model_name="my-transform-model") + + transformer = Transformer( + model_name="my-transform-model", instance_count=1, instance_type="ml.m5.xlarge", + accept="text/csv", assemble_with="Line", output_path="s3://bucket/output", + ) + transformer.transform("s3://bucket/input", content_type="text/csv", split_type="Line", input_filter="$[1:]") + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-transform-job-example>` + + + +Hyperparameter Tuning +---------------------- + + +Optimize hyperparameters with ``HyperparameterTuner`` using ``ContinuousParameter`` and ``CategoricalParameter`` ranges. + +.. code-block:: python + + from sagemaker.train.tuner import HyperparameterTuner + from sagemaker.core.parameter import ContinuousParameter, CategoricalParameter + from sagemaker.train import ModelTrainer + from sagemaker.train.configs import InputData + + trainer = ModelTrainer(training_image=pytorch_image, source_code=source_code, compute=compute, hyperparameters={"epochs": 1}) + + tuner = HyperparameterTuner( + model_trainer=trainer, + objective_metric_name="average test loss", + hyperparameter_ranges={"lr": ContinuousParameter(0.001, 0.1), "batch-size": CategoricalParameter([32, 64, 128])}, + metric_definitions=[{"Name": "average test loss", "Regex": "Test set: Average loss: ([0-9\\.]+)"}], + max_jobs=3, max_parallel_jobs=2, strategy="Random", objective_type="Minimize", + ) + + tuner.tune(inputs=[InputData(channel_name="training", data_source=s3_data_uri)], wait=False) + +:doc:`Standalone example <../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-example>` · :doc:`Pipeline example <../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-pipeline>` + + + +Model Registry +--------------- + + +Register models, create models from registry entries, and manage approval workflows. + +.. code-block:: python + + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.core.resources import Model, ModelPackage + + # Register from artifact + model_builder = ModelBuilder(s3_model_data_url=s3_url, image_uri=image_uri, role_arn=role) + model_builder.build(model_name="my-model") + model_builder.register(model_package_group_name="my-group", content_types=["application/json"], response_types=["application/json"], approval_status="Approved") + + # Create model from registry + model_package = ModelPackage.get(model_package_name=registered_arn) + model_builder = ModelBuilder( + s3_model_data_url=model_package.inference_specification.containers[0].model_data_url, + image_uri=model_package.inference_specification.containers[0].image, role_arn=role, + ) + model_builder.build(model_name="model-from-registry") + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-model-registry-example/v3-model-registry-example>` + + + +Clarify Bias and Explainability +-------------------------------- + + +Run pre-training bias analysis and SHAP explainability using ``SageMakerClarifyProcessor``. + +.. code-block:: python + + from sagemaker.core.clarify import SageMakerClarifyProcessor, DataConfig, BiasConfig, SHAPConfig + + data_config = DataConfig(s3_data_input_path=data_uri, s3_output_path=output_uri, label="target", headers=headers, dataset_type="text/csv") + bias_config = BiasConfig(label_values_or_threshold=[1], facet_name="gender", facet_values_or_threshold=[1]) + + clarify_processor = SageMakerClarifyProcessor(role=role, instance_count=1, instance_type="ml.m5.large") + clarify_processor.run_pre_training_bias(data_config=data_config, data_bias_config=bias_config, methods=["CI", "DPL"]) + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-sagemaker-clarify>` + + + +EMR Serverless Pipeline Step +----------------------------- + + +Run PySpark jobs on EMR Serverless within a SageMaker Pipeline. + +.. code-block:: python + + from sagemaker.mlops.workflow.emr_serverless_step import EMRServerlessStep, EMRServerlessJobConfig + from sagemaker.mlops.workflow.pipeline import Pipeline + + job_config = EMRServerlessJobConfig( + job_driver={"sparkSubmit": {"entryPoint": script_uri, "entryPointArguments": ["--input", input_uri, "--output", output_uri]}}, + execution_role_arn=emr_role, + ) + + step = EMRServerlessStep( + name="SparkJob", job_config=job_config, + application_config={"name": "spark-app", "releaseLabel": "emr-6.15.0", "type": "SPARK"}, + ) + + pipeline = Pipeline(name="EMRPipeline", steps=[step], sagemaker_session=pipeline_session) + pipeline.upsert(role_arn=role) + pipeline.start() + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-emr-serverless-step-example>` + + + +MLflow Integration +------------------- + + +Train with MLflow metric tracking and deploy from the MLflow model registry. + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.serve.model_builder import ModelBuilder + from sagemaker.serve.mode.function_pointers import Mode + + # Train (script logs to MLflow internally) + trainer = ModelTrainer(training_image=pytorch_image, source_code=SourceCode(source_dir=code_dir, entry_script="train.py", requirements="requirements.txt")) + trainer.train() + + # Deploy from MLflow registry + model_builder = ModelBuilder( + mode=Mode.SAGEMAKER_ENDPOINT, + schema_builder=schema_builder, + model_metadata={"MLFLOW_MODEL_PATH": "models:/my-model/1", "MLFLOW_TRACKING_ARN": tracking_arn}, + ) + model_builder.build(model_name="mlflow-model") + model_builder.deploy(endpoint_name="mlflow-endpoint") + +:doc:`Full example notebook <../v3-examples/ml-ops-examples/v3-mlflow-train-inference-e2e-example>` + + + +Migration from V2 +------------------ + + +MLOps Classes and Imports +-------------------------- + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.workflow.pipeline.Pipeline`` + - ``sagemaker.mlops.workflow.pipeline.Pipeline`` + * - ``sagemaker.workflow.steps.ProcessingStep`` + - ``sagemaker.mlops.workflow.steps.ProcessingStep`` + * - ``sagemaker.workflow.steps.TrainingStep`` + - ``sagemaker.mlops.workflow.steps.TrainingStep`` + * - ``sagemaker.workflow.step_collections.RegisterModel`` + - ``sagemaker.mlops.workflow.model_step.ModelStep`` + ``model_builder.register()`` + * - ``sagemaker.workflow.model_step.ModelStep`` + - ``sagemaker.mlops.workflow.model_step.ModelStep`` + * - ``sagemaker.sklearn.processing.SKLearnProcessor`` + - ``sagemaker.core.processing.ScriptProcessor`` + * - ``sagemaker.processing.ScriptProcessor`` + - ``sagemaker.core.processing.ScriptProcessor`` + * - ``sagemaker.processing.FrameworkProcessor`` + - ``sagemaker.core.processing.FrameworkProcessor`` + * - ``sagemaker.processing.ProcessingInput`` + - ``sagemaker.core.shapes.ProcessingInput`` + ``ProcessingS3Input`` + * - ``sagemaker.processing.ProcessingOutput`` + - ``sagemaker.core.shapes.ProcessingOutput`` + ``ProcessingS3Output`` + * - ``sagemaker.tuner.HyperparameterTuner`` + - ``sagemaker.train.tuner.HyperparameterTuner`` + * - ``sagemaker.parameter.ContinuousParameter`` + - ``sagemaker.core.parameter.ContinuousParameter`` + * - ``sagemaker.transformer.Transformer`` + - ``sagemaker.core.transformer.Transformer`` + * - ``sagemaker.clarify.SageMakerClarifyProcessor`` + - ``sagemaker.core.clarify.SageMakerClarifyProcessor`` + * - ``sagemaker.workflow.parameters.ParameterString`` + - ``sagemaker.core.workflow.parameters.ParameterString`` + * - ``sagemaker.workflow.pipeline_context.PipelineSession`` + - ``sagemaker.core.workflow.pipeline_context.PipelineSession`` + * - ``sagemaker.lineage.context.Context`` + - ``sagemaker.core.lineage.context.Context`` + + +V3 Package Structure +--------------------- + + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - V3 Package + - MLOps Components + * - ``sagemaker-core`` + - ScriptProcessor, FrameworkProcessor, Transformer, Clarify, lineage, pipeline context, parameters, image URIs + * - ``sagemaker-train`` + - ModelTrainer, HyperparameterTuner, InputData, Compute, SourceCode + * - ``sagemaker-serve`` + - ModelBuilder (build, register, deploy) + * - ``sagemaker-mlops`` + - Pipeline, ProcessingStep, TrainingStep, ModelStep, TuningStep, EMRServerlessStep, CacheConfig + + +Explore comprehensive MLOps examples: .. toctree:: :maxdepth: 1 + lineage ../v3-examples/ml-ops-examples/v3-sagemaker-clarify ../v3-examples/ml-ops-examples/v3-pipeline-train-create-registry ../v3-examples/ml-ops-examples/v3-transform-job-example ../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-example + ../v3-examples/ml-ops-examples/v3-hyperparameter-tuning-example/v3-hyperparameter-tuning-pipeline ../v3-examples/ml-ops-examples/v3-model-registry-example/v3-model-registry-example ../v3-examples/ml-ops-examples/v3-processing-job-pytorch/v3-pytorch-processing-example + ../v3-examples/ml-ops-examples/v3-processing-job-sklearn + ../v3-examples/ml-ops-examples/v3-emr-serverless-step-example + ../v3-examples/ml-ops-examples/v3-mlflow-train-inference-e2e-example diff --git a/docs/ml_ops/lineage.rst b/docs/ml_ops/lineage.rst new file mode 100644 index 0000000000..cc770d6587 --- /dev/null +++ b/docs/ml_ops/lineage.rst @@ -0,0 +1,473 @@ +Lineage Tracking +================ + +Amazon SageMaker Lineage enables events that happen within SageMaker to be traced via a graph structure. The data simplifies generating reports, making comparisons, or discovering relationships between events. For example, you can easily trace both how a model was generated and where the model was deployed. + +The lineage graph is created automatically by SageMaker and you can directly create or modify your own graphs. + +Key Concepts +------------ + +- **Lineage Graph** - A connected graph tracing your machine learning workflow end to end. +- **Artifacts** - Represents a URI addressable object or data. Artifacts are typically inputs or outputs to Actions. +- **Actions** - Represents an action taken such as a computation, transformation, or job. +- **Contexts** - Provides a method to logically group other entities. +- **Associations** - A directed edge in the lineage graph that links two entities. +- **Lineage Traversal** - Starting from an arbitrary point, trace the lineage graph to discover and analyze relationships between steps in your workflow. +- **Experiments** - Experiment entities (Experiments, Trials, and Trial Components) are also part of the lineage graph and can be associated with Artifacts, Actions, or Contexts. + +Use Cases +--------- + +The notebook from the `SageMaker examples repository `_ demonstrates the following major use cases for lineage tracking: + +1. **Creating Lineage Contexts** - Group related lineage entities under a logical workflow context +2. **Listing Lineage Entities** - Query and enumerate existing contexts, actions, artifacts, and associations +3. **Creating Actions** - Record computational steps such as model builds, transformations, or training jobs +4. **Creating Artifacts** - Register data inputs (datasets, labels) and outputs (trained models) as lineage artifacts +5. **Creating Associations** - Link entities together with directed edges to form the lineage graph +6. **Traversing Associations** - Query incoming and outgoing associations to understand entity relationships +7. **Cleaning Up Lineage Data** - Delete associations and entities when they are no longer needed + +V3 Migration Notes +------------------ + +In SageMaker Python SDK V3, lineage classes have moved from ``sagemaker.lineage`` to ``sagemaker.core.lineage``. The old import paths still work via compatibility shims but emit deprecation warnings. + +.. list-table:: Import Path Changes + :header-rows: 1 + :widths: 50 50 + + * - V2 Import + - V3 Import + * - ``from sagemaker.lineage.context import Context`` + - ``from sagemaker.core.lineage.context import Context`` + * - ``from sagemaker.lineage.action import Action`` + - ``from sagemaker.core.lineage.action import Action`` + * - ``from sagemaker.lineage.artifact import Artifact`` + - ``from sagemaker.core.lineage.artifact import Artifact`` + * - ``from sagemaker.lineage.association import Association`` + - ``from sagemaker.core.lineage.association import Association`` + * - ``import sagemaker`` / ``sagemaker.session.Session()`` + - ``from sagemaker.core.helper.session_helper import Session`` + +The API signatures for ``create``, ``list``, ``delete``, and association management remain the same. The key change is the import path. + + +Use Case 1: Session Setup +------------------------- + +Initialize a SageMaker session and set up common variables. + +**V2 (Legacy):** + +.. code-block:: python + + import boto3 + import sagemaker + + region = boto3.Session().region_name + sagemaker_session = sagemaker.session.Session() + default_bucket = sagemaker_session.default_bucket() + +**V3:** + +.. code-block:: python + + import boto3 + from sagemaker.core.helper.session_helper import Session + + region = boto3.Session().region_name + sagemaker_session = Session() + default_bucket = sagemaker_session.default_bucket() + + +Use Case 2: Creating a Lineage Context +--------------------------------------- + +Contexts provide a method to logically group other lineage entities. Each context name must be unique across all other contexts. + +**V2 (Legacy):** + +.. code-block:: python + + from datetime import datetime + from sagemaker.lineage.context import Context + + unique_id = str(int(datetime.now().replace(microsecond=0).timestamp())) + context_name = f"machine-learning-workflow-{unique_id}" + + ml_workflow_context = Context.create( + context_name=context_name, + context_type="MLWorkflow", + source_uri=unique_id, + properties={"example": "true"}, + ) + +**V3:** + +.. code-block:: python + + from datetime import datetime + from sagemaker.core.lineage.context import Context + + unique_id = str(int(datetime.now().replace(microsecond=0).timestamp())) + context_name = f"machine-learning-workflow-{unique_id}" + + ml_workflow_context = Context.create( + context_name=context_name, + context_type="MLWorkflow", + source_uri=unique_id, + properties={"example": "true"}, + ) + + +Use Case 3: Listing Contexts +----------------------------- + +Enumerate existing contexts sorted by creation time. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.context import Context + + contexts = Context.list(sort_by="CreationTime", sort_order="Descending") + for ctx in contexts: + print(ctx.context_name) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.context import Context + + contexts = Context.list(sort_by="CreationTime", sort_order="Descending") + for ctx in contexts: + print(ctx.context_name) + + +Use Case 4: Creating an Action +------------------------------- + +Actions represent computational steps such as model builds, transformations, or training jobs. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.action import Action + + model_build_action = Action.create( + action_name=f"model-build-step-{unique_id}", + action_type="ModelBuild", + source_uri=unique_id, + properties={"Example": "Metadata"}, + ) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.action import Action + + model_build_action = Action.create( + action_name=f"model-build-step-{unique_id}", + action_type="ModelBuild", + source_uri=unique_id, + properties={"Example": "Metadata"}, + ) + + +Use Case 5: Creating Associations +----------------------------------- + +Associations are directed edges in the lineage graph. The ``association_type`` can be ``Produced``, ``DerivedFrom``, ``AssociatedWith``, or ``ContributedTo``. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.association import Association + + context_action_association = Association.create( + source_arn=ml_workflow_context.context_arn, + destination_arn=model_build_action.action_arn, + association_type="AssociatedWith", + ) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.association import Association + + context_action_association = Association.create( + source_arn=ml_workflow_context.context_arn, + destination_arn=model_build_action.action_arn, + association_type="AssociatedWith", + ) + + + +Use Case 6: Traversing Associations +------------------------------------- + +Query incoming and outgoing associations to understand how entities are related in the lineage graph. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.association import Association + + # List incoming associations to an action + incoming = Association.list(destination_arn=model_build_action.action_arn) + for association in incoming: + print(f"{model_build_action.action_name} has incoming association from {association.source_name}") + + # List outgoing associations from a context + outgoing = Association.list(source_arn=ml_workflow_context.context_arn) + for association in outgoing: + print(f"{ml_workflow_context.context_name} has outgoing association to {association.destination_name}") + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.association import Association + + # List incoming associations to an action + incoming = Association.list(destination_arn=model_build_action.action_arn) + for association in incoming: + print(f"{model_build_action.action_name} has incoming association from {association.source_name}") + + # List outgoing associations from a context + outgoing = Association.list(source_arn=ml_workflow_context.context_arn) + for association in outgoing: + print(f"{ml_workflow_context.context_name} has outgoing association to {association.destination_name}") + + +Use Case 7: Creating Artifacts +------------------------------- + +Artifacts represent URI-addressable objects or data, such as datasets, labels, or trained models. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.artifact import Artifact + + input_test_images = Artifact.create( + artifact_name="mnist-test-images", + artifact_type="TestData", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz", + ) + + input_test_labels = Artifact.create( + artifact_name="mnist-test-labels", + artifact_type="TestLabels", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz", + ) + + output_model = Artifact.create( + artifact_name="mnist-model", + artifact_type="Model", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/model/tensorflow-training-2020-11-20-23-57-13-077/model.tar.gz", + ) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.artifact import Artifact + + input_test_images = Artifact.create( + artifact_name="mnist-test-images", + artifact_type="TestData", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz", + ) + + input_test_labels = Artifact.create( + artifact_name="mnist-test-labels", + artifact_type="TestLabels", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz", + ) + + output_model = Artifact.create( + artifact_name="mnist-model", + artifact_type="Model", + source_types=[{"SourceIdType": "Custom", "Value": unique_id}], + source_uri=f"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/model/tensorflow-training-2020-11-20-23-57-13-077/model.tar.gz", + ) + + +Use Case 8: Linking Artifacts to Actions +------------------------------------------ + +Associate data artifacts as inputs to an action, and the action's output to a model artifact, forming a complete lineage chain. + +**V2 (Legacy):** + +.. code-block:: python + + from sagemaker.lineage.association import Association + + # Link input data to the model build action + Association.create( + source_arn=input_test_images.artifact_arn, + destination_arn=model_build_action.action_arn, + ) + Association.create( + source_arn=input_test_labels.artifact_arn, + destination_arn=model_build_action.action_arn, + ) + + # Link the action output to the model artifact + Association.create( + source_arn=model_build_action.action_arn, + destination_arn=output_model.artifact_arn, + ) + +**V3:** + +.. code-block:: python + + from sagemaker.core.lineage.association import Association + + # Link input data to the model build action + Association.create( + source_arn=input_test_images.artifact_arn, + destination_arn=model_build_action.action_arn, + ) + Association.create( + source_arn=input_test_labels.artifact_arn, + destination_arn=model_build_action.action_arn, + ) + + # Link the action output to the model artifact + Association.create( + source_arn=model_build_action.action_arn, + destination_arn=output_model.artifact_arn, + ) + + +Use Case 9: Cleaning Up Lineage Data +-------------------------------------- + +Delete associations first, then delete the entities themselves. Associations must be removed before their source or destination entities can be deleted. + +**V2 (Legacy):** + +.. code-block:: python + + import sagemaker + from sagemaker.lineage.association import Association + from sagemaker.lineage.context import Context + from sagemaker.lineage.action import Action + from sagemaker.lineage.artifact import Artifact + + sagemaker_session = sagemaker.session.Session() + + def delete_associations(arn): + for summary in Association.list(destination_arn=arn): + assct = Association( + source_arn=summary.source_arn, + destination_arn=summary.destination_arn, + sagemaker_session=sagemaker_session, + ) + assct.delete() + for summary in Association.list(source_arn=arn): + assct = Association( + source_arn=summary.source_arn, + destination_arn=summary.destination_arn, + sagemaker_session=sagemaker_session, + ) + assct.delete() + + # Delete context + delete_associations(ml_workflow_context.context_arn) + Context(context_name=ml_workflow_context.context_name, sagemaker_session=sagemaker_session).delete() + + # Delete action + delete_associations(model_build_action.action_arn) + Action(action_name=model_build_action.action_name, sagemaker_session=sagemaker_session).delete() + + # Delete artifacts + for artifact in [input_test_images, input_test_labels, output_model]: + delete_associations(artifact.artifact_arn) + Artifact(artifact_arn=artifact.artifact_arn, sagemaker_session=sagemaker_session).delete() + +**V3:** + +.. code-block:: python + + from sagemaker.core.helper.session_helper import Session + from sagemaker.core.lineage.association import Association + from sagemaker.core.lineage.context import Context + from sagemaker.core.lineage.action import Action + from sagemaker.core.lineage.artifact import Artifact + + sagemaker_session = Session() + + def delete_associations(arn): + for summary in Association.list(destination_arn=arn): + assct = Association( + source_arn=summary.source_arn, + destination_arn=summary.destination_arn, + sagemaker_session=sagemaker_session, + ) + assct.delete() + for summary in Association.list(source_arn=arn): + assct = Association( + source_arn=summary.source_arn, + destination_arn=summary.destination_arn, + sagemaker_session=sagemaker_session, + ) + assct.delete() + + # Delete context + delete_associations(ml_workflow_context.context_arn) + Context(context_name=ml_workflow_context.context_name, sagemaker_session=sagemaker_session).delete() + + # Delete action + delete_associations(model_build_action.action_arn) + Action(action_name=model_build_action.action_name, sagemaker_session=sagemaker_session).delete() + + # Delete artifacts + for artifact in [input_test_images, input_test_labels, output_model]: + delete_associations(artifact.artifact_arn) + Artifact(artifact_arn=artifact.artifact_arn, sagemaker_session=sagemaker_session).delete() + + +Caveats +------- + +- Associations cannot be created between two experiment entities (e.g., between an Experiment and Trial). +- Associations can only be created between Action, Artifact, or Context resources. +- Maximum number of manually created lineage entities: + + - Artifacts: 6000 + - Contexts: 500 + - Actions: 3000 + - Associations: 6000 + +- There is no limit on the number of lineage entities created automatically by SageMaker. + + +Lineage Tracking Example +------------------------- + +For a complete end-to-end V3 example, see the lineage tracking notebook: + +.. toctree:: + :maxdepth: 1 + + ../v3-examples/ml-ops-examples/v3-lineage-tracking-example diff --git a/docs/training/index.rst b/docs/training/index.rst index 5ba8bbd7a1..a69e7bf6f5 100644 --- a/docs/training/index.rst +++ b/docs/training/index.rst @@ -1,7 +1,7 @@ Model Training =============== -SageMaker Python SDK V3 revolutionizes machine learning training with the unified **ModelTrainer** class, replacing the complex framework-specific estimators from V2. This modern approach provides a consistent interface across all training scenarios while maintaining the power and flexibility you need. +SageMaker Python SDK V3 provides a unified **ModelTrainer** class that replaces the framework-specific estimators from V2. This single class handles PyTorch, TensorFlow, Scikit-learn, XGBoost, and custom containers through a consistent interface. Key Benefits of V3 Training --------------------------- @@ -13,14 +13,12 @@ Key Benefits of V3 Training Quick Start Example ------------------- -Here's how training has evolved from V2 to V3: - **SageMaker Python SDK V2:** .. code-block:: python from sagemaker.estimator import Estimator - + estimator = Estimator( image_uri="my-training-image", role="arn:aws:iam::123456789012:role/SageMakerRole", @@ -49,88 +47,506 @@ Here's how training has evolved from V2 to V3: trainer.train(input_data_config=[train_data]) -ModelTrainer Overview --------------------- -The ``ModelTrainer`` class is the cornerstone of SageMaker Python SDK V3, providing a unified interface for all training scenarios. This single class replaces the complex web of framework-specific estimators from V2, offering: -**Unified Training Interface** - One class handles PyTorch, TensorFlow, Scikit-learn, XGBoost, and custom containers +Local Container Training +------------------------ -**Intelligent Defaults** - Automatically configures optimal settings based on your training requirements -**Flexible Configuration** - Object-oriented design with structured configs that align with AWS APIs +Run training jobs in Docker containers on your local machine for rapid development and debugging before deploying to SageMaker cloud instances. Local mode requires Docker to be installed and running. -**Seamless Integration** - Works seamlessly with SageMaker features like distributed training, spot instances, and hyperparameter tuning +**Session Setup and Image Retrieval:** .. code-block:: python - from sagemaker.train import ModelTrainer - from sagemaker.train.configs import InputData, ResourceConfig + from sagemaker.core.helper.session_helper import Session + from sagemaker.core import image_uris - # Create trainer with intelligent defaults - trainer = ModelTrainer( - training_image="your-training-image", - role="your-sagemaker-role" + sagemaker_session = Session() + region = sagemaker_session.boto_region_name + + training_image = image_uris.retrieve( + framework="pytorch", + region=region, + version="2.0.0", + py_version="py310", + instance_type="ml.m5.xlarge", + image_scope="training" + ) + +**Configuring Local Container Training:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer, Mode + from sagemaker.train.configs import SourceCode, Compute, InputData + + source_code = SourceCode( + source_dir="./source", + entry_script="train.py", + ) + + compute = Compute( + instance_type="local_cpu", + instance_count=1, ) - # Configure training data train_data = InputData( - channel_name="training", - data_source="s3://your-bucket/train-data" + channel_name="train", + data_source="./data/train", ) - # Start training - training_job = trainer.train( + model_trainer = ModelTrainer( + training_image=training_image, + sagemaker_session=sagemaker_session, + source_code=source_code, + compute=compute, input_data_config=[train_data], - resource_config=ResourceConfig( - instance_type="ml.m5.xlarge", - instance_count=1 - ) + base_job_name="local-training", + training_mode=Mode.LOCAL_CONTAINER, ) -Framework Support -~~~~~~~~~~~~~~~~~ + model_trainer.train() + +Key points: + +- Use ``instance_type="local_cpu"`` or ``"local_gpu"`` for local execution +- Set ``training_mode=Mode.LOCAL_CONTAINER`` to run in Docker +- Local data paths are mounted directly into the container +- Training artifacts are saved to the current working directory + +:doc:`Full example notebook <../v3-examples/training-examples/local-training-example>` + + + +Distributed Local Training +-------------------------- + + +Test multi-node distributed training locally using multiple Docker containers before deploying to cloud. This uses the ``Torchrun`` distributed driver to coordinate training across containers. + +**Configuring Distributed Local Training:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer, Mode + from sagemaker.train.configs import SourceCode, Compute, InputData + from sagemaker.train.distributed import Torchrun + + source_code = SourceCode( + source_dir="./source", + entry_script="train.py", + ) + + distributed = Torchrun( + process_count_per_node=1, + ) + + compute = Compute( + instance_type="local_cpu", + instance_count=2, # Two containers for distributed training + ) + + model_trainer = ModelTrainer( + training_image=training_image, + sagemaker_session=sagemaker_session, + source_code=source_code, + distributed=distributed, + compute=compute, + input_data_config=[train_data, test_data], + base_job_name="distributed-local-training", + training_mode=Mode.LOCAL_CONTAINER, + ) + + model_trainer.train() + +Key points: + +- ``instance_count=2`` launches two Docker containers +- ``Torchrun`` handles process coordination across containers +- ``process_count_per_node`` controls how many training processes run per container +- Temporary directories (``shared``, ``algo-1``, ``algo-2``) are cleaned up automatically after training -* **PyTorch** - Deep learning with dynamic computation graphs -* **TensorFlow** - Production-ready machine learning at scale -* **Scikit-learn** - Classical machine learning algorithms -* **XGBoost** - Gradient boosting for structured data -* **Custom Containers** - Bring your own training algorithms +:doc:`Full example notebook <../v3-examples/training-examples/distributed-local-training-example>` -Training Types -~~~~~~~~~~~~~~ -* **Single Instance Training** - Cost-effective training for smaller models -* **Multi-Instance Training** - Distributed training for large-scale models -* **Spot Instance Training** - Cost optimization with managed spot instances -* **Local Mode Training** - Development and debugging on local infrastructure -Advanced Features -~~~~~~~~~~~~~~~~~ +Hyperparameter Management +------------------------- + + +ModelTrainer supports loading hyperparameters from JSON files, YAML files, or Python dictionaries. File-based hyperparameters provide better version control and support for complex nested structures. + +**Loading Hyperparameters from JSON:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.train.configs import SourceCode + + source_code = SourceCode( + source_dir="./source", + requirements="requirements.txt", + entry_script="train.py", + ) + + trainer = ModelTrainer( + training_image=training_image, + hyperparameters="hyperparameters.json", # Path to JSON file + source_code=source_code, + base_job_name="hp-json-training", + ) + + trainer.train() + +**Loading Hyperparameters from YAML:** + +.. code-block:: python + + trainer = ModelTrainer( + training_image=training_image, + hyperparameters="hyperparameters.yaml", # Path to YAML file + source_code=source_code, + base_job_name="hp-yaml-training", + ) + + trainer.train() + +**Using a Python Dictionary:** + +.. code-block:: python + + trainer = ModelTrainer( + training_image=training_image, + hyperparameters={ + "epochs": 10, + "learning_rate": 0.001, + "batch_size": 32, + "model_config": {"hidden_size": 256, "num_layers": 3}, + }, + source_code=source_code, + base_job_name="hp-dict-training", + ) + + trainer.train() + +Key points: + +- JSON and YAML files support complex nested structures (dicts, lists, booleans, floats) +- Hyperparameters are passed to the training script as command-line arguments +- They are also available via the ``SM_HPS`` environment variable as a JSON string +- All three approaches (JSON, YAML, dict) produce identical training behavior + +:doc:`Full example notebook <../v3-examples/training-examples/hyperparameter-training-example>` + + + +JumpStart Training +------------------ + + +Train pre-configured models from the SageMaker JumpStart hub using ``ModelTrainer.from_jumpstart_config()``. JumpStart provides optimized training scripts, default hyperparameters, and curated datasets for hundreds of models. + +**Training a HuggingFace BERT Model:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.core.jumpstart import JumpStartConfig + from sagemaker.core.helper.session_helper import Session, get_execution_role + + sagemaker_session = Session() + role = get_execution_role() + + bert_config = JumpStartConfig( + model_id="huggingface-spc-bert-base-cased", + ) + + bert_trainer = ModelTrainer.from_jumpstart_config( + jumpstart_config=bert_config, + base_job_name="jumpstart-bert", + hyperparameters={ + "epochs": 1, + "learning_rate": 5e-5, + "train_batch_size": 32, + }, + sagemaker_session=sagemaker_session, + ) + + bert_trainer.train() + +**Training an XGBoost Classification Model:** + +.. code-block:: python + + xgboost_config = JumpStartConfig( + model_id="xgboost-classification-model", + ) + + xgboost_trainer = ModelTrainer.from_jumpstart_config( + jumpstart_config=xgboost_config, + base_job_name="jumpstart-xgboost", + hyperparameters={ + "num_round": 10, + "max_depth": 5, + "eta": 0.2, + "objective": "binary:logistic", + }, + sagemaker_session=sagemaker_session, + ) + + xgboost_trainer.train() + +**Discovering Available JumpStart Models:** + +.. code-block:: python + + from sagemaker.core.jumpstart.notebook_utils import list_jumpstart_models + from sagemaker.core.jumpstart.search import search_public_hub_models + + # List all available models + models = list_jumpstart_models() + + # Filter by framework + hf_models = list_jumpstart_models(filter="framework == huggingface") + + # Search with queries + results = search_public_hub_models(query="bert") + + # Complex queries with filters + text_gen = search_public_hub_models(query="@task:text-generation") + +Key points: + +- ``from_jumpstart_config()`` auto-configures training image, instance type, and default hyperparameters +- Override any default hyperparameters while keeping proven defaults for the rest +- JumpStart provides built-in datasets so you can start training immediately +- Supports HuggingFace, XGBoost, CatBoost, LightGBM, and many more frameworks +- Use ``list_jumpstart_models()`` and ``search_public_hub_models()`` to discover available models + +:doc:`Full example notebook <../v3-examples/training-examples/jumpstart-training-example>` + + + +Custom Distributed Training Drivers +------------------------------------ + + +Create custom distributed training drivers by extending ``DistributedConfig`` for specialized coordination logic, framework integration, or advanced debugging. + +**Defining a Custom Driver:** + +.. code-block:: python + + from sagemaker.train.distributed import DistributedConfig + + class CustomDriver(DistributedConfig): + process_count_per_node: int = None + + @property + def driver_dir(self) -> str: + return "./custom_drivers" + + @property + def driver_script(self) -> str: + return "driver.py" + +The driver script (``driver.py``) receives environment variables including ``SM_DISTRIBUTED_CONFIG``, ``SM_HPS``, ``SM_SOURCE_DIR``, and ``SM_ENTRY_SCRIPT`` to coordinate training. + +**Using the Custom Driver with ModelTrainer:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.train.configs import SourceCode + + source_code = SourceCode( + source_dir="./scripts", + entry_script="entry_script.py", + ) + + custom_driver = CustomDriver(process_count_per_node=2) + + model_trainer = ModelTrainer( + training_image=training_image, + hyperparameters={"epochs": 10}, + source_code=source_code, + distributed=custom_driver, + base_job_name="custom-distributed", + ) + + model_trainer.train() + +Key points: + +- Extend ``DistributedConfig`` and implement ``driver_dir`` and ``driver_script`` properties +- The driver script manages process launching and coordination +- Environment variables provide access to hyperparameters, source code location, and distributed config +- Useful for custom frameworks, specialized coordination patterns, or advanced debugging + +:doc:`Full example notebook <../v3-examples/training-examples/custom-distributed-training-example>` + + + +AWS Batch Training Queues +------------------------- + + +Submit training jobs to AWS Batch job queues for automatic scheduling and resource management. Batch handles capacity allocation and job execution order. + +**Setting Up and Submitting Jobs:** + +.. code-block:: python + + from sagemaker.train.model_trainer import ModelTrainer + from sagemaker.train.configs import SourceCode, Compute, StoppingCondition + from sagemaker.train.aws_batch.training_queue import TrainingQueue + + source_code = SourceCode(command="echo 'Hello World'") + + model_trainer = ModelTrainer( + training_image=image_uri, + source_code=source_code, + base_job_name="batch-training-job", + compute=Compute(instance_type="ml.g5.xlarge", instance_count=1), + stopping_condition=StoppingCondition(max_runtime_in_seconds=300), + ) + + # Create a queue reference and submit jobs + queue = TrainingQueue("my-sm-training-fifo-jq") + queued_job = queue.submit(training_job=model_trainer, inputs=None) + +**Creating Batch Resources Programmatically:** + +.. code-block:: python + + from sagemaker.train.aws_batch.boto_client import get_batch_boto_client + from utils.aws_batch_resource_management import AwsBatchResourceManager, create_resources + + resource_manager = AwsBatchResourceManager(get_batch_boto_client()) + resources = create_resources( + resource_manager, + job_queue_name="my-sm-training-fifo-jq", + service_environment_name="my-sm-training-fifo-se", + max_capacity=1, + ) + +Key points: + +- ``TrainingQueue`` wraps AWS Batch job queues for SageMaker training +- ``queue.submit()`` submits a ModelTrainer job to the queue +- Batch manages capacity allocation and job scheduling automatically +- Resources (Service Environments, Job Queues) can be created via console or programmatically +- Supports FIFO and priority-based scheduling + +:doc:`Full example notebook <../v3-examples/training-examples/aws_batch/sm-training-queues_getting_started_with_model_trainer>` -* **Automatic Model Tuning** - Hyperparameter optimization at scale -* **Distributed Training** - Multi-node, multi-GPU training strategies -* **Checkpointing** - Resume training from saved states -* **Early Stopping** - Prevent overfitting with intelligent stopping criteria Migration from V2 ------------------ -If you're migrating from V2, the key changes are: -* Replace framework-specific estimators (PyTorchEstimator, TensorFlowEstimator, etc.) with ``ModelTrainer`` -* Use structured ``InputData`` configs instead of dictionary-based input specifications -* Leverage the new object-oriented API for cleaner, more maintainable code +Training Classes and Imports +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.estimator.Estimator`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.pytorch.PyTorch`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.tensorflow.TensorFlow`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.huggingface.HuggingFace`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.sklearn.SKLearn`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.xgboost.XGBoost`` + - ``sagemaker.train.model_trainer.ModelTrainer`` + * - ``sagemaker.jumpstart.JumpStartEstimator`` + - ``ModelTrainer.from_jumpstart_config(JumpStartConfig(...))`` + * - ``sagemaker.tuner.HyperparameterTuner`` + - ``sagemaker.core.resources.HyperParameterTuningJob`` + + +Methods and Patterns +~~~~~~~~~~~~~~~~~~~~~ + + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``estimator.fit({"train": "s3://..."})`` + - ``trainer.train(input_data_config=[InputData(...)])`` + * - ``estimator.deploy()`` + - ``ModelBuilder(model=trainer).deploy()`` + * - ``instance_type="ml.m5.xlarge"`` + - ``Compute(instance_type="ml.m5.xlarge")`` + * - ``entry_point="train.py"`` + - ``SourceCode(entry_script="train.py")`` + * - ``source_dir="./src"`` + - ``SourceCode(source_dir="./src")`` + * - ``sagemaker.inputs.TrainingInput(s3_data=...)`` + - ``InputData(channel_name=..., data_source=...)`` + * - ``hyperparameters={"lr": 0.01}`` + - ``hyperparameters={"lr": 0.01}`` or ``hyperparameters="config.json"`` + * - ``max_run=3600`` + - ``StoppingCondition(max_runtime_in_seconds=3600)`` + + +Session and Utilities +~~~~~~~~~~~~~~~~~~~~~~ + + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - V2 + - V3 + * - ``sagemaker.session.Session()`` + - ``sagemaker.core.helper.session_helper.Session()`` + * - ``sagemaker.get_execution_role()`` + - ``sagemaker.core.helper.session_helper.get_execution_role()`` + * - ``sagemaker.image_uris.retrieve(...)`` + - ``sagemaker.core.image_uris.retrieve(...)`` + * - ``import sagemaker`` (bare import) + - Use explicit imports from subpackages + * - ``boto3.client('sagemaker')`` + - ``sagemaker.core.resources.*`` (TrainingJob, Model, Endpoint, etc.) + + +V3 Package Structure +~~~~~~~~~~~~~~~~~~~~~ + + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - V3 Package + - Purpose + * - ``sagemaker-core`` + - Low-level resource management, session, image URIs, lineage, JumpStart + * - ``sagemaker-train`` + - ModelTrainer, Compute, SourceCode, InputData, distributed training + * - ``sagemaker-serve`` + - ModelBuilder, InferenceSpec, SchemaBuilder, deployment + * - ``sagemaker-mlops`` + - Pipelines, processing, model registry, monitoring, Clarify + Training Examples ----------------- -Explore comprehensive training examples that demonstrate V3 capabilities: - .. toctree:: :maxdepth: 1 diff --git a/v3-examples/ml-ops-examples/v3-lineage-tracking-example.ipynb b/v3-examples/ml-ops-examples/v3-lineage-tracking-example.ipynb new file mode 100644 index 0000000000..6088765334 --- /dev/null +++ b/v3-examples/ml-ops-examples/v3-lineage-tracking-example.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker Lineage Tracking - V3 SDK Example\n", + "\n", + "This notebook demonstrates how to use SageMaker Lineage Tracking with the V3 Python SDK.\n", + "\n", + "Amazon SageMaker Lineage enables events that happen within SageMaker to be traced via a graph structure. The data simplifies generating reports, making comparisons, or discovering relationships between events.\n", + "\n", + "## What you will learn\n", + "\n", + "- Create and manage lineage Contexts, Actions, and Artifacts\n", + "- Create Associations to link entities into a lineage graph\n", + "- Traverse associations to discover relationships\n", + "- Clean up lineage data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Initialize a SageMaker session using the V3 `Session` class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "from sagemaker.core.helper.session_helper import Session\n", + "\n", + "region = boto3.Session().region_name\n", + "sagemaker_session = Session()\n", + "default_bucket = sagemaker_session.default_bucket()\n", + "\n", + "print(f\"Region: {region}\")\n", + "print(f\"Default bucket: {default_bucket}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "from sagemaker.core.lineage.context import Context\n", + "from sagemaker.core.lineage.action import Action\n", + "from sagemaker.core.lineage.association import Association\n", + "from sagemaker.core.lineage.artifact import Artifact\n", + "\n", + "unique_id = str(int(datetime.now().replace(microsecond=0).timestamp()))\n", + "print(f\"Unique id is {unique_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 1: Create a Lineage Context\n", + "\n", + "Contexts provide a method to logically group other lineage entities. The context name must be unique across all other contexts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context_name = f\"machine-learning-workflow-{unique_id}\"\n", + "\n", + "ml_workflow_context = Context.create(\n", + " context_name=context_name,\n", + " context_type=\"MLWorkflow\",\n", + " source_uri=unique_id,\n", + " properties={\"example\": \"true\"},\n", + ")\n", + "\n", + "print(f\"Created context: {ml_workflow_context.context_name}\")\n", + "print(f\"Context ARN: {ml_workflow_context.context_arn}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 2: List Contexts\n", + "\n", + "Enumerate existing contexts sorted by creation time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "contexts = Context.list(sort_by=\"CreationTime\", sort_order=\"Descending\")\n", + "\n", + "for ctx in contexts:\n", + " print(ctx.context_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 3: Create an Action\n", + "\n", + "Actions represent computational steps such as model builds, transformations, or training jobs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_build_action = Action.create(\n", + " action_name=f\"model-build-step-{unique_id}\",\n", + " action_type=\"ModelBuild\",\n", + " source_uri=unique_id,\n", + " properties={\"Example\": \"Metadata\"},\n", + ")\n", + "\n", + "print(f\"Created action: {model_build_action.action_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 4: Create Associations\n", + "\n", + "Associations are directed edges in the lineage graph. The `association_type` can be `Produced`, `DerivedFrom`, `AssociatedWith`, or `ContributedTo`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context_action_association = Association.create(\n", + " source_arn=ml_workflow_context.context_arn,\n", + " destination_arn=model_build_action.action_arn,\n", + " association_type=\"AssociatedWith\",\n", + ")\n", + "\n", + "print(\"Association created between context and action\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 5: Traverse Associations\n", + "\n", + "Query incoming and outgoing associations to understand entity relationships." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List incoming associations to the action\n", + "incoming_associations = Association.list(destination_arn=model_build_action.action_arn)\n", + "for association in incoming_associations:\n", + " print(\n", + " f\"{model_build_action.action_name} has an incoming association from {association.source_name}\"\n", + " )\n", + "\n", + "# List outgoing associations from the context\n", + "outgoing_associations = Association.list(source_arn=ml_workflow_context.context_arn)\n", + "for association in outgoing_associations:\n", + " print(\n", + " f\"{ml_workflow_context.context_name} has an outgoing association to {association.destination_name}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 6: Create Artifacts\n", + "\n", + "Artifacts represent URI-addressable objects or data such as datasets, labels, or trained models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create input data artifacts\n", + "input_test_images = Artifact.create(\n", + " artifact_name=\"mnist-test-images\",\n", + " artifact_type=\"TestData\",\n", + " source_types=[{\"SourceIdType\": \"Custom\", \"Value\": unique_id}],\n", + " source_uri=f\"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz\",\n", + ")\n", + "\n", + "input_test_labels = Artifact.create(\n", + " artifact_name=\"mnist-test-labels\",\n", + " artifact_type=\"TestLabels\",\n", + " source_types=[{\"SourceIdType\": \"Custom\", \"Value\": unique_id}],\n", + " source_uri=f\"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz\",\n", + ")\n", + "\n", + "print(f\"Created artifact: {input_test_images.artifact_name}\")\n", + "print(f\"Created artifact: {input_test_labels.artifact_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create output model artifact\n", + "output_model = Artifact.create(\n", + " artifact_name=\"mnist-model\",\n", + " artifact_type=\"Model\",\n", + " source_types=[{\"SourceIdType\": \"Custom\", \"Value\": unique_id}],\n", + " source_uri=f\"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/model/tensorflow-training-2020-11-20-23-57-13-077/model.tar.gz\",\n", + ")\n", + "\n", + "print(f\"Created artifact: {output_model.artifact_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case 7: Link Artifacts to Actions\n", + "\n", + "Associate data artifacts as inputs to the action, and the action output to the model artifact." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Associate input data with the model build action\n", + "Association.create(\n", + " source_arn=input_test_images.artifact_arn,\n", + " destination_arn=model_build_action.action_arn,\n", + ")\n", + "Association.create(\n", + " source_arn=input_test_labels.artifact_arn,\n", + " destination_arn=model_build_action.action_arn,\n", + ")\n", + "\n", + "# Associate the action with the output model\n", + "Association.create(\n", + " source_arn=model_build_action.action_arn,\n", + " destination_arn=output_model.artifact_arn,\n", + ")\n", + "\n", + "print(\"Lineage graph complete: inputs -> action -> output\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup\n", + "\n", + "Delete all lineage entities created in this notebook. Associations must be removed before their source or destination entities can be deleted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def delete_associations(arn):\n", + " \"\"\"Delete all incoming and outgoing associations for an entity.\"\"\"\n", + " for summary in Association.list(destination_arn=arn):\n", + " assct = Association(\n", + " source_arn=summary.source_arn,\n", + " destination_arn=summary.destination_arn,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " assct.delete()\n", + "\n", + " for summary in Association.list(source_arn=arn):\n", + " assct = Association(\n", + " source_arn=summary.source_arn,\n", + " destination_arn=summary.destination_arn,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " assct.delete()\n", + "\n", + "\n", + "def delete_lineage_data():\n", + " \"\"\"Delete all lineage entities created in this notebook.\"\"\"\n", + " print(f\"Deleting context {ml_workflow_context.context_name}\")\n", + " delete_associations(ml_workflow_context.context_arn)\n", + " ctx = Context(\n", + " context_name=ml_workflow_context.context_name,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " ctx.delete()\n", + "\n", + " print(f\"Deleting action {model_build_action.action_name}\")\n", + " delete_associations(model_build_action.action_arn)\n", + " actn = Action(\n", + " action_name=model_build_action.action_name,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " actn.delete()\n", + "\n", + " for artifact in [input_test_images, input_test_labels, output_model]:\n", + " print(f\"Deleting artifact {artifact.artifact_arn} {artifact.artifact_name}\")\n", + " delete_associations(artifact.artifact_arn)\n", + " artfct = Artifact(\n", + " artifact_arn=artifact.artifact_arn,\n", + " sagemaker_session=sagemaker_session,\n", + " )\n", + " artfct.delete()\n", + "\n", + "\n", + "delete_lineage_data()\n", + "print(\"Cleanup complete\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}