From 9a344e5e13e8315b36e3dffa5e79940e1eaa148a Mon Sep 17 00:00:00 2001 From: jzhaoqwa Date: Wed, 23 Jul 2025 16:07:52 -0700 Subject: [PATCH 1/2] Update inferenece SDK examples --- .../SDK/inference-fsx-model-e2e.ipynb | 29 ++++++++---- .../SDK/inference-jumpstart-e2e.ipynb | 30 ++++-------- .../SDK/inference-s3-model-e2e.ipynb | 46 +++++++++++++------ 3 files changed, 59 insertions(+), 46 deletions(-) diff --git a/examples/inference/SDK/inference-fsx-model-e2e.ipynb b/examples/inference/SDK/inference-fsx-model-e2e.ipynb index 10ae5b13..b56e8a7c 100644 --- a/examples/inference/SDK/inference-fsx-model-e2e.ipynb +++ b/examples/inference/SDK/inference-fsx-model-e2e.ipynb @@ -7,10 +7,19 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager\n", - "\n", - "HyperPodManager.list_clusters(region='us-east-2')\n", - "HyperPodManager.set_context('', region='us-east-2')" + "from sagemaker.hyperpod import list_clusters, set_cluster_context\n", + "list_clusters(region='us-east-2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "765ef3fd", + "metadata": {}, + "outputs": [], + "source": [ + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -20,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker\n", + "from sagemaker.hyperpod.inference.config.hp_endpoint_config import FsxStorage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker\n", "from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint\n", "import yaml\n", "import time" @@ -33,13 +42,13 @@ "metadata": {}, "outputs": [], "source": [ - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "model_source_config = ModelSourceConfig(\n", " model_source_type='fsx',\n", - " model_location=\"\",\n", + " model_location=\"\",\n", " fsx_storage=FsxStorage(\n", - " file_system_id=''\n", + " file_system_id=''\n", " ),\n", ")\n", "\n", @@ -73,7 +82,7 @@ "outputs": [], "source": [ "fsx_endpoint = HPEndpoint(\n", - " endpoint_name='test-endpoint-name-fsx-pysdk',\n", + " endpoint_name='',\n", " instance_type='ml.g5.8xlarge',\n", " model_name='deepseek15b-fsx-test-pysdk',\n", " tls_config=tls_config,\n", @@ -165,7 +174,7 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint = HPEndpoint.get(name='')" + "endpoint = HPEndpoint.get(name='')" ] }, { diff --git a/examples/inference/SDK/inference-jumpstart-e2e.ipynb b/examples/inference/SDK/inference-jumpstart-e2e.ipynb index 1cb0b4b4..f1ff2aaf 100644 --- a/examples/inference/SDK/inference-jumpstart-e2e.ipynb +++ b/examples/inference/SDK/inference-jumpstart-e2e.ipynb @@ -8,14 +8,6 @@ "## Inference Operator PySDK E2E Expereience (JumpStart model)" ] }, - { - "cell_type": "markdown", - "id": "1b3ce5c1-3c3d-4139-b7ae-042f360f3032", - "metadata": {}, - "source": [ - "Prerequisite: Data scientists should list clusters and set cluster context" - ] - }, { "cell_type": "code", "execution_count": null, @@ -23,7 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager" + "from sagemaker.hyperpod import list_clusters, set_cluster_context" ] }, { @@ -33,8 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "#Set region \n", - "region = \"us-west-2\"" + "list_clusters(region='us-east-2')" ] }, { @@ -44,8 +35,8 @@ "metadata": {}, "outputs": [], "source": [ - "# choose the HP cluster user works on\n", - "HyperPodManager.set_context('sagemaker-hyperpod-eks-cluster-demo-05-01', region=region)" + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -67,7 +58,7 @@ "from jumpstart_public_hub_visualization_utils import get_all_public_hub_model_data\n", "\n", "# Load and display SageMaker public hub models\n", - "get_all_public_hub_model_data(region=\"us-west-2\")" + "get_all_public_hub_model_data(region=\"us-east-2\")" ] }, { @@ -122,8 +113,8 @@ "server=Server(\n", " instance_type='ml.g5.8xlarge',\n", ")\n", - "endpoint_name=SageMakerEndpoint(name='deepsek7bsme-testing-jumpstart-7-1')\n", - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')\n", + "endpoint_name=SageMakerEndpoint(name='')\n", + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "# create spec\n", "js_endpoint=HPJumpStartEndpoint(\n", @@ -230,7 +221,7 @@ "outputs": [], "source": [ "# output is similar to kubectl describe jumpstartmodel\n", - "endpoint = HPJumpStartEndpoint.get(name='deepseek-llm-r1-distill-qwen-1-5b')\n", + "endpoint = HPJumpStartEndpoint.get(name='')\n", "print_yaml(endpoint)" ] }, @@ -265,10 +256,7 @@ "outputs": [], "source": [ "# get operator logs\n", - "print(js_endpoint.get_operator_logs(since_hours=1))\n", - "\n", - "# get specific pod log\n", - "# js_endpoint.get_logs(pod='pod-name')" + "print(js_endpoint.get_operator_logs(since_hours=0.1))" ] }, { diff --git a/examples/inference/SDK/inference-s3-model-e2e.ipynb b/examples/inference/SDK/inference-s3-model-e2e.ipynb index 2c41a11d..145f37f4 100644 --- a/examples/inference/SDK/inference-s3-model-e2e.ipynb +++ b/examples/inference/SDK/inference-s3-model-e2e.ipynb @@ -7,10 +7,19 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager\n", - "\n", - "HyperPodManager.list_clusters(region='us-east-2')\n", - "HyperPodManager.set_context('', region='us-east-2')" + "from sagemaker.hyperpod import list_clusters, set_cluster_context\n", + "list_clusters(region='us-east-2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14cd61ab", + "metadata": {}, + "outputs": [], + "source": [ + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -33,13 +42,13 @@ "metadata": {}, "outputs": [], "source": [ - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "model_source_config = ModelSourceConfig(\n", " model_source_type='s3',\n", - " model_location=\"\",\n", + " model_location=\"\",\n", " s3_storage=S3Storage(\n", - " bucket_name='',\n", + " bucket_name='',\n", " region='us-east-2',\n", " ),\n", ")\n", @@ -67,7 +76,7 @@ "\n", "# Create dimensions\n", "dimensions = [\n", - " Dimensions(name=\"EndpointName\", value=\"\"),\n", + " Dimensions(name=\"EndpointName\", value=\"\"),\n", " Dimensions(name=\"VariantName\", value=\"AllTraffic\")\n", "]\n", "\n", @@ -102,7 +111,7 @@ "outputs": [], "source": [ "s3_endpoint = HPEndpoint(\n", - " endpoint_name='s3-test-endpoint-name',\n", + " endpoint_name='',\n", " instance_type='ml.g5.8xlarge',\n", " model_name='deepseek15b-test-model-name', \n", " tls_config=tls_config,\n", @@ -120,7 +129,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_endpoint.create(debug=True)" + "s3_endpoint.create()" ] }, { @@ -193,7 +202,17 @@ "outputs": [], "source": [ "endpoint_list = HPEndpoint.list()\n", - "print_yaml(endpoint_list[1])" + "print_yaml(endpoint_list[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "660e8d47", + "metadata": {}, + "outputs": [], + "source": [ + "s3_endpoint = HPEndpoint.get(name='')" ] }, { @@ -206,10 +225,7 @@ "outputs": [], "source": [ "# get operator logs\n", - "print(s3_endpoint.get_operator_logs(since_hours=0.5))\n", - "\n", - "# get specific pod log\n", - "# js_endpoint.get_logs(pod='pod-name')" + "print(s3_endpoint.get_operator_logs(since_hours=0.1))" ] }, { From dee74e295f8277cf1454178e5ad95dbc1ec77c07 Mon Sep 17 00:00:00 2001 From: jzhaoqwa Date: Wed, 23 Jul 2025 16:23:53 -0700 Subject: [PATCH 2/2] Update readme --- README.md | 101 +++++++++--------- .../SDK/inference-s3-model-e2e.ipynb | 34 +----- 2 files changed, 50 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index 8086e8de..02d94c38 100644 --- a/README.md +++ b/README.md @@ -337,24 +337,21 @@ Pre-trained Jumpstart models can be gotten from https://sagemaker.readthedocs.io from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint -model = Model( - model_id="deepseek-llm-r1-distill-qwen-1-5b", - model_version="2.0.4" +model=Model( + model_id='deepseek-llm-r1-distill-qwen-1-5b', + model_version='2.0.4', ) - -server = Server( - instance_type="ml.g5.8xlarge" +server=Server( + instance_type='ml.g5.8xlarge', ) +endpoint_name=SageMakerEndpoint(name='') +tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://') -endpoint_name = SageMakerEndpoint(name="endpoint-jumpstart") - -tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") - -js_endpoint = HPJumpStartEndpoint( +js_endpoint=HPJumpStartEndpoint( model=model, server=server, sage_maker_endpoint=endpoint_name, - tls_config=tls_config + tls_config=tls_config, ) js_endpoint.create() @@ -370,51 +367,51 @@ print(response) ``` -#### Creating a Custom Inference Endpoint +#### Creating a Custom Inference Endpoint (with S3) ``` -from sagemaker.hyperpod.inference.config.hp_custom_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig, EnvironmentVariables -from sagemaker.hyperpod.inference.hp_custom_endpoint import HPCustomEndpoint +from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, AutoScalingSpec, Metrics, S3Storage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint -model = Model( - model_source_type="s3", - model_location="test-pytorch-job/model.tar.gz", - s3_bucket_name="my-bucket", - s3_region="us-east-2", - prefetch_enabled=True +model_source_config = ModelSourceConfig( + model_source_type='s3', + model_location="", + s3_storage=S3Storage( + bucket_name='', + region='us-east-2', + ), ) -server = Server( - instance_type="ml.g5.8xlarge", - image_uri="763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0", - container_port=8080, - model_volume_mount_name="model-weights" -) +environment_variables = [ + EnvironmentVariables(name="HF_MODEL_ID", value="/opt/ml/model"), + EnvironmentVariables(name="SAGEMAKER_PROGRAM", value="inference.py"), + EnvironmentVariables(name="SAGEMAKER_SUBMIT_DIRECTORY", value="/opt/ml/model/code"), + EnvironmentVariables(name="MODEL_CACHE_ROOT", value="/opt/ml/model"), + EnvironmentVariables(name="SAGEMAKER_ENV", value="1"), +] -resources = { - "requests": {"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"}, - "limits": {"nvidia.com/gpu": 1} -} - -env = EnvironmentVariables( - HF_MODEL_ID="/opt/ml/model", - SAGEMAKER_PROGRAM="inference.py", - SAGEMAKER_SUBMIT_DIRECTORY="/opt/ml/model/code", - MODEL_CACHE_ROOT="/opt/ml/model", - SAGEMAKER_ENV="1" +worker = Worker( + image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0', + model_volume_mount=ModelVolumeMount( + name='model-weights', + ), + model_invocation_port=ModelInvocationPort(container_port=8080), + resources=Resources( + requests={"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"}, + limits={"nvidia.com/gpu": 1} + ), + environment_variables=environment_variables, ) -endpoint_name = SageMakerEndpoint(name="endpoint-custom-pytorch") - -tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") +tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://') -custom_endpoint = HPCustomEndpoint( - model=model, - server=server, - resources=resources, - environment=env, - sage_maker_endpoint=endpoint_name, +custom_endpoint = HPEndpoint( + endpoint_name='', + instance_type='ml.g5.8xlarge', + model_name='deepseek15b-test-model-name', tls_config=tls_config, + model_source_config=model_source_config, + worker=worker, ) custom_endpoint.create() @@ -431,19 +428,17 @@ print(response) #### Managing an Endpoint ``` -endpoint_iterator = HPJumpStartEndpoint.list() -for endpoint in endpoint_iterator: - print(endpoint.name, endpoint.status) +endpoint_list = HPEndpoint.list() +print(endpoint_list[0]) -logs = js_endpoint.get_logs() -print(logs) +print(custom_endpoint.get_operator_logs(since_hours=0.5)) ``` #### Deleting an Endpoint ``` -js_endpoint.delete() +custom_endpoint.delete() ``` diff --git a/examples/inference/SDK/inference-s3-model-e2e.ipynb b/examples/inference/SDK/inference-s3-model-e2e.ipynb index 145f37f4..79810c39 100644 --- a/examples/inference/SDK/inference-s3-model-e2e.ipynb +++ b/examples/inference/SDK/inference-s3-model-e2e.ipynb @@ -29,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, S3Storage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker\n", + "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, AutoScalingSpec, Metrics, S3Storage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker\n", "from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint\n", "import yaml\n", "import time" @@ -72,35 +72,7 @@ " limits={\"nvidia.com/gpu\": 1}\n", " ),\n", " environment_variables=environment_variables,\n", - ")\n", - "\n", - "# Create dimensions\n", - "dimensions = [\n", - " Dimensions(name=\"EndpointName\", value=\"\"),\n", - " Dimensions(name=\"VariantName\", value=\"AllTraffic\")\n", - "]\n", - "\n", - "# Create CloudWatch trigger\n", - "cloudwatch_trigger = CloudWatchTrigger(\n", - " dimensions=dimensions,\n", - " metric_collection_period=30,\n", - " metric_name=\"Invocations\",\n", - " metric_stat=\"Sum\",\n", - " metric_type=\"Average\",\n", - " min_value=0.0,\n", - " name=\"SageMaker-Invocations\",\n", - " namespace=\"AWS/SageMaker\",\n", - " target_value=10,\n", - " use_cached_metrics=False\n", - ")\n", - "\n", - "# Create autoscaling spec\n", - "auto_scaling_spec = AutoScalingSpec(\n", - " cloud_watch_trigger=cloudwatch_trigger\n", - ")\n", - "\n", - "# Create metrics\n", - "metrics = Metrics(enabled=True)" + ")" ] }, { @@ -117,8 +89,6 @@ " tls_config=tls_config,\n", " model_source_config=model_source_config,\n", " worker=worker,\n", - " auto_scaling_spec=auto_scaling_spec,\n", - " metrics=metrics,\n", ")" ] },