From 4001fa2a992ed4d9ad362fcbd12b393a39ba9b05 Mon Sep 17 00:00:00 2001 From: liyuan Date: Wed, 2 Nov 2022 17:54:50 +0800 Subject: [PATCH 01/31] update gpu agent Signed-off-by: liyuan --- gpu/install_gpu_driver.sh | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 28ba6c00d..ab0cbfcc8 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -309,7 +309,7 @@ function install_nvidia_gpu_driver() { } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics -function install_gpu_agent() { +function install_gpu_agent_bak() { if ! command -v pip; then execute_with_retries "apt-get install -y -q python-pip" fi @@ -344,6 +344,35 @@ EOF systemctl --no-reload --now enable gpu-utilization-agent.service } +function install_gpu_agent() { + downloading_agent + installing_agent_dependency + starting_agent_service +} + +function downloading_agent(){ + sudo apt-get install git -y + sudo mkdir -p /opt/google + cd /opt/google + sudo git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git +} + +function installing_agent_dependency(){ + cd /opt/google/compute-gpu-monitoring/linux + sudo apt-get install python3.8-venv python3.8-dev -y + sudo python3.8 -m venv venv + sudo venv/bin/pip install wheel + cd /opt/google/compute-gpu-monitoring/linux/venv/bin + sudo ln -sf /usr/bin/python3.8 python3 + sudo venv/bin/pip install -Ur requirements.txt +} + +function starting_agent_service(){ + sudo cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system + sudo systemctl daemon-reload + sudo systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service +} + function set_hadoop_property() { local -r config_file=$1 local -r property=$2 From 5e249b02994e2531766bdcae476165c3d300ede7 Mon Sep 17 00:00:00 2001 From: liyuan Date: Wed, 2 Nov 2022 17:59:29 +0800 Subject: [PATCH 02/31] update gpu agent Signed-off-by: liyuan --- gpu/install_gpu_driver.sh | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index ab0cbfcc8..2c5a9d4d7 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -309,41 +309,6 @@ function install_nvidia_gpu_driver() { } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics -function install_gpu_agent_bak() { - if ! command -v pip; then - execute_with_retries "apt-get install -y -q python-pip" - fi - local install_dir=/opt/gpu-utilization-agent - mkdir -p "${install_dir}" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" -o "${install_dir}/report_gpu_metrics.py" - pip install -r "${install_dir}/requirements.txt" - - # Generate GPU service. - cat </lib/systemd/system/gpu-utilization-agent.service -[Unit] -Description=GPU Utilization Metric Agent - -[Service] -Type=simple -PIDFile=/run/gpu_agent.pid -ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"' -User=root -Group=root -WorkingDirectory=/ -Restart=always - -[Install] -WantedBy=multi-user.target -EOF - # Reload systemd manager configuration - systemctl daemon-reload - # Enable gpu-utilization-agent service - systemctl --no-reload --now enable gpu-utilization-agent.service -} - function install_gpu_agent() { downloading_agent installing_agent_dependency From 4293cd98367b6a2f14bd92ac3c13766a5a383e7c Mon Sep 17 00:00:00 2001 From: liyuan Date: Thu, 3 Nov 2022 11:48:13 +0800 Subject: [PATCH 03/31] rename folders and scripts and add agent for ubuntu and debian image Signed-off-by: liyuan --- BUILD | 2 +- gpu/install_gpu_driver.sh | 25 ++++++++++++------- {sparkRapids => spark-rapids}/BUILD | 4 +-- {sparkRapids => spark-rapids}/README.md | 2 +- {sparkRapids => spark-rapids}/__init__.py | 0 {sparkRapids => spark-rapids}/spark-rapids.sh | 0 .../test_spark_rapids.py | 0 .../verify_xgboost_spark_rapids.scala | 0 8 files changed, 20 insertions(+), 13 deletions(-) rename {sparkRapids => spark-rapids}/BUILD (84%) rename {sparkRapids => spark-rapids}/README.md (98%) rename {sparkRapids => spark-rapids}/__init__.py (100%) rename {sparkRapids => spark-rapids}/spark-rapids.sh (100%) rename sparkRapids/test_sparkRapids.py => spark-rapids/test_spark_rapids.py (100%) rename {sparkRapids => spark-rapids}/verify_xgboost_spark_rapids.scala (100%) diff --git a/BUILD b/BUILD index 94c38a59b..93bfb7f94 100644 --- a/BUILD +++ b/BUILD @@ -8,6 +8,7 @@ test_suite( ":test_hive_hcatalog", ":test_hive_llap", ":test_starburst_presto", + ":test_spark_rapids", "//alluxio:test_alluxio", "//atlas:test_atlas", "//bigtable:test_bigtable", @@ -34,7 +35,6 @@ test_suite( "//solr:test_solr", "//sqoop:test_sqoop", "//tony:test_tony", - "//sparkRapids:test_sparkRapids", ], ) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 2c5a9d4d7..a13024e90 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -316,20 +316,27 @@ function install_gpu_agent() { } function downloading_agent(){ - sudo apt-get install git -y + execute_with_retries "sudo apt-get install git -y" sudo mkdir -p /opt/google cd /opt/google - sudo git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git + execute_with_retries "sudo git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" } function installing_agent_dependency(){ - cd /opt/google/compute-gpu-monitoring/linux - sudo apt-get install python3.8-venv python3.8-dev -y - sudo python3.8 -m venv venv - sudo venv/bin/pip install wheel - cd /opt/google/compute-gpu-monitoring/linux/venv/bin - sudo ln -sf /usr/bin/python3.8 python3 - sudo venv/bin/pip install -Ur requirements.txt + if [[ ${OS_NAME} == debian ]]; then + execute_with_retries "sudo apt-get install python3-venv python3-dev -y" + sudo python3 -m venv venv + execute_with_retries "sudo venv/bin/pip install wheel" + execute_with_retries "sudo venv/bin/pip install -Ur requirements.txt" + elif [[ ${OS_NAME} == ubuntu ]]; then + execute_with_retries "sudo apt-get install python3.8-venv python3.8-dev -y" + sudo python3.8 -m venv venv + execute_with_retries "sudo venv/bin/pip install wheel setuptools" + cd /opt/google/compute-gpu-monitoring/linux/venv/bin + sudo ln -sf /opt/conda/default/bin/python3 python3 + cd /opt/google/compute-gpu-monitoring/linux + execute_with_retries "sudo venv/bin/pip install -Ur requirements.txt" + fi } function starting_agent_service(){ diff --git a/sparkRapids/BUILD b/spark-rapids/BUILD similarity index 84% rename from sparkRapids/BUILD rename to spark-rapids/BUILD index ccc1004e0..04a43fc5b 100644 --- a/sparkRapids/BUILD +++ b/spark-rapids/BUILD @@ -3,9 +3,9 @@ package(default_visibility = ["//visibility:public"]) exports_files(["spark-rapids.sh"]) py_test( - name = "test_sparkRapids", + name = "test_spark_rapids", size = "enormous", - srcs = ["test_sparkRapids.py"], + srcs = ["test_spark_rapids.py"], data = [ "spark-rapids.sh", "verify_xgboost_spark_rapids.scala", diff --git a/sparkRapids/README.md b/spark-rapids/README.md similarity index 98% rename from sparkRapids/README.md rename to spark-rapids/README.md index 875fa6f87..ae4ede6fa 100644 --- a/sparkRapids/README.md +++ b/spark-rapids/README.md @@ -67,7 +67,7 @@ gcloud dataproc clusters create $CLUSTER_NAME \ --worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \ --worker-machine-type n1-standard-8 \ --num-worker-local-ssds 1 \ - --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/sparkRapids/spark-rapids.sh \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \ --optional-components=JUPYTER,ZEPPELIN \ --metadata gpu-driver-provider="NVIDIA",rapids-runtime="SPARK",cuda-version="$CUDA_VER" \ --bucket $GCS_BUCKET \ diff --git a/sparkRapids/__init__.py b/spark-rapids/__init__.py similarity index 100% rename from sparkRapids/__init__.py rename to spark-rapids/__init__.py diff --git a/sparkRapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh similarity index 100% rename from sparkRapids/spark-rapids.sh rename to spark-rapids/spark-rapids.sh diff --git a/sparkRapids/test_sparkRapids.py b/spark-rapids/test_spark_rapids.py similarity index 100% rename from sparkRapids/test_sparkRapids.py rename to spark-rapids/test_spark_rapids.py diff --git a/sparkRapids/verify_xgboost_spark_rapids.scala b/spark-rapids/verify_xgboost_spark_rapids.scala similarity index 100% rename from sparkRapids/verify_xgboost_spark_rapids.scala rename to spark-rapids/verify_xgboost_spark_rapids.scala From 6372e430d376b421908371a22b332607fbbff036 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 7 Nov 2022 14:21:49 +0800 Subject: [PATCH 04/31] update reademe and fix error with agent on debian10 and ubuntu18 Signed-off-by: liyuan --- gpu/install_gpu_driver.sh | 25 ++++++------------ spark-rapids/README.md | 21 +++++++++++++-- spark-rapids/spark-rapids.sh | 50 +++++++++++++++--------------------- 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index bd6ac4f2f..6b71706b2 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -318,31 +318,22 @@ function install_gpu_agent() { function downloading_agent(){ execute_with_retries "sudo apt-get install git -y" sudo mkdir -p /opt/google + sudo chmod 777 /opt/google cd /opt/google - execute_with_retries "sudo git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" } function installing_agent_dependency(){ - if [[ ${OS_NAME} == debian ]]; then - execute_with_retries "sudo apt-get install python3-venv python3-dev -y" - sudo python3 -m venv venv - execute_with_retries "sudo venv/bin/pip install wheel" - execute_with_retries "sudo venv/bin/pip install -Ur requirements.txt" - elif [[ ${OS_NAME} == ubuntu ]]; then - execute_with_retries "sudo apt-get install python3.8-venv python3.8-dev -y" - sudo python3.8 -m venv venv - execute_with_retries "sudo venv/bin/pip install wheel setuptools" - cd /opt/google/compute-gpu-monitoring/linux/venv/bin - sudo ln -sf /opt/conda/default/bin/python3 python3 - cd /opt/google/compute-gpu-monitoring/linux - execute_with_retries "sudo venv/bin/pip install -Ur requirements.txt" - fi + cd /opt/google/compute-gpu-monitoring/linux + python3 -m venv venv + venv/bin/pip install wheel + venv/bin/pip install -Ur requirements.txt } function starting_agent_service(){ sudo cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system - sudo systemctl daemon-reload - sudo systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service + systemctl daemon-reload + systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service } function set_hadoop_property() { diff --git a/spark-rapids/README.md b/spark-rapids/README.md index ae4ede6fa..920eb441f 100644 --- a/spark-rapids/README.md +++ b/spark-rapids/README.md @@ -28,7 +28,7 @@ To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3 * NVIDIA GPU driver 440.33+ * CUDA v11.5/v11.0/v10.2/v10.1 * NCCL 2.11.4+ - * Ubuntu 18.04, Ubuntu 20.04 or Rocky Linux 7, Rocky Linux8, Debian 10 + * Ubuntu 18.04, Ubuntu 20.04 or Rocky Linux 7, Rocky Linux8, Debian 10, Debian 11 This section describes how to create [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with @@ -114,4 +114,21 @@ In some releases, you might not see that due to AQE has not finalized the plan. Or go to the Spark UI and click on the application you ran and on the "SQL" tab. If you click the operation "count at ...", you should see the graph of Spark -Executors and some of those should have the "GPU" label as well. \ No newline at end of file +Executors and some of those should have the "GPU" label as well. + +If you want to monitor GPU metrics on Dataproc, you can create the cluster with additional configs: +``` +--metadata install-gpu-agent="true" +--scopes monitoring +``` +Then you can monitor following metrics on [Web UI](https://console.cloud.google.com/monitoring/metrics-explorer), +we should be able to see "Resource & Metric" -> "VM Instance" -> "Custom": +* **custom.googleapis.com/instance/gpu/utilization** - The GPU cores utilization in %. +* **custom.googleapis.com/instance/gpu/memory_utilization** - The GPU memory bandwidth utilization in %. +* **custom.googleapis.com/instance/gpu/memory_total** - Total memory of the GPU card in MB. +* **custom.googleapis.com/instance/gpu/memory_used** - Used memory of the GPU card. +* **custom.googleapis.com/instance/gpu/memory_free** - Available memory of the GPU card. +* **custom.googleapis.com/instance/gpu/temperature** - Temperature of the GPU. +The metrics are sent with attached label, marking them by the gpu_type and gpu_bus_id. +This way, instances with multiple GPUs attached can report the metrics of their cards separately. +You can later aggregate or filter those metrics in the Cloud Monitoring systems. \ No newline at end of file diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 948bb645c..3d9c58649 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -212,38 +212,30 @@ function install_nvidia_gpu_driver() { # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { - if ! command -v pip; then - execute_with_retries "apt-get install -y -q python-pip" - fi - local install_dir=/opt/gpu-utilization-agent - mkdir "${install_dir}" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" -o "${install_dir}/report_gpu_metrics.py" - pip install -r "${install_dir}/requirements.txt" - - # Generate GPU service. - cat </lib/systemd/system/gpu-utilization-agent.service -[Unit] -Description=GPU Utilization Metric Agent + downloading_agent + installing_agent_dependency + starting_agent_service +} -[Service] -Type=simple -PIDFile=/run/gpu_agent.pid -ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"' -User=root -Group=root -WorkingDirectory=/ -Restart=always +function downloading_agent(){ + execute_with_retries "sudo apt-get install git -y" + sudo mkdir -p /opt/google + sudo chmod 777 /opt/google + cd /opt/google + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" +} -[Install] -WantedBy=multi-user.target -EOF - # Reload systemd manager configuration +function installing_agent_dependency(){ + cd /opt/google/compute-gpu-monitoring/linux + python3 -m venv venv + venv/bin/pip install wheel + venv/bin/pip install -Ur requirements.txt +} + +function starting_agent_service(){ + sudo cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system systemctl daemon-reload - # Enable gpu-utilization-agent service - systemctl --no-reload --now enable gpu-utilization-agent.service + systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service } function set_hadoop_property() { From 4352cff767647d99226dbf5a4d55be47408fb636 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 7 Nov 2022 16:26:56 +0800 Subject: [PATCH 05/31] update reademe and fix error with agent on debian10 and ubuntu18 Signed-off-by: liyuan --- spark-rapids/README.md | 4 +++- spark-rapids/spark-rapids.sh | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/spark-rapids/README.md b/spark-rapids/README.md index 920eb441f..4702a5471 100644 --- a/spark-rapids/README.md +++ b/spark-rapids/README.md @@ -116,7 +116,9 @@ Or go to the Spark UI and click on the application you ran and on the "SQL" tab. If you click the operation "count at ...", you should see the graph of Spark Executors and some of those should have the "GPU" label as well. -If you want to monitor GPU metrics on Dataproc, you can create the cluster with additional configs: +If you want to monitor GPU metrics on Dataproc, you can create the cluster with additional +[metadata](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/metadata) and +[scopes](https://cloud.google.com/sdk/gcloud/reference/dataproc/clusters/create#--scopes): ``` --metadata install-gpu-agent="true" --scopes monitoring diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 3d9c58649..039859e67 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -12,11 +12,11 @@ OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') readonly OS_NAME readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="22.08.0" +readonly DEFAULT_SPARK_RAPIDS_VERSION="22.10.0" if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then readonly DEFAULT_CUDA_VERSION="11.5" - readonly DEFAULT_CUDF_VERSION="22.08.0" + readonly DEFAULT_CUDF_VERSION="22.10.0" readonly DEFAULT_XGBOOST_VERSION="1.6.2" readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.3.0" readonly SPARK_VERSION="3.0" @@ -300,8 +300,8 @@ function configure_gpu_exclusive_mode() { function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts sudo chmod 755 /usr/local/yarn-mig-scripts - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.08/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.08/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh sudo chmod 755 /usr/local/yarn-mig-scripts/* } From 3ef29449eb89cd009785180d72cd1046c3d25300 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 7 Nov 2022 16:30:48 +0800 Subject: [PATCH 06/31] suport customized driver-version in metadata Signed-off-by: liyuan --- spark-rapids/spark-rapids.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 039859e67..0519b9767 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -35,9 +35,10 @@ readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') # CUDA version and Driver version config CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '11.5') -readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION='495.29.05' +DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '495.29.05') readonly CUDA_VERSION +readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX=${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION%%.*} # Parameters for NVIDIA-provided Debian GPU driver From b761e0ce704bf353bdcb9877687d1577a108b456 Mon Sep 17 00:00:00 2001 From: liyuan Date: Wed, 9 Nov 2022 11:54:09 +0800 Subject: [PATCH 07/31] rename some functions Signed-off-by: liyuan --- gpu/install_gpu_driver.sh | 12 ++++++------ spark-rapids/README.md | 2 +- spark-rapids/spark-rapids.sh | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 6b71706b2..d13290ec7 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -310,12 +310,12 @@ function install_nvidia_gpu_driver() { # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { - downloading_agent - installing_agent_dependency - starting_agent_service + download_agent + install_agent_dependency + start_agent_service } -function downloading_agent(){ +function download_agent(){ execute_with_retries "sudo apt-get install git -y" sudo mkdir -p /opt/google sudo chmod 777 /opt/google @@ -323,14 +323,14 @@ function downloading_agent(){ execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" } -function installing_agent_dependency(){ +function install_agent_dependency(){ cd /opt/google/compute-gpu-monitoring/linux python3 -m venv venv venv/bin/pip install wheel venv/bin/pip install -Ur requirements.txt } -function starting_agent_service(){ +function start_agent_service(){ sudo cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system systemctl daemon-reload systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service diff --git a/spark-rapids/README.md b/spark-rapids/README.md index 4702a5471..1b087792c 100644 --- a/spark-rapids/README.md +++ b/spark-rapids/README.md @@ -123,7 +123,7 @@ If you want to monitor GPU metrics on Dataproc, you can create the cluster with --metadata install-gpu-agent="true" --scopes monitoring ``` -Then you can monitor following metrics on [Web UI](https://console.cloud.google.com/monitoring/metrics-explorer), +You can then monitor the following metrics on [Web UI](https://console.cloud.google.com/monitoring/metrics-explorer), we should be able to see "Resource & Metric" -> "VM Instance" -> "Custom": * **custom.googleapis.com/instance/gpu/utilization** - The GPU cores utilization in %. * **custom.googleapis.com/instance/gpu/memory_utilization** - The GPU memory bandwidth utilization in %. diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 0519b9767..613365a21 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -213,12 +213,12 @@ function install_nvidia_gpu_driver() { # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { - downloading_agent - installing_agent_dependency - starting_agent_service + download_agent + install_agent_dependency + start_agent_service } -function downloading_agent(){ +function download_agent(){ execute_with_retries "sudo apt-get install git -y" sudo mkdir -p /opt/google sudo chmod 777 /opt/google @@ -226,14 +226,14 @@ function downloading_agent(){ execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" } -function installing_agent_dependency(){ +function install_agent_dependency(){ cd /opt/google/compute-gpu-monitoring/linux python3 -m venv venv venv/bin/pip install wheel venv/bin/pip install -Ur requirements.txt } -function starting_agent_service(){ +function start_agent_service(){ sudo cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system systemctl daemon-reload systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service From c0b3bb4514cdb4eb0b78058c2215607810735983 Mon Sep 17 00:00:00 2001 From: liyuan Date: Wed, 9 Nov 2022 16:52:03 +0800 Subject: [PATCH 08/31] update the BUILD script Signed-off-by: liyuan --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index 93bfb7f94..92223f231 100644 --- a/BUILD +++ b/BUILD @@ -8,7 +8,7 @@ test_suite( ":test_hive_hcatalog", ":test_hive_llap", ":test_starburst_presto", - ":test_spark_rapids", + "spark-rapids:test_spark_rapids", "//alluxio:test_alluxio", "//atlas:test_atlas", "//bigtable:test_bigtable", From 65b5c1486b85d3af886ff94e369d8a4c716f60e5 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 14 Nov 2022 16:37:15 +0800 Subject: [PATCH 09/31] remove all sudos and verified it works Signed-off-by: liyuan --- gpu/install_gpu_driver.sh | 12 ++++++------ spark-rapids/spark-rapids.sh | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index d13290ec7..5b95faf6f 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -316,9 +316,9 @@ function install_gpu_agent() { } function download_agent(){ - execute_with_retries "sudo apt-get install git -y" - sudo mkdir -p /opt/google - sudo chmod 777 /opt/google + execute_with_retries "apt-get install git -y" + mkdir -p /opt/google + chmod 777 /opt/google cd /opt/google execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" } @@ -331,7 +331,7 @@ function install_agent_dependency(){ } function start_agent_service(){ - sudo cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system + cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system systemctl daemon-reload systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service } @@ -397,10 +397,10 @@ function configure_gpu_exclusive_mode() { function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts - sudo chmod 755 /usr/local/yarn-mig-scripts + chmod 755 /usr/local/yarn-mig-scripts wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - sudo chmod 755 /usr/local/yarn-mig-scripts/* + chmod 755 /usr/local/yarn-mig-scripts/* } function configure_gpu_script() { diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 613365a21..01deae877 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -219,9 +219,9 @@ function install_gpu_agent() { } function download_agent(){ - execute_with_retries "sudo apt-get install git -y" - sudo mkdir -p /opt/google - sudo chmod 777 /opt/google + execute_with_retries "apt-get install git -y" + mkdir -p /opt/google + chmod 777 /opt/google cd /opt/google execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" } @@ -234,7 +234,7 @@ function install_agent_dependency(){ } function start_agent_service(){ - sudo cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system + cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system systemctl daemon-reload systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service } @@ -300,10 +300,10 @@ function configure_gpu_exclusive_mode() { function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts - sudo chmod 755 /usr/local/yarn-mig-scripts + chmod 755 /usr/local/yarn-mig-scripts wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - sudo chmod 755 /usr/local/yarn-mig-scripts/* + chmod 755 /usr/local/yarn-mig-scripts/* } function configure_gpu_script() { From c7cea82948a50d33651cc82e71e0dfbb90416539 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 14 Nov 2022 16:44:59 +0800 Subject: [PATCH 10/31] add the condition of installing git on rocky Signed-off-by: liyuan --- gpu/install_gpu_driver.sh | 6 +++++- spark-rapids/spark-rapids.sh | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 5b95faf6f..19fc28342 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -316,7 +316,11 @@ function install_gpu_agent() { } function download_agent(){ - execute_with_retries "apt-get install git -y" + if [[ ${OS_NAME} == rocky ]]; then + execute_with_retries "dnf -y -q install git" + else + execute_with_retries "apt-get install git -y" + fi mkdir -p /opt/google chmod 777 /opt/google cd /opt/google diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 01deae877..92271ef36 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -219,7 +219,11 @@ function install_gpu_agent() { } function download_agent(){ - execute_with_retries "apt-get install git -y" + if [[ ${OS_NAME} == rocky ]]; then + execute_with_retries "dnf -y -q install git" + else + execute_with_retries "apt-get install git -y" + fi mkdir -p /opt/google chmod 777 /opt/google cd /opt/google From c08a65e0fa51cc4e1de3aa14106b3a55889e7ac5 Mon Sep 17 00:00:00 2001 From: liyuan Date: Tue, 15 Nov 2022 18:14:14 +0800 Subject: [PATCH 11/31] add rocky8 patch and revert the changes in BUILD Signed-off-by: liyuan --- BUILD | 2 +- spark-rapids/spark-rapids-rocky.sh | 483 +++++++++++++++++++++++++++++ spark-rapids/spark-rapids.sh | 32 +- 3 files changed, 498 insertions(+), 19 deletions(-) create mode 100644 spark-rapids/spark-rapids-rocky.sh diff --git a/BUILD b/BUILD index 92223f231..93bfb7f94 100644 --- a/BUILD +++ b/BUILD @@ -8,7 +8,7 @@ test_suite( ":test_hive_hcatalog", ":test_hive_llap", ":test_starburst_presto", - "spark-rapids:test_spark_rapids", + ":test_spark_rapids", "//alluxio:test_alluxio", "//atlas:test_atlas", "//bigtable:test_bigtable", diff --git a/spark-rapids/spark-rapids-rocky.sh b/spark-rapids/spark-rapids-rocky.sh new file mode 100644 index 000000000..099f7fc07 --- /dev/null +++ b/spark-rapids/spark-rapids-rocky.sh @@ -0,0 +1,483 @@ +#!/bin/bash + +set -euxo pipefail + +function get_metadata_attribute() { + local -r attribute_name=$1 + local -r default_value=$2 + /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +} + +OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') +readonly OS_NAME + +readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) +readonly DEFAULT_SPARK_RAPIDS_VERSION="22.10.0" + +if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then + readonly DEFAULT_CUDA_VERSION="11.5" + readonly DEFAULT_CUDF_VERSION="22.10.0" + readonly DEFAULT_XGBOOST_VERSION="1.6.2" + readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.3.0" + readonly SPARK_VERSION="3.0" +else + readonly DEFAULT_CUDA_VERSION="10.1" + readonly DEFAULT_CUDF_VERSION="0.9.2" + readonly DEFAULT_XGBOOST_VERSION="1.0.0" + readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="Beta5" + readonly SPARK_VERSION="2.x" +fi + +readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) +readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) + +readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + +# CUDA version and Driver version config +CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '11.5') +DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '495.29.05') + +readonly CUDA_VERSION +readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION +readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX=${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION%%.*} + +# Parameters for NVIDIA-provided Debian GPU driver +readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION}.run" +NVIDIA_DEBIAN_GPU_DRIVER_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL}") +readonly NVIDIA_DEBIAN_GPU_DRIVER_URL + +readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' + +# Parameters for NVIDIA-provided Debian GPU driver +readonly -A DEFAULT_NVIDIA_DEBIAN_CUDA_URLS=( + [10.1]="${NVIDIA_BASE_DL_URL}/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run" + [10.2]="${NVIDIA_BASE_DL_URL}/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run" + [11.0]="${NVIDIA_BASE_DL_URL}/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run" + [11.1]="${NVIDIA_BASE_DL_URL}/cuda/11.1.0/local_installers/cuda_11.1.0_455.23.05_linux.run" + [11.2]="${NVIDIA_BASE_DL_URL}/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run" + [11.5]="${NVIDIA_BASE_DL_URL}/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run" + [11.6]="${NVIDIA_BASE_DL_URL}/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run" + [11.7]="${NVIDIA_BASE_DL_URL}/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run") +readonly DEFAULT_NVIDIA_DEBIAN_CUDA_URL=${DEFAULT_NVIDIA_DEBIAN_CUDA_URLS["${CUDA_VERSION}"]} +NVIDIA_DEBIAN_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_DEBIAN_CUDA_URL}") +readonly NVIDIA_DEBIAN_CUDA_URL + +# Parameters for NVIDIA-provided Ubuntu GPU driver +readonly NVIDIA_UBUNTU_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64" +readonly NVIDIA_UBUNTU_REPO_KEY_PACKAGE="${NVIDIA_UBUNTU_REPO_URL}/cuda-keyring_1.0-1_all.deb" +readonly NVIDIA_UBUNTU_REPO_CUDA_PIN="${NVIDIA_UBUNTU_REPO_URL}/cuda-ubuntu1804.pin" + +# Parameter for NVIDIA-provided Rocky Linux GPU driver +readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/rhel8/x86_64/cuda-rhel8.repo" + +# Whether to install NVIDIA-provided or OS-provided GPU driver +GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') +readonly GPU_DRIVER_PROVIDER + +# Stackdriver GPU agent parameters +readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' +# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver +INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') +readonly INSTALL_GPU_AGENT + +# Dataproc configurations +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly HIVE_CONF_DIR='/etc/hive/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' + +NVIDIA_SMI_PATH='/usr/bin' +MIG_MAJOR_CAPS=0 +IS_MIG_ENABLED=0 + +# SPARK config +readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) +readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) +readonly XGBOOST_GPU_SUB_VERSION=$(get_metadata_attribute 'spark-gpu-sub-version' ${DEFAULT_XGBOOST_GPU_SUB_VERSION}) + +function execute_with_retries() { + local -r cmd=$1 + for ((i = 0; i < 10; i++)); do + if eval "$cmd"; then + return 0 + fi + sleep 5 + done + return 1 +} + +function install_spark_rapids() { + local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' + + # Convert . to - for URL formatting + local cudf_cuda_version="${CUDA_VERSION//\./-}" + + # There's only one release for all CUDA 11 versions + # The version formatting does not have a '.' + if [[ ${cudf_cuda_version} == 11* ]]; then + cudf_cuda_version="11" + fi + + if [[ "${SPARK_VERSION}" == "3"* ]]; then + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + else + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${rapids_repo_url}/xgboost4j-spark_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j-spark_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${rapids_repo_url}/xgboost4j_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + fi +} + +function configure_spark() { + if [[ "${SPARK_VERSION}" == "3"* ]]; then + cat >>${SPARK_CONF_DIR}/spark-defaults.conf <>${SPARK_CONF_DIR}/spark-defaults.conf <\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} + +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + + # Fix local dirs access permissions + local yarn_local_dirs=() + readarray -d ',' yarn_local_dirs < <(bdconfig get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" +} + +function configure_gpu_exclusive_mode() { + # check if running spark 3, if not, enable GPU exclusive mode + local spark_version + spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) + if [[ ${spark_version} != 3.* ]]; then + # include exclusive mode on GPU + nvidia-smi -c EXCLUSIVE_PROCESS + fi +} + +function fetch_mig_scripts() { + mkdir -p /usr/local/yarn-mig-scripts + chmod 755 /usr/local/yarn-mig-scripts + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh + chmod 755 /usr/local/yarn-mig-scripts/* +} + +function configure_gpu_script() { + # Download GPU discovery script + local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' + mkdir -p ${spark_gpu_script_dir} + # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still + # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: + # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh + echo ' +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +NUM_MIG_DEVICES=$(nvidia-smi -L | grep MIG | wc -l) +ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -e '\'':a'\'' -e '\''N'\'' -e'\''$!ba'\'' -e '\''s/\n/","/g'\'') +if [ $NUM_MIG_DEVICES -gt 0 ]; then + MIG_INDEX=$(( $NUM_MIG_DEVICES - 1 )) + ADDRS=$(seq -s '\''","'\'' 0 $MIG_INDEX) +fi +echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]} +' > ${spark_gpu_script_dir}/getGpusResources.sh + + chmod a+rwx -R ${spark_gpu_script_dir} +} + +function configure_gpu_isolation() { + # enable GPU isolation + sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg" + if [[ $IS_MIG_ENABLED -ne 0 ]]; then + # configure the container-executor.cfg to have major caps + printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg" + printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + else + printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" + fi + + # Configure a systemd unit to ensure that permissions are set on restart + cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service< Date: Tue, 15 Nov 2022 18:14:35 +0800 Subject: [PATCH 12/31] add rocky8 patch and revert the changes in BUILD Signed-off-by: liyuan --- spark-rapids/spark-rapids-rocky.sh | 483 ----------------------------- 1 file changed, 483 deletions(-) delete mode 100644 spark-rapids/spark-rapids-rocky.sh diff --git a/spark-rapids/spark-rapids-rocky.sh b/spark-rapids/spark-rapids-rocky.sh deleted file mode 100644 index 099f7fc07..000000000 --- a/spark-rapids/spark-rapids-rocky.sh +++ /dev/null @@ -1,483 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -function get_metadata_attribute() { - local -r attribute_name=$1 - local -r default_value=$2 - /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" -} - -OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') -readonly OS_NAME - -readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="22.10.0" - -if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then - readonly DEFAULT_CUDA_VERSION="11.5" - readonly DEFAULT_CUDF_VERSION="22.10.0" - readonly DEFAULT_XGBOOST_VERSION="1.6.2" - readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.3.0" - readonly SPARK_VERSION="3.0" -else - readonly DEFAULT_CUDA_VERSION="10.1" - readonly DEFAULT_CUDF_VERSION="0.9.2" - readonly DEFAULT_XGBOOST_VERSION="1.0.0" - readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="Beta5" - readonly SPARK_VERSION="2.x" -fi - -readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) -readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) - -readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - -# CUDA version and Driver version config -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '11.5') -DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '495.29.05') - -readonly CUDA_VERSION -readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION -readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX=${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION%%.*} - -# Parameters for NVIDIA-provided Debian GPU driver -readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION}.run" -NVIDIA_DEBIAN_GPU_DRIVER_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL}") -readonly NVIDIA_DEBIAN_GPU_DRIVER_URL - -readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' - -# Parameters for NVIDIA-provided Debian GPU driver -readonly -A DEFAULT_NVIDIA_DEBIAN_CUDA_URLS=( - [10.1]="${NVIDIA_BASE_DL_URL}/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run" - [10.2]="${NVIDIA_BASE_DL_URL}/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run" - [11.0]="${NVIDIA_BASE_DL_URL}/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run" - [11.1]="${NVIDIA_BASE_DL_URL}/cuda/11.1.0/local_installers/cuda_11.1.0_455.23.05_linux.run" - [11.2]="${NVIDIA_BASE_DL_URL}/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run" - [11.5]="${NVIDIA_BASE_DL_URL}/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run" - [11.6]="${NVIDIA_BASE_DL_URL}/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run" - [11.7]="${NVIDIA_BASE_DL_URL}/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run") -readonly DEFAULT_NVIDIA_DEBIAN_CUDA_URL=${DEFAULT_NVIDIA_DEBIAN_CUDA_URLS["${CUDA_VERSION}"]} -NVIDIA_DEBIAN_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_DEBIAN_CUDA_URL}") -readonly NVIDIA_DEBIAN_CUDA_URL - -# Parameters for NVIDIA-provided Ubuntu GPU driver -readonly NVIDIA_UBUNTU_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64" -readonly NVIDIA_UBUNTU_REPO_KEY_PACKAGE="${NVIDIA_UBUNTU_REPO_URL}/cuda-keyring_1.0-1_all.deb" -readonly NVIDIA_UBUNTU_REPO_CUDA_PIN="${NVIDIA_UBUNTU_REPO_URL}/cuda-ubuntu1804.pin" - -# Parameter for NVIDIA-provided Rocky Linux GPU driver -readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/rhel8/x86_64/cuda-rhel8.repo" - -# Whether to install NVIDIA-provided or OS-provided GPU driver -GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') -readonly GPU_DRIVER_PROVIDER - -# Stackdriver GPU agent parameters -readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' -# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver -INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') -readonly INSTALL_GPU_AGENT - -# Dataproc configurations -readonly HADOOP_CONF_DIR='/etc/hadoop/conf' -readonly HIVE_CONF_DIR='/etc/hive/conf' -readonly SPARK_CONF_DIR='/etc/spark/conf' - -NVIDIA_SMI_PATH='/usr/bin' -MIG_MAJOR_CAPS=0 -IS_MIG_ENABLED=0 - -# SPARK config -readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) -readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) -readonly XGBOOST_GPU_SUB_VERSION=$(get_metadata_attribute 'spark-gpu-sub-version' ${DEFAULT_XGBOOST_GPU_SUB_VERSION}) - -function execute_with_retries() { - local -r cmd=$1 - for ((i = 0; i < 10; i++)); do - if eval "$cmd"; then - return 0 - fi - sleep 5 - done - return 1 -} - -function install_spark_rapids() { - local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' - local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - - # Convert . to - for URL formatting - local cudf_cuda_version="${CUDA_VERSION//\./-}" - - # There's only one release for all CUDA 11 versions - # The version formatting does not have a '.' - if [[ ${cudf_cuda_version} == 11* ]]; then - cudf_cuda_version="11" - fi - - if [[ "${SPARK_VERSION}" == "3"* ]]; then - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - else - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${rapids_repo_url}/xgboost4j-spark_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j-spark_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${rapids_repo_url}/xgboost4j_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - fi -} - -function configure_spark() { - if [[ "${SPARK_VERSION}" == "3"* ]]; then - cat >>${SPARK_CONF_DIR}/spark-defaults.conf <>${SPARK_CONF_DIR}/spark-defaults.conf <\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} - -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - - # Fix local dirs access permissions - local yarn_local_dirs=() - readarray -d ',' yarn_local_dirs < <(bdconfig get_property_value \ - --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ - --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" -} - -function configure_gpu_exclusive_mode() { - # check if running spark 3, if not, enable GPU exclusive mode - local spark_version - spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) - if [[ ${spark_version} != 3.* ]]; then - # include exclusive mode on GPU - nvidia-smi -c EXCLUSIVE_PROCESS - fi -} - -function fetch_mig_scripts() { - mkdir -p /usr/local/yarn-mig-scripts - chmod 755 /usr/local/yarn-mig-scripts - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - chmod 755 /usr/local/yarn-mig-scripts/* -} - -function configure_gpu_script() { - # Download GPU discovery script - local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' - mkdir -p ${spark_gpu_script_dir} - # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still - # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: - # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh - echo ' -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -NUM_MIG_DEVICES=$(nvidia-smi -L | grep MIG | wc -l) -ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -e '\'':a'\'' -e '\''N'\'' -e'\''$!ba'\'' -e '\''s/\n/","/g'\'') -if [ $NUM_MIG_DEVICES -gt 0 ]; then - MIG_INDEX=$(( $NUM_MIG_DEVICES - 1 )) - ADDRS=$(seq -s '\''","'\'' 0 $MIG_INDEX) -fi -echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]} -' > ${spark_gpu_script_dir}/getGpusResources.sh - - chmod a+rwx -R ${spark_gpu_script_dir} -} - -function configure_gpu_isolation() { - # enable GPU isolation - sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg" - if [[ $IS_MIG_ENABLED -ne 0 ]]; then - # configure the container-executor.cfg to have major caps - printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg" - printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" - printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" - else - printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" - fi - - # Configure a systemd unit to ensure that permissions are set on restart - cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service< Date: Wed, 16 Nov 2022 09:50:57 +0800 Subject: [PATCH 13/31] update function names in test_spark_rapids.py Signed-off-by: liyuan --- spark-rapids/test_spark_rapids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index fb62f8773..b813c5ac7 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -6,7 +6,7 @@ from integration_tests.dataproc_test_case import DataprocTestCase -class RapidsTestCase(DataprocTestCase): +class SparkRapidsTestCase(DataprocTestCase): COMPONENT = "rapids" INIT_ACTIONS = ["sparkRapids/spark-rapids.sh"] @@ -35,7 +35,7 @@ def verify_spark_job(self): @parameterized.parameters(("SINGLE", ["m"], GPU_P100), ("STANDARD", ["w-0"], GPU_P100)) - def test_rapids_spark(self, configuration, machine_suffixes, accelerator): + def test_spark_rapids(self, configuration, machine_suffixes, accelerator): if self.getImageOs() == "rocky": self.skipTest("Not supported in Rocky Linux-based images") From c476016f0c39b9d46445e9b7f5ba620024f69c70 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 21 Nov 2022 09:55:31 +0800 Subject: [PATCH 14/31] fix the wrong path in test_spark_rapids.py Signed-off-by: liyuan --- spark-rapids/test_spark_rapids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index b813c5ac7..68200fd55 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -8,7 +8,7 @@ class SparkRapidsTestCase(DataprocTestCase): COMPONENT = "rapids" - INIT_ACTIONS = ["sparkRapids/spark-rapids.sh"] + INIT_ACTIONS = ["spark-rapids/spark-rapids.sh"] GPU_P100 = "type=nvidia-tesla-p100" From e9d208e0ac6c7db8810b8e5f2af2aae51afcef79 Mon Sep 17 00:00:00 2001 From: liyuan Date: Tue, 22 Nov 2022 11:54:47 +0800 Subject: [PATCH 15/31] update gpu type to t4 to avoid no sufficient quota Signed-off-by: liyuan --- gpu/test_gpu.py | 36 +++++++++++++++---------------- spark-rapids/test_spark_rapids.py | 8 +++---- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 39114184a..05e3c62c1 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -9,7 +9,7 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): COMPONENT = "gpu" INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] - GPU_V100 = "type=nvidia-tesla-v100" + GPU_T4 = "type=nvidia-tesla-t4" GPU_A100 = "type=nvidia-tesla-a100" def verify_instance(self, name): @@ -28,10 +28,10 @@ def verify_instance_cudnn(self, name): name, "sudo ldconfig -p | grep -q libcudnn" ) @parameterized.parameters( - ("SINGLE", ["m"], GPU_V100, None, None), - ("STANDARD", ["m"], GPU_V100, None, None), - ("STANDARD", ["m", "w-0", "w-1"], GPU_V100, GPU_V100, "NVIDIA"), - ("STANDARD", ["w-0", "w-1"], None, GPU_V100, "NVIDIA"), + ("SINGLE", ["m"], GPU_T4, None, None), + ("STANDARD", ["m"], GPU_T4, None, None), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), + ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), ) def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -56,9 +56,9 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("STANDARD", ["w-0", "w-1"], None, GPU_V100, None), - ("STANDARD", ["m"], GPU_V100, None, "NVIDIA"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_V100, GPU_V100, "NVIDIA"), + ("STANDARD", ["w-0", "w-1"], None, GPU_T4, None), + ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), ) def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -83,9 +83,9 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("STANDARD", ["m", "w-0", "w-1"], GPU_V100, GPU_V100, None), - ("STANDARD", ["w-0", "w-1"], None, GPU_V100, "NVIDIA"), - ("STANDARD", ["m"], GPU_V100, None, "NVIDIA"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), + ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), + ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"), ) def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -113,11 +113,11 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("SINGLE", ["m"], GPU_V100, None, "10.1"), - ("STANDARD", ["m"], GPU_V100, None, "10.2"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_V100, GPU_V100, "11.0"), - ("STANDARD", ["w-0", "w-1"], None, GPU_V100, "11.1"), - ("STANDARD", ["w-0", "w-1"], None, GPU_V100, "11.2"), + ("SINGLE", ["m"], GPU_T4, None, "10.1"), + ("STANDARD", ["m"], GPU_T4, None, "10.2"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.0"), + ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.1"), + ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.2"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -172,8 +172,8 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("SINGLE", GPU_V100, None, None), - ("STANDARD", GPU_V100, GPU_V100, "NVIDIA") + ("SINGLE", GPU_T4, None, None), + ("STANDARD", GPU_T4, GPU_T4, "NVIDIA") ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 68200fd55..f9d36181e 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -10,7 +10,7 @@ class SparkRapidsTestCase(DataprocTestCase): COMPONENT = "rapids" INIT_ACTIONS = ["spark-rapids/spark-rapids.sh"] - GPU_P100 = "type=nvidia-tesla-p100" + GPU_T4 = "type=nvidia-tesla-t4" # Tests for RAPIDS init action XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids.scala" @@ -33,8 +33,8 @@ def verify_spark_job(self): self.remove_test_script(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME, instance_name) - @parameterized.parameters(("SINGLE", ["m"], GPU_P100), - ("STANDARD", ["w-0"], GPU_P100)) + @parameterized.parameters(("SINGLE", ["m"], GPU_T4), + ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids(self, configuration, machine_suffixes, accelerator): if self.getImageOs() == "rocky": self.skipTest("Not supported in Rocky Linux-based images") @@ -61,7 +61,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): # Only need to do this once self.verify_spark_job() - @parameterized.parameters(("STANDARD", ["w-0"], GPU_P100, "11.2")) + @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "11.2")) def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version): if self.getImageOs() == "rocky": From c798b06e8d8cac62eff9bef564ebac9b2be7add3 Mon Sep 17 00:00:00 2001 From: liyuan Date: Tue, 29 Nov 2022 14:04:04 +0800 Subject: [PATCH 16/31] skip all rocky tests for install gpu driver Signed-off-by: liyuan --- gpu/test_gpu.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 05e3c62c1..4509275ab 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -124,11 +124,10 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, cuda_version): image_os = self.getImageOs() - if self.getImageVersion() < pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") + if self.getImageVersion() < pkg_resources.parse_version("2.0") or self.getImageOs() == "rocky": + self.skipTest("Not supported in pre 2.0 or Rocky images") - if ( image_os == "rocky" and (cuda_version < "11.2" and cuda_version != "11.0") ) or \ - ( image_os == "debian" and cuda_version < "11.1" ): + if ( image_os == "debian" and cuda_version < "11.1" ): self.skipTest(f'CUDA version {cuda_version} is not supported on os {image_os}') metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) @@ -177,12 +176,9 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): - if configuration == "SINGLE" and self.getImageOs() == "rocky": - self.skipTest("Test hangs on single-node clsuter with Rocky Linux-based images") - - if self.getImageVersion() < pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0") - + if self.getImageVersion() < pkg_resources.parse_version("2.0") or self.getImageOs() == "rocky": + self.skipTest("Not supported in pre 2.0 or Rocky images") + metadata = None if driver_provider is not None: metadata = "gpu-driver-provider={}".format(driver_provider) From 4e2888e26c0685f9366c6b645f7ff676b25939d5 Mon Sep 17 00:00:00 2001 From: liyuan Date: Tue, 29 Nov 2022 21:03:32 +0800 Subject: [PATCH 17/31] revert changes to enable rocky8 tests Signed-off-by: liyuan --- gpu/test_gpu.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 4509275ab..4f6217250 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -124,10 +124,11 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, cuda_version): image_os = self.getImageOs() - if self.getImageVersion() < pkg_resources.parse_version("2.0") or self.getImageOs() == "rocky": - self.skipTest("Not supported in pre 2.0 or Rocky images") + if self.getImageVersion() < pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in pre 2.0 images") - if ( image_os == "debian" and cuda_version < "11.1" ): + if ( image_os == "rocky" and (cuda_version < "11.2" and cuda_version != "11.0") ) or \ + ( image_os == "debian" and cuda_version < "11.1" ): self.skipTest(f'CUDA version {cuda_version} is not supported on os {image_os}') metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) @@ -176,8 +177,11 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): - if self.getImageVersion() < pkg_resources.parse_version("2.0") or self.getImageOs() == "rocky": - self.skipTest("Not supported in pre 2.0 or Rocky images") + if configuration == "SINGLE" and self.getImageOs() == "rocky": + self.skipTest("Test hangs on single-node clsuter with Rocky Linux-based images") + + if self.getImageVersion() < pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in pre 2.0") metadata = None if driver_provider is not None: From 93fa9d5f81a8846e59af3bd5d2e3defce8985e6c Mon Sep 17 00:00:00 2001 From: liyuan Date: Fri, 2 Dec 2022 10:24:03 +0800 Subject: [PATCH 18/31] revert the changes of gpu directory Signed-off-by: liyuan --- gpu/install_gpu_driver.sh | 56 +++++++++++++++++++++------------------ gpu/test_gpu.py | 42 ++++++++++++++--------------- 2 files changed, 51 insertions(+), 47 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 19fc28342..610db8c06 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -310,34 +310,38 @@ function install_nvidia_gpu_driver() { # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { - download_agent - install_agent_dependency - start_agent_service -} - -function download_agent(){ - if [[ ${OS_NAME} == rocky ]]; then - execute_with_retries "dnf -y -q install git" - else - execute_with_retries "apt-get install git -y" + if ! command -v pip; then + execute_with_retries "apt-get install -y -q python-pip" fi - mkdir -p /opt/google - chmod 777 /opt/google - cd /opt/google - execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" -} + local install_dir=/opt/gpu-utilization-agent + mkdir -p "${install_dir}" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" -o "${install_dir}/report_gpu_metrics.py" + pip install -r "${install_dir}/requirements.txt" + + # Generate GPU service. + cat </lib/systemd/system/gpu-utilization-agent.service +[Unit] +Description=GPU Utilization Metric Agent -function install_agent_dependency(){ - cd /opt/google/compute-gpu-monitoring/linux - python3 -m venv venv - venv/bin/pip install wheel - venv/bin/pip install -Ur requirements.txt -} +[Service] +Type=simple +PIDFile=/run/gpu_agent.pid +ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"' +User=root +Group=root +WorkingDirectory=/ +Restart=always -function start_agent_service(){ - cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system +[Install] +WantedBy=multi-user.target +EOF + # Reload systemd manager configuration systemctl daemon-reload - systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service + # Enable gpu-utilization-agent service + systemctl --no-reload --now enable gpu-utilization-agent.service } function set_hadoop_property() { @@ -401,10 +405,10 @@ function configure_gpu_exclusive_mode() { function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts - chmod 755 /usr/local/yarn-mig-scripts + sudo chmod 755 /usr/local/yarn-mig-scripts wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - chmod 755 /usr/local/yarn-mig-scripts/* + sudo chmod 755 /usr/local/yarn-mig-scripts/* } function configure_gpu_script() { diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 4f6217250..39114184a 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -9,7 +9,7 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): COMPONENT = "gpu" INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] - GPU_T4 = "type=nvidia-tesla-t4" + GPU_V100 = "type=nvidia-tesla-v100" GPU_A100 = "type=nvidia-tesla-a100" def verify_instance(self, name): @@ -28,10 +28,10 @@ def verify_instance_cudnn(self, name): name, "sudo ldconfig -p | grep -q libcudnn" ) @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, None), - ("STANDARD", ["m"], GPU_T4, None, None), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), - ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), + ("SINGLE", ["m"], GPU_V100, None, None), + ("STANDARD", ["m"], GPU_V100, None, None), + ("STANDARD", ["m", "w-0", "w-1"], GPU_V100, GPU_V100, "NVIDIA"), + ("STANDARD", ["w-0", "w-1"], None, GPU_V100, "NVIDIA"), ) def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -56,9 +56,9 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("STANDARD", ["w-0", "w-1"], None, GPU_T4, None), - ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), + ("STANDARD", ["w-0", "w-1"], None, GPU_V100, None), + ("STANDARD", ["m"], GPU_V100, None, "NVIDIA"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_V100, GPU_V100, "NVIDIA"), ) def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -83,9 +83,9 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), - ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), - ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_V100, GPU_V100, None), + ("STANDARD", ["w-0", "w-1"], None, GPU_V100, "NVIDIA"), + ("STANDARD", ["m"], GPU_V100, None, "NVIDIA"), ) def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -113,11 +113,11 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, "10.1"), - ("STANDARD", ["m"], GPU_T4, None, "10.2"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.0"), - ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.1"), - ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.2"), + ("SINGLE", ["m"], GPU_V100, None, "10.1"), + ("STANDARD", ["m"], GPU_V100, None, "10.2"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_V100, GPU_V100, "11.0"), + ("STANDARD", ["w-0", "w-1"], None, GPU_V100, "11.1"), + ("STANDARD", ["w-0", "w-1"], None, GPU_V100, "11.2"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -128,7 +128,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.skipTest("Not supported in pre 2.0 images") if ( image_os == "rocky" and (cuda_version < "11.2" and cuda_version != "11.0") ) or \ - ( image_os == "debian" and cuda_version < "11.1" ): + ( image_os == "debian" and cuda_version < "11.1" ): self.skipTest(f'CUDA version {cuda_version} is not supported on os {image_os}') metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) @@ -172,17 +172,17 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("SINGLE", GPU_T4, None, None), - ("STANDARD", GPU_T4, GPU_T4, "NVIDIA") + ("SINGLE", GPU_V100, None, None), + ("STANDARD", GPU_V100, GPU_V100, "NVIDIA") ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): if configuration == "SINGLE" and self.getImageOs() == "rocky": self.skipTest("Test hangs on single-node clsuter with Rocky Linux-based images") - + if self.getImageVersion() < pkg_resources.parse_version("2.0"): self.skipTest("Not supported in pre 2.0") - + metadata = None if driver_provider is not None: metadata = "gpu-driver-provider={}".format(driver_provider) From 9488491b8e8c53256a2da3b8789b610c617ff4df Mon Sep 17 00:00:00 2001 From: liyuan Date: Fri, 2 Dec 2022 10:35:11 +0800 Subject: [PATCH 19/31] merge the spark-rapids BUILD to initialization-action BUILD Signed-off-by: liyuan --- BUILD | 16 ++++++++++++++++ spark-rapids/BUILD | 20 -------------------- 2 files changed, 16 insertions(+), 20 deletions(-) delete mode 100644 spark-rapids/BUILD diff --git a/BUILD b/BUILD index 93bfb7f94..f204015ad 100644 --- a/BUILD +++ b/BUILD @@ -110,6 +110,22 @@ py_test( ], ) +py_test( + name = "test_spark_rapids", + size = "enormous", + srcs = ["spark-rapids/test_spark_rapids.py"], + data = [ + "spark-rapids/spark-rapids.sh", + "spark-rapids/verify_xgboost_spark_rapids.scala", + ], + local = True, + shard_count = 3, + deps = [ + "//integration_tests:dataproc_test_case", + "@io_abseil_py//absl/testing:parameterized", + ], +) + py_library( name = "pyspark_metastore_test", testonly = True, diff --git a/spark-rapids/BUILD b/spark-rapids/BUILD deleted file mode 100644 index 04a43fc5b..000000000 --- a/spark-rapids/BUILD +++ /dev/null @@ -1,20 +0,0 @@ -package(default_visibility = ["//visibility:public"]) - -exports_files(["spark-rapids.sh"]) - -py_test( - name = "test_spark_rapids", - size = "enormous", - srcs = ["test_spark_rapids.py"], - data = [ - "spark-rapids.sh", - "verify_xgboost_spark_rapids.scala", - ], - local = True, - shard_count = 3, - deps = [ - "//integration_tests:dataproc_test_case", - "@io_abseil_py//absl/testing:parameterized", - ], -) - From 4aee70f76a3f300d3a63f9c168c5cbc43fcb8784 Mon Sep 17 00:00:00 2001 From: liyuan Date: Fri, 2 Dec 2022 13:39:14 +0800 Subject: [PATCH 20/31] upgrade kernel as cjac comments Signed-off-by: liyuan --- spark-rapids/spark-rapids.sh | 67 ++++++++++++++++++++++++++++++- spark-rapids/test_spark_rapids.py | 10 ++--- 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index cf89849db..b57182e4e 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -446,9 +446,72 @@ function setup_gpu_yarn() { fi done } + +function upgrade_kernel() { + # Determine which kernel is installed + if [[ "${OS_NAME}" == "debian" ]]; then + CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( / Debian (\S+) / )'` + elif [[ "${OS_NAME}" == "ubuntu" ]]; then + CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( /^Linux version (\S+) / )'` + elif [[ ${OS_NAME} == rocky ]]; then + KERN_VER=$(yum info --installed kernel | awk '/^Version/ {print $3}') + KERN_REL=$(yum info --installed kernel | awk '/^Release/ {print $3}') + CURRENT_KERNEL_VERSION="${KERN_VER}-${KERN_REL}" + else + echo "unsupported OS: ${OS_NAME}!" + exit -1 + fi + + # Get latest version available in repos + if [[ "${OS_NAME}" == "debian" ]]; then + apt-get -qq update + TARGET_VERSION=$(apt-cache show --no-all-versions linux-image-amd64 | awk '/^Version/ {print $2}') + elif [[ "${OS_NAME}" == "ubuntu" ]]; then + apt-get -qq update + LATEST_VERSION=$(apt-cache show --no-all-versions linux-image-gcp | awk '/^Version/ {print $2}') + TARGET_VERSION=`echo ${LATEST_VERSION} | perl -ne 'printf(q{%s-%s-gcp},/(\d+\.\d+\.\d+)\.(\d+)/)'` + elif [[ "${OS_NAME}" == "rocky" ]]; then + if yum info --available kernel ; then + KERN_VER=$(yum info --available kernel | awk '/^Version/ {print $3}') + KERN_REL=$(yum info --available kernel | awk '/^Release/ {print $3}') + TARGET_VERSION="${KERN_VER}-${KERN_REL}" + else + TARGET_VERSION="${CURRENT_KERNEL_VERSION}" + fi + fi + + # Skip this script if we are already on the target version + if [[ "${CURRENT_KERNEL_VERSION}" == "${TARGET_VERSION}" ]]; then + echo "target kernel version [${TARGET_VERSION}] is installed" + exit 0 + fi + + # Install the latest kernel + if [[ ${OS_NAME} == debian ]]; then + apt-get install -y linux-image-amd64 + elif [[ "${OS_NAME}" == "ubuntu" ]]; then + apt-get install -y linux-image-gcp + elif [[ "${OS_NAME}" == "rocky" ]]; then + dnf -y -q install kernel + fi + + # Make it possible to reboot before init actions are complete - #1033 + DP_ROOT=/usr/local/share/google/dataproc + STARTUP_SCRIPT="${DP_ROOT}/startup-script.sh" + POST_HDFS_STARTUP_SCRIPT="${DP_ROOT}/post-hdfs-startup-script.sh" + + for startup_script in ${STARTUP_SCRIPT} ${POST_HDFS_STARTUP_SCRIPT} ; do + sed -i -e 's:/usr/bin/env bash:/usr/bin/env bash\nexit 0:' ${startup_script} + done + + cp /var/log/dataproc-initialization-script-0.log /var/log/dataproc-initialization-script-0.log.0 + + systemctl reboot +} + function main() { + upgrade_kernel setup_gpu_yarn - if [[ "${RUNTIME}" == "SPARK" ]]; then install_spark_rapids configure_spark @@ -466,3 +529,5 @@ function main() { } main + +} \ No newline at end of file diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index f9d36181e..1967d139e 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -36,14 +36,12 @@ def verify_spark_job(self): @parameterized.parameters(("SINGLE", ["m"], GPU_T4), ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids(self, configuration, machine_suffixes, accelerator): - if self.getImageOs() == "rocky": - self.skipTest("Not supported in Rocky Linux-based images") + + if self.getImageVersion() < pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in pre 2.0 images") optional_components = None metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" - if self.getImageVersion() < pkg_resources.parse_version("2.0"): - optional_components = ["ANACONDA"] - metadata += ",cuda-version=10.1" self.createCluster( configuration, @@ -64,8 +62,6 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "11.2")) def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version): - if self.getImageOs() == "rocky": - self.skipTest("Not supported in Rocky Linux-based images") if self.getImageVersion() < pkg_resources.parse_version("2.0"): self.skipTest("Not supported in pre 2.0 images") From 4772eee7a84ef2f4f163c953a227e967320e801c Mon Sep 17 00:00:00 2001 From: liyuan Date: Fri, 2 Dec 2022 13:44:59 +0800 Subject: [PATCH 21/31] update exit to return 0 Signed-off-by: liyuan --- spark-rapids/spark-rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index b57182e4e..ed64c9ab8 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -483,7 +483,7 @@ function upgrade_kernel() { # Skip this script if we are already on the target version if [[ "${CURRENT_KERNEL_VERSION}" == "${TARGET_VERSION}" ]]; then echo "target kernel version [${TARGET_VERSION}] is installed" - exit 0 + return 0 fi # Install the latest kernel From bf2d4425874c0b6657d667b32d105c606e75b0e2 Mon Sep 17 00:00:00 2001 From: liyuan Date: Fri, 2 Dec 2022 14:08:58 +0800 Subject: [PATCH 22/31] fix syntax error Signed-off-by: liyuan --- spark-rapids/spark-rapids.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index ed64c9ab8..607cdafd1 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -528,6 +528,4 @@ function main() { done } -main - -} \ No newline at end of file +main \ No newline at end of file From cff4bac96514e99f38ae1ae05d514fe85f3277b4 Mon Sep 17 00:00:00 2001 From: liyuan Date: Fri, 2 Dec 2022 16:01:44 +0800 Subject: [PATCH 23/31] update the driver version to latest because there are issues with 495 driver Signed-off-by: liyuan --- spark-rapids/spark-rapids.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 607cdafd1..b32f0b42e 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -201,8 +201,9 @@ function install_nvidia_gpu_driver() { elif [[ ${OS_NAME} == rocky ]]; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" execute_with_retries "dnf clean all" - execute_with_retries "dnf -y -q module install nvidia-driver:${DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX}-dkms" - execute_with_retries "dnf -y -q install cuda-${CUDA_VERSION//./-}" + # Always install the latest cuda/driver version because old driver version 495 has issues + execute_with_retries "dnf install -y -q nvidia-driver nvidia-settings" + execute_with_retries "dnf install -y -q cuda-driver" else echo "Unsupported OS: '${OS_NAME}'" exit 1 From e6b378145b9501d54e8a2f9b7c0b492abddc8aa1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 3 Dec 2022 19:51:42 -0800 Subject: [PATCH 24/31] restore sparkRapids/BUILD --- sparkRapids/BUILD | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 sparkRapids/BUILD diff --git a/sparkRapids/BUILD b/sparkRapids/BUILD new file mode 100644 index 000000000..ccc1004e0 --- /dev/null +++ b/sparkRapids/BUILD @@ -0,0 +1,20 @@ +package(default_visibility = ["//visibility:public"]) + +exports_files(["spark-rapids.sh"]) + +py_test( + name = "test_sparkRapids", + size = "enormous", + srcs = ["test_sparkRapids.py"], + data = [ + "spark-rapids.sh", + "verify_xgboost_spark_rapids.scala", + ], + local = True, + shard_count = 3, + deps = [ + "//integration_tests:dataproc_test_case", + "@io_abseil_py//absl/testing:parameterized", + ], +) + From 2e98879a05bc9961cf80e4438419c104033bcd48 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 3 Dec 2022 21:31:39 -0800 Subject: [PATCH 25/31] clean up potential package inconsistency --- spark-rapids/spark-rapids.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index b32f0b42e..65febdff3 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -484,6 +484,12 @@ function upgrade_kernel() { # Skip this script if we are already on the target version if [[ "${CURRENT_KERNEL_VERSION}" == "${TARGET_VERSION}" ]]; then echo "target kernel version [${TARGET_VERSION}] is installed" + + # Reboot may have interrupted dpkg. Bring package system to a good state + if [[ "${OS_NAME}" == "debian" || "${OS_NAME}" == "ubuntu" ]]; then + dpkg --configure -a + fi + return 0 fi @@ -511,6 +517,7 @@ function upgrade_kernel() { } function main() { + upgrade_kernel setup_gpu_yarn if [[ "${RUNTIME}" == "SPARK" ]]; then @@ -529,4 +536,4 @@ function main() { done } -main \ No newline at end of file +main From 8e28d95dd293c96fa640505bdb81db27b26c7c99 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 3 Dec 2022 22:02:04 -0800 Subject: [PATCH 26/31] strip trailing whitespace --- spark-rapids/README.md | 8 ++++---- spark-rapids/spark-rapids.sh | 18 +++++++++--------- spark-rapids/test_spark_rapids.py | 2 +- spark-rapids/verify_xgboost_spark_rapids.scala | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/spark-rapids/README.md b/spark-rapids/README.md index 1b087792c..2f9fb1173 100644 --- a/spark-rapids/README.md +++ b/spark-rapids/README.md @@ -1,6 +1,6 @@ # SPARK-RAPIDS -The [RAPIDS Accelerator for Apache Spark](https://nvidia.github.io/spark-rapids/) leverages GPUs +The [RAPIDS Accelerator for Apache Spark](https://nvidia.github.io/spark-rapids/) leverages GPUs to accelerate processing via the [RAPIDS libraries](http://rapids.ai). This initialization action supports Spark runtimes for RAPIDS on [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster. @@ -117,7 +117,7 @@ If you click the operation "count at ...", you should see the graph of Spark Executors and some of those should have the "GPU" label as well. If you want to monitor GPU metrics on Dataproc, you can create the cluster with additional -[metadata](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/metadata) and +[metadata](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/metadata) and [scopes](https://cloud.google.com/sdk/gcloud/reference/dataproc/clusters/create#--scopes): ``` --metadata install-gpu-agent="true" @@ -131,6 +131,6 @@ we should be able to see "Resource & Metric" -> "VM Instance" -> "Custom": * **custom.googleapis.com/instance/gpu/memory_used** - Used memory of the GPU card. * **custom.googleapis.com/instance/gpu/memory_free** - Available memory of the GPU card. * **custom.googleapis.com/instance/gpu/temperature** - Temperature of the GPU. -The metrics are sent with attached label, marking them by the gpu_type and gpu_bus_id. +The metrics are sent with attached label, marking them by the gpu_type and gpu_bus_id. This way, instances with multiple GPUs attached can report the metrics of their cards separately. -You can later aggregate or filter those metrics in the Cloud Monitoring systems. \ No newline at end of file +You can later aggregate or filter those metrics in the Cloud Monitoring systems. diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 65febdff3..cf4730e53 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -109,7 +109,7 @@ function install_spark_rapids() { local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - + # Convert . to - for URL formatting local cudf_cuda_version="${CUDA_VERSION//\./-}" @@ -144,7 +144,7 @@ function configure_spark() { cat >>${SPARK_CONF_DIR}/spark-defaults.conf < Date: Sun, 4 Dec 2022 13:37:11 -0800 Subject: [PATCH 27/31] larger pd size to improve io throughput --- spark-rapids/test_spark_rapids.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 4218b3cd0..5a45289a4 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -51,6 +51,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, + boot_disk_size="1024GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -76,6 +77,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, + boot_disk_size="1024GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: From 10181aaa6e51ff3c963d17b03488ac5c2d65dc89 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 4 Dec 2022 14:17:33 -0800 Subject: [PATCH 28/31] only upgrade kernel if packages are not available for this version --- spark-rapids/spark-rapids.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index cf4730e53..62c24d4f3 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -518,7 +518,13 @@ function upgrade_kernel() { function main() { - upgrade_kernel + if [[ "${OS_NAME}" == "rocky" ]]; then + if dnf list kernel-devel-$(uname -r) && list kernel-headers-$(uname -r); then + echo "kernel devel and headers packages are available. Proceed without kernel upgrade." + else + upgrade_kernel + fi + fi setup_gpu_yarn if [[ "${RUNTIME}" == "SPARK" ]]; then install_spark_rapids From 7f7dc2912739c8c56357a2edee35b5a9b3565fd9 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 4 Dec 2022 14:20:51 -0800 Subject: [PATCH 29/31] we are installing the binary kernel driver ; no need for kernel headers or devel package --- spark-rapids/spark-rapids.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 62c24d4f3..607b7475c 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -202,8 +202,8 @@ function install_nvidia_gpu_driver() { execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" execute_with_retries "dnf clean all" # Always install the latest cuda/driver version because old driver version 495 has issues - execute_with_retries "dnf install -y -q nvidia-driver nvidia-settings" - execute_with_retries "dnf install -y -q cuda-driver" + execute_with_retries "dnf install -y -q nvidia-driver nvidia-settings cuda-driver" + modprobe nvidia else echo "Unsupported OS: '${OS_NAME}'" exit 1 @@ -415,7 +415,7 @@ function setup_gpu_yarn() { if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" elif [[ ${OS_NAME} == rocky ]]; then - execute_with_retries "dnf -y -q install kernel-devel-$(uname -r) kernel-headers-$(uname -r)" + echo "kernel devel and headers not required on rocky. installing from binary" fi # if mig is enabled drivers would have already been installed From 10c5c17130201608bef14696b4943a4a5857b647 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 4 Dec 2022 15:54:55 -0800 Subject: [PATCH 30/31] 1.5-debian10 looks like it comes up to me --- spark-rapids/test_spark_rapids.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 5a45289a4..a37917812 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -37,8 +37,8 @@ def verify_spark_job(self): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids(self, configuration, machine_suffixes, accelerator): - if self.getImageVersion() < pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") + if self.getImageVersion() < pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky": + self.skipTest("Not supported in pre 2.0 rocky images") optional_components = None metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" @@ -64,8 +64,8 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version): - if self.getImageVersion() < pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") + if self.getImageVersion() < pkg_resources.parse_version("2.0") and self.getImageOs() == "rocky": + self.skipTest("Not supported in pre 2.0 rocky images") metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" ",cuda-version={}".format(cuda_version)) From 765400328215ad2abaa395a726492a538e08a694 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 4 Dec 2022 16:48:42 -0800 Subject: [PATCH 31/31] no need to update or install the kernel devl package or gcc --- spark-rapids/spark-rapids.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 607b7475c..b5b3d0d74 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -384,10 +384,7 @@ function setup_gpu_yarn() { execute_with_retries "apt-get update" execute_with_retries "apt-get install -y -q pciutils" elif [[ ${OS_NAME} == rocky ]] ; then - execute_with_retries "dnf -y -q update" execute_with_retries "dnf -y -q install pciutils" - execute_with_retries "dnf -y -q install kernel-devel" - execute_with_retries "dnf -y -q install gcc" else echo "Unsupported OS: '${OS_NAME}'" exit 1