From 397ea29d99b44a39887724cd44ec4b3056791aee Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Fri, 16 Jan 2026 22:43:40 +0530 Subject: [PATCH 1/6] chore: Migrate gsutil usage to gcloud storage (#1703) Co-authored-by: Andrew Gold <41129777+agold-rh@users.noreply.github.com> --- examples/dataflow-flex-python/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dataflow-flex-python/README.md b/examples/dataflow-flex-python/README.md index 512671c604d..d486a7bc000 100644 --- a/examples/dataflow-flex-python/README.md +++ b/examples/dataflow-flex-python/README.md @@ -93,10 +93,10 @@ gcloud storage buckets create gs://$INPUT_BUCKET_NAME --location $LOCATION --pro # Create a bucket for dataflow staging and temp locations gcloud storage buckets create gs://$STAGING_BUCKET_NAME --location $LOCATION --project $PROJECT_ID -gsutil iam ch serviceAccount:dataflow-worker-sa@$PROJECT_ID.iam.gserviceaccount.com:roles/storage.legacyBucketWriter gs://$STAGING_BUCKET_NAME +gcloud storage buckets add-iam-policy-binding gs://$STAGING_BUCKET_NAME --member="serviceAccount:dataflow-worker-sa@$PROJECT_ID.iam.gserviceaccount.com" --role="roles/storage.legacyBucketWriter" # Assign Legacy Bucket Writer Role on Input bucket in order to move the object -gsutil iam ch serviceAccount:dataflow-worker-sa@$PROJECT_ID.iam.gserviceaccount.com:roles/storage.legacyBucketWriter gs://$INPUT_BUCKET_NAME +gcloud storage buckets add-iam-policy-binding gs://$INPUT_BUCKET_NAME --member="serviceAccount:dataflow-worker-sa@$PROJECT_ID.iam.gserviceaccount.com" --role="roles/storage.legacyBucketWriter" ``` #### Create BQ Dataset From 88b5b1c8f179dd7c6b0ff3c1e6a4e84c7e60fc77 Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Fri, 16 Jan 2026 22:47:19 +0530 Subject: [PATCH 2/6] chore: Migrate gsutil usage to gcloud storage (#1705) Co-authored-by: Andrew Gold <41129777+agold-rh@users.noreply.github.com> --- examples/dataflow-production-ready/python/README.md | 3 +-- .../dataflow-production-ready/run_system_integration_test.sh | 4 ++-- examples/dataflow-production-ready/terraform/README.MD | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/dataflow-production-ready/python/README.md b/examples/dataflow-production-ready/python/README.md index 1eb7f5fec6d..03b37cc3774 100644 --- a/examples/dataflow-production-ready/python/README.md +++ b/examples/dataflow-production-ready/python/README.md @@ -37,7 +37,7 @@ gcloud config set project $GCP_PROJECT Then, create a GCS bucket for this demo ``` -gsutil mb -l $REGION -p $GCP_PROJECT gs://$BUCKET_NAME +gcloud storage buckets create gs://$BUCKET_NAME --location $REGION --project $GCP_PROJECT ``` @@ -181,4 +181,3 @@ To trigger a build on certain actions (e.g. commits to master) 2. Configure the trigger 3. Point the trigger to the [cloudbuild.yaml](ml_preproc/cloudbuild.yaml) file in the repository 4. Add the substitution variables as explained in the [Substitution variables](#substitution-variables) section. - diff --git a/examples/dataflow-production-ready/run_system_integration_test.sh b/examples/dataflow-production-ready/run_system_integration_test.sh index cfee80ef6e4..5d8e468c2c2 100755 --- a/examples/dataflow-production-ready/run_system_integration_test.sh +++ b/examples/dataflow-production-ready/run_system_integration_test.sh @@ -53,8 +53,8 @@ SELECT r.flag AND e.flag FROM e, r" echo "Preparing GCP test resources.." gcloud config set project "${GCP_PROJECT}" -gsutil mb -c standard -l "${REGION}" "${GCS_BUCKET}" -gsutil cp "${LOCAL_INPUT_PATH}" "${GCS_BUCKET}/input/" +gcloud storage buckets create --default-storage-class=standard --location="${REGION}" "${GCS_BUCKET}" +gcloud storage cp "${LOCAL_INPUT_PATH}" "${GCS_BUCKET}/input/" #replace with terraform script and pass the dataset as var bq mk --location "${REGION}" "${DATASET}" bq mk --table "${RESULTS_TABLE}" schema/ml_preproc_results.json diff --git a/examples/dataflow-production-ready/terraform/README.MD b/examples/dataflow-production-ready/terraform/README.MD index e034d15171e..10ec580d607 100644 --- a/examples/dataflow-production-ready/terraform/README.MD +++ b/examples/dataflow-production-ready/terraform/README.MD @@ -1,4 +1,3 @@ - ## Intro This Terraform module will automate the creation of infrastructure components needed for the data pipeline. @@ -22,7 +21,7 @@ export BQ_DATASET= * Update the Terraform backend bucket name in [backend.tf](backend.tf) (if necessary) and create a bucket with the same name ``` -gsutil mb -l $REGION -p $GCP_PROJECT gs:// +gcloud storage buckets create --location $REGION --project $GCP_PROJECT gs:// ``` * From the **repo root folder**, set up the environment via Cloud Build From 88e642f41e8dcea57eddc268f5500f417f2c8a99 Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Fri, 16 Jan 2026 22:57:39 +0530 Subject: [PATCH 3/6] chore: Migrate gsutil usage to gcloud storage (#1706) Co-authored-by: Andrew Gold <41129777+agold-rh@users.noreply.github.com> --- examples/dataflow-xml-pubsub-to-gcs/python/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/dataflow-xml-pubsub-to-gcs/python/README.md b/examples/dataflow-xml-pubsub-to-gcs/python/README.md index 0849cba35e2..56ec337fa3e 100644 --- a/examples/dataflow-xml-pubsub-to-gcs/python/README.md +++ b/examples/dataflow-xml-pubsub-to-gcs/python/README.md @@ -95,7 +95,7 @@ gcloud pubsub topics create $TOPIC_ID The output will write to a GCS bucket: ``` export BUCKET_NAME= -gsutil mb gs://$BUCKET_NAME +gcloud storage buckets create gs://$BUCKET_NAME ``` # 4. Run the test @@ -151,8 +151,8 @@ monitoring screen. List the generated files in the GCS bucket and inspect their contents ``` -gsutil ls gs://${BUCKET_NAME}/output_location/ -gsutil cat gs://${BUCKET_NAME}/output_location/* +gcloud storage ls gs://${BUCKET_NAME}/output_location/ +gcloud storage cat gs://${BUCKET_NAME}/output_location/* ``` # 5. Clean up @@ -164,11 +164,11 @@ gcloud pubsub topics delete $TOPIC_ID ``` 2. Delete the GCS files ``` -gsutil -m rm -rf "gs://${BUCKET_NAME}/output_location/*" +gcloud storage rm --recursive --continue-on-error "gs://${BUCKET_NAME}/output_location/*" ``` 3. Remove the GCS bucket ``` -gsutil rb gs://${BUCKET_NAME} +gcloud storage buckets delete gs://${BUCKET_NAME} ``` 4. **Optionally** Revoke the authentication credentials that you created, and delete the local credential file. ``` From 9a0ca045928b64ee6f1f764a35d35f3a0f13be90 Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Fri, 16 Jan 2026 23:09:31 +0530 Subject: [PATCH 4/6] chore: Migrate gsutil usage to gcloud storage (#1707) Co-authored-by: Andrew Gold <41129777+agold-rh@users.noreply.github.com> --- examples/dataproc-gcs-connector/connectors.sh | 2 +- examples/dataproc-gcs-connector/test_gcs_connector.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dataproc-gcs-connector/connectors.sh b/examples/dataproc-gcs-connector/connectors.sh index f40087800a3..665032e8250 100644 --- a/examples/dataproc-gcs-connector/connectors.sh +++ b/examples/dataproc-gcs-connector/connectors.sh @@ -126,7 +126,7 @@ update_connector_url() { fi # UPDATED this line to pull correct GCS connector - gsutil cp "gs://gcs-connector-init_actions/gcs-connector-${HADOOP_VERSION}-shaded.jar" "${vm_connectors_dir}/" + gcloud storage cp "gs://gcs-connector-init_actions/gcs-connector-${HADOOP_VERSION}-shaded.jar" "${vm_connectors_dir}/" local -r jar_name=${url##*/} diff --git a/examples/dataproc-gcs-connector/test_gcs_connector.sh b/examples/dataproc-gcs-connector/test_gcs_connector.sh index 39075692156..7aef25569ce 100755 --- a/examples/dataproc-gcs-connector/test_gcs_connector.sh +++ b/examples/dataproc-gcs-connector/test_gcs_connector.sh @@ -48,4 +48,4 @@ gcloud dataproc jobs submit hive --region=us-central1 \ --cluster=${YOUR_CLUSTER} \ -e="$Q1" -e="$Q2" -e="$Q3" -gsutil cat gs://${YOUR_BUCKET}/data_files/top_ten/000000_0 +gcloud storage cat gs://${YOUR_BUCKET}/data_files/top_ten/000000_0 From c8f22bbf8cbcff9819bd9038e0f605d09a861a03 Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Fri, 16 Jan 2026 23:17:46 +0530 Subject: [PATCH 5/6] chore: Migrate gsutil usage to gcloud storage (#1708) Co-authored-by: Andrew Gold <41129777+agold-rh@users.noreply.github.com> --- examples/dataproc-idle-shutdown/README.md | 4 ++-- examples/dataproc-idle-shutdown/create-idlemonitoringjob.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/dataproc-idle-shutdown/README.md b/examples/dataproc-idle-shutdown/README.md index 73b792056b2..2a0af476bec 100644 --- a/examples/dataproc-idle-shutdown/README.md +++ b/examples/dataproc-idle-shutdown/README.md @@ -36,7 +36,7 @@ git clone https://github.com/GoogleCloudPlatform/professional-services.git ``` Copy all artifacts to Cloud Storage: ``` -gsutil cp ./professional-services/examples/dataproc-idle-check/*sh gs:// +gcloud storage cp ./professional-services/examples/dataproc-idle-check/*sh gs:// ``` ### Cluster start: Start the cluster specifying key parameters @@ -44,7 +44,7 @@ gsutil cp ./professional-services/examples/dataproc-idle-check/*sh gs:// 2. [Mandatory] Specify the location of the idle-check.sh script as the value of the metadata key “script_storage_location”. The location of the idle-check.sh script and the maximum idle time should be specified as metadata using the “script_storage_location” and “max-idle” keys, respectively. 3. [Mandatory] Specify the maximum idle time to allow the cluster to be idle as the value of the metadata key “max-idle”. Similar to the parameter associated with Scheduled Cluster deletion, the max-idle duration parameter should be provided in IntegerUnit format, where the unit can be “s, m, h, d” (seconds, minutes, hours, days, respectively). Examples: “30m” or “1d” (30 minutes or 1 day from when the cluster becomes idle). 4. [Optional] Specify, as the value of the metadata key “key_process_list”, a semi-colin separated list of process names (in addition to YARN jobs and active SSH connections) for which the cluster should be considered active. -5. [Optional] Specify if the cluster should write diagnostic logs to the Cloud Storage staging bucket (TRUE/FALSE) as the value of the metadata key "persist_diagnostic_tarball" (TRUE). Unless specified, the default value is FALSE. The diagnostic output is saved in a folder specific to the job under which the DIAGNOSE command was run, the best way to locate the diagnostic output is " gsutil ls gs://[GCS STAGING BUCKET]/google-cloud-dataproc-metainfo/*/*/diagnostic.tar.gz". +5. [Optional] Specify if the cluster should write diagnostic logs to the Cloud Storage staging bucket (TRUE/FALSE) as the value of the metadata key "persist_diagnostic_tarball" (TRUE). Unless specified, the default value is FALSE. The diagnostic output is saved in a folder specific to the job under which the DIAGNOSE command was run, the best way to locate the diagnostic output is " gcloud storage ls gs://[GCS STAGING BUCKET]/google-cloud-dataproc-metainfo/*/*/diagnostic.tar.gz". >Note: [Google APIs](https://developers.google.com/identity/protocols/googlescopes) must also be included in scopes in order for the scripts to read and write cluster metadata. diff --git a/examples/dataproc-idle-shutdown/create-idlemonitoringjob.sh b/examples/dataproc-idle-shutdown/create-idlemonitoringjob.sh index 6070b443841..9cbeec69cde 100644 --- a/examples/dataproc-idle-shutdown/create-idlemonitoringjob.sh +++ b/examples/dataproc-idle-shutdown/create-idlemonitoringjob.sh @@ -90,7 +90,7 @@ function startIdleJobChecker() { cd DataprocShutdown || exit # copy the script from GCS - gsutil cp "${SCRIPT_STORAGE_LOCATION}/idle-check.sh" . + gcloud storage cp "${SCRIPT_STORAGE_LOCATION}/idle-check.sh" . # make it executable chmod 700 idle-check.sh # run IsIdle script From 4fc61a468cde862ea5e8400947fe934a73838b24 Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Sat, 17 Jan 2026 02:08:06 +0530 Subject: [PATCH 6/6] chore: Migrate gsutil usage to gcloud storage (#1709) Co-authored-by: Andrew Gold <41129777+agold-rh@users.noreply.github.com> --- .../dataproc-job-optimization-guide/README.md | 16 ++++++++-------- .../scripts/setup.sh | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/dataproc-job-optimization-guide/README.md b/examples/dataproc-job-optimization-guide/README.md index 586164f465e..5bbc4a33572 100755 --- a/examples/dataproc-job-optimization-guide/README.md +++ b/examples/dataproc-job-optimization-guide/README.md @@ -57,7 +57,7 @@ A sizing cluster can help determine the right number of workers for your applica ```bash -gsutil -m rm -r gs://$BUCKET_NAME/transformed-$TIMESTAMP +gcloud storage rm --recursive gs://$BUCKET_NAME/transformed-$TIMESTAMP gcloud dataproc jobs submit pyspark --region=$REGION --cluster=$CLUSTER_NAME-sizing scripts/spark_average_speed.py -- gs://$BUCKET_NAME/raw-$TIMESTAMP/ gs://$BUCKET_NAME/transformed-$TIMESTAMP/ ``` @@ -93,7 +93,7 @@ gcloud dataproc clusters create $CLUSTER_NAME-testing-2x8-standard \ --worker-boot-disk-size=1000GB \ --region=$REGION -gsutil -m rm -r gs://$BUCKET_NAME/transformed-$TIMESTAMP +gcloud storage rm --recursive gs://$BUCKET_NAME/transformed-$TIMESTAMP gcloud dataproc jobs submit pyspark --region=$REGION --cluster=$CLUSTER_NAME-testing-2x8-standard scripts/spark_average_speed.py -- gs://$BUCKET_NAME/raw-$TIMESTAMP/ gs://$BUCKET_NAME/transformed-$TIMESTAMP/ ``` @@ -114,7 +114,7 @@ gcloud dataproc clusters create $CLUSTER_NAME-testing-4x4-standard \ --worker-boot-disk-size=1000GB \ --region=$REGION -gsutil -m rm -r gs://$BUCKET_NAME/transformed-$TIMESTAMP +gcloud storage rm --recursive gs://$BUCKET_NAME/transformed-$TIMESTAMP gcloud dataproc jobs submit pyspark --region=$REGION --cluster=$CLUSTER_NAME-testing-4x4-standard scripts/spark_average_speed.py -- gs://$BUCKET_NAME/raw-$TIMESTAMP/ gs://$BUCKET_NAME/transformed-$TIMESTAMP/ ``` @@ -135,7 +135,7 @@ gcloud dataproc clusters create $CLUSTER_NAME-testing-8x2-standard \ --worker-boot-disk-size=1000GB \ --region=$REGION -gsutil -m rm -r gs://$BUCKET_NAME/transformed-$TIMESTAMP +gcloud storage rm --recursive gs://$BUCKET_NAME/transformed-$TIMESTAMP gcloud dataproc jobs submit pyspark --region=$REGION --cluster=$CLUSTER_NAME-testing-8x2-standard scripts/spark_average_speed.py -- gs://$BUCKET_NAME/raw-$TIMESTAMP/ gs://$BUCKET_NAME/transformed-$TIMESTAMP/ ``` @@ -169,7 +169,7 @@ gcloud dataproc clusters create $CLUSTER_NAME-testing-8x2-balanced \ --worker-boot-disk-size=500GB \ --region=$REGION -gsutil -m rm -r gs://$BUCKET_NAME/transformed-$TIMESTAMP +gcloud storage rm --recursive gs://$BUCKET_NAME/transformed-$TIMESTAMP gcloud dataproc jobs submit pyspark --region=$REGION --cluster=$CLUSTER_NAME-testing-8x2-balanced scripts/spark_average_speed.py -- gs://$BUCKET_NAME/raw-$TIMESTAMP/ gs://$BUCKET_NAME/transformed-$TIMESTAMP/ ``` @@ -190,7 +190,7 @@ gcloud dataproc clusters create $CLUSTER_NAME-testing-8x2-ssd \ --worker-boot-disk-size=250GB \ --region=$REGION -gsutil -m rm -r gs://$BUCKET_NAME/transformed-$TIMESTAMP +gcloud storage rm --recursive gs://$BUCKET_NAME/transformed-$TIMESTAMP gcloud dataproc jobs submit pyspark --region=$REGION --cluster=$CLUSTER_NAME-testing-8x2-ssd scripts/spark_average_speed.py -- gs://$BUCKET_NAME/raw-$TIMESTAMP/ gs://$BUCKET_NAME/transformed-$TIMESTAMP/ ``` @@ -213,7 +213,7 @@ gcloud dataproc clusters create $CLUSTER_NAME-testing-8x2-ssd-costop \ --worker-boot-disk-size=30GB \ --region=$REGION -gsutil -m rm -r gs://$BUCKET_NAME/transformed-$TIMESTAMP +gcloud storage rm --recursive gs://$BUCKET_NAME/transformed-$TIMESTAMP gcloud dataproc jobs submit pyspark --region=$REGION --cluster=$CLUSTER_NAME-testing-8x2-ssd-costop scripts/spark_average_speed.py -- gs://$BUCKET_NAME/raw-$TIMESTAMP/ gs://$BUCKET_NAME/transformed-$TIMESTAMP/ ``` @@ -230,7 +230,7 @@ sample job submit: **2 x n2-standard-8-ssd-costop-appop = 1 min 15 seconds** ```bash -gsutil -m rm -r gs://$BUCKET_NAME/transformed-$TIMESTAMP +gcloud storage rm --recursive gs://$BUCKET_NAME/transformed-$TIMESTAMP gcloud dataproc jobs submit pyspark --region=$REGION --cluster=$CLUSTER_NAME-testing-8x2-ssd-costop scripts/spark_average_speed.py --properties='spark.executor.cores=5,spark.driver.cores=5,spark.executor.instances=1,spark.executor.memory=25459m,spark.driver.memory=25459m,spark.executor.memoryOverhead=2829m,spark.default.parallelism=10,spark.sql.shuffle.partitions=10,spark.shuffle.spill.compress=true,spark.checkpoint.compress=true,spark.io.compresion.codex=snappy,spark.dynamicAllocation=true,spark.shuffle.service.enabled=true' -- gs://$BUCKET_NAME/raw-$TIMESTAMP/ gs://$BUCKET_NAME/transformed-$TIMESTAMP/ ``` diff --git a/examples/dataproc-job-optimization-guide/scripts/setup.sh b/examples/dataproc-job-optimization-guide/scripts/setup.sh index 6e4f9c6e5e7..bf10e24e221 100755 --- a/examples/dataproc-job-optimization-guide/scripts/setup.sh +++ b/examples/dataproc-job-optimization-guide/scripts/setup.sh @@ -62,7 +62,7 @@ echo "====================================================" echo " Removing old infrastructure ..." -gsutil -m rm -r gs://"$bucket" +gcloud storage rm --recursive gs://"$bucket" bq rm -t=true -f=true "$bucket".myTableCopy bq rm -t=true -f=true "$bucket".yellow_trips_copy @@ -70,11 +70,11 @@ bq rm -t=true -f=true "$bucket".yellow_trips_copy echo "====================================================" echo " Building infrastructure ..." -gsutil mb -c regional -l "$region" gs://"$bucket" +gcloud storage buckets create --default-storage-class=regional --location="$region" gs://"$bucket" bq mk "$bucket" -gsutil cp scripts/spark_average_speed.py gs://"$bucket"/scripts/spark_average_speed.py +gcloud storage cp scripts/spark_average_speed.py gs://"$bucket"/scripts/spark_average_speed.py echo "====================================================" echo " Loading data ..."