From ef2ced43791808c9d67ff2b381b263c4c24d2ed4 Mon Sep 17 00:00:00 2001 From: lachezar-n Date: Sun, 18 Feb 2024 17:07:12 +0100 Subject: [PATCH 1/3] Fixing AWS scripts for emr-7.0.0 --- scripts/aws/run_systemds_script.sh | 4 ++-- scripts/aws/spinup_systemds_cluster.sh | 22 ++++++++++++++++------ scripts/aws/systemds_cluster.config | 16 ++++++++-------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/scripts/aws/run_systemds_script.sh b/scripts/aws/run_systemds_script.sh index 4c58fcec9bb..db2d7185e24 100755 --- a/scripts/aws/run_systemds_script.sh +++ b/scripts/aws/run_systemds_script.sh @@ -32,7 +32,7 @@ fi source systemds_cluster.config -aws s3 cp $1 s3://system-ds-bucket/ --exclude "*" --include "*.dml" +aws s3 cp $1 s3://${BUCKET} --exclude "*" --include "*.dml" if [ ! -z "$2" ] then @@ -50,7 +50,7 @@ STEP_INFO=$(aws emr add-steps --cluster-id $CLUSTER_ID --steps "Type=Spark, --driver-memory,$SPARK_DRIVER_MEMORY, --num-executors,$SPARK_NUM_EXECUTORS, --conf,spark.driver.maxResultSize=0, - $SYSTEMDS_JAR_PATH, -f, s3://system-ds-bucket/$dml_filename, -exec, $SYSTEMDS_EXEC_MODE,$args,-stats, -explain]") + $SYSTEMDS_JAR_PATH, -f, s3://$BUCKET/$dml_filename, -exec, $SYSTEMDS_EXEC_MODE,$args,-stats, -explain]") STEP_ID=$(echo $STEP_INFO | jq .StepIds | tr -d '"' | tr -d ']' | tr -d '[' | tr -d '[:space:]' ) echo "Waiting for the step to finish" diff --git a/scripts/aws/spinup_systemds_cluster.sh b/scripts/aws/spinup_systemds_cluster.sh index 8c93a636eae..7cb4a05803e 100755 --- a/scripts/aws/spinup_systemds_cluster.sh +++ b/scripts/aws/spinup_systemds_cluster.sh @@ -46,9 +46,12 @@ set_config "SPARK_EXECUTOR_MEMORY" $SPARK_EXECUTOR_MEMORY set_config "SPARK_DRIVER_MEMORY" "1G" set_config "BUCKET" $BUCKET-$(((RANDOM % 999) + 1000)) -#Create systemDS bucket -aws s3api create-bucket --bucket $BUCKET --region $REGION &> /dev/null -aws s3api create-bucket --bucket $BUCKET-logs --region $REGION &> /dev/null +#Source again to update the changes for the current session +source systemds_cluster.config + +#Create systemDS bucket (LocationConstraint configuration required regions outside of us-east-1) +aws s3api create-bucket --bucket $BUCKET --region $REGION --create-bucket-configuration LocationConstraint=$REGION &> /dev/null +aws s3api create-bucket --bucket $BUCKET-logs --region $REGION --create-bucket-configuration LocationConstraint=$REGION &> /dev/null # Upload Jar and scripts to s3 aws s3 sync $SYSTEMDS_TARGET_DIRECTORY s3://$BUCKET --exclude "*" --include "*.dml" --include "*config.xml" --include "*DS.jar*" @@ -60,11 +63,18 @@ if [ ! -f ${KEYPAIR_NAME}.pem ]; then echo "${KEYPAIR_NAME}.pem private key created!" fi +#Get the first available subnet in the default VPC of the region +DEFAULT_SUBNET=$(aws ec2 describe-subnets --region eu-central-1 \ + --filter "Name=defaultForAz,Values=true" --query "Subnets[0].SubnetId" --output text) + #Create the cluster +#Note: Ganglia not available since emr-6.15.0: exchange with AmazonCloudWatchAgent +#Note: '--availability-zone ANY' enforce assigning a default subnet to the cluster CLUSTER_INFO=$(aws emr create-cluster \ - --applications Name=Ganglia Name=Spark \ + --applications Name=AmazonCloudWatchAgent Name=Spark \ --ec2-attributes '{"KeyName":"'${KEYPAIR_NAME}'", - "InstanceProfile":"EMR_EC2_DefaultRole"}'\ + "InstanceProfile":"EMR_EC2_DefaultRole", + "SubnetId": "'${DEFAULT_SUBNET}'"}'\ --service-role EMR_DefaultRole \ --enable-debugging \ --release-label $EMR_VERSION \ @@ -104,6 +114,6 @@ echo "Cluster info:" export CLUSTER_URL=$(aws emr describe-cluster --cluster-id $CLUSTER_ID | jq .Cluster.MasterPublicDnsName | tr -d '"') aws emr ssh --cluster-id $CLUSTER_ID --key-pair-file ${KEYPAIR_NAME}.pem --region $REGION \ - --command 'aws s3 cp s3://system-ds-bucket/target . --recursive --exclude "*" --include "*DS.jar*"' + --command 'aws s3 cp s3://'${BUCKET}' . --recursive --exclude "*" --include "*DS.jar*"' echo "Spinup finished." diff --git a/scripts/aws/systemds_cluster.config b/scripts/aws/systemds_cluster.config index a254bbc8649..731290cceda 100644 --- a/scripts/aws/systemds_cluster.config +++ b/scripts/aws/systemds_cluster.config @@ -22,13 +22,13 @@ # Configuration KEYPAIR_NAME="SystemDSkeynamex" -REGION="us-east-1" -BUCKET="systemds-bucket" -EMR_VERSION="emr-5.28.0" +REGION="eu-central-1" +BUCKET=systemds-bucket-1612-1480-1991-1609 +EMR_VERSION="emr-7.0.0" INSTANCES_TYPE="m5.xlarge" MASTER_INSTANCES_COUNT=1 -CORE_INSTANCES_COUNT=5 +CORE_INSTANCES_COUNT=2 SPARK_DEPLOY_MODE="client" # SystemDS specific @@ -38,10 +38,10 @@ SYSTEMDS_ARGS="" # Readonly # -CLUSTER_ID=j-cluster_id -SPARK_NUM_EXECUTORS=4 -SPARK_EXECUTOR_CORES=2 -SPARK_EXECUTOR_MEMORY=4800.00MB +CLUSTER_ID=j-1EJM2QSXSF1Z +SPARK_NUM_EXECUTORS=2 +SPARK_EXECUTOR_CORES=4 +SPARK_EXECUTOR_MEMORY=9600.00MB SPARK_DRIVER_MEMORY=1G SYSTEMDS_JAR_PATH="/home/hadoop/SystemDS.jar" # End - Readonly # From f3b58efb09abbe941ea2978c05cc00caf980d22b Mon Sep 17 00:00:00 2001 From: lachezar-n Date: Sun, 25 Feb 2024 12:02:29 +0100 Subject: [PATCH 2/3] AWS scripts for emr-7.0.0 update --- scripts/aws/spinup_systemds_cluster.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/aws/spinup_systemds_cluster.sh b/scripts/aws/spinup_systemds_cluster.sh index 7cb4a05803e..58f9f2db059 100755 --- a/scripts/aws/spinup_systemds_cluster.sh +++ b/scripts/aws/spinup_systemds_cluster.sh @@ -63,13 +63,12 @@ if [ ! -f ${KEYPAIR_NAME}.pem ]; then echo "${KEYPAIR_NAME}.pem private key created!" fi -#Get the first available subnet in the default VPC of the region -DEFAULT_SUBNET=$(aws ec2 describe-subnets --region eu-central-1 \ +#Get the first available subnet in the default VPC of the configured region +DEFAULT_SUBNET=$(aws ec2 describe-subnets --region $REGION \ --filter "Name=defaultForAz,Values=true" --query "Subnets[0].SubnetId" --output text) #Create the cluster -#Note: Ganglia not available since emr-6.15.0: exchange with AmazonCloudWatchAgent -#Note: '--availability-zone ANY' enforce assigning a default subnet to the cluster +#Note: Ganglia not available since emr-6.15.0: exchanged with AmazonCloudWatchAgent CLUSTER_INFO=$(aws emr create-cluster \ --applications Name=AmazonCloudWatchAgent Name=Spark \ --ec2-attributes '{"KeyName":"'${KEYPAIR_NAME}'", From 2e6d542b792e1b0bdde625dc51e1a70a04090e30 Mon Sep 17 00:00:00 2001 From: lachezar-n Date: Sun, 25 Feb 2024 12:20:36 +0100 Subject: [PATCH 3/3] AWS scripts: rollback for cluster configs --- scripts/aws/systemds_cluster.config | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/aws/systemds_cluster.config b/scripts/aws/systemds_cluster.config index 731290cceda..8afed8d2bba 100644 --- a/scripts/aws/systemds_cluster.config +++ b/scripts/aws/systemds_cluster.config @@ -22,13 +22,13 @@ # Configuration KEYPAIR_NAME="SystemDSkeynamex" -REGION="eu-central-1" -BUCKET=systemds-bucket-1612-1480-1991-1609 +REGION="us-east-1" +BUCKET=systemds-bucket EMR_VERSION="emr-7.0.0" INSTANCES_TYPE="m5.xlarge" MASTER_INSTANCES_COUNT=1 -CORE_INSTANCES_COUNT=2 +CORE_INSTANCES_COUNT=5 SPARK_DEPLOY_MODE="client" # SystemDS specific @@ -38,10 +38,10 @@ SYSTEMDS_ARGS="" # Readonly # -CLUSTER_ID=j-1EJM2QSXSF1Z -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=4 -SPARK_EXECUTOR_MEMORY=9600.00MB +CLUSTER_ID=j-cluster_id +SPARK_NUM_EXECUTORS=4 +SPARK_EXECUTOR_CORES=2 +SPARK_EXECUTOR_MEMORY=4800.00MB SPARK_DRIVER_MEMORY=1G SYSTEMDS_JAR_PATH="/home/hadoop/SystemDS.jar" # End - Readonly #