From 6858bd4b9f5e46c7e6bc6eb8737e470517b2550e Mon Sep 17 00:00:00 2001 From: Sheypex Date: Tue, 20 Jun 2023 20:07:15 +0200 Subject: [PATCH 01/19] set up file structure for nn perf tests --- scripts/perftest/datagen/genNNData.sh | 68 ++++++++++++++ scripts/perftest/runAll.sh | 30 +++--- scripts/perftest/runAllNN.sh | 83 +++++++++++++++++ scripts/perftest/runNNNesterovClassify.sh | 57 ++++++++++++ scripts/perftest/runNNSimpleSGD.sh | 57 ++++++++++++ .../scripts/nnNesterovClassify-predict.dml | 92 +++++++++++++++++++ .../scripts/nnNesterovClassify-train.dml | 92 +++++++++++++++++++ .../perftest/scripts/nnSimpleSGD-predict.dml | 61 ++++++++++++ .../perftest/scripts/nnSimpleSGD-train.dml | 59 ++++++++++++ 9 files changed, 585 insertions(+), 14 deletions(-) create mode 100644 scripts/perftest/datagen/genNNData.sh create mode 100755 scripts/perftest/runAllNN.sh create mode 100755 scripts/perftest/runNNNesterovClassify.sh create mode 100755 scripts/perftest/runNNSimpleSGD.sh create mode 100644 scripts/perftest/scripts/nnNesterovClassify-predict.dml create mode 100644 scripts/perftest/scripts/nnNesterovClassify-train.dml create mode 100644 scripts/perftest/scripts/nnSimpleSGD-predict.dml create mode 100644 scripts/perftest/scripts/nnSimpleSGD-train.dml diff --git a/scripts/perftest/datagen/genNNData.sh b/scripts/perftest/datagen/genNNData.sh new file mode 100644 index 00000000000..255bfaff53a --- /dev/null +++ b/scripts/perftest/datagen/genNNData.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +DATADIR=$2/nn +MAXMEM=$3 + +FORMAT="text" # can be csv, mm, text, binary +DENSE_SP=0.9 +SPARSE_SP=0.01 + +echo "-- Generating NN data." >> results/times.txt; + +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index db315597bf4..9a7d1337d12 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -103,13 +103,14 @@ echo -e "\n$HOSTNAME" >> results/times.txt echo -e "\n\n" >> results/times.txt ## Data Gen -# ./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genBinomialData.out -# ./datagen/genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genMultinomialData.out -# ./datagen/genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStatsData.out -# ./datagen/genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStratStatsData.out -# ./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genClusteringData.out -# ./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genDimensionReductionData.out -# ./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out +#./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genBinomialData.out +#./datagen/genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genMultinomialData.out +#./datagen/genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStatsData.out +#./datagen/genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStratStatsData.out +#./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genClusteringData.out +#./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genDimensionReductionData.out +#./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out +./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genNNData.out ### Micro Benchmarks: #./MatrixMult.sh ${CMD} @@ -119,13 +120,14 @@ echo -e "\n\n" >> results/times.txt #./fed/runAllFed.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} ### Algorithms Benchmarks: -./runAllBinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllMultinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllRegression.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllBinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllMultinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllRegression.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +./runAllNN.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} # TODO The following benchmarks have yet to be written. The decision tree algorithms additionally need to be fixed. # add stepwise Linear diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh new file mode 100755 index 00000000000..c8d3eb3a514 --- /dev/null +++ b/scripts/perftest/runAllNN.sh @@ -0,0 +1,83 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +COMMAND=$1 +TEMPFOLDER=$2 +MAXMEM=$3 + +if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp; fi +BASE=${TEMPFOLDER}/nn +MAXITR=200 + +FILENAME=$0 +err_report() { + echo "Error in $FILENAME on line $1" +} +trap 'err_report $LINENO' ERR + +DATA=() # todo .. which data is needed? does the ALS data work? +if [ $MAXMEM -ge 80 ]; then DATA+=("10k_1k_dense" "10k_1k_sparse"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("100k_1k_dense" "100k_1k_sparse"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("1M_1k_dense" "1M_1k_sparse"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("10M_1k_dense" "10M_1k_sparse"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("100M_1k_dense" "100M_1k_sparse"); fi + +echo "RUN NEURAL NETWORK EXPERIMENTS" $(date) >>results/times.txt + +for d in ${DATA[@]}; do #"_KDD" + + # ------------------------------------------------------------------------------------------------------------------- + # TODO return an additional output to preserve the internal scaling from training (for the built-in functions lmCG and lmDS). + # The original scripts algorithms/LinearRegCG.dml and algorithms/LinearRegDS.dml do have that additional output column, but the respective built-in functions do not. + # ------------------------------------------------------------------------------------------------------------------- + + # for f in "runLinearRegDS" + # do + # echo "-- Running "$f" on "$d" (all configs)" >> results/times.txt; + # ./${f}.sh ${BASE}/X${d} ${BASE}/y${d} ${BASE} ${COMMAND} &> logs/${f}_${d}.out; + # done + # + # # run with the parameter setting maximum of iterations + # for f in "runLinearRegCG" "runGLM_poisson_log" "runGLM_gamma_log" "runGLM_binomial_probit" + # do + # echo "-- Running "$f" on "$d" (all configs)" >> results/times.txt; + # ./${f}.sh ${BASE}/X${d} ${BASE}/y${d} ${BASE} ${MAXITR} ${COMMAND} &> logs/${f}_${d}.out; + # done + + # Regression tasks + for f in "runNNSimpleSGD"; do + echo "-- Running "$f" on "$d" (all configs)" >>results/times.txt + ./${f}.sh ${BASE}/X${d} ${BASE}/y${d} ${BASE} ${COMMAND} &>logs/${f}_${d}.out + done + + # Classification tasks + for f in "runNNNesterovClassify"; do + echo "-- Running "$f" on "$d" (all configs)" >>results/times.txt + ./${f}.sh ${BASE}/X${d} ${BASE}/y${d} ${BASE} ${COMMAND} &>logs/${f}_${d}.out + done +done + +echo -e "\n\n" >>results/times.txt diff --git a/scripts/perftest/runNNNesterovClassify.sh b/scripts/perftest/runNNNesterovClassify.sh new file mode 100755 index 00000000000..2f71a8911f7 --- /dev/null +++ b/scripts/perftest/runNNNesterovClassify.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# params: +# 1) X data +# 2) Y data +# 3) path of base temp dir +# 4) command for systemds +set -e + +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +CMD=$4 +BASE=$3 + +echo "running sgd nn classifier with nesterov momentum" + +#training +tstart=$(date +%s.%N) +${CMD} -f scripts/nnNesterovClassify-train.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$1 Y=$2 B=${BASE}/b fmt="csv" + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "nesterov momentum neural network trained with SGD on "$1": "$ttrain >>results/times.txt + +#predict +tstart=$(date +%s.%N) +${CMD} -f scripts/nnNesterovClassify-predict.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "nesterov momentum neural network trained with SGD predicted on "$1": "$tpredict >>results/times.txt diff --git a/scripts/perftest/runNNSimpleSGD.sh b/scripts/perftest/runNNSimpleSGD.sh new file mode 100755 index 00000000000..2ab5365de07 --- /dev/null +++ b/scripts/perftest/runNNSimpleSGD.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# params: +# 1) X data +# 2) Y data +# 3) path of base temp dir +# 4) command for systemds +set -e + +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +CMD=$4 +BASE=$3 + +echo "running simple sgd neural network" + +#training +tstart=$(date +%s.%N) +${CMD} -f scripts/nnSimpleSGD-train.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$1 Y=$2 B=${BASE}/b fmt="csv" + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "simple neural network trained with SGD on "$1": "$ttrain >>results/times.txt + +#predict +tstart=$(date +%s.%N) +${CMD} -f scripts/nnSimpleSGD-predict.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "simple neural network trained with SGD predicted on "$1": "$tpredict >>results/times.txt diff --git a/scripts/perftest/scripts/nnNesterovClassify-predict.dml b/scripts/perftest/scripts/nnNesterovClassify-predict.dml new file mode 100644 index 00000000000..74da3abdb0d --- /dev/null +++ b/scripts/perftest/scripts/nnNesterovClassify-predict.dml @@ -0,0 +1,92 @@ +# Imports +source("../../nn/layers/affine.dml") as affine +source("../../nn/layers/cross_entropy_loss.dml") as cross_entropy_loss +source("../../nn/layers/dropout.dml") as dropout +source("../../nn/layers/relu.dml") as relu +source("../../nn/layers/softmax.dml") as softmax +source("../../nn/optim/sgd_nesterov.dml") as sgd_nesterov + +# Generate input data +N = 1024 # num examples +D = 100 # num features +t = 5 # num targets +X = rand(rows=N, cols=D, pdf="normal") +classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform")) +y = matrix(0, rows=N, cols=t) +parfor (i in 1:N) { + y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding +} + +# Create network: +# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax +H1 = 64 # number of neurons in 1st hidden layer +H2 = 64 # number of neurons in 2nd hidden layer +p = 0.5 # dropout probability +[W1, b1] = affine::init(D, H1, -1) +[W2, b2] = affine::init(H1, H2, -1) +[W3, b3] = affine::init(H2, t, -1) + +# Initialize SGD w/ Nesterov momentum optimizer +lr = 0.05 # learning rate +mu = 0.5 # momentum +decay = 0.99 # learning rate decay constant +vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1) +vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2) +vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3) + +# Optimize +print("Starting optimization") +batch_size = 64 +epochs = 10 +iters = 1024 / batch_size +for (e in 1:epochs) { + for(i in 1:iters) { + # Get next batch + X_batch = X[i:i+batch_size-1,] + y_batch = y[i:i+batch_size-1,] + + # Compute forward pass + ## layer 1: + out1 = affine::forward(X_batch, W1, b1) + outr1 = relu::forward(out1) + [outd1, maskd1] = dropout::forward(outr1, p, -1) + ## layer 2: + out2 = affine::forward(outd1, W2, b2) + outr2 = relu::forward(out2) + [outd2, maskd2] = dropout::forward(outr2, p, -1) + ## layer 3: + out3 = affine::forward(outd2, W3, b3) + probs = softmax::forward(out3) + + # Compute loss + loss = cross_entropy_loss::forward(probs, y_batch) + print("Cross entropy loss: " + loss) + + # Compute backward pass + ## loss: + dprobs = cross_entropy_loss::backward(probs, y_batch) + ## layer 3: + dout3 = softmax::backward(dprobs, out3) + [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3) + ## layer 2: + doutr2 = dropout::backward(doutd2, outr2, p, maskd2) + dout2 = relu::backward(doutr2, out2) + [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2) + ## layer 1: + doutr1 = dropout::backward(doutd1, outr1, p, maskd1) + dout1 = relu::backward(doutr1, out1) + [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1) + + # Optimize with SGD w/ Nesterov momentum + [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1) + [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1) + [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2) + [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2) + [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3) + [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3) + } + # Anneal momentum towards 0.999 + mu = mu + (0.999 - mu)/(1+epochs-e) + # Decay learning rate + lr = lr * decay +} \ No newline at end of file diff --git a/scripts/perftest/scripts/nnNesterovClassify-train.dml b/scripts/perftest/scripts/nnNesterovClassify-train.dml new file mode 100644 index 00000000000..74da3abdb0d --- /dev/null +++ b/scripts/perftest/scripts/nnNesterovClassify-train.dml @@ -0,0 +1,92 @@ +# Imports +source("../../nn/layers/affine.dml") as affine +source("../../nn/layers/cross_entropy_loss.dml") as cross_entropy_loss +source("../../nn/layers/dropout.dml") as dropout +source("../../nn/layers/relu.dml") as relu +source("../../nn/layers/softmax.dml") as softmax +source("../../nn/optim/sgd_nesterov.dml") as sgd_nesterov + +# Generate input data +N = 1024 # num examples +D = 100 # num features +t = 5 # num targets +X = rand(rows=N, cols=D, pdf="normal") +classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform")) +y = matrix(0, rows=N, cols=t) +parfor (i in 1:N) { + y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding +} + +# Create network: +# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax +H1 = 64 # number of neurons in 1st hidden layer +H2 = 64 # number of neurons in 2nd hidden layer +p = 0.5 # dropout probability +[W1, b1] = affine::init(D, H1, -1) +[W2, b2] = affine::init(H1, H2, -1) +[W3, b3] = affine::init(H2, t, -1) + +# Initialize SGD w/ Nesterov momentum optimizer +lr = 0.05 # learning rate +mu = 0.5 # momentum +decay = 0.99 # learning rate decay constant +vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1) +vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2) +vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3) + +# Optimize +print("Starting optimization") +batch_size = 64 +epochs = 10 +iters = 1024 / batch_size +for (e in 1:epochs) { + for(i in 1:iters) { + # Get next batch + X_batch = X[i:i+batch_size-1,] + y_batch = y[i:i+batch_size-1,] + + # Compute forward pass + ## layer 1: + out1 = affine::forward(X_batch, W1, b1) + outr1 = relu::forward(out1) + [outd1, maskd1] = dropout::forward(outr1, p, -1) + ## layer 2: + out2 = affine::forward(outd1, W2, b2) + outr2 = relu::forward(out2) + [outd2, maskd2] = dropout::forward(outr2, p, -1) + ## layer 3: + out3 = affine::forward(outd2, W3, b3) + probs = softmax::forward(out3) + + # Compute loss + loss = cross_entropy_loss::forward(probs, y_batch) + print("Cross entropy loss: " + loss) + + # Compute backward pass + ## loss: + dprobs = cross_entropy_loss::backward(probs, y_batch) + ## layer 3: + dout3 = softmax::backward(dprobs, out3) + [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3) + ## layer 2: + doutr2 = dropout::backward(doutd2, outr2, p, maskd2) + dout2 = relu::backward(doutr2, out2) + [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2) + ## layer 1: + doutr1 = dropout::backward(doutd1, outr1, p, maskd1) + dout1 = relu::backward(doutr1, out1) + [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1) + + # Optimize with SGD w/ Nesterov momentum + [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1) + [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1) + [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2) + [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2) + [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3) + [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3) + } + # Anneal momentum towards 0.999 + mu = mu + (0.999 - mu)/(1+epochs-e) + # Decay learning rate + lr = lr * decay +} \ No newline at end of file diff --git a/scripts/perftest/scripts/nnSimpleSGD-predict.dml b/scripts/perftest/scripts/nnSimpleSGD-predict.dml new file mode 100644 index 00000000000..f673d556955 --- /dev/null +++ b/scripts/perftest/scripts/nnSimpleSGD-predict.dml @@ -0,0 +1,61 @@ +# Imports +source("../../nn/layers/affine.dml") as affine +source("../../nn/layers/l2_loss.dml") as l2_loss +source("../../nn/layers/relu.dml") as relu +source("../../nn/optim/sgd.dml") as sgd + +# todo scrap this and write it to use given weights to make a prediction + +# Generate input data +N = 1024 # num examples # todo parameterize this based on given data +D = 100 # num features +t = 1 # num targets +X = rand(rows=N, cols=D, pdf="normal") # todo get this data from outside +y = rand(rows=N, cols=t) + +# Create 2-layer network: +## affine1 -> relu1 -> affine2 +M = 64 # number of neurons # todo parameterize this +[W1, b1] = affine::init(D, M, -1) +[W2, b2] = affine::init(M, t, -1) + +# Initialize optimizer +lr = 0.05 # learning rate +mu = 0.9 # momentum +decay = 0.99 # learning rate decay constant + +# Optimize +print("Starting optimization") +batch_size = 32 +epochs = 5 +iters = 1024 / batch_size +for (e in 1:epochs) { + for(i in 1:iters) { + # Get next batch + X_batch = X[i:i+batch_size-1,] + y_batch = y[i:i+batch_size-1,] + + # Compute forward pass + out1 = affine::forward(X_batch, W1, b1) + outr1 = relu::forward(out1) + out2 = affine::forward(outr1, W2, b2) + + # Compute loss + loss = l2_loss::forward(out2, y_batch) + print("L2 loss: " + loss) + + # Compute backward pass + dout2 = l2_loss::backward(out2, y_batch) + [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2) + dout1 = relu::backward(doutr1, out1) + [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1) + + # Optimize with vanilla SGD + W1 = sgd::update(W1, dW1, lr) + b1 = sgd::update(b1, db1, lr) + W2 = sgd::update(W2, dW2, lr) + b2 = sgd::update(b2, db2, lr) + } + # Decay learning rate + lr = lr * decay +} # todo save learned params \ No newline at end of file diff --git a/scripts/perftest/scripts/nnSimpleSGD-train.dml b/scripts/perftest/scripts/nnSimpleSGD-train.dml new file mode 100644 index 00000000000..fd66c9d31ea --- /dev/null +++ b/scripts/perftest/scripts/nnSimpleSGD-train.dml @@ -0,0 +1,59 @@ +# Imports +source("../../nn/layers/affine.dml") as affine +source("../../nn/layers/l2_loss.dml") as l2_loss +source("../../nn/layers/relu.dml") as relu +source("../../nn/optim/sgd.dml") as sgd + +# Generate input data +N = 1024 # num examples # todo parameterize this based on given data +D = 100 # num features +t = 1 # num targets +X = rand(rows=N, cols=D, pdf="normal") # todo get this data from outside +y = rand(rows=N, cols=t) + +# Create 2-layer network: +## affine1 -> relu1 -> affine2 +M = 64 # number of neurons # todo parameterize this +[W1, b1] = affine::init(D, M, -1) +[W2, b2] = affine::init(M, t, -1) + +# Initialize optimizer +lr = 0.05 # learning rate +mu = 0.9 # momentum +decay = 0.99 # learning rate decay constant + +# Optimize +print("Starting optimization") +batch_size = 32 +epochs = 5 +iters = 1024 / batch_size +for (e in 1:epochs) { + for(i in 1:iters) { + # Get next batch + X_batch = X[i:i+batch_size-1,] + y_batch = y[i:i+batch_size-1,] + + # Compute forward pass + out1 = affine::forward(X_batch, W1, b1) + outr1 = relu::forward(out1) + out2 = affine::forward(outr1, W2, b2) + + # Compute loss + loss = l2_loss::forward(out2, y_batch) + print("L2 loss: " + loss) + + # Compute backward pass + dout2 = l2_loss::backward(out2, y_batch) + [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2) + dout1 = relu::backward(doutr1, out1) + [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1) + + # Optimize with vanilla SGD + W1 = sgd::update(W1, dW1, lr) + b1 = sgd::update(b1, db1, lr) + W2 = sgd::update(W2, dW2, lr) + b2 = sgd::update(b2, db2, lr) + } + # Decay learning rate + lr = lr * decay +} # todo save learned params \ No newline at end of file From 361b340420ebf520b6f41c7717862c27c77b88dc Mon Sep 17 00:00:00 2001 From: Sheypex Date: Thu, 22 Jun 2023 18:09:49 +0200 Subject: [PATCH 02/19] got datagen for nn tests running --- .../datagen/genRandData4NNClassification.dml | 67 +++++++++++++++++++ scripts/datagen/genRandData4NNRegression.dml | 63 +++++++++++++++++ scripts/perftest/datagen/genNNData.sh | 27 ++++---- 3 files changed, 143 insertions(+), 14 deletions(-) create mode 100644 scripts/datagen/genRandData4NNClassification.dml create mode 100644 scripts/datagen/genRandData4NNRegression.dml mode change 100644 => 100755 scripts/perftest/datagen/genNNData.sh diff --git a/scripts/datagen/genRandData4NNClassification.dml b/scripts/datagen/genRandData4NNClassification.dml new file mode 100644 index 00000000000..5632877f35e --- /dev/null +++ b/scripts/datagen/genRandData4NNClassification.dml @@ -0,0 +1,67 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random Gaussian-mixture data to test k-Means clustering algorithms +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# nr Int 1024 Number of examples +# nf Int 100 Number of features +# nt Int 5 Number of targets +# X String --- Location to write X data +# Y String --- Location to write Y data +# fmt Format specifier csv Format of output data +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1 + +print ("BEGIN NN CLASSIFICATION GENERATOR SCRIPT"); + +num_records = ifdef($nr, 1024); +num_features = ifdef($nf, 100); +num_targets = ifdef($nt, 5); + +fileX = ifdef ($X, "X"); +fileY = ifdef ($Y, "Y"); +fmt = ifdef ($fmt, "csv"); + +# Generate input data +N = num_records # num examples +D = num_features # num features +t = num_targets # num targets +X = rand(rows=N, cols=D, pdf="normal") +classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform")) +Y = matrix(0, rows=N, cols=t) +parfor (i in 1:N) { + Y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding +} + +print ("Writing out the resulting dataset..."); + +write (X, fileX, format = fmt); +write (Y, fileY, format = fmt); + +print ("DONE: NN CLASSIFICATION GENERATOR SCRIPT"); + diff --git a/scripts/datagen/genRandData4NNRegression.dml b/scripts/datagen/genRandData4NNRegression.dml new file mode 100644 index 00000000000..313a31d17f4 --- /dev/null +++ b/scripts/datagen/genRandData4NNRegression.dml @@ -0,0 +1,63 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random Gaussian-mixture data to test k-Means clustering algorithms +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# nr Int 1024 Number of examples +# nf Int 100 Number of features +# nt Int 1 Number of targets +# X String --- Location to write X data +# Y String --- Location to write Y data +# fmt Format specifier csv Format of output data +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1 + +print ("BEGIN NN REGRESSION GENERATOR SCRIPT"); + +num_records = ifdef($nr, 1024); +num_features = ifdef($nf, 100); +num_targets = ifdef($nt, 1); + +fileX = ifdef ($X, "X"); +fileY = ifdef ($Y, "Y"); +fmt = ifdef ($fmt, "csv"); + +# Generate input data +N = num_records # num examples +D = num_features # num features +t = num_targets # num targets +X = rand(rows=N, cols=D, pdf="normal") +Y = rand(rows=N, cols=t) + +print ("Writing out the resulting dataset..."); + +write (X, fileX, format = fmt); +write (Y, fileY, format = fmt); + +print ("DONE: NN REGRESSION GENERATOR SCRIPT"); + diff --git a/scripts/perftest/datagen/genNNData.sh b/scripts/perftest/datagen/genNNData.sh old mode 100644 new mode 100755 index 255bfaff53a..236d0bfeed9 --- a/scripts/perftest/datagen/genNNData.sh +++ b/scripts/perftest/datagen/genNNData.sh @@ -29,40 +29,39 @@ CMD=$1 DATADIR=$2/nn MAXMEM=$3 -FORMAT="text" # can be csv, mm, text, binary -DENSE_SP=0.9 -SPARSE_SP=0.01 +FORMAT="csv" # can be csv, mm, text, binary echo "-- Generating NN data." >> results/times.txt; - +# the scaling of nr and nf is to just multiply them by 3 each .. since sqrt(10) is about 3 and the data size should scale by a factor of 10 ..... needs to be tested for applicability +# for now only t=1 and t=5 are generated for regression and classification respectively .. may want to add more variety #generate XS scenarios (80MB) if [ $MAXMEM -ge 80 ]; then - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X1024_100_1_reg Y=${DATADIR}/Y1024_100_1_reg nr=1024 nf=100 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X1024_100_1_class Y=${DATADIR}/Y1024_100_1_class nr=1024 nf=100 nt=5 fmt=$FORMAT & fi #generate S scenarios (800MB) if [ $MAXMEM -ge 800 ]; then - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X3072_300_1_reg Y=${DATADIR}/Y3072_300_1_reg=3072 nf=300 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X3072_300_1_class Y=${DATADIR}/Y3072_300_1_class=3072 nf=300 nt=5 fmt=$FORMAT & fi #generate M scenarios (8GB) if [ $MAXMEM -ge 8000 ]; then - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X9216_900_1_reg Y=${DATADIR}/Y9216_900_1_reg=9216 nf=900 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X9216_900_1_class Y=${DATADIR}/Y9216_900_1_class=9216 nf=900 nt=5 fmt=$FORMAT & fi #generate L scenarios (80GB) if [ $MAXMEM -ge 80000 ]; then - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X27648_2700_1_reg Y=${DATADIR}/Y27648_2700_1_reg=27648 nf=2700 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X27648_2700_1_class Y=${DATADIR}/Y27648_2700_1_class=27648 nf=2700 nt=5 fmt=$FORMAT & fi #generate XL scenarios (800GB) if [ $MAXMEM -ge 800000 ]; then - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT - ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X82944_8200_1_reg Y=${DATADIR}/Y82944_8200_1_reg=82944 nf=8200 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X82944_8200_1_class Y=${DATADIR}/Y82944_8200_1_class=82944 nf=8200 nt=5 fmt=$FORMAT & fi wait \ No newline at end of file From b9f59d9d388c2b444636b0e24b9d84630372bc19 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Thu, 22 Jun 2023 18:29:39 +0200 Subject: [PATCH 03/19] got datagen for nn tests running --- scripts/datagen/genRandData4NNClassification.dml | 2 +- scripts/perftest/runAllNN.sh | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/datagen/genRandData4NNClassification.dml b/scripts/datagen/genRandData4NNClassification.dml index 5632877f35e..295025aea32 100644 --- a/scripts/datagen/genRandData4NNClassification.dml +++ b/scripts/datagen/genRandData4NNClassification.dml @@ -54,7 +54,7 @@ t = num_targets # num targets X = rand(rows=N, cols=D, pdf="normal") classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform")) Y = matrix(0, rows=N, cols=t) -parfor (i in 1:N) { +for (i in 1:N) { # todo: using parfor here should be fine, but crashes? Y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding } diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh index c8d3eb3a514..edf2963d93f 100755 --- a/scripts/perftest/runAllNN.sh +++ b/scripts/perftest/runAllNN.sh @@ -39,11 +39,11 @@ err_report() { trap 'err_report $LINENO' ERR DATA=() # todo .. which data is needed? does the ALS data work? -if [ $MAXMEM -ge 80 ]; then DATA+=("10k_1k_dense" "10k_1k_sparse"); fi -if [ $MAXMEM -ge 800 ]; then DATA+=("100k_1k_dense" "100k_1k_sparse"); fi -if [ $MAXMEM -ge 8000 ]; then DATA+=("1M_1k_dense" "1M_1k_sparse"); fi -if [ $MAXMEM -ge 80000 ]; then DATA+=("10M_1k_dense" "10M_1k_sparse"); fi -if [ $MAXMEM -ge 800000 ]; then DATA+=("100M_1k_dense" "100M_1k_sparse"); fi +if [ $MAXMEM -ge 80 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg" "X1024_100_1_class"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg"); fi echo "RUN NEURAL NETWORK EXPERIMENTS" $(date) >>results/times.txt From 0bb5a489b6518718b9a5bc97151035511d4450ca Mon Sep 17 00:00:00 2001 From: Sheypex Date: Thu, 22 Jun 2023 19:57:09 +0200 Subject: [PATCH 04/19] nn simple sgd running --- scripts/perftest/datagen/genNNData.sh | 18 +++--- scripts/perftest/runAllNN.sh | 18 +++--- scripts/perftest/runNNSimpleSGD.sh | 10 ++- .../perftest/scripts/nnSimpleSGD-predict.dml | 63 +++++-------------- .../perftest/scripts/nnSimpleSGD-train.dml | 30 +++++---- 5 files changed, 58 insertions(+), 81 deletions(-) diff --git a/scripts/perftest/datagen/genNNData.sh b/scripts/perftest/datagen/genNNData.sh index 236d0bfeed9..06e4241961c 100755 --- a/scripts/perftest/datagen/genNNData.sh +++ b/scripts/perftest/datagen/genNNData.sh @@ -34,6 +34,8 @@ FORMAT="csv" # can be csv, mm, text, binary echo "-- Generating NN data." >> results/times.txt; # the scaling of nr and nf is to just multiply them by 3 each .. since sqrt(10) is about 3 and the data size should scale by a factor of 10 ..... needs to be tested for applicability # for now only t=1 and t=5 are generated for regression and classification respectively .. may want to add more variety +# todo make test data +# todo generated data is too small with current parameters .. X data for xs is 2mb, s is 18mb -> pump it up #generate XS scenarios (80MB) if [ $MAXMEM -ge 80 ]; then ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X1024_100_1_reg Y=${DATADIR}/Y1024_100_1_reg nr=1024 nf=100 nt=1 fmt=$FORMAT & @@ -42,26 +44,26 @@ fi #generate S scenarios (800MB) if [ $MAXMEM -ge 800 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X3072_300_1_reg Y=${DATADIR}/Y3072_300_1_reg=3072 nf=300 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X3072_300_1_class Y=${DATADIR}/Y3072_300_1_class=3072 nf=300 nt=5 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X3072_300_1_reg Y=${DATADIR}/Y3072_300_1_reg nr=3072 nf=300 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X3072_300_1_class Y=${DATADIR}/Y3072_300_1_class nr=3072 nf=300 nt=5 fmt=$FORMAT & fi #generate M scenarios (8GB) if [ $MAXMEM -ge 8000 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X9216_900_1_reg Y=${DATADIR}/Y9216_900_1_reg=9216 nf=900 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X9216_900_1_class Y=${DATADIR}/Y9216_900_1_class=9216 nf=900 nt=5 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X9216_900_1_reg Y=${DATADIR}/Y9216_900_1_reg nr=9216 nf=900 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X9216_900_1_class Y=${DATADIR}/Y9216_900_1_class nr=9216 nf=900 nt=5 fmt=$FORMAT & fi #generate L scenarios (80GB) if [ $MAXMEM -ge 80000 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X27648_2700_1_reg Y=${DATADIR}/Y27648_2700_1_reg=27648 nf=2700 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X27648_2700_1_class Y=${DATADIR}/Y27648_2700_1_class=27648 nf=2700 nt=5 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X27648_2700_1_reg Y=${DATADIR}/Y27648_2700_1_reg nr=27648 nf=2700 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X27648_2700_1_class Y=${DATADIR}/Y27648_2700_1_class nr=27648 nf=2700 nt=5 fmt=$FORMAT & fi #generate XL scenarios (800GB) if [ $MAXMEM -ge 800000 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X82944_8200_1_reg Y=${DATADIR}/Y82944_8200_1_reg=82944 nf=8200 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X82944_8200_1_class Y=${DATADIR}/Y82944_8200_1_class=82944 nf=8200 nt=5 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X82944_8200_1_reg Y=${DATADIR}/Y82944_8200_1_reg nr=82944 nf=8200 nt=1 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X82944_8200_1_class Y=${DATADIR}/Y82944_8200_1_class nr=82944 nf=8200 nt=5 fmt=$FORMAT & fi wait \ No newline at end of file diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh index edf2963d93f..31c50181fae 100755 --- a/scripts/perftest/runAllNN.sh +++ b/scripts/perftest/runAllNN.sh @@ -38,12 +38,12 @@ err_report() { } trap 'err_report $LINENO' ERR -DATA=() # todo .. which data is needed? does the ALS data work? -if [ $MAXMEM -ge 80 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg" "X1024_100_1_class"); fi -if [ $MAXMEM -ge 800 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg"); fi -if [ $MAXMEM -ge 8000 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg"); fi -if [ $MAXMEM -ge 80000 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg"); fi -if [ $MAXMEM -ge 800000 ]; then DATA+=("X1024_100_1_reg" "Y1024_100_1_reg"); fi +DATA=() # todo .. which data is needed? +if [ $MAXMEM -ge 80 ]; then DATA+=("1024_100_1"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("3072_300_1" ); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("9216_900_1"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("27648_2700_1"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("82944_8200_1"); fi echo "RUN NEURAL NETWORK EXPERIMENTS" $(date) >>results/times.txt @@ -69,14 +69,14 @@ for d in ${DATA[@]}; do #"_KDD" # Regression tasks for f in "runNNSimpleSGD"; do - echo "-- Running "$f" on "$d" (all configs)" >>results/times.txt - ./${f}.sh ${BASE}/X${d} ${BASE}/y${d} ${BASE} ${COMMAND} &>logs/${f}_${d}.out + echo "-- Running "$f" on "$d"" >>results/times.txt + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} &>logs/${f}_${d}.out done # Classification tasks for f in "runNNNesterovClassify"; do echo "-- Running "$f" on "$d" (all configs)" >>results/times.txt - ./${f}.sh ${BASE}/X${d} ${BASE}/y${d} ${BASE} ${COMMAND} &>logs/${f}_${d}.out + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} &>logs/${f}_${d}.out done done diff --git a/scripts/perftest/runNNSimpleSGD.sh b/scripts/perftest/runNNSimpleSGD.sh index 2ab5365de07..13aebb3d62a 100755 --- a/scripts/perftest/runNNSimpleSGD.sh +++ b/scripts/perftest/runNNSimpleSGD.sh @@ -31,8 +31,11 @@ if [ "$(basename $PWD)" != "perftest" ]; then exit 1 fi -CMD=$4 +X=$1 +Y=$2 BASE=$3 +CMD=$4 +LOGIDENTIFIER=$5 echo "running simple sgd neural network" @@ -41,7 +44,7 @@ tstart=$(date +%s.%N) ${CMD} -f scripts/nnSimpleSGD-train.dml \ --config conf/SystemDS-config.xml \ --stats \ - --nvargs X=$1 Y=$2 B=${BASE}/b fmt="csv" + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnSimpleSGD-train_${LOGIDENTIFIER}.out ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) echo "simple neural network trained with SGD on "$1": "$ttrain >>results/times.txt @@ -51,7 +54,8 @@ tstart=$(date +%s.%N) ${CMD} -f scripts/nnSimpleSGD-predict.dml \ --config conf/SystemDS-config.xml \ --stats \ - --nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv + --nvargs fmt="csv" X=${X} Y=${Y} B=${BASE} &>logs/nnSimpleSGD-predict_${LOGIDENTIFIER}.out + # --nvargs fmt=csv X=$1_test B=${BASE} Y=$2_test tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) echo "simple neural network trained with SGD predicted on "$1": "$tpredict >>results/times.txt diff --git a/scripts/perftest/scripts/nnSimpleSGD-predict.dml b/scripts/perftest/scripts/nnSimpleSGD-predict.dml index f673d556955..807006e1f1f 100644 --- a/scripts/perftest/scripts/nnSimpleSGD-predict.dml +++ b/scripts/perftest/scripts/nnSimpleSGD-predict.dml @@ -4,58 +4,23 @@ source("../../nn/layers/l2_loss.dml") as l2_loss source("../../nn/layers/relu.dml") as relu source("../../nn/optim/sgd.dml") as sgd -# todo scrap this and write it to use given weights to make a prediction +X = read($X) +Y = read($Y) -# Generate input data -N = 1024 # num examples # todo parameterize this based on given data -D = 100 # num features -t = 1 # num targets -X = rand(rows=N, cols=D, pdf="normal") # todo get this data from outside -y = rand(rows=N, cols=t) -# Create 2-layer network: +# Create 2-layer network based on read params: ## affine1 -> relu1 -> affine2 -M = 64 # number of neurons # todo parameterize this -[W1, b1] = affine::init(D, M, -1) -[W2, b2] = affine::init(M, t, -1) +W1 = read(""+$B+"/w1_simple_sgd") +W2 = read(""+$B+"/w2_simple_sgd") +b1 = read(""+$B+"/b1_simple_sgd") +b2 = read(""+$B+"/b2_simple_sgd") -# Initialize optimizer -lr = 0.05 # learning rate -mu = 0.9 # momentum -decay = 0.99 # learning rate decay constant +# make prediction for given X +out1 = affine::forward(X, W1, b1) +outr1 = relu::forward(out1) +out2 = affine::forward(outr1, W2, b2) -# Optimize -print("Starting optimization") -batch_size = 32 -epochs = 5 -iters = 1024 / batch_size -for (e in 1:epochs) { - for(i in 1:iters) { - # Get next batch - X_batch = X[i:i+batch_size-1,] - y_batch = y[i:i+batch_size-1,] +# check accuracy +loss = l2_loss::forward(out2, Y) - # Compute forward pass - out1 = affine::forward(X_batch, W1, b1) - outr1 = relu::forward(out1) - out2 = affine::forward(outr1, W2, b2) - - # Compute loss - loss = l2_loss::forward(out2, y_batch) - print("L2 loss: " + loss) - - # Compute backward pass - dout2 = l2_loss::backward(out2, y_batch) - [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2) - dout1 = relu::backward(doutr1, out1) - [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1) - - # Optimize with vanilla SGD - W1 = sgd::update(W1, dW1, lr) - b1 = sgd::update(b1, db1, lr) - W2 = sgd::update(W2, dW2, lr) - b2 = sgd::update(b2, db2, lr) - } - # Decay learning rate - lr = lr * decay -} # todo save learned params \ No newline at end of file +print("Got loss of " + loss) \ No newline at end of file diff --git a/scripts/perftest/scripts/nnSimpleSGD-train.dml b/scripts/perftest/scripts/nnSimpleSGD-train.dml index fd66c9d31ea..0388070f7b6 100644 --- a/scripts/perftest/scripts/nnSimpleSGD-train.dml +++ b/scripts/perftest/scripts/nnSimpleSGD-train.dml @@ -4,16 +4,17 @@ source("../../nn/layers/l2_loss.dml") as l2_loss source("../../nn/layers/relu.dml") as relu source("../../nn/optim/sgd.dml") as sgd -# Generate input data -N = 1024 # num examples # todo parameterize this based on given data -D = 100 # num features -t = 1 # num targets -X = rand(rows=N, cols=D, pdf="normal") # todo get this data from outside -y = rand(rows=N, cols=t) +X = read($X) +Y = read($Y) +fmt = ifdef($fmt, "csv") + +N = nrow(X) +D = ncol(X) +t = ncol(Y) # Create 2-layer network: ## affine1 -> relu1 -> affine2 -M = 64 # number of neurons # todo parameterize this +M = ifdef($M, 64) # number of neurons # todo parameterize this [W1, b1] = affine::init(D, M, -1) [W2, b2] = affine::init(M, t, -1) @@ -24,14 +25,14 @@ decay = 0.99 # learning rate decay constant # Optimize print("Starting optimization") -batch_size = 32 -epochs = 5 -iters = 1024 / batch_size +batch_size = ifdef($batch_size, 32) +epochs = ifdef($epochs, 5) +iters = N / batch_size for (e in 1:epochs) { for(i in 1:iters) { # Get next batch X_batch = X[i:i+batch_size-1,] - y_batch = y[i:i+batch_size-1,] + y_batch = Y[i:i+batch_size-1,] # Compute forward pass out1 = affine::forward(X_batch, W1, b1) @@ -56,4 +57,9 @@ for (e in 1:epochs) { } # Decay learning rate lr = lr * decay -} # todo save learned params \ No newline at end of file +} +# save params +write(W1,""+$B+"/w1_simple_sgd",format=fmt) +write(W2,""+$B+"/w2_simple_sgd",format=fmt) +write(b1,""+$B+"/b1_simple_sgd",format=fmt) +write(b2,""+$B+"/b2_simple_sgd",format=fmt) \ No newline at end of file From c218449cdea2811b8b832080e0d54b4b2e7e5083 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Thu, 22 Jun 2023 20:24:26 +0200 Subject: [PATCH 05/19] nn simple sgd running --- scripts/perftest/runAllNN.sh | 4 +- scripts/perftest/runNNNesterovClassify.sh | 10 +- .../scripts/nnNesterovClassify-predict.dml | 126 ++++++------------ .../scripts/nnNesterovClassify-train.dml | 43 +++--- .../perftest/scripts/nnSimpleSGD-predict.dml | 1 + .../perftest/scripts/nnSimpleSGD-train.dml | 1 + 6 files changed, 78 insertions(+), 107 deletions(-) diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh index 31c50181fae..d23f2993344 100755 --- a/scripts/perftest/runAllNN.sh +++ b/scripts/perftest/runAllNN.sh @@ -75,8 +75,8 @@ for d in ${DATA[@]}; do #"_KDD" # Classification tasks for f in "runNNNesterovClassify"; do - echo "-- Running "$f" on "$d" (all configs)" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} &>logs/${f}_${d}.out + echo "-- Running "$f" on "$d"" >>results/times.txt + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} &>logs/${f}_${d}.out done done diff --git a/scripts/perftest/runNNNesterovClassify.sh b/scripts/perftest/runNNNesterovClassify.sh index 2f71a8911f7..5a0173ada87 100755 --- a/scripts/perftest/runNNNesterovClassify.sh +++ b/scripts/perftest/runNNNesterovClassify.sh @@ -31,8 +31,11 @@ if [ "$(basename $PWD)" != "perftest" ]; then exit 1 fi -CMD=$4 +X=$1 +Y=$2 BASE=$3 +CMD=$4 +LOGIDENTIFIER=$5 echo "running sgd nn classifier with nesterov momentum" @@ -41,7 +44,7 @@ tstart=$(date +%s.%N) ${CMD} -f scripts/nnNesterovClassify-train.dml \ --config conf/SystemDS-config.xml \ --stats \ - --nvargs X=$1 Y=$2 B=${BASE}/b fmt="csv" + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnNesterovClassify-train_${LOGIDENTIFIER}.out ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) echo "nesterov momentum neural network trained with SGD on "$1": "$ttrain >>results/times.txt @@ -51,7 +54,8 @@ tstart=$(date +%s.%N) ${CMD} -f scripts/nnNesterovClassify-predict.dml \ --config conf/SystemDS-config.xml \ --stats \ - --nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}.out + #--nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}.out tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) echo "nesterov momentum neural network trained with SGD predicted on "$1": "$tpredict >>results/times.txt diff --git a/scripts/perftest/scripts/nnNesterovClassify-predict.dml b/scripts/perftest/scripts/nnNesterovClassify-predict.dml index 74da3abdb0d..f97aedbf178 100644 --- a/scripts/perftest/scripts/nnNesterovClassify-predict.dml +++ b/scripts/perftest/scripts/nnNesterovClassify-predict.dml @@ -6,87 +6,45 @@ source("../../nn/layers/relu.dml") as relu source("../../nn/layers/softmax.dml") as softmax source("../../nn/optim/sgd_nesterov.dml") as sgd_nesterov -# Generate input data -N = 1024 # num examples -D = 100 # num features -t = 5 # num targets -X = rand(rows=N, cols=D, pdf="normal") -classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform")) -y = matrix(0, rows=N, cols=t) -parfor (i in 1:N) { - y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding -} - -# Create network: -# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax -H1 = 64 # number of neurons in 1st hidden layer -H2 = 64 # number of neurons in 2nd hidden layer -p = 0.5 # dropout probability -[W1, b1] = affine::init(D, H1, -1) -[W2, b2] = affine::init(H1, H2, -1) -[W3, b3] = affine::init(H2, t, -1) - -# Initialize SGD w/ Nesterov momentum optimizer -lr = 0.05 # learning rate -mu = 0.5 # momentum -decay = 0.99 # learning rate decay constant -vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1) -vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2) -vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3) - -# Optimize -print("Starting optimization") -batch_size = 64 -epochs = 10 -iters = 1024 / batch_size -for (e in 1:epochs) { - for(i in 1:iters) { - # Get next batch - X_batch = X[i:i+batch_size-1,] - y_batch = y[i:i+batch_size-1,] - - # Compute forward pass - ## layer 1: - out1 = affine::forward(X_batch, W1, b1) - outr1 = relu::forward(out1) - [outd1, maskd1] = dropout::forward(outr1, p, -1) - ## layer 2: - out2 = affine::forward(outd1, W2, b2) - outr2 = relu::forward(out2) - [outd2, maskd2] = dropout::forward(outr2, p, -1) - ## layer 3: - out3 = affine::forward(outd2, W3, b3) - probs = softmax::forward(out3) - - # Compute loss - loss = cross_entropy_loss::forward(probs, y_batch) - print("Cross entropy loss: " + loss) - - # Compute backward pass - ## loss: - dprobs = cross_entropy_loss::backward(probs, y_batch) - ## layer 3: - dout3 = softmax::backward(dprobs, out3) - [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3) - ## layer 2: - doutr2 = dropout::backward(doutd2, outr2, p, maskd2) - dout2 = relu::backward(doutr2, out2) - [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2) - ## layer 1: - doutr1 = dropout::backward(doutd1, outr1, p, maskd1) - dout1 = relu::backward(doutr1, out1) - [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1) - - # Optimize with SGD w/ Nesterov momentum - [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1) - [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1) - [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2) - [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2) - [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3) - [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3) - } - # Anneal momentum towards 0.999 - mu = mu + (0.999 - mu)/(1+epochs-e) - # Decay learning rate - lr = lr * decay -} \ No newline at end of file +X = read($X) +Y = read($Y) + +W1 = read(""+$B+"/w1_nesterov_classify") +W2 = read(""+$B+"/w2_nesterov_classify") +W3 = read(""+$B+"/w3_nesterov_classify") +b1 = read(""+$B+"/b1_nesterov_classify") +b2 = read(""+$B+"/b2_nesterov_classify") +b3 = read(""+$B+"/b3_nesterov_classify") +p = read(""+$B+"/p_nesterov_classify") + +# Compute forward pass with dropout +## layer 1: +out1 = affine::forward(X, W1, b1) +outr1 = relu::forward(out1) +[outd1, maskd1] = dropout::forward(outr1, p, -1) +## layer 2: +out2 = affine::forward(outd1, W2, b2) +outr2 = relu::forward(out2) +[outd2, maskd2] = dropout::forward(outr2, p, -1) +## layer 3: +out3 = affine::forward(outd2, W3, b3) +probs = softmax::forward(out3) + +# Compute loss +loss = cross_entropy_loss::forward(probs, Y) +print("Cross entropy loss with dropout: " + loss) + +# repeat without dropout +## layer 1: +out1 = affine::forward(X, W1, b1) +outr1 = relu::forward(out1) +## layer 2: +out2 = affine::forward(outr1, W2, b2) +outr2 = relu::forward(out2) +## layer 3: +out3 = affine::forward(outr2, W3, b3) +probs = softmax::forward(out3) + +# Compute loss +loss = cross_entropy_loss::forward(probs, Y) +print("Cross entropy loss without dropout: " + loss) \ No newline at end of file diff --git a/scripts/perftest/scripts/nnNesterovClassify-train.dml b/scripts/perftest/scripts/nnNesterovClassify-train.dml index 74da3abdb0d..01e25ded808 100644 --- a/scripts/perftest/scripts/nnNesterovClassify-train.dml +++ b/scripts/perftest/scripts/nnNesterovClassify-train.dml @@ -6,22 +6,20 @@ source("../../nn/layers/relu.dml") as relu source("../../nn/layers/softmax.dml") as softmax source("../../nn/optim/sgd_nesterov.dml") as sgd_nesterov -# Generate input data -N = 1024 # num examples -D = 100 # num features -t = 5 # num targets -X = rand(rows=N, cols=D, pdf="normal") -classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform")) -y = matrix(0, rows=N, cols=t) -parfor (i in 1:N) { - y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding -} +# read input data +X = read($X) +Y = read($Y) +fmt = ifdef($fmt, "csv") + +N = nrow(X) +D = ncol(X) +t = ncol(Y) # Create network: # affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax -H1 = 64 # number of neurons in 1st hidden layer -H2 = 64 # number of neurons in 2nd hidden layer -p = 0.5 # dropout probability +H1 = ifdef($H1, 64) # number of neurons in 1st hidden layer +H2 = ifdef($H2, 64) # number of neurons in 2nd hidden layer +p = ifdef($dropout_prob, 0.5) # dropout probability [W1, b1] = affine::init(D, H1, -1) [W2, b2] = affine::init(H1, H2, -1) [W3, b3] = affine::init(H2, t, -1) @@ -36,14 +34,14 @@ vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3) # Optimize print("Starting optimization") -batch_size = 64 -epochs = 10 -iters = 1024 / batch_size +batch_size = ifdef($batch_size, 64) +epochs = ifdef($epochs, 10) +iters = N / batch_size for (e in 1:epochs) { for(i in 1:iters) { # Get next batch X_batch = X[i:i+batch_size-1,] - y_batch = y[i:i+batch_size-1,] + y_batch = Y[i:i+batch_size-1,] # Compute forward pass ## layer 1: @@ -89,4 +87,13 @@ for (e in 1:epochs) { mu = mu + (0.999 - mu)/(1+epochs-e) # Decay learning rate lr = lr * decay -} \ No newline at end of file +} + +# save params +write(W1,""+$B+"/w1_nesterov_classify",format=fmt) +write(W2,""+$B+"/w2_nesterov_classify",format=fmt) +write(W3,""+$B+"/w3_nesterov_classify",format=fmt) +write(b1,""+$B+"/b1_nesterov_classify",format=fmt) +write(b2,""+$B+"/b2_nesterov_classify",format=fmt) +write(b3,""+$B+"/b3_nesterov_classify",format=fmt) +write(p,""+$B+"/p_nesterov_classify",format=fmt) \ No newline at end of file diff --git a/scripts/perftest/scripts/nnSimpleSGD-predict.dml b/scripts/perftest/scripts/nnSimpleSGD-predict.dml index 807006e1f1f..44142402c89 100644 --- a/scripts/perftest/scripts/nnSimpleSGD-predict.dml +++ b/scripts/perftest/scripts/nnSimpleSGD-predict.dml @@ -4,6 +4,7 @@ source("../../nn/layers/l2_loss.dml") as l2_loss source("../../nn/layers/relu.dml") as relu source("../../nn/optim/sgd.dml") as sgd +# read input data X = read($X) Y = read($Y) diff --git a/scripts/perftest/scripts/nnSimpleSGD-train.dml b/scripts/perftest/scripts/nnSimpleSGD-train.dml index 0388070f7b6..936b76625bd 100644 --- a/scripts/perftest/scripts/nnSimpleSGD-train.dml +++ b/scripts/perftest/scripts/nnSimpleSGD-train.dml @@ -4,6 +4,7 @@ source("../../nn/layers/l2_loss.dml") as l2_loss source("../../nn/layers/relu.dml") as relu source("../../nn/optim/sgd.dml") as sgd +# read input data X = read($X) Y = read($Y) fmt = ifdef($fmt, "csv") From c32bfc41e35acc9d734fe43a4ba9cc622231bc43 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Thu, 22 Jun 2023 20:24:43 +0200 Subject: [PATCH 06/19] nn simple nesterov running --- scripts/perftest/scripts/nnNesterovClassify-predict.dml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/perftest/scripts/nnNesterovClassify-predict.dml b/scripts/perftest/scripts/nnNesterovClassify-predict.dml index f97aedbf178..52e10243a5f 100644 --- a/scripts/perftest/scripts/nnNesterovClassify-predict.dml +++ b/scripts/perftest/scripts/nnNesterovClassify-predict.dml @@ -7,7 +7,7 @@ source("../../nn/layers/softmax.dml") as softmax source("../../nn/optim/sgd_nesterov.dml") as sgd_nesterov X = read($X) -Y = read($Y) +Y = read($Y) W1 = read(""+$B+"/w1_nesterov_classify") W2 = read(""+$B+"/w2_nesterov_classify") From 4da031db71458da72954d82a79588c10467e2bcf Mon Sep 17 00:00:00 2001 From: Sheypex Date: Thu, 22 Jun 2023 20:45:31 +0200 Subject: [PATCH 07/19] fixed batching in nesterov and simpleSGD training .. also updated README.md in nn as this is the source for simpleSGD and nesterov --- scripts/nn/README.md | 8 ++++---- scripts/perftest/runAll.sh | 2 +- scripts/perftest/runAllNN.sh | 12 ++++++++---- scripts/perftest/runNNNesterovClassify.sh | 5 +++-- scripts/perftest/runNNSimpleSGD.sh | 5 +++-- .../perftest/scripts/nnNesterovClassify-predict.dml | 2 +- .../perftest/scripts/nnNesterovClassify-train.dml | 4 ++-- scripts/perftest/scripts/nnSimpleSGD-train.dml | 4 ++-- 8 files changed, 24 insertions(+), 18 deletions(-) diff --git a/scripts/nn/README.md b/scripts/nn/README.md index e9a59a707f0..c73955576e0 100644 --- a/scripts/nn/README.md +++ b/scripts/nn/README.md @@ -55,8 +55,8 @@ iters = 1024 / batch_size for (e in 1:epochs) { for(i in 1:iters) { # Get next batch - X_batch = X[i:i+batch_size-1,] - y_batch = y[i:i+batch_size-1,] + X_batch = X[(i-1)*batch_size+1:i*batch_size,] + y_batch = Y[(i-1)*batch_size+1:i*batch_size,] # Compute forward pass out1 = affine::forward(X_batch, W1, b1) @@ -131,8 +131,8 @@ iters = 1024 / batch_size for (e in 1:epochs) { for(i in 1:iters) { # Get next batch - X_batch = X[i:i+batch_size-1,] - y_batch = y[i:i+batch_size-1,] + X_batch = X[(i-1)*batch_size+1:i*batch_size,] + y_batch = Y[(i-1)*batch_size+1:i*batch_size,] # Compute forward pass ## layer 1: diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index 9a7d1337d12..0b45eccb1b1 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -32,7 +32,7 @@ TEMPFOLDER="temp" # Max memory of data to be benchmarked # Possible values: 80/80MB, 800/800MB, 8000/8000MB/8GB, 80000/80000MB/80GB, 800000/800000MB/800GB -MAXMEM=80 +MAXMEM=800 # Set properties export LOG4JPROP='conf/log4j-off.properties' diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh index d23f2993344..0abce52eb25 100755 --- a/scripts/perftest/runAllNN.sh +++ b/scripts/perftest/runAllNN.sh @@ -69,14 +69,18 @@ for d in ${DATA[@]}; do #"_KDD" # Regression tasks for f in "runNNSimpleSGD"; do - echo "-- Running "$f" on "$d"" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} &>logs/${f}_${d}.out + echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} 5 &>logs/${f}_${d}.out + echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} 50 &>logs/${f}_${d}.out done # Classification tasks for f in "runNNNesterovClassify"; do - echo "-- Running "$f" on "$d"" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} &>logs/${f}_${d}.out + echo "-- Running "$f" on "$d" for 10 epochs" >>results/times.txt + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} 10 &>logs/${f}_${d}.out + echo "-- Running "$f" on "$d" for 100 epochs" >>results/times.txt + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} 100 &>logs/${f}_${d}.out done done diff --git a/scripts/perftest/runNNNesterovClassify.sh b/scripts/perftest/runNNNesterovClassify.sh index 5a0173ada87..83feb2f2393 100755 --- a/scripts/perftest/runNNNesterovClassify.sh +++ b/scripts/perftest/runNNNesterovClassify.sh @@ -36,6 +36,7 @@ Y=$2 BASE=$3 CMD=$4 LOGIDENTIFIER=$5 +EPOCHS=$6 echo "running sgd nn classifier with nesterov momentum" @@ -44,7 +45,7 @@ tstart=$(date +%s.%N) ${CMD} -f scripts/nnNesterovClassify-train.dml \ --config conf/SystemDS-config.xml \ --stats \ - --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnNesterovClassify-train_${LOGIDENTIFIER}.out + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" epochs=${EPOCHS} &>logs/nnNesterovClassify-train_${LOGIDENTIFIER}_${EPOCHS}.out ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) echo "nesterov momentum neural network trained with SGD on "$1": "$ttrain >>results/times.txt @@ -54,7 +55,7 @@ tstart=$(date +%s.%N) ${CMD} -f scripts/nnNesterovClassify-predict.dml \ --config conf/SystemDS-config.xml \ --stats \ - --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}.out + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}_${EPOCHS}.out #--nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}.out tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) diff --git a/scripts/perftest/runNNSimpleSGD.sh b/scripts/perftest/runNNSimpleSGD.sh index 13aebb3d62a..5887086e023 100755 --- a/scripts/perftest/runNNSimpleSGD.sh +++ b/scripts/perftest/runNNSimpleSGD.sh @@ -36,6 +36,7 @@ Y=$2 BASE=$3 CMD=$4 LOGIDENTIFIER=$5 +EPOCHS=$6 echo "running simple sgd neural network" @@ -44,7 +45,7 @@ tstart=$(date +%s.%N) ${CMD} -f scripts/nnSimpleSGD-train.dml \ --config conf/SystemDS-config.xml \ --stats \ - --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnSimpleSGD-train_${LOGIDENTIFIER}.out + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" epochs=${EPOCHS} &>logs/nnSimpleSGD-train_${LOGIDENTIFIER}_${EPOCHS}.out ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) echo "simple neural network trained with SGD on "$1": "$ttrain >>results/times.txt @@ -54,7 +55,7 @@ tstart=$(date +%s.%N) ${CMD} -f scripts/nnSimpleSGD-predict.dml \ --config conf/SystemDS-config.xml \ --stats \ - --nvargs fmt="csv" X=${X} Y=${Y} B=${BASE} &>logs/nnSimpleSGD-predict_${LOGIDENTIFIER}.out + --nvargs fmt="csv" X=${X} Y=${Y} B=${BASE} &>logs/nnSimpleSGD-predict_${LOGIDENTIFIER}_${EPOCHS}.out # --nvargs fmt=csv X=$1_test B=${BASE} Y=$2_test tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) diff --git a/scripts/perftest/scripts/nnNesterovClassify-predict.dml b/scripts/perftest/scripts/nnNesterovClassify-predict.dml index 52e10243a5f..f97aedbf178 100644 --- a/scripts/perftest/scripts/nnNesterovClassify-predict.dml +++ b/scripts/perftest/scripts/nnNesterovClassify-predict.dml @@ -7,7 +7,7 @@ source("../../nn/layers/softmax.dml") as softmax source("../../nn/optim/sgd_nesterov.dml") as sgd_nesterov X = read($X) -Y = read($Y) +Y = read($Y) W1 = read(""+$B+"/w1_nesterov_classify") W2 = read(""+$B+"/w2_nesterov_classify") diff --git a/scripts/perftest/scripts/nnNesterovClassify-train.dml b/scripts/perftest/scripts/nnNesterovClassify-train.dml index 01e25ded808..ad21a2f7123 100644 --- a/scripts/perftest/scripts/nnNesterovClassify-train.dml +++ b/scripts/perftest/scripts/nnNesterovClassify-train.dml @@ -40,8 +40,8 @@ iters = N / batch_size for (e in 1:epochs) { for(i in 1:iters) { # Get next batch - X_batch = X[i:i+batch_size-1,] - y_batch = Y[i:i+batch_size-1,] + X_batch = X[(i-1)*batch_size+1:i*batch_size,] + y_batch = Y[(i-1)*batch_size+1:i*batch_size,] # Compute forward pass ## layer 1: diff --git a/scripts/perftest/scripts/nnSimpleSGD-train.dml b/scripts/perftest/scripts/nnSimpleSGD-train.dml index 936b76625bd..188e09dce6a 100644 --- a/scripts/perftest/scripts/nnSimpleSGD-train.dml +++ b/scripts/perftest/scripts/nnSimpleSGD-train.dml @@ -32,8 +32,8 @@ iters = N / batch_size for (e in 1:epochs) { for(i in 1:iters) { # Get next batch - X_batch = X[i:i+batch_size-1,] - y_batch = Y[i:i+batch_size-1,] + X_batch = X[(i-1)*batch_size+1:i*batch_size,] + y_batch = Y[(i-1)*batch_size+1:i*batch_size,] # Compute forward pass out1 = affine::forward(X_batch, W1, b1) From 580caf6e6f8de8c563076d47c3c6709343552a34 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Fri, 23 Jun 2023 00:02:04 +0200 Subject: [PATCH 08/19] weirdness with execution NCF.dml in staging .. otherwise NCF perftest should be ready --- scripts/datagen/genRandData4NCF.dml | 84 +++++++++++++ scripts/nn/examples/ncf-dummy-data.dml | 2 +- scripts/perftest/datagen/genNCFData.sh | 144 ++++++++++++++++++++++ scripts/perftest/runAll.sh | 6 +- scripts/perftest/runAllNCF.sh | 61 +++++++++ scripts/perftest/runAllNN.sh | 8 +- scripts/perftest/runNCF.sh | 105 ++++++++++++++++ scripts/perftest/runNNNesterovClassify.sh | 4 +- scripts/perftest/runNNSimpleSGD.sh | 4 +- scripts/perftest/scripts/NCF-predict.dml | 35 ++++++ scripts/perftest/scripts/NCF-train.dml | 45 +++++++ scripts/staging/NCF.dml | 12 +- 12 files changed, 493 insertions(+), 17 deletions(-) create mode 100644 scripts/datagen/genRandData4NCF.dml create mode 100755 scripts/perftest/datagen/genNCFData.sh create mode 100755 scripts/perftest/runAllNCF.sh create mode 100755 scripts/perftest/runNCF.sh create mode 100755 scripts/perftest/scripts/NCF-predict.dml create mode 100755 scripts/perftest/scripts/NCF-train.dml diff --git a/scripts/datagen/genRandData4NCF.dml b/scripts/datagen/genRandData4NCF.dml new file mode 100644 index 00000000000..272e4ff4a74 --- /dev/null +++ b/scripts/datagen/genRandData4NCF.dml @@ -0,0 +1,84 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random Gaussian-mixture data to test k-Means clustering algorithms +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# file_xyz String --- Respective output files name/path +# ktrain Int 1000 Number of training samples +# kval Int 100 Number of validation samples +# nitems Int 50 Number of items +# nusers Int 60 Number of users +# fmt Format specifier csv Format of output data +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1 + +print ("BEGIN NCF GENERATOR SCRIPT"); + +file_users_train = ifdef ($users_train, "users_train"); +file_items_train = ifdef ($items_train, "items_train"); +file_targets_train = ifdef ($targets_train, "targets_train"); +file_users_val = ifdef ($users_val, "users_val"); +file_items_val = ifdef ($items_val, "items_val"); +file_targets_val = ifdef ($targets_val, "targets_val"); +fmt = ifdef ($fmt, "csv"); + +# Generate input data +K_train = ifdef($ktrain, 1000); # number of training samples +K_val = ifdef($kval, 100); # number of validation samples + +N = ifdef($nitems, 50); # number items +M = ifdef($nusers, 60); # number users + +# targets +targets_train = round(rand(rows=K_train, cols=1)); +targets_val = round(rand(rows=K_val, cols=1)); + +# user/items integer-encoded vectors +items_train_int_encoded = round(rand(rows=K_train, cols=1, min=1, max=N)); +users_train_int_encoded = round(rand(rows=K_train, cols=1, min=1, max=M)); +items_val_int_encoded = round(rand(rows=K_val, cols=1, min=1, max=N)); +users_val_int_encoded = round(rand(rows=K_val, cols=1, min=1, max=M)); + +# user/items matrices by applying one-hot-encoding +items_train = toOneHot(items_train_int_encoded, N); +items_val = toOneHot(items_val_int_encoded, N); +users_train = toOneHot(users_train_int_encoded, M); +users_val = toOneHot(users_val_int_encoded, M); + + +print ("Writing out the resulting dataset..."); + +write (users_train, file_users_train, format=fmt); +write (items_train, file_items_train, format=fmt); +write (targets_train, file_targets_train, format=fmt); +write (users_val, file_users_val, format=fmt); +write (items_val, file_items_val, format=fmt); +write (targets_val, file_targets_val, format=fmt); + +print ("DONE: NCF GENERATOR SCRIPT"); + diff --git a/scripts/nn/examples/ncf-dummy-data.dml b/scripts/nn/examples/ncf-dummy-data.dml index fff5f63042d..eeb420437f1 100644 --- a/scripts/nn/examples/ncf-dummy-data.dml +++ b/scripts/nn/examples/ncf-dummy-data.dml @@ -20,7 +20,7 @@ #------------------------------------------------------------- # Imports -source("staging/NCF.dml") as NCF +source("../../staging/NCF.dml") as NCF K_train = 1000; # number of training samples K_val = 100; # number of validation samples diff --git a/scripts/perftest/datagen/genNCFData.sh b/scripts/perftest/datagen/genNCFData.sh new file mode 100755 index 00000000000..2e57546a82c --- /dev/null +++ b/scripts/perftest/datagen/genNCFData.sh @@ -0,0 +1,144 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +DATADIR=$2/ncf +MAXMEM=$3 + +FORMAT="csv" # can be csv, mm, text, binary + +echo "-- Generating NCF data." >> results/times.txt; +#generate XS scenarios (80MB) +BASE_ktrain=1000 +BASE_kval=100 +BASE_nitems=50 +BASE_nusers=60 +if [ $MAXMEM -ge 80 ]; then + MULTIPLIER=1 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + MULTIPLIER=3 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + MULTIPLIER=9 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + MULTIPLIER=27 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + MULTIPLIER=81 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index 0b45eccb1b1..e80b4444540 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -110,7 +110,8 @@ echo -e "\n\n" >> results/times.txt #./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genClusteringData.out #./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genDimensionReductionData.out #./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out -./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genNNData.out +#./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genNNData.out +./datagen/genNCFData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genNCFData.out ### Micro Benchmarks: #./MatrixMult.sh ${CMD} @@ -127,7 +128,8 @@ echo -e "\n\n" >> results/times.txt #./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} #./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} #./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllNN.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllNN.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +./runAllNCF.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} # TODO The following benchmarks have yet to be written. The decision tree algorithms additionally need to be fixed. # add stepwise Linear diff --git a/scripts/perftest/runAllNCF.sh b/scripts/perftest/runAllNCF.sh new file mode 100755 index 00000000000..e8ca5a94e62 --- /dev/null +++ b/scripts/perftest/runAllNCF.sh @@ -0,0 +1,61 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +COMMAND=$1 +TEMPFOLDER=$2 +MAXMEM=$3 + +if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp; fi +BASE=${TEMPFOLDER}/ncf +MAXITR=200 + +FILENAME=$0 +err_report() { + echo "Error in $FILENAME on line $1" +} +trap 'err_report $LINENO' ERR + +DATA=() # todo .. which data is needed? +if [ $MAXMEM -ge 80 ]; then DATA+=("1000_100_50_60"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("3000_300_150_180"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("9000_900_450_540"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("27000_2700_1350_1620"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("81000_8100_4050_4860"); fi + +echo "RUN NEURAL COLLABORATIVE FILTERING EXPERIMENTS" $(date) >>results/times.txt + +for d in ${DATA[@]}; do #"_KDD" + for f in "runNCF"; do + echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt + ./${f}.sh ${BASE}/Tt${d} ${BASE}/Tv${d} ${BASE}/It${d} ${BASE}/Iv${d} ${BASE}/Ut${d} ${BASE}/Uv${d} \ + ${BASE} ${COMMAND} ${d} 5 &>logs/${f}_${d}_5.out + echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt + ./${f}.sh ${BASE}/Tt${d} ${BASE}/Tv${d} ${BASE}/It${d} ${BASE}/Iv${d} ${BASE}/Ut${d} ${BASE}/Uv${d} \ + ${BASE} ${COMMAND} ${d} 50 &>logs/${f}_${d}_50.out + done +done + +echo -e "\n\n" >>results/times.txt diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh index 0abce52eb25..a628685811d 100755 --- a/scripts/perftest/runAllNN.sh +++ b/scripts/perftest/runAllNN.sh @@ -70,17 +70,17 @@ for d in ${DATA[@]}; do #"_KDD" # Regression tasks for f in "runNNSimpleSGD"; do echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} 5 &>logs/${f}_${d}.out + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} 5 &>logs/${f}_${d}_5.out echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} 50 &>logs/${f}_${d}.out + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} 50 &>logs/${f}_${d}_50.out done # Classification tasks for f in "runNNNesterovClassify"; do echo "-- Running "$f" on "$d" for 10 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} 10 &>logs/${f}_${d}.out + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} 10 &>logs/${f}_${d}_10.out echo "-- Running "$f" on "$d" for 100 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} 100 &>logs/${f}_${d}.out + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} 100 &>logs/${f}_${d}_100.out done done diff --git a/scripts/perftest/runNCF.sh b/scripts/perftest/runNCF.sh new file mode 100755 index 00000000000..4638707c526 --- /dev/null +++ b/scripts/perftest/runNCF.sh @@ -0,0 +1,105 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# params: +# 1) X data +# 2) Y data +# 3) path of base temp dir +# 4) command for systemds +set -e + +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +TTrain=$1 +TVal=$2 +ITrain=$3 +IVal=$4 +UTrain=$5 +UVal=$6 +BASE=$7 +CMD=$8 +LOGIDENTIFIER=$9 +EPOCHS=${10} + +echo "running NCF" +echo \ +${CMD} -f scripts/NCF-train.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs B=${BASE} fmt="csv" \ + targets_train=${TTrain} \ + targets_val=${TVal} \ + items_train=${ITrain} \ + items_val=${IVal} \ + users_train=${UTrain} \ + users_val=${UVal} \ + epochs=${EPOCHS} +#training +tstart=$(date +%s.%N) +${CMD} -f scripts/NCF-train.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs B=${BASE} fmt="csv" \ + targets_train=${TTrain} \ + targets_val=${TVal} \ + items_train=${ITrain} \ + items_val=${IVal} \ + users_train=${UTrain} \ + users_val=${UVal} \ + epochs=${EPOCHS} \ + &>logs/NCF-train_${LOGIDENTIFIER}_${EPOCHS}.out + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "NCF trained on "$9": "$ttrain >>results/times.txt + +#predict +tstart=$(date +%s.%N) +${CMD} -f scripts/NCF-predict.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs B=${BASE} fmt="csv" epochs=${EPOCHS} \ + items=${ITrain} \ + users=${UTrain} \ + target=${TTrain} \ + biases=${BASE}/ncf_biases \ + weights=${BASE}/ncf_weights \ + &>logs/NCF-predict_train_${LOGIDENTIFIER}_${EPOCHS}.out + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "NCF predicted on training data "$9": "$tpredict >>results/times.txt + +tstart=$(date +%s.%N) +${CMD} -f scripts/NCF-predict.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs B=${BASE} fmt="csv" epochs=${EPOCHS} \ + items=${IVal} \ + users=${UVal} \ + target=${TVal} \ + biases=${BASE}/ncf_biases \ + weights=${BASE}/ncf_weights \ + &>logs/NCF-predict_val_${LOGIDENTIFIER}_${EPOCHS}.out + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "NCF predicted on validation data "$9": "$tpredict >>results/times.txt diff --git a/scripts/perftest/runNNNesterovClassify.sh b/scripts/perftest/runNNNesterovClassify.sh index 83feb2f2393..011ddbf3d01 100755 --- a/scripts/perftest/runNNNesterovClassify.sh +++ b/scripts/perftest/runNNNesterovClassify.sh @@ -48,7 +48,7 @@ ${CMD} -f scripts/nnNesterovClassify-train.dml \ --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" epochs=${EPOCHS} &>logs/nnNesterovClassify-train_${LOGIDENTIFIER}_${EPOCHS}.out ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "nesterov momentum neural network trained with SGD on "$1": "$ttrain >>results/times.txt +echo "nesterov momentum neural network trained with SGD on "$5": "$ttrain >>results/times.txt #predict tstart=$(date +%s.%N) @@ -59,4 +59,4 @@ ${CMD} -f scripts/nnNesterovClassify-predict.dml \ #--nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}.out tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "nesterov momentum neural network trained with SGD predicted on "$1": "$tpredict >>results/times.txt +echo "nesterov momentum neural network trained with SGD predicted on "$5": "$tpredict >>results/times.txt diff --git a/scripts/perftest/runNNSimpleSGD.sh b/scripts/perftest/runNNSimpleSGD.sh index 5887086e023..7950625d37e 100755 --- a/scripts/perftest/runNNSimpleSGD.sh +++ b/scripts/perftest/runNNSimpleSGD.sh @@ -48,7 +48,7 @@ ${CMD} -f scripts/nnSimpleSGD-train.dml \ --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" epochs=${EPOCHS} &>logs/nnSimpleSGD-train_${LOGIDENTIFIER}_${EPOCHS}.out ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "simple neural network trained with SGD on "$1": "$ttrain >>results/times.txt +echo "simple neural network trained with SGD on "$5": "$ttrain >>results/times.txt #predict tstart=$(date +%s.%N) @@ -59,4 +59,4 @@ ${CMD} -f scripts/nnSimpleSGD-predict.dml \ # --nvargs fmt=csv X=$1_test B=${BASE} Y=$2_test tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "simple neural network trained with SGD predicted on "$1": "$tpredict >>results/times.txt +echo "simple neural network trained with SGD predicted on "$5": "$tpredict >>results/times.txt diff --git a/scripts/perftest/scripts/NCF-predict.dml b/scripts/perftest/scripts/NCF-predict.dml new file mode 100755 index 00000000000..339f24294ab --- /dev/null +++ b/scripts/perftest/scripts/NCF-predict.dml @@ -0,0 +1,35 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +source("../../staging/NCF.dml") as NCF +fmt=ifdef($fmt, "csv") + +biases = read($biases) +weights = read($weights) + +items = read($items); +users = read($users); +target = read($target) + +[out_FA, out_F, out_D1A, out_D1, out_D2A, out_D2, out_D3A, out_D3, out_concat, out_U, out_I] = NCF::predict(users, items, biases, weights); + +[loss, accuracy] = NCF::eval(out_FA, target); + +print("got loss and accuracy of: " + loss + ", " + accuracy) diff --git a/scripts/perftest/scripts/NCF-train.dml b/scripts/perftest/scripts/NCF-train.dml new file mode 100755 index 00000000000..f92535af9c8 --- /dev/null +++ b/scripts/perftest/scripts/NCF-train.dml @@ -0,0 +1,45 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +source("../../staging/NCF.dml") as NCF + +fmt=ifdef($fmt, "csv") + +targets_train = read($targets_train); +targets_val = read($targets_val); +items_train = read($items_train); +items_val = read($items_val); +users_train = read($users_train); +users_val = read($users_val); + +# Train +epochs = ifdef($epochs, 50); +batch_size = ifdef($batch_size, 16); + +# layer dimensions +E = ifdef($embedding, 8); # embedding +D1 = ifdef($d1, 64); # dense layer 1 +D2 = ifdef($d2, 32); # dense layer 2 +D3 = ifdef($d3, 16); # dense layer 3 + +[biases, weights] = NCF::train(users_train, items_train, targets_train, users_val, items_val, targets_val, epochs, batch_size, E, D1, D2, D3); + +write(biases, ""+$B+"/ncf_biases",format=fmt) +write(weights, ""+$B+"/ncf_weights",format=fmt) \ No newline at end of file diff --git a/scripts/staging/NCF.dml b/scripts/staging/NCF.dml index 0719b585e2f..3a8ba62f882 100644 --- a/scripts/staging/NCF.dml +++ b/scripts/staging/NCF.dml @@ -24,12 +24,12 @@ # # Imports -source("nn/optim/adam.dml") as adam -source("nn/layers/relu.dml") as relu -source("nn/layers/sigmoid.dml") as sigmoid -source("nn/layers/affine.dml") as affine -source("nn/layers/log_loss.dml") as log_loss -source("nn/layers/l2_reg.dml") as l2_reg +source("../nn/optim/adam.dml") as adam +source("../nn/layers/relu.dml") as relu +source("../nn/layers/sigmoid.dml") as sigmoid +source("../nn/layers/affine.dml") as affine +source("../nn/layers/log_loss.dml") as log_loss +source("../nn/layers/l2_reg.dml") as l2_reg train = function( matrix[double] users_train, matrix[double] items_train, From f9da87caf989700b0abe48bf915f02759e9a2710 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Fri, 23 Jun 2023 19:38:54 +0200 Subject: [PATCH 09/19] added flags in runAll.sh to toggle execution of nn tests and use of gpu for nn tests --- scripts/perftest/runAll.sh | 50 ++++++++++++++++++++++++----------- scripts/perftest/runAllNCF.sh | 4 +-- scripts/perftest/runAllNN.sh | 29 ++++---------------- 3 files changed, 41 insertions(+), 42 deletions(-) diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index e80b4444540..8c727f9bfc6 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -20,10 +20,9 @@ # #------------------------------------------------------------- -if [ "$(basename $PWD)" != "perftest" ]; -then +if [ "$(basename $PWD)" != "perftest" ]; then echo "Please execute scripts from directory 'perftest'" - exit 1; + exit 1 fi # Command to be executed @@ -32,7 +31,7 @@ TEMPFOLDER="temp" # Max memory of data to be benchmarked # Possible values: 80/80MB, 800/800MB, 8000/8000MB/8GB, 80000/80000MB/80GB, 800000/800000MB/800GB -MAXMEM=800 +MAXMEM=80 # Set properties export LOG4JPROP='conf/log4j-off.properties' @@ -85,22 +84,26 @@ MAXMEM=${MAXMEM%"MB"}; MAXMEM=${MAXMEM/GB/"000"} # Possible lines to initialize Intel MKL, depending on version and install location if [ -d ~/intel ] && [ -d ~/intel/bin ] && [ -f ~/intel/bin/compilervars.sh ]; then - . ~/intel/bin/compilervars.sh intel64 + . ~/intel/bin/compilervars.sh intel64 elif [ -d /opt ] && [ -d /opt/intel ] && [ -d /opt/intel/bin ]; then - . /opt/intel/bin/compilervars.sh intel64 + . /opt/intel/bin/compilervars.sh intel64 fi # make dirs if not exsisting -mkdir -p logs -mkdir -p results +mkdir -p logs +mkdir -p results mkdir -p temp +# Flags for tests of components in nn +DO_TESTS_FOR_NN=true # toggle execution of datagen for as well as tests of nn components themselves +USE_GPU_FOR_NN=true # toggle gpu usage for nn tests + # init time measurement rm -f results/times.txt -date +"%Y-%m-%d-%T" >> results/times.txt -echo -e "\n$HOSTNAME" >> results/times.txt -echo -e "\n\n" >> results/times.txt +date +"%Y-%m-%d-%T" >>results/times.txt +echo -e "\n$HOSTNAME" >>results/times.txt +echo -e "\n\n" >>results/times.txt ## Data Gen #./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genBinomialData.out @@ -110,8 +113,12 @@ echo -e "\n\n" >> results/times.txt #./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genClusteringData.out #./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genDimensionReductionData.out #./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out -#./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genNNData.out -./datagen/genNCFData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genNCFData.out + +# Data for tests of nn components +if [ "$DO_TESTS_FOR_NN" = true ]; then + ./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNNData.out + ./datagen/genNCFData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNCFData.out +fi ### Micro Benchmarks: #./MatrixMult.sh ${CMD} @@ -128,11 +135,22 @@ echo -e "\n\n" >> results/times.txt #./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} #./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} #./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -#./runAllNN.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllNCF.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} + +# Tests of nn components +if [ "$DO_TESTS_FOR_NN" = true ]; then + # take care of toggle to use/not to use gpu for nn tests by conditionally adding the -gpu execution flag onto the CMD variable + NN_CMD=$CMD + if [ "$USE_GPU_FOR_NN" = true ]; then + NN_CMD="${CMD} -gpu" + fi + # NOTICE: remember to pass the command variable as a quoted string! + # otherwise the command (eg. `systemds -gpu` without quotes) will be split into two variables in subscripts when USE_GPU_FOR_NN is set + ./runAllNN.sh "${NN_CMD}" ${TEMPFOLDER} ${MAXMEM} + ./runAllNCF.sh "${NN_CMD}" ${TEMPFOLDER} ${MAXMEM} # currently broken: staging/NCF.dml and any dml that sources it die on launch +fi # TODO The following benchmarks have yet to be written. The decision tree algorithms additionally need to be fixed. -# add stepwise Linear +# add stepwise Linear # add stepwise GLM #./runAllTrees.sh $CMD $TEMPFOLDER # add randomForest diff --git a/scripts/perftest/runAllNCF.sh b/scripts/perftest/runAllNCF.sh index e8ca5a94e62..7114af7c2d4 100755 --- a/scripts/perftest/runAllNCF.sh +++ b/scripts/perftest/runAllNCF.sh @@ -51,10 +51,10 @@ for d in ${DATA[@]}; do #"_KDD" for f in "runNCF"; do echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt ./${f}.sh ${BASE}/Tt${d} ${BASE}/Tv${d} ${BASE}/It${d} ${BASE}/Iv${d} ${BASE}/Ut${d} ${BASE}/Uv${d} \ - ${BASE} ${COMMAND} ${d} 5 &>logs/${f}_${d}_5.out + ${BASE} "${COMMAND}" ${d} 5 &>logs/${f}_${d}_5.out echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt ./${f}.sh ${BASE}/Tt${d} ${BASE}/Tv${d} ${BASE}/It${d} ${BASE}/Iv${d} ${BASE}/Ut${d} ${BASE}/Uv${d} \ - ${BASE} ${COMMAND} ${d} 50 &>logs/${f}_${d}_50.out + ${BASE} "${COMMAND}" ${d} 50 &>logs/${f}_${d}_50.out done done diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh index a628685811d..b56441f7a75 100755 --- a/scripts/perftest/runAllNN.sh +++ b/scripts/perftest/runAllNN.sh @@ -40,7 +40,7 @@ trap 'err_report $LINENO' ERR DATA=() # todo .. which data is needed? if [ $MAXMEM -ge 80 ]; then DATA+=("1024_100_1"); fi -if [ $MAXMEM -ge 800 ]; then DATA+=("3072_300_1" ); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("3072_300_1"); fi if [ $MAXMEM -ge 8000 ]; then DATA+=("9216_900_1"); fi if [ $MAXMEM -ge 80000 ]; then DATA+=("27648_2700_1"); fi if [ $MAXMEM -ge 800000 ]; then DATA+=("82944_8200_1"); fi @@ -48,39 +48,20 @@ if [ $MAXMEM -ge 800000 ]; then DATA+=("82944_8200_1"); fi echo "RUN NEURAL NETWORK EXPERIMENTS" $(date) >>results/times.txt for d in ${DATA[@]}; do #"_KDD" - - # ------------------------------------------------------------------------------------------------------------------- - # TODO return an additional output to preserve the internal scaling from training (for the built-in functions lmCG and lmDS). - # The original scripts algorithms/LinearRegCG.dml and algorithms/LinearRegDS.dml do have that additional output column, but the respective built-in functions do not. - # ------------------------------------------------------------------------------------------------------------------- - - # for f in "runLinearRegDS" - # do - # echo "-- Running "$f" on "$d" (all configs)" >> results/times.txt; - # ./${f}.sh ${BASE}/X${d} ${BASE}/y${d} ${BASE} ${COMMAND} &> logs/${f}_${d}.out; - # done - # - # # run with the parameter setting maximum of iterations - # for f in "runLinearRegCG" "runGLM_poisson_log" "runGLM_gamma_log" "runGLM_binomial_probit" - # do - # echo "-- Running "$f" on "$d" (all configs)" >> results/times.txt; - # ./${f}.sh ${BASE}/X${d} ${BASE}/y${d} ${BASE} ${MAXITR} ${COMMAND} &> logs/${f}_${d}.out; - # done - # Regression tasks for f in "runNNSimpleSGD"; do echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} 5 &>logs/${f}_${d}_5.out + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} "${COMMAND}" ${d} 5 &>logs/${f}_${d}_5.out echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} ${COMMAND} ${d} 50 &>logs/${f}_${d}_50.out + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} "${COMMAND}" ${d} 50 &>logs/${f}_${d}_50.out done # Classification tasks for f in "runNNNesterovClassify"; do echo "-- Running "$f" on "$d" for 10 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} 10 &>logs/${f}_${d}_10.out + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} "${COMMAND}" ${d} 10 &>logs/${f}_${d}_10.out echo "-- Running "$f" on "$d" for 100 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} ${COMMAND} ${d} 100 &>logs/${f}_${d}_100.out + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} "${COMMAND}" ${d} 100 &>logs/${f}_${d}_100.out done done From cd83bd459ec66f52267fe97ce0d596d39100bb44 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Mon, 26 Jun 2023 16:47:27 +0200 Subject: [PATCH 10/19] fixed -gpu flag --- scripts/perftest/runAll.sh | 13 ++++-------- scripts/perftest/runAllNCF.sh | 5 +++-- scripts/perftest/runAllNN.sh | 9 +++++---- scripts/perftest/runNCF.sh | 16 +++++++++++---- scripts/perftest/runNNNesterovClassify.sh | 10 ++++++++-- scripts/perftest/runNNSimpleSGD.sh | 10 ++++++++-- .../scripts/nnNesterovClassify-predict.dml | 20 +++++++++++++++++++ .../scripts/nnNesterovClassify-train.dml | 20 +++++++++++++++++++ .../perftest/scripts/nnSimpleSGD-predict.dml | 20 +++++++++++++++++++ .../perftest/scripts/nnSimpleSGD-train.dml | 20 +++++++++++++++++++ 10 files changed, 120 insertions(+), 23 deletions(-) diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index 8c727f9bfc6..c843ab3b6f4 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -31,7 +31,7 @@ TEMPFOLDER="temp" # Max memory of data to be benchmarked # Possible values: 80/80MB, 800/800MB, 8000/8000MB/8GB, 80000/80000MB/80GB, 800000/800000MB/800GB -MAXMEM=80 +MAXMEM=800 # Set properties export LOG4JPROP='conf/log4j-off.properties' @@ -96,7 +96,7 @@ mkdir -p temp # Flags for tests of components in nn DO_TESTS_FOR_NN=true # toggle execution of datagen for as well as tests of nn components themselves -USE_GPU_FOR_NN=true # toggle gpu usage for nn tests +USE_GPU_FOR_NN=false # toggle gpu usage for nn tests # init time measurement @@ -138,15 +138,10 @@ fi # Tests of nn components if [ "$DO_TESTS_FOR_NN" = true ]; then - # take care of toggle to use/not to use gpu for nn tests by conditionally adding the -gpu execution flag onto the CMD variable - NN_CMD=$CMD - if [ "$USE_GPU_FOR_NN" = true ]; then - NN_CMD="${CMD} -gpu" - fi # NOTICE: remember to pass the command variable as a quoted string! # otherwise the command (eg. `systemds -gpu` without quotes) will be split into two variables in subscripts when USE_GPU_FOR_NN is set - ./runAllNN.sh "${NN_CMD}" ${TEMPFOLDER} ${MAXMEM} - ./runAllNCF.sh "${NN_CMD}" ${TEMPFOLDER} ${MAXMEM} # currently broken: staging/NCF.dml and any dml that sources it die on launch + ./runAllNN.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} + ./runAllNCF.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} # currently broken: staging/NCF.dml and any dml that sources it die on launch fi # TODO The following benchmarks have yet to be written. The decision tree algorithms additionally need to be fixed. diff --git a/scripts/perftest/runAllNCF.sh b/scripts/perftest/runAllNCF.sh index 7114af7c2d4..e2639012612 100755 --- a/scripts/perftest/runAllNCF.sh +++ b/scripts/perftest/runAllNCF.sh @@ -27,6 +27,7 @@ fi COMMAND=$1 TEMPFOLDER=$2 MAXMEM=$3 +USEGPU=$4 if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp; fi BASE=${TEMPFOLDER}/ncf @@ -51,10 +52,10 @@ for d in ${DATA[@]}; do #"_KDD" for f in "runNCF"; do echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt ./${f}.sh ${BASE}/Tt${d} ${BASE}/Tv${d} ${BASE}/It${d} ${BASE}/Iv${d} ${BASE}/Ut${d} ${BASE}/Uv${d} \ - ${BASE} "${COMMAND}" ${d} 5 &>logs/${f}_${d}_5.out + ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt ./${f}.sh ${BASE}/Tt${d} ${BASE}/Tv${d} ${BASE}/It${d} ${BASE}/Iv${d} ${BASE}/Ut${d} ${BASE}/Uv${d} \ - ${BASE} "${COMMAND}" ${d} 50 &>logs/${f}_${d}_50.out + ${BASE} "${COMMAND}" ${d} 50 ${USEGPU} &>logs/${f}_${d}_50.out done done diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh index b56441f7a75..c7285c362a9 100755 --- a/scripts/perftest/runAllNN.sh +++ b/scripts/perftest/runAllNN.sh @@ -27,6 +27,7 @@ fi COMMAND=$1 TEMPFOLDER=$2 MAXMEM=$3 +USEGPU=$4 if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp; fi BASE=${TEMPFOLDER}/nn @@ -51,17 +52,17 @@ for d in ${DATA[@]}; do #"_KDD" # Regression tasks for f in "runNNSimpleSGD"; do echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} "${COMMAND}" ${d} 5 &>logs/${f}_${d}_5.out + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} "${COMMAND}" ${d} 50 &>logs/${f}_${d}_50.out + ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} "${COMMAND}" ${d} 50 ${USEGPU} &>logs/${f}_${d}_50.out done # Classification tasks for f in "runNNNesterovClassify"; do echo "-- Running "$f" on "$d" for 10 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} "${COMMAND}" ${d} 10 &>logs/${f}_${d}_10.out + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} "${COMMAND}" ${d} 10 ${USEGPU} &>logs/${f}_${d}_10.out echo "-- Running "$f" on "$d" for 100 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} "${COMMAND}" ${d} 100 &>logs/${f}_${d}_100.out + ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} "${COMMAND}" ${d} 100 ${USEGPU} &>logs/${f}_${d}_100.out done done diff --git a/scripts/perftest/runNCF.sh b/scripts/perftest/runNCF.sh index 4638707c526..d15149a61e1 100755 --- a/scripts/perftest/runNCF.sh +++ b/scripts/perftest/runNCF.sh @@ -41,12 +41,20 @@ BASE=$7 CMD=$8 LOGIDENTIFIER=$9 EPOCHS=${10} +USEGPU=${11} + +FLAGS="--stats" +if [ "$USEGPU" = true ]; then + FLAGS="${FLAGS} --gpu" +fi + + echo "running NCF" echo \ ${CMD} -f scripts/NCF-train.dml \ --config conf/SystemDS-config.xml \ - --stats \ + ${FLAGS} \ --nvargs B=${BASE} fmt="csv" \ targets_train=${TTrain} \ targets_val=${TVal} \ @@ -59,7 +67,7 @@ ${CMD} -f scripts/NCF-train.dml \ tstart=$(date +%s.%N) ${CMD} -f scripts/NCF-train.dml \ --config conf/SystemDS-config.xml \ - --stats \ + ${FLAGS} \ --nvargs B=${BASE} fmt="csv" \ targets_train=${TTrain} \ targets_val=${TVal} \ @@ -77,7 +85,7 @@ echo "NCF trained on "$9": "$ttrain >>results/times.txt tstart=$(date +%s.%N) ${CMD} -f scripts/NCF-predict.dml \ --config conf/SystemDS-config.xml \ - --stats \ + ${FLAGS} \ --nvargs B=${BASE} fmt="csv" epochs=${EPOCHS} \ items=${ITrain} \ users=${UTrain} \ @@ -92,7 +100,7 @@ echo "NCF predicted on training data "$9": "$tpredict >>results/times.txt tstart=$(date +%s.%N) ${CMD} -f scripts/NCF-predict.dml \ --config conf/SystemDS-config.xml \ - --stats \ + ${FLAGS} \ --nvargs B=${BASE} fmt="csv" epochs=${EPOCHS} \ items=${IVal} \ users=${UVal} \ diff --git a/scripts/perftest/runNNNesterovClassify.sh b/scripts/perftest/runNNNesterovClassify.sh index 011ddbf3d01..4fe90085be7 100755 --- a/scripts/perftest/runNNNesterovClassify.sh +++ b/scripts/perftest/runNNNesterovClassify.sh @@ -37,6 +37,12 @@ BASE=$3 CMD=$4 LOGIDENTIFIER=$5 EPOCHS=$6 +USEGPU=$7 + +FLAGS="--stats" +if [ "$USEGPU" = true ]; then + FLAGS="${FLAGS} --gpu" +fi echo "running sgd nn classifier with nesterov momentum" @@ -44,7 +50,7 @@ echo "running sgd nn classifier with nesterov momentum" tstart=$(date +%s.%N) ${CMD} -f scripts/nnNesterovClassify-train.dml \ --config conf/SystemDS-config.xml \ - --stats \ + ${FLAGS} \ --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" epochs=${EPOCHS} &>logs/nnNesterovClassify-train_${LOGIDENTIFIER}_${EPOCHS}.out ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) @@ -54,7 +60,7 @@ echo "nesterov momentum neural network trained with SGD on "$5": "$ttrain >>resu tstart=$(date +%s.%N) ${CMD} -f scripts/nnNesterovClassify-predict.dml \ --config conf/SystemDS-config.xml \ - --stats \ + ${FLAGS} \ --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}_${EPOCHS}.out #--nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}.out diff --git a/scripts/perftest/runNNSimpleSGD.sh b/scripts/perftest/runNNSimpleSGD.sh index 7950625d37e..8c78e4c7af6 100755 --- a/scripts/perftest/runNNSimpleSGD.sh +++ b/scripts/perftest/runNNSimpleSGD.sh @@ -37,6 +37,12 @@ BASE=$3 CMD=$4 LOGIDENTIFIER=$5 EPOCHS=$6 +USEGPU=$7 + +FLAGS="--stats" +if [ "$USEGPU" = true ]; then + FLAGS="${FLAGS} --gpu" +fi echo "running simple sgd neural network" @@ -44,7 +50,7 @@ echo "running simple sgd neural network" tstart=$(date +%s.%N) ${CMD} -f scripts/nnSimpleSGD-train.dml \ --config conf/SystemDS-config.xml \ - --stats \ + ${FLAGS} \ --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" epochs=${EPOCHS} &>logs/nnSimpleSGD-train_${LOGIDENTIFIER}_${EPOCHS}.out ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) @@ -54,7 +60,7 @@ echo "simple neural network trained with SGD on "$5": "$ttrain >>results/times.t tstart=$(date +%s.%N) ${CMD} -f scripts/nnSimpleSGD-predict.dml \ --config conf/SystemDS-config.xml \ - --stats \ + ${FLAGS} \ --nvargs fmt="csv" X=${X} Y=${Y} B=${BASE} &>logs/nnSimpleSGD-predict_${LOGIDENTIFIER}_${EPOCHS}.out # --nvargs fmt=csv X=$1_test B=${BASE} Y=$2_test diff --git a/scripts/perftest/scripts/nnNesterovClassify-predict.dml b/scripts/perftest/scripts/nnNesterovClassify-predict.dml index f97aedbf178..378c5612883 100644 --- a/scripts/perftest/scripts/nnNesterovClassify-predict.dml +++ b/scripts/perftest/scripts/nnNesterovClassify-predict.dml @@ -1,3 +1,23 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- # Imports source("../../nn/layers/affine.dml") as affine source("../../nn/layers/cross_entropy_loss.dml") as cross_entropy_loss diff --git a/scripts/perftest/scripts/nnNesterovClassify-train.dml b/scripts/perftest/scripts/nnNesterovClassify-train.dml index ad21a2f7123..8d09dcb291a 100644 --- a/scripts/perftest/scripts/nnNesterovClassify-train.dml +++ b/scripts/perftest/scripts/nnNesterovClassify-train.dml @@ -1,3 +1,23 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- # Imports source("../../nn/layers/affine.dml") as affine source("../../nn/layers/cross_entropy_loss.dml") as cross_entropy_loss diff --git a/scripts/perftest/scripts/nnSimpleSGD-predict.dml b/scripts/perftest/scripts/nnSimpleSGD-predict.dml index 44142402c89..7f12b65e270 100644 --- a/scripts/perftest/scripts/nnSimpleSGD-predict.dml +++ b/scripts/perftest/scripts/nnSimpleSGD-predict.dml @@ -1,3 +1,23 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- # Imports source("../../nn/layers/affine.dml") as affine source("../../nn/layers/l2_loss.dml") as l2_loss diff --git a/scripts/perftest/scripts/nnSimpleSGD-train.dml b/scripts/perftest/scripts/nnSimpleSGD-train.dml index 188e09dce6a..91ef4fa314f 100644 --- a/scripts/perftest/scripts/nnSimpleSGD-train.dml +++ b/scripts/perftest/scripts/nnSimpleSGD-train.dml @@ -1,3 +1,23 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- # Imports source("../../nn/layers/affine.dml") as affine source("../../nn/layers/l2_loss.dml") as l2_loss From 68e7184ec74436523592fffd05e361fda3414649 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Mon, 26 Jun 2023 18:29:49 +0200 Subject: [PATCH 11/19] moved to using existing datagen scripts genRandData4LogisticRegression and genRandData4Multinomial. now also running tests for sparse and dense data. not yet utilizing generated test data sets --- .../datagen/genRandData4NNClassification.dml | 67 --- scripts/datagen/genRandData4NNRegression.dml | 63 --- scripts/perftest/datagen/genNCFData.sh | 4 +- scripts/perftest/datagen/genNNData.sh | 492 +++++++++++++++++- scripts/perftest/runAllNN.sh | 76 ++- 5 files changed, 544 insertions(+), 158 deletions(-) delete mode 100644 scripts/datagen/genRandData4NNClassification.dml delete mode 100644 scripts/datagen/genRandData4NNRegression.dml diff --git a/scripts/datagen/genRandData4NNClassification.dml b/scripts/datagen/genRandData4NNClassification.dml deleted file mode 100644 index 295025aea32..00000000000 --- a/scripts/datagen/genRandData4NNClassification.dml +++ /dev/null @@ -1,67 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# Generates random Gaussian-mixture data to test k-Means clustering algorithms -# -# INPUT PARAMETERS: -# ---------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# ---------------------------------------------------------------------------- -# nr Int 1024 Number of examples -# nf Int 100 Number of features -# nt Int 5 Number of targets -# X String --- Location to write X data -# Y String --- Location to write Y data -# fmt Format specifier csv Format of output data -# ---------------------------------------------------------------------------- -# -# Example: -# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1 - -print ("BEGIN NN CLASSIFICATION GENERATOR SCRIPT"); - -num_records = ifdef($nr, 1024); -num_features = ifdef($nf, 100); -num_targets = ifdef($nt, 5); - -fileX = ifdef ($X, "X"); -fileY = ifdef ($Y, "Y"); -fmt = ifdef ($fmt, "csv"); - -# Generate input data -N = num_records # num examples -D = num_features # num features -t = num_targets # num targets -X = rand(rows=N, cols=D, pdf="normal") -classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform")) -Y = matrix(0, rows=N, cols=t) -for (i in 1:N) { # todo: using parfor here should be fine, but crashes? - Y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding -} - -print ("Writing out the resulting dataset..."); - -write (X, fileX, format = fmt); -write (Y, fileY, format = fmt); - -print ("DONE: NN CLASSIFICATION GENERATOR SCRIPT"); - diff --git a/scripts/datagen/genRandData4NNRegression.dml b/scripts/datagen/genRandData4NNRegression.dml deleted file mode 100644 index 313a31d17f4..00000000000 --- a/scripts/datagen/genRandData4NNRegression.dml +++ /dev/null @@ -1,63 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# Generates random Gaussian-mixture data to test k-Means clustering algorithms -# -# INPUT PARAMETERS: -# ---------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# ---------------------------------------------------------------------------- -# nr Int 1024 Number of examples -# nf Int 100 Number of features -# nt Int 1 Number of targets -# X String --- Location to write X data -# Y String --- Location to write Y data -# fmt Format specifier csv Format of output data -# ---------------------------------------------------------------------------- -# -# Example: -# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1 - -print ("BEGIN NN REGRESSION GENERATOR SCRIPT"); - -num_records = ifdef($nr, 1024); -num_features = ifdef($nf, 100); -num_targets = ifdef($nt, 1); - -fileX = ifdef ($X, "X"); -fileY = ifdef ($Y, "Y"); -fmt = ifdef ($fmt, "csv"); - -# Generate input data -N = num_records # num examples -D = num_features # num features -t = num_targets # num targets -X = rand(rows=N, cols=D, pdf="normal") -Y = rand(rows=N, cols=t) - -print ("Writing out the resulting dataset..."); - -write (X, fileX, format = fmt); -write (Y, fileY, format = fmt); - -print ("DONE: NN REGRESSION GENERATOR SCRIPT"); - diff --git a/scripts/perftest/datagen/genNCFData.sh b/scripts/perftest/datagen/genNCFData.sh index 2e57546a82c..ae41ac828bb 100755 --- a/scripts/perftest/datagen/genNCFData.sh +++ b/scripts/perftest/datagen/genNCFData.sh @@ -31,12 +31,12 @@ MAXMEM=$3 FORMAT="csv" # can be csv, mm, text, binary -echo "-- Generating NCF data." >> results/times.txt; -#generate XS scenarios (80MB) BASE_ktrain=1000 BASE_kval=100 BASE_nitems=50 BASE_nusers=60 +echo "-- Generating NCF data." >> results/times.txt; +#generate XS scenarios (80MB) if [ $MAXMEM -ge 80 ]; then MULTIPLIER=1 KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) diff --git a/scripts/perftest/datagen/genNNData.sh b/scripts/perftest/datagen/genNNData.sh index 06e4241961c..0a4fa6c50f4 100755 --- a/scripts/perftest/datagen/genNNData.sh +++ b/scripts/perftest/datagen/genNNData.sh @@ -19,10 +19,9 @@ # under the License. # #------------------------------------------------------------- -if [ "$(basename $PWD)" != "perftest" ]; -then +if [ "$(basename $PWD)" != "perftest" ]; then echo "Please execute scripts from directory 'perftest'" - exit 1; + exit 1 fi CMD=$1 @@ -31,39 +30,502 @@ MAXMEM=$3 FORMAT="csv" # can be csv, mm, text, binary -echo "-- Generating NN data." >> results/times.txt; +DENSE_SP=0.9 +SPARSE_SP=0.01 +BASE_REG_SAMPLES=1024 +BASE_REG_FEATRUES=100 +BASE_CLASS_SAMPLES=1024 +BASE_CLASS_FEATURES=100 +BASE_CLASS_CLASSES=5 + # the scaling of nr and nf is to just multiply them by 3 each .. since sqrt(10) is about 3 and the data size should scale by a factor of 10 ..... needs to be tested for applicability # for now only t=1 and t=5 are generated for regression and classification respectively .. may want to add more variety # todo make test data # todo generated data is too small with current parameters .. X data for xs is 2mb, s is 18mb -> pump it up +echo "-- Generating NN data." >>results/times.txt #generate XS scenarios (80MB) if [ $MAXMEM -ge 80 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X1024_100_1_reg Y=${DATADIR}/Y1024_100_1_reg nr=1024 nf=100 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X1024_100_1_class Y=${DATADIR}/Y1024_100_1_class nr=1024 nf=100 nt=5 fmt=$FORMAT & + # set multiplier and calculate resulting parameters + MULTIPLIER=1 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & fi #generate S scenarios (800MB) if [ $MAXMEM -ge 800 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X3072_300_1_reg Y=${DATADIR}/Y3072_300_1_reg nr=3072 nf=300 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X3072_300_1_class Y=${DATADIR}/Y3072_300_1_class nr=3072 nf=300 nt=5 fmt=$FORMAT & + # set multiplier and calculate resulting parameters + MULTIPLIER=3 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & fi #generate M scenarios (8GB) if [ $MAXMEM -ge 8000 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X9216_900_1_reg Y=${DATADIR}/Y9216_900_1_reg nr=9216 nf=900 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X9216_900_1_class Y=${DATADIR}/Y9216_900_1_class nr=9216 nf=900 nt=5 fmt=$FORMAT & + # set multiplier and calculate resulting parameters + MULTIPLIER=9 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & fi #generate L scenarios (80GB) if [ $MAXMEM -ge 80000 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X27648_2700_1_reg Y=${DATADIR}/Y27648_2700_1_reg nr=27648 nf=2700 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X27648_2700_1_class Y=${DATADIR}/Y27648_2700_1_class nr=27648 nf=2700 nt=5 fmt=$FORMAT & + # set multiplier and calculate resulting parameters + MULTIPLIER=27 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & fi #generate XL scenarios (800GB) if [ $MAXMEM -ge 800000 ]; then - ${CMD} -f ../datagen/genRandData4NNRegression.dml --nvargs X=${DATADIR}/X82944_8200_1_reg Y=${DATADIR}/Y82944_8200_1_reg nr=82944 nf=8200 nt=1 fmt=$FORMAT & - ${CMD} -f ../datagen/genRandData4NNClassification.dml --nvargs X=${DATADIR}/X82944_8200_1_class Y=${DATADIR}/Y82944_8200_1_class nr=82944 nf=8200 nt=5 fmt=$FORMAT & + # set multiplier and calculate resulting parameters + MULTIPLIER=81 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & fi -wait \ No newline at end of file +wait diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh index c7285c362a9..1977c24cf44 100755 --- a/scripts/perftest/runAllNN.sh +++ b/scripts/perftest/runAllNN.sh @@ -39,30 +39,84 @@ err_report() { } trap 'err_report $LINENO' ERR -DATA=() # todo .. which data is needed? -if [ $MAXMEM -ge 80 ]; then DATA+=("1024_100_1"); fi -if [ $MAXMEM -ge 800 ]; then DATA+=("3072_300_1"); fi -if [ $MAXMEM -ge 8000 ]; then DATA+=("9216_900_1"); fi -if [ $MAXMEM -ge 80000 ]; then DATA+=("27648_2700_1"); fi -if [ $MAXMEM -ge 800000 ]; then DATA+=("82944_8200_1"); fi +BASE_REG_SAMPLES=1024 # these should be kept in sync with the ones set in genNNData, so that file names are in sync! +BASE_REG_FEATRUES=100 +BASE_CLASS_SAMPLES=1024 +BASE_CLASS_FEATURES=100 +BASE_CLASS_CLASSES=5 + +REG_DATA=() # todo .. which data is needed? +CLASS_DATA=() # todo .. which data is needed? +if [ $MAXMEM -ge 80 ]; then + MULTIPLIER=1 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi +if [ $MAXMEM -ge 800 ]; then + MULTIPLIER=3 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi +if [ $MAXMEM -ge 8000 ]; then + MULTIPLIER=9 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi +if [ $MAXMEM -ge 80000 ]; then + MULTIPLIER=27 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi +if [ $MAXMEM -ge 800000 ]; then + MULTIPLIER=81 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi echo "RUN NEURAL NETWORK EXPERIMENTS" $(date) >>results/times.txt -for d in ${DATA[@]}; do #"_KDD" +for d in ${REG_DATA[@]}; do #"_KDD" # Regression tasks for f in "runNNSimpleSGD"; do echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out + ./${f}.sh ${BASE}/X${d} ${BASE}/Y${d} ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_reg ${BASE}/Y${d}_reg ${BASE} "${COMMAND}" ${d} 50 ${USEGPU} &>logs/${f}_${d}_50.out + ./${f}.sh ${BASE}/X${d} ${BASE}/Y${d} ${BASE} "${COMMAND}" ${d} 50 ${USEGPU} &>logs/${f}_${d}_50.out done +done +for d in ${CLASS_DATA[@]}; do # Classification tasks for f in "runNNNesterovClassify"; do echo "-- Running "$f" on "$d" for 10 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} "${COMMAND}" ${d} 10 ${USEGPU} &>logs/${f}_${d}_10.out + ./${f}.sh ${BASE}/X${d} ${BASE}/Y${d} ${BASE} "${COMMAND}" ${d} 10 ${USEGPU} &>logs/${f}_${d}_10.out echo "-- Running "$f" on "$d" for 100 epochs" >>results/times.txt - ./${f}.sh ${BASE}/X${d}_class ${BASE}/Y${d}_class ${BASE} "${COMMAND}" ${d} 100 ${USEGPU} &>logs/${f}_${d}_100.out + ./${f}.sh ${BASE}/X${d} ${BASE}/Y${d} ${BASE} "${COMMAND}" ${d} 100 ${USEGPU} &>logs/${f}_${d}_100.out done done From 6a7f7a3bb370e409e6edef3928ca6eadb0d64564 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Mon, 26 Jun 2023 18:36:18 +0200 Subject: [PATCH 12/19] fixed NN datagen parameters --- scripts/perftest/datagen/genNNData.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/perftest/datagen/genNNData.sh b/scripts/perftest/datagen/genNNData.sh index 0a4fa6c50f4..95cd3ac4d54 100755 --- a/scripts/perftest/datagen/genNNData.sh +++ b/scripts/perftest/datagen/genNNData.sh @@ -57,8 +57,8 @@ if [ $MAXMEM -ge 80 ]; then ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ ${REG_SAMPLES} \ ${REG_FEATURES} \ - 5 \ - 5 \ + 1 \ + 1 \ ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ @@ -72,8 +72,8 @@ if [ $MAXMEM -ge 80 ]; then ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ ${REG_SAMPLES} \ ${REG_FEATURES} \ - 5 \ - 5 \ + 1 \ + 1 \ ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ From 58eb33f54cba410fb59115de767e12121a0ce092 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Mon, 26 Jun 2023 19:09:57 +0200 Subject: [PATCH 13/19] downloader for MNIST Dataset --- scripts/datagen/getMNISTDataset.sh | 37 ++++++++++++++ scripts/perftest/datagen/genMNISTData.sh | 63 ++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100755 scripts/datagen/getMNISTDataset.sh create mode 100755 scripts/perftest/datagen/genMNISTData.sh diff --git a/scripts/datagen/getMNISTDataset.sh b/scripts/datagen/getMNISTDataset.sh new file mode 100755 index 00000000000..bacd2a38d21 --- /dev/null +++ b/scripts/datagen/getMNISTDataset.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +BASE=$1 +if [ "$BASE" = "" ]; then BASE=$PWD; fi +RET=$PWD +cd $BASE || exit + +echo "Downloading" +if [ ! -f "mnist_train.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_train.csv.zip; fi +if [ ! -f "mnist_test.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_test.csv.zip; fi + +echo "Unzipping" +unzip -u mnist_train.csv.zip +unzip -u mnist_test.csv.zip + +cd $RET || exit +echo "Done" diff --git a/scripts/perftest/datagen/genMNISTData.sh b/scripts/perftest/datagen/genMNISTData.sh new file mode 100755 index 00000000000..2cbf116e364 --- /dev/null +++ b/scripts/perftest/datagen/genMNISTData.sh @@ -0,0 +1,63 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +DATADIR=$2/mnist +MAXMEM=$3 + +FORMAT="text" # can be csv, mm, text, binary + +echo "-- Generating MNIST data." >> results/times.txt; +#make sure whole MNIST is available +../datagen/getMNISTDataset.sh ${DATADIR} + +#generate XS scenarios (80MB) by producing a subset of MNIST +if [ $MAXMEM -ge 80 ]; then + echo "placeholder" +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + echo "placeholder" +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + echo "placeholder" +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + echo "placeholder" +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + echo "placeholder" +fi + +wait \ No newline at end of file From a116a46e6173512d7ce0550e53ebb2e183e10a19 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Mon, 3 Jul 2023 19:51:48 +0200 Subject: [PATCH 14/19] MNIST "datagen" done by producing smaller version of the whole MNIST dataset based on MAXMEM setting, and using whole MNIST only for biggest MAXMEM --- scripts/datagen/extractMNISTData.dml | 68 +++++++++++++++++++++ scripts/perftest/datagen/genMNISTData.sh | 77 ++++++++++++++++++++++-- 2 files changed, 139 insertions(+), 6 deletions(-) create mode 100644 scripts/datagen/extractMNISTData.dml diff --git a/scripts/datagen/extractMNISTData.dml b/scripts/datagen/extractMNISTData.dml new file mode 100644 index 00000000000..359387233b5 --- /dev/null +++ b/scripts/datagen/extractMNISTData.dml @@ -0,0 +1,68 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random Gaussian-mixture data to test k-Means clustering algorithms +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# file_xyz String --- Respective output files name/path +# ktrain Int 1000 Number of training samples +# kval Int 100 Number of validation samples +# nitems Int 50 Number of items +# nusers Int 60 Number of users +# fmt Format specifier csv Format of output data +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1 + +print ("BEGIN MNIST EXTRACTOR SCRIPT"); + +file_mnist_train = $mnist_train; +file_mnist_test = $mnist_test; +file_out_train = $out_train; +file_out_test = $out_test; +num_train = ifdef ($num_train, 60000); +num_test = ifdef ($num_test, 10000); +fmt = ifdef ($fmt, "csv"); + +mnist_train = read(file_mnist_train); +mnist_test = read(file_mnist_test); + +num_train = min(num_train, nrow(mnist_train)); +num_test = min(num_test, nrow(mnist_test)); + +# targets +# todo add shuffle? +out_train = mnist_train[,1:num_train]; +out_test = mnist_test[,1:num_test]; + + +print ("Writing out the resulting dataset..."); + +write (out_train, file_out_train, format=fmt); +write (out_test, file_out_test, format=fmt); + +print ("DONE: MNIST EXTRACTOR GENERATOR SCRIPT"); + diff --git a/scripts/perftest/datagen/genMNISTData.sh b/scripts/perftest/datagen/genMNISTData.sh index 2cbf116e364..c7bd263ece6 100755 --- a/scripts/perftest/datagen/genMNISTData.sh +++ b/scripts/perftest/datagen/genMNISTData.sh @@ -29,35 +29,100 @@ CMD=$1 DATADIR=$2/mnist MAXMEM=$3 -FORMAT="text" # can be csv, mm, text, binary +FORMAT="csv" # can be csv, mm, text, binary echo "-- Generating MNIST data." >> results/times.txt; #make sure whole MNIST is available ../datagen/getMNISTDataset.sh ${DATADIR} +MNIST_train_filename="mnist_train.csv" +MNIST_test_filename="mnist_test.csv" + +max_size_ordinal=4 +min_num_examples_train=12000 +max_num_examples_train=60000 +span_num_examples_train=$(echo "${max_num_examples_train} - ${min_num_examples_train}" | bc) +min_num_examples_test=2000 +max_num_examples_test=10000 +span_num_examples_test=$(echo "${max_num_examples_test} - ${min_num_examples_test}" | bc) #generate XS scenarios (80MB) by producing a subset of MNIST if [ $MAXMEM -ge 80 ]; then - echo "placeholder" + size_ordinal=0 + percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") # todo couldn't work out how to do this using bc so using slower python calls instead + target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + file_mnist_train=${DATADIR}/${MNIST_train_filename} \ + file_mnist_test=${DATADIR}/${MNIST_test_filename} \ + file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ + file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & fi #generate S scenarios (800MB) if [ $MAXMEM -ge 800 ]; then - echo "placeholder" + size_ordinal=1 + percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") + target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + file_mnist_train=${DATADIR}/${MNIST_train_filename} \ + file_mnist_test=${DATADIR}/${MNIST_test_filename} \ + file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ + file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & fi #generate M scenarios (8GB) if [ $MAXMEM -ge 8000 ]; then - echo "placeholder" + size_ordinal=2 + percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") + target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + file_mnist_train=${DATADIR}/${MNIST_train_filename} \ + file_mnist_test=${DATADIR}/${MNIST_test_filename} \ + file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ + file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & fi #generate L scenarios (80GB) if [ $MAXMEM -ge 80000 ]; then - echo "placeholder" + size_ordinal=3 + percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") + target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + file_mnist_train=${DATADIR}/${MNIST_train_filename} \ + file_mnist_test=${DATADIR}/${MNIST_test_filename} \ + file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ + file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & fi #generate XL scenarios (800GB) if [ $MAXMEM -ge 800000 ]; then - echo "placeholder" + size_ordinal=4 + percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") + target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + file_mnist_train=${DATADIR}/${MNIST_train_filename} \ + file_mnist_test=${DATADIR}/${MNIST_test_filename} \ + file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ + file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & fi wait \ No newline at end of file From fb6b15eb4e6bfd8c7a2d3d93b821e805cc6529e8 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Tue, 4 Jul 2023 21:16:48 +0200 Subject: [PATCH 15/19] replaced inline python calls in genMNISTData.sh --- scripts/perftest/datagen/genMNISTData.sh | 40 ++++++++++++++---------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/scripts/perftest/datagen/genMNISTData.sh b/scripts/perftest/datagen/genMNISTData.sh index c7bd263ece6..6c2660a894b 100755 --- a/scripts/perftest/datagen/genMNISTData.sh +++ b/scripts/perftest/datagen/genMNISTData.sh @@ -8,9 +8,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -19,19 +19,22 @@ # under the License. # #------------------------------------------------------------- -if [ "$(basename $PWD)" != "perftest" ]; -then +if [ "$(basename $PWD)" != "perftest" ]; then echo "Please execute scripts from directory 'perftest'" - exit 1; + exit 1 fi +# this sets the dot as the separating character in floating point numbers ie. their string representation +# this avoids an error where bc outputs results dot-separated but printf may expect floats comma-separated if the system default says so +export LC_NUMERIC="en_US.UTF-8" + CMD=$1 DATADIR=$2/mnist MAXMEM=$3 FORMAT="csv" # can be csv, mm, text, binary -echo "-- Generating MNIST data." >> results/times.txt; +echo "-- Generating MNIST data." >>results/times.txt #make sure whole MNIST is available ../datagen/getMNISTDataset.sh ${DATADIR} @@ -49,8 +52,11 @@ span_num_examples_test=$(echo "${max_num_examples_test} - ${min_num_examples_tes if [ $MAXMEM -ge 80 ]; then size_ordinal=0 percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) - target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") # todo couldn't work out how to do this using bc so using slower python calls instead - target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + # these python calls are here to show what the equivalent computations for the target_num variables do .. only difference is that printf $0.f doesnt round the float value down like floor but just truncates it to produce an integer value + # target_num_train=$(python -c "from math import floor; print(${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + # target_num_test=$(python -c "from math import floor; print(${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ file_mnist_train=${DATADIR}/${MNIST_train_filename} \ file_mnist_test=${DATADIR}/${MNIST_test_filename} \ @@ -65,8 +71,8 @@ fi if [ $MAXMEM -ge 800 ]; then size_ordinal=1 percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) - target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") - target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ file_mnist_train=${DATADIR}/${MNIST_train_filename} \ file_mnist_test=${DATADIR}/${MNIST_test_filename} \ @@ -81,8 +87,8 @@ fi if [ $MAXMEM -ge 8000 ]; then size_ordinal=2 percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) - target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") - target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ file_mnist_train=${DATADIR}/${MNIST_train_filename} \ file_mnist_test=${DATADIR}/${MNIST_test_filename} \ @@ -97,8 +103,8 @@ fi if [ $MAXMEM -ge 80000 ]; then size_ordinal=3 percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) - target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") - target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ file_mnist_train=${DATADIR}/${MNIST_train_filename} \ file_mnist_test=${DATADIR}/${MNIST_test_filename} \ @@ -113,8 +119,8 @@ fi if [ $MAXMEM -ge 800000 ]; then size_ordinal=4 percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) - target_num_train=$(python -c "from math import floor; print( ${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") - target_num_test=$(python -c "from math import floor; print( ${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ file_mnist_train=${DATADIR}/${MNIST_train_filename} \ file_mnist_test=${DATADIR}/${MNIST_test_filename} \ @@ -125,4 +131,4 @@ if [ $MAXMEM -ge 800000 ]; then fmt=${FORMAT} & fi -wait \ No newline at end of file +wait From 0e3e754d6ee787eedd2131a6cc514a90fd2925a8 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Tue, 4 Jul 2023 22:36:59 +0200 Subject: [PATCH 16/19] debugged mnist "datagen" pipeline .. works now :) --- scripts/datagen/extractMNISTData.dml | 7 +-- scripts/datagen/getMNISTDataset.sh | 33 +++++++++++++- scripts/perftest/datagen/genMNISTData.sh | 58 +++++++++++++----------- 3 files changed, 66 insertions(+), 32 deletions(-) diff --git a/scripts/datagen/extractMNISTData.dml b/scripts/datagen/extractMNISTData.dml index 359387233b5..17de958d53e 100644 --- a/scripts/datagen/extractMNISTData.dml +++ b/scripts/datagen/extractMNISTData.dml @@ -50,13 +50,14 @@ fmt = ifdef ($fmt, "csv"); mnist_train = read(file_mnist_train); mnist_test = read(file_mnist_test); +# stay in bounds num_train = min(num_train, nrow(mnist_train)); num_test = min(num_test, nrow(mnist_test)); # targets # todo add shuffle? -out_train = mnist_train[,1:num_train]; -out_test = mnist_test[,1:num_test]; +out_train = mnist_train[1:num_train,]; +out_test = mnist_test[1:num_test,]; print ("Writing out the resulting dataset..."); @@ -64,5 +65,5 @@ print ("Writing out the resulting dataset..."); write (out_train, file_out_train, format=fmt); write (out_test, file_out_test, format=fmt); -print ("DONE: MNIST EXTRACTOR GENERATOR SCRIPT"); +print ("DONE: MNIST EXTRACTOR SCRIPT"); diff --git a/scripts/datagen/getMNISTDataset.sh b/scripts/datagen/getMNISTDataset.sh index bacd2a38d21..e972a7cd0e0 100755 --- a/scripts/datagen/getMNISTDataset.sh +++ b/scripts/datagen/getMNISTDataset.sh @@ -23,7 +23,8 @@ BASE=$1 if [ "$BASE" = "" ]; then BASE=$PWD; fi RET=$PWD -cd $BASE || exit +if [ ! -d "$BASE" ]; then mkdir "$BASE"; fi +cd "$BASE" || exit echo "Downloading" if [ ! -f "mnist_train.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_train.csv.zip; fi @@ -33,5 +34,33 @@ echo "Unzipping" unzip -u mnist_train.csv.zip unzip -u mnist_test.csv.zip -cd $RET || exit +# have to create metadata for these external csv files + +echo '{ + "data_type": "matrix", + "value_type": "double", + "rows": 60000, + "cols": 785, + "nnz": 0, + "format": "csv", + "author": "anon", + "header": false, + "sep": ",", + "created": "2023-06-26 18:35:22 CEST" + }' > mnist_train.csv.mtd + +echo '{ + "data_type": "matrix", + "value_type": "double", + "rows": 10000, + "cols": 785, + "nnz": 0, + "format": "csv", + "author": "nobody", + "header": false, + "sep": ",", + "created": "2023-06-26 18:35:22 CEST" + }' > mnist_test.csv.mtd + +cd "$RET" || exit echo "Done" diff --git a/scripts/perftest/datagen/genMNISTData.sh b/scripts/perftest/datagen/genMNISTData.sh index 6c2660a894b..81a1535202d 100755 --- a/scripts/perftest/datagen/genMNISTData.sh +++ b/scripts/perftest/datagen/genMNISTData.sh @@ -38,8 +38,8 @@ echo "-- Generating MNIST data." >>results/times.txt #make sure whole MNIST is available ../datagen/getMNISTDataset.sh ${DATADIR} -MNIST_train_filename="mnist_train.csv" -MNIST_test_filename="mnist_test.csv" +mnist_train_filename="mnist_train.csv" +mnist_test_filename="mnist_test.csv" max_size_ordinal=4 min_num_examples_train=12000 @@ -50,18 +50,20 @@ max_num_examples_test=10000 span_num_examples_test=$(echo "${max_num_examples_test} - ${min_num_examples_test}" | bc) #generate XS scenarios (80MB) by producing a subset of MNIST if [ $MAXMEM -ge 80 ]; then + echo "doing size one" size_ordinal=0 - percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) # these python calls are here to show what the equivalent computations for the target_num variables do .. only difference is that printf $0.f doesnt round the float value down like floor but just truncates it to produce an integer value # target_num_train=$(python -c "from math import floor; print(${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) # target_num_test=$(python -c "from math import floor; print(${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) + echo $size_ordinal $percent_size $target_num_train $target_num_test ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ - file_mnist_train=${DATADIR}/${MNIST_train_filename} \ - file_mnist_test=${DATADIR}/${MNIST_test_filename} \ - file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ - file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_train_${target_num_train} \ + out_test=${DATADIR}/mnist_test_${target_num_test} \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & @@ -69,15 +71,17 @@ fi #generate S scenarios (800MB) if [ $MAXMEM -ge 800 ]; then + echo "doing size two" size_ordinal=1 - percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) + echo $size_ordinal $percent_size $target_num_train $target_num_test ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ - file_mnist_train=${DATADIR}/${MNIST_train_filename} \ - file_mnist_test=${DATADIR}/${MNIST_test_filename} \ - file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ - file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_train_${target_num_train} \ + out_test=${DATADIR}/mnist_test_${target_num_test} \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & @@ -86,14 +90,14 @@ fi #generate M scenarios (8GB) if [ $MAXMEM -ge 8000 ]; then size_ordinal=2 - percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ - file_mnist_train=${DATADIR}/${MNIST_train_filename} \ - file_mnist_test=${DATADIR}/${MNIST_test_filename} \ - file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ - file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_train_${target_num_train} \ + out_test=${DATADIR}/mnist_test_${target_num_test} \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & @@ -102,14 +106,14 @@ fi #generate L scenarios (80GB) if [ $MAXMEM -ge 80000 ]; then size_ordinal=3 - percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ - file_mnist_train=${DATADIR}/${MNIST_train_filename} \ - file_mnist_test=${DATADIR}/${MNIST_test_filename} \ - file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ - file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_train_${target_num_train} \ + out_test=${DATADIR}/mnist_test_${target_num_test} \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & @@ -118,14 +122,14 @@ fi #generate XL scenarios (800GB) if [ $MAXMEM -ge 800000 ]; then size_ordinal=4 - percent_size=$(echo "${size_ordinal} / ${max_size_ordinal}" | bc) + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ - file_mnist_train=${DATADIR}/${MNIST_train_filename} \ - file_mnist_test=${DATADIR}/${MNIST_test_filename} \ - file_out_train=${DATADIR}/MNIST_train_${target_num_train} \ - file_out_test=${DATADIR}/MNIST_test_${target_num_test} \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_train_${target_num_train} \ + out_test=${DATADIR}/mnist_test_${target_num_test} \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & From 472e3c1abd03d994980dea9c5bcd507766b6f236 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Thu, 6 Jul 2023 19:22:32 +0200 Subject: [PATCH 17/19] mnist perftest done .. but lenet implementation faulty? --- scripts/perftest/datagen/genMNISTData.sh | 26 +++--- scripts/perftest/runAll.sh | 12 +-- scripts/perftest/runAllConv2d.sh | 93 +++++++++++++++++++ scripts/perftest/runMNISTLeNet.sh | 68 ++++++++++++++ .../perftest/scripts/mnist_lenet-predict.dml | 57 ++++++++++++ .../perftest/scripts/mnist_lenet-train.dml | 60 ++++++++++++ 6 files changed, 295 insertions(+), 21 deletions(-) create mode 100755 scripts/perftest/runAllConv2d.sh create mode 100755 scripts/perftest/runMNISTLeNet.sh create mode 100644 scripts/perftest/scripts/mnist_lenet-predict.dml create mode 100644 scripts/perftest/scripts/mnist_lenet-train.dml diff --git a/scripts/perftest/datagen/genMNISTData.sh b/scripts/perftest/datagen/genMNISTData.sh index 81a1535202d..7d34879b8bb 100755 --- a/scripts/perftest/datagen/genMNISTData.sh +++ b/scripts/perftest/datagen/genMNISTData.sh @@ -50,20 +50,18 @@ max_num_examples_test=10000 span_num_examples_test=$(echo "${max_num_examples_test} - ${min_num_examples_test}" | bc) #generate XS scenarios (80MB) by producing a subset of MNIST if [ $MAXMEM -ge 80 ]; then - echo "doing size one" size_ordinal=0 percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) - # these python calls are here to show what the equivalent computations for the target_num variables do .. only difference is that printf $0.f doesnt round the float value down like floor but just truncates it to produce an integer value + # these python calls are here to show what the equivalent computations for the target_num variables do .. only difference is that printf $0.f doesnt round the float value down like floor but rounds it to produce an integer value instead # target_num_train=$(python -c "from math import floor; print(${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) # target_num_test=$(python -c "from math import floor; print(${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) - echo $size_ordinal $percent_size $target_num_train $target_num_test ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ mnist_train=${DATADIR}/${mnist_train_filename} \ mnist_test=${DATADIR}/${mnist_test_filename} \ - out_train=${DATADIR}/mnist_train_${target_num_train} \ - out_test=${DATADIR}/mnist_test_${target_num_test} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & @@ -71,17 +69,15 @@ fi #generate S scenarios (800MB) if [ $MAXMEM -ge 800 ]; then - echo "doing size two" size_ordinal=1 percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) - echo $size_ordinal $percent_size $target_num_train $target_num_test ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ mnist_train=${DATADIR}/${mnist_train_filename} \ mnist_test=${DATADIR}/${mnist_test_filename} \ - out_train=${DATADIR}/mnist_train_${target_num_train} \ - out_test=${DATADIR}/mnist_test_${target_num_test} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & @@ -96,8 +92,8 @@ if [ $MAXMEM -ge 8000 ]; then ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ mnist_train=${DATADIR}/${mnist_train_filename} \ mnist_test=${DATADIR}/${mnist_test_filename} \ - out_train=${DATADIR}/mnist_train_${target_num_train} \ - out_test=${DATADIR}/mnist_test_${target_num_test} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & @@ -112,8 +108,8 @@ if [ $MAXMEM -ge 80000 ]; then ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ mnist_train=${DATADIR}/${mnist_train_filename} \ mnist_test=${DATADIR}/${mnist_test_filename} \ - out_train=${DATADIR}/mnist_train_${target_num_train} \ - out_test=${DATADIR}/mnist_test_${target_num_test} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & @@ -128,8 +124,8 @@ if [ $MAXMEM -ge 800000 ]; then ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ mnist_train=${DATADIR}/${mnist_train_filename} \ mnist_test=${DATADIR}/${mnist_test_filename} \ - out_train=${DATADIR}/mnist_train_${target_num_train} \ - out_test=${DATADIR}/mnist_test_${target_num_test} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ num_train=${target_num_train} \ num_test=${target_num_test} \ fmt=${FORMAT} & diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index c843ab3b6f4..19a3ab96dc1 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -116,8 +116,9 @@ echo -e "\n\n" >>results/times.txt # Data for tests of nn components if [ "$DO_TESTS_FOR_NN" = true ]; then - ./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNNData.out - ./datagen/genNCFData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNCFData.out + #./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNNData.out + #./datagen/genNCFData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNCFData.out + ./datagen/genMNISTData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genMNISTData.out fi ### Micro Benchmarks: @@ -138,10 +139,9 @@ fi # Tests of nn components if [ "$DO_TESTS_FOR_NN" = true ]; then - # NOTICE: remember to pass the command variable as a quoted string! - # otherwise the command (eg. `systemds -gpu` without quotes) will be split into two variables in subscripts when USE_GPU_FOR_NN is set - ./runAllNN.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} - ./runAllNCF.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} # currently broken: staging/NCF.dml and any dml that sources it die on launch + #./runAllNN.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} + #./runAllNCF.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} # currently broken: staging/NCF.dml and any dml that sources it die on launch + ./runAllConv2d.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} fi # TODO The following benchmarks have yet to be written. The decision tree algorithms additionally need to be fixed. diff --git a/scripts/perftest/runAllConv2d.sh b/scripts/perftest/runAllConv2d.sh new file mode 100755 index 00000000000..517e3b2b110 --- /dev/null +++ b/scripts/perftest/runAllConv2d.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +# this sets the dot as the separating character in floating point numbers ie. their string representation +# this avoids an error where bc outputs results dot-separated but printf may expect floats comma-separated if the system default says so +export LC_NUMERIC="en_US.UTF-8" + +COMMAND=$1 +TEMPFOLDER=$2 +MAXMEM=$3 +USEGPU=$4 + +if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp; fi +BASE=${TEMPFOLDER}/mnist +MAXITR=200 + +FILENAME=$0 +err_report() { + echo "Error in $FILENAME on line $1" +} +trap 'err_report $LINENO' ERR + +max_size_ordinal=4 # these should be kept in sync with the ones set in genMNISTData, so that file names are in sync! +min_num_examples_train=12000 +max_num_examples_train=60000 +span_num_examples_train=$(echo "${max_num_examples_train} - ${min_num_examples_train}" | bc) +DATA=() +if [ $MAXMEM -ge 80 ]; then + size_ordinal=0 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi +if [ $MAXMEM -ge 800 ]; then + size_ordinal=1 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi +if [ $MAXMEM -ge 8000 ]; then + size_ordinal=2 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi +if [ $MAXMEM -ge 80000 ]; then + size_ordinal=3 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi +if [ $MAXMEM -ge 800000 ]; then + size_ordinal=4 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi + +echo "RUN CONV2D EXPERIMENTS" $(date) >>results/times.txt + +for d in ${DATA[@]}; do #"_KDD" + for f in "runMNISTLeNet"; do + echo "-- Running "$f" on "$d" for 10 epochs" >>results/times.txt + ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 10 ${USEGPU} &>logs/${f}_${d}_10.out + echo "-- Running "$f" on "$d" for 100 epochs" >>results/times.txt + ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 100 ${USEGPU} &>logs/${f}_${d}_100.out + done +done + +echo -e "\n\n" >>results/times.txt diff --git a/scripts/perftest/runMNISTLeNet.sh b/scripts/perftest/runMNISTLeNet.sh new file mode 100755 index 00000000000..7799dc567f5 --- /dev/null +++ b/scripts/perftest/runMNISTLeNet.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# params: +# 1) X data +# 2) Y data +# 3) path of base temp dir +# 4) command for systemds +set -e + +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +Train_data=$1 +Test_data=$2 +BASE=$3 +CMD=$4 +LOGIDENTIFIER=$5 +EPOCHS=$6 +USEGPU=$7 + +FLAGS="--stats" +if [ "$USEGPU" = true ]; then + FLAGS="${FLAGS} --gpu" +fi + +echo "running mnist lenet" + +#training +tstart=$(date +%s.%N) +${CMD} -f scripts/mnist_lenet-train.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs train=${Train_data} C=1 Hin=28 Win=28 epochs=${EPOCHS} base_dir=${BASE} fmt="csv" &>logs/mnist_lenet-train_${LOGIDENTIFIER}_${EPOCHS}.out + + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "mnist lenet trained on "$5": "$ttrain >>results/times.txt + +#predict +tstart=$(date +%s.%N) +${CMD} -f scripts/mnist_lenet-predict.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs input=${Test_data} C=1 Hin=28 Win=28 model_dir=${BASE} fmt="csv" &>logs/mnist_lenet-predict_${LOGIDENTIFIER}_${EPOCHS}.out + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "mnist lenet predicted on "$5": "$tpredict >>results/times.txt diff --git a/scripts/perftest/scripts/mnist_lenet-predict.dml b/scripts/perftest/scripts/mnist_lenet-predict.dml new file mode 100644 index 00000000000..e1c71931f28 --- /dev/null +++ b/scripts/perftest/scripts/mnist_lenet-predict.dml @@ -0,0 +1,57 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +source("../../nn/examples/mnist_lenet.dml") as mnist_lenet + +# Read training data +fmt = ifdef($fmt, "csv") +test = read($input, format=fmt) +C = $C +Hin = $Hin +Win = $Win + +# Extract images and labels +X_test = test[,2:ncol(test)] +Y_test = test[,1] + +# Scale images to [-1,1], and one-hot encode the labels +n_test = nrow(test) +X_test = (X_test / 255.0) * 2 - 1 +Y_test = table(seq(1, n_test), Y_test+1, n_test, 10) + + +# Read model coefficients +W1 = read($model_dir+"/W1") +b1 = read($model_dir+"/b1") +W2 = read($model_dir+"/W2") +b2 = read($model_dir+"/b2") +W3 = read($model_dir+"/W3") +b3 = read($model_dir+"/b3") +W4 = read($model_dir+"/W4") +b4 = read($model_dir+"/b4") + + +# Eval on test set +probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4) +[loss, accuracy] = mnist_lenet::eval(probs, Y_test) + +# Output results +print("Test Accuracy: " + accuracy) +print("Test Loss: " + loss) diff --git a/scripts/perftest/scripts/mnist_lenet-train.dml b/scripts/perftest/scripts/mnist_lenet-train.dml new file mode 100644 index 00000000000..6bcf418706b --- /dev/null +++ b/scripts/perftest/scripts/mnist_lenet-train.dml @@ -0,0 +1,60 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# The following is an adaptation of the script nn/examples/mnist_lenet_predict.dml +source("../../nn/examples/mnist_lenet.dml") as mnist_lenet + +# Read training data & settings +fmt = ifdef($fmt, "csv") +train = read($train, format=fmt) +C = $C +Hin = $Hin +Win = $Win +epochs = ifdef($epochs, 10) +out_dir = ifdef($base_dir, ".") + +# Extract images and labels +images = train[,2:ncol(train)] +labels = train[,1] + +# Scale images to [-1,1], and one-hot encode the labels +n = nrow(train) +images = (images / 255.0) * 2 - 1 +labels = table(seq(1, n), labels+1, n, 10) + +# Split into 80/20 training/validation data +split_idx = floor(.8 * nrow(images)) +X = images[1:split_idx,] +X_val = images[split_idx+1:nrow(images),] +Y = labels[1:split_idx,] +Y_val = labels[split_idx+1:nrow(images),] + +# Train +[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, Y, X_val, Y_val, C, Hin, Win, epochs) + +# Write model out +write(W1, out_dir+"/W1", format=fmt) +write(b1, out_dir+"/b1", format=fmt) +write(W2, out_dir+"/W2", format=fmt) +write(b2, out_dir+"/b2", format=fmt) +write(W3, out_dir+"/W3", format=fmt) +write(b3, out_dir+"/b3", format=fmt) +write(W4, out_dir+"/W4", format=fmt) +write(b4, out_dir+"/b4", format=fmt) \ No newline at end of file From 9ebe54b2deb79b4a4186579d6cf88c611d9927a9 Mon Sep 17 00:00:00 2001 From: Sheypex Date: Mon, 10 Jul 2023 16:58:10 +0200 Subject: [PATCH 18/19] lowered number of epochs in MNIST perftest --- scripts/perftest/runAllConv2d.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/perftest/runAllConv2d.sh b/scripts/perftest/runAllConv2d.sh index 517e3b2b110..c817c435243 100755 --- a/scripts/perftest/runAllConv2d.sh +++ b/scripts/perftest/runAllConv2d.sh @@ -83,10 +83,10 @@ echo "RUN CONV2D EXPERIMENTS" $(date) >>results/times.txt for d in ${DATA[@]}; do #"_KDD" for f in "runMNISTLeNet"; do - echo "-- Running "$f" on "$d" for 10 epochs" >>results/times.txt - ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 10 ${USEGPU} &>logs/${f}_${d}_10.out - echo "-- Running "$f" on "$d" for 100 epochs" >>results/times.txt - ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 100 ${USEGPU} &>logs/${f}_${d}_100.out + echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt + ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out + echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt + ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 50 ${USEGPU} &>logs/${f}_${d}_50.out done done From 53d13535e4f23bb526409a94b411c6e8ece8493b Mon Sep 17 00:00:00 2001 From: Sheypex Date: Mon, 10 Jul 2023 18:11:28 +0200 Subject: [PATCH 19/19] cleanup and reduced epochs for MNIST tests --- scripts/perftest/runAll.sh | 14 +++++++------- scripts/perftest/runAllConv2d.sh | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index 19a3ab96dc1..60fd0140ae5 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -31,7 +31,11 @@ TEMPFOLDER="temp" # Max memory of data to be benchmarked # Possible values: 80/80MB, 800/800MB, 8000/8000MB/8GB, 80000/80000MB/80GB, 800000/800000MB/800GB -MAXMEM=800 +MAXMEM=80 + +# Flags for tests of components in nn +DO_TESTS_FOR_NN=true # toggle execution of datagen for as well as tests of nn components themselves +USE_GPU_FOR_NN=false # toggle gpu usage for nn tests # Set properties export LOG4JPROP='conf/log4j-off.properties' @@ -94,10 +98,6 @@ mkdir -p logs mkdir -p results mkdir -p temp -# Flags for tests of components in nn -DO_TESTS_FOR_NN=true # toggle execution of datagen for as well as tests of nn components themselves -USE_GPU_FOR_NN=false # toggle gpu usage for nn tests - # init time measurement rm -f results/times.txt @@ -116,7 +116,7 @@ echo -e "\n\n" >>results/times.txt # Data for tests of nn components if [ "$DO_TESTS_FOR_NN" = true ]; then - #./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNNData.out + ./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNNData.out #./datagen/genNCFData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNCFData.out ./datagen/genMNISTData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genMNISTData.out fi @@ -139,7 +139,7 @@ fi # Tests of nn components if [ "$DO_TESTS_FOR_NN" = true ]; then - #./runAllNN.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} + ./runAllNN.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} #./runAllNCF.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} # currently broken: staging/NCF.dml and any dml that sources it die on launch ./runAllConv2d.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} fi diff --git a/scripts/perftest/runAllConv2d.sh b/scripts/perftest/runAllConv2d.sh index c817c435243..6d762cc2171 100755 --- a/scripts/perftest/runAllConv2d.sh +++ b/scripts/perftest/runAllConv2d.sh @@ -85,8 +85,8 @@ for d in ${DATA[@]}; do #"_KDD" for f in "runMNISTLeNet"; do echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out - echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt - ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 50 ${USEGPU} &>logs/${f}_${d}_50.out + echo "-- Running "$f" on "$d" for 25 epochs" >>results/times.txt + ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 25 ${USEGPU} &>logs/${f}_${d}_25.out done done