diff --git a/scripts/datagen/extractMNISTData.dml b/scripts/datagen/extractMNISTData.dml new file mode 100644 index 00000000000..17de958d53e --- /dev/null +++ b/scripts/datagen/extractMNISTData.dml @@ -0,0 +1,69 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random Gaussian-mixture data to test k-Means clustering algorithms +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# file_xyz String --- Respective output files name/path +# ktrain Int 1000 Number of training samples +# kval Int 100 Number of validation samples +# nitems Int 50 Number of items +# nusers Int 60 Number of users +# fmt Format specifier csv Format of output data +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1 + +print ("BEGIN MNIST EXTRACTOR SCRIPT"); + +file_mnist_train = $mnist_train; +file_mnist_test = $mnist_test; +file_out_train = $out_train; +file_out_test = $out_test; +num_train = ifdef ($num_train, 60000); +num_test = ifdef ($num_test, 10000); +fmt = ifdef ($fmt, "csv"); + +mnist_train = read(file_mnist_train); +mnist_test = read(file_mnist_test); + +# stay in bounds +num_train = min(num_train, nrow(mnist_train)); +num_test = min(num_test, nrow(mnist_test)); + +# targets +# todo add shuffle? +out_train = mnist_train[1:num_train,]; +out_test = mnist_test[1:num_test,]; + + +print ("Writing out the resulting dataset..."); + +write (out_train, file_out_train, format=fmt); +write (out_test, file_out_test, format=fmt); + +print ("DONE: MNIST EXTRACTOR SCRIPT"); + diff --git a/scripts/datagen/genRandData4NCF.dml b/scripts/datagen/genRandData4NCF.dml new file mode 100644 index 00000000000..272e4ff4a74 --- /dev/null +++ b/scripts/datagen/genRandData4NCF.dml @@ -0,0 +1,84 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random Gaussian-mixture data to test k-Means clustering algorithms +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# file_xyz String --- Respective output files name/path +# ktrain Int 1000 Number of training samples +# kval Int 100 Number of validation samples +# nitems Int 50 Number of items +# nusers Int 60 Number of users +# fmt Format specifier csv Format of output data +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1 + +print ("BEGIN NCF GENERATOR SCRIPT"); + +file_users_train = ifdef ($users_train, "users_train"); +file_items_train = ifdef ($items_train, "items_train"); +file_targets_train = ifdef ($targets_train, "targets_train"); +file_users_val = ifdef ($users_val, "users_val"); +file_items_val = ifdef ($items_val, "items_val"); +file_targets_val = ifdef ($targets_val, "targets_val"); +fmt = ifdef ($fmt, "csv"); + +# Generate input data +K_train = ifdef($ktrain, 1000); # number of training samples +K_val = ifdef($kval, 100); # number of validation samples + +N = ifdef($nitems, 50); # number items +M = ifdef($nusers, 60); # number users + +# targets +targets_train = round(rand(rows=K_train, cols=1)); +targets_val = round(rand(rows=K_val, cols=1)); + +# user/items integer-encoded vectors +items_train_int_encoded = round(rand(rows=K_train, cols=1, min=1, max=N)); +users_train_int_encoded = round(rand(rows=K_train, cols=1, min=1, max=M)); +items_val_int_encoded = round(rand(rows=K_val, cols=1, min=1, max=N)); +users_val_int_encoded = round(rand(rows=K_val, cols=1, min=1, max=M)); + +# user/items matrices by applying one-hot-encoding +items_train = toOneHot(items_train_int_encoded, N); +items_val = toOneHot(items_val_int_encoded, N); +users_train = toOneHot(users_train_int_encoded, M); +users_val = toOneHot(users_val_int_encoded, M); + + +print ("Writing out the resulting dataset..."); + +write (users_train, file_users_train, format=fmt); +write (items_train, file_items_train, format=fmt); +write (targets_train, file_targets_train, format=fmt); +write (users_val, file_users_val, format=fmt); +write (items_val, file_items_val, format=fmt); +write (targets_val, file_targets_val, format=fmt); + +print ("DONE: NCF GENERATOR SCRIPT"); + diff --git a/scripts/datagen/getMNISTDataset.sh b/scripts/datagen/getMNISTDataset.sh new file mode 100755 index 00000000000..e972a7cd0e0 --- /dev/null +++ b/scripts/datagen/getMNISTDataset.sh @@ -0,0 +1,66 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +BASE=$1 +if [ "$BASE" = "" ]; then BASE=$PWD; fi +RET=$PWD +if [ ! -d "$BASE" ]; then mkdir "$BASE"; fi +cd "$BASE" || exit + +echo "Downloading" +if [ ! -f "mnist_train.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_train.csv.zip; fi +if [ ! -f "mnist_test.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_test.csv.zip; fi + +echo "Unzipping" +unzip -u mnist_train.csv.zip +unzip -u mnist_test.csv.zip + +# have to create metadata for these external csv files + +echo '{ + "data_type": "matrix", + "value_type": "double", + "rows": 60000, + "cols": 785, + "nnz": 0, + "format": "csv", + "author": "anon", + "header": false, + "sep": ",", + "created": "2023-06-26 18:35:22 CEST" + }' > mnist_train.csv.mtd + +echo '{ + "data_type": "matrix", + "value_type": "double", + "rows": 10000, + "cols": 785, + "nnz": 0, + "format": "csv", + "author": "nobody", + "header": false, + "sep": ",", + "created": "2023-06-26 18:35:22 CEST" + }' > mnist_test.csv.mtd + +cd "$RET" || exit +echo "Done" diff --git a/scripts/nn/README.md b/scripts/nn/README.md index e9a59a707f0..c73955576e0 100644 --- a/scripts/nn/README.md +++ b/scripts/nn/README.md @@ -55,8 +55,8 @@ iters = 1024 / batch_size for (e in 1:epochs) { for(i in 1:iters) { # Get next batch - X_batch = X[i:i+batch_size-1,] - y_batch = y[i:i+batch_size-1,] + X_batch = X[(i-1)*batch_size+1:i*batch_size,] + y_batch = Y[(i-1)*batch_size+1:i*batch_size,] # Compute forward pass out1 = affine::forward(X_batch, W1, b1) @@ -131,8 +131,8 @@ iters = 1024 / batch_size for (e in 1:epochs) { for(i in 1:iters) { # Get next batch - X_batch = X[i:i+batch_size-1,] - y_batch = y[i:i+batch_size-1,] + X_batch = X[(i-1)*batch_size+1:i*batch_size,] + y_batch = Y[(i-1)*batch_size+1:i*batch_size,] # Compute forward pass ## layer 1: diff --git a/scripts/nn/examples/ncf-dummy-data.dml b/scripts/nn/examples/ncf-dummy-data.dml index fff5f63042d..eeb420437f1 100644 --- a/scripts/nn/examples/ncf-dummy-data.dml +++ b/scripts/nn/examples/ncf-dummy-data.dml @@ -20,7 +20,7 @@ #------------------------------------------------------------- # Imports -source("staging/NCF.dml") as NCF +source("../../staging/NCF.dml") as NCF K_train = 1000; # number of training samples K_val = 100; # number of validation samples diff --git a/scripts/perftest/datagen/genMNISTData.sh b/scripts/perftest/datagen/genMNISTData.sh new file mode 100755 index 00000000000..7d34879b8bb --- /dev/null +++ b/scripts/perftest/datagen/genMNISTData.sh @@ -0,0 +1,134 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +# this sets the dot as the separating character in floating point numbers ie. their string representation +# this avoids an error where bc outputs results dot-separated but printf may expect floats comma-separated if the system default says so +export LC_NUMERIC="en_US.UTF-8" + +CMD=$1 +DATADIR=$2/mnist +MAXMEM=$3 + +FORMAT="csv" # can be csv, mm, text, binary + +echo "-- Generating MNIST data." >>results/times.txt +#make sure whole MNIST is available +../datagen/getMNISTDataset.sh ${DATADIR} + +mnist_train_filename="mnist_train.csv" +mnist_test_filename="mnist_test.csv" + +max_size_ordinal=4 +min_num_examples_train=12000 +max_num_examples_train=60000 +span_num_examples_train=$(echo "${max_num_examples_train} - ${min_num_examples_train}" | bc) +min_num_examples_test=2000 +max_num_examples_test=10000 +span_num_examples_test=$(echo "${max_num_examples_test} - ${min_num_examples_test}" | bc) +#generate XS scenarios (80MB) by producing a subset of MNIST +if [ $MAXMEM -ge 80 ]; then + size_ordinal=0 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + # these python calls are here to show what the equivalent computations for the target_num variables do .. only difference is that printf $0.f doesnt round the float value down like floor but rounds it to produce an integer value instead + # target_num_train=$(python -c "from math import floor; print(${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))") + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + # target_num_test=$(python -c "from math import floor; print(${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))") + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + size_ordinal=1 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + size_ordinal=2 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + size_ordinal=3 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + size_ordinal=4 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc) + ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \ + mnist_train=${DATADIR}/${mnist_train_filename} \ + mnist_test=${DATADIR}/${mnist_test_filename} \ + out_train=${DATADIR}/mnist_${target_num_train}_train \ + out_test=${DATADIR}/mnist_${target_num_train}_test \ + num_train=${target_num_train} \ + num_test=${target_num_test} \ + fmt=${FORMAT} & +fi + +wait diff --git a/scripts/perftest/datagen/genNCFData.sh b/scripts/perftest/datagen/genNCFData.sh new file mode 100755 index 00000000000..ae41ac828bb --- /dev/null +++ b/scripts/perftest/datagen/genNCFData.sh @@ -0,0 +1,144 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +DATADIR=$2/ncf +MAXMEM=$3 + +FORMAT="csv" # can be csv, mm, text, binary + +BASE_ktrain=1000 +BASE_kval=100 +BASE_nitems=50 +BASE_nusers=60 +echo "-- Generating NCF data." >> results/times.txt; +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + MULTIPLIER=1 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + MULTIPLIER=3 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + MULTIPLIER=9 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + MULTIPLIER=27 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + MULTIPLIER=81 + KTRAIN=$(echo "$BASE_ktrain * $MULTIPLIER" | bc) + KVAL=$(echo "$BASE_kval * $MULTIPLIER" | bc) + NITEMS=$(echo "$BASE_nitems * $MULTIPLIER" | bc) + NUSERS=$(echo "$BASE_nusers * $MULTIPLIER" | bc) + ${CMD} -f ../datagen/genRandData4NCF.dml --nvargs \ + users_train=${DATADIR}/Ut${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_train=${DATADIR}/It${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_train=${DATADIR}/Tt${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + users_val=${DATADIR}/Uv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + items_val=${DATADIR}/Iv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + targets_val=${DATADIR}/Tv${KTRAIN}_${KVAL}_${NITEMS}_${NUSERS} \ + fmt="csv" \ + ktrain=${KTRAIN} \ + kval=${KVAL} \ + nitems=${NITEMS} \ + nusers=${NUSERS} & +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/datagen/genNNData.sh b/scripts/perftest/datagen/genNNData.sh new file mode 100755 index 00000000000..95cd3ac4d54 --- /dev/null +++ b/scripts/perftest/datagen/genNNData.sh @@ -0,0 +1,531 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +CMD=$1 +DATADIR=$2/nn +MAXMEM=$3 + +FORMAT="csv" # can be csv, mm, text, binary + +DENSE_SP=0.9 +SPARSE_SP=0.01 +BASE_REG_SAMPLES=1024 +BASE_REG_FEATRUES=100 +BASE_CLASS_SAMPLES=1024 +BASE_CLASS_FEATURES=100 +BASE_CLASS_CLASSES=5 + +# the scaling of nr and nf is to just multiply them by 3 each .. since sqrt(10) is about 3 and the data size should scale by a factor of 10 ..... needs to be tested for applicability +# for now only t=1 and t=5 are generated for regression and classification respectively .. may want to add more variety +# todo make test data +# todo generated data is too small with current parameters .. X data for xs is 2mb, s is 18mb -> pump it up +echo "-- Generating NN data." >>results/times.txt +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + # set multiplier and calculate resulting parameters + MULTIPLIER=1 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 1 \ + 1 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 1 \ + 1 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + # set multiplier and calculate resulting parameters + MULTIPLIER=3 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + # set multiplier and calculate resulting parameters + MULTIPLIER=9 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + # set multiplier and calculate resulting parameters + MULTIPLIER=27 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + # set multiplier and calculate resulting parameters + MULTIPLIER=81 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + + ## generate regression data + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + 1 \ + 0 \ + ${DENSE_SP} \ + ${FORMAT} \ + 0 & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args \ + ${REG_SAMPLES} \ + ${REG_FEATURES} \ + 5 \ + 5 \ + ${DATADIR}/w${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + 1 \ + 0 \ + ${SPARSE_SP} \ + ${FORMAT} \ + 0 & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse \ + ${DATADIR}/X${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${DATADIR}/Y${REG_SAMPLES}_${REG_FEATURES}_reg_sparse_test \ + ${FORMAT} & + + ## generate classification data + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${DENSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${FORMAT} & + pidDense80=$! + + ${CMD} -f ../datagen/genRandData4Multinomial.dml --args \ + ${CLASS_SAMPLES} \ + ${CLASS_FEATURES} \ + ${SPARSE_SP} \ + ${CLASS_CLASSES} \ + 0 \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${FORMAT} & + pidSparse80=$! + + wait $pidDense80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense_test \ + ${FORMAT} & + + wait $pidSparse80 + ${CMD} -f scripts/extractTestData.dml --args \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse \ + ${DATADIR}/X${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${DATADIR}/Y${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse_test \ + ${FORMAT} & +fi + +wait diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index db315597bf4..60fd0140ae5 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -20,10 +20,9 @@ # #------------------------------------------------------------- -if [ "$(basename $PWD)" != "perftest" ]; -then +if [ "$(basename $PWD)" != "perftest" ]; then echo "Please execute scripts from directory 'perftest'" - exit 1; + exit 1 fi # Command to be executed @@ -34,6 +33,10 @@ TEMPFOLDER="temp" # Possible values: 80/80MB, 800/800MB, 8000/8000MB/8GB, 80000/80000MB/80GB, 800000/800000MB/800GB MAXMEM=80 +# Flags for tests of components in nn +DO_TESTS_FOR_NN=true # toggle execution of datagen for as well as tests of nn components themselves +USE_GPU_FOR_NN=false # toggle gpu usage for nn tests + # Set properties export LOG4JPROP='conf/log4j-off.properties' export SYSDS_QUIET=1 @@ -85,31 +88,38 @@ MAXMEM=${MAXMEM%"MB"}; MAXMEM=${MAXMEM/GB/"000"} # Possible lines to initialize Intel MKL, depending on version and install location if [ -d ~/intel ] && [ -d ~/intel/bin ] && [ -f ~/intel/bin/compilervars.sh ]; then - . ~/intel/bin/compilervars.sh intel64 + . ~/intel/bin/compilervars.sh intel64 elif [ -d /opt ] && [ -d /opt/intel ] && [ -d /opt/intel/bin ]; then - . /opt/intel/bin/compilervars.sh intel64 + . /opt/intel/bin/compilervars.sh intel64 fi # make dirs if not exsisting -mkdir -p logs -mkdir -p results +mkdir -p logs +mkdir -p results mkdir -p temp # init time measurement rm -f results/times.txt -date +"%Y-%m-%d-%T" >> results/times.txt -echo -e "\n$HOSTNAME" >> results/times.txt -echo -e "\n\n" >> results/times.txt +date +"%Y-%m-%d-%T" >>results/times.txt +echo -e "\n$HOSTNAME" >>results/times.txt +echo -e "\n\n" >>results/times.txt ## Data Gen -# ./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genBinomialData.out -# ./datagen/genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genMultinomialData.out -# ./datagen/genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStatsData.out -# ./datagen/genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStratStatsData.out -# ./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genClusteringData.out -# ./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genDimensionReductionData.out -# ./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out +#./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genBinomialData.out +#./datagen/genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genMultinomialData.out +#./datagen/genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStatsData.out +#./datagen/genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStratStatsData.out +#./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genClusteringData.out +#./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genDimensionReductionData.out +#./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out + +# Data for tests of nn components +if [ "$DO_TESTS_FOR_NN" = true ]; then + ./datagen/genNNData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNNData.out + #./datagen/genNCFData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genNCFData.out + ./datagen/genMNISTData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>logs/genMNISTData.out +fi ### Micro Benchmarks: #./MatrixMult.sh ${CMD} @@ -119,16 +129,23 @@ echo -e "\n\n" >> results/times.txt #./fed/runAllFed.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} ### Algorithms Benchmarks: -./runAllBinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllMultinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllRegression.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllBinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllMultinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllRegression.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +#./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} + +# Tests of nn components +if [ "$DO_TESTS_FOR_NN" = true ]; then + ./runAllNN.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} + #./runAllNCF.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} # currently broken: staging/NCF.dml and any dml that sources it die on launch + ./runAllConv2d.sh "${CMD}" ${TEMPFOLDER} ${MAXMEM} ${USE_GPU_FOR_NN} +fi # TODO The following benchmarks have yet to be written. The decision tree algorithms additionally need to be fixed. -# add stepwise Linear +# add stepwise Linear # add stepwise GLM #./runAllTrees.sh $CMD $TEMPFOLDER # add randomForest diff --git a/scripts/perftest/runAllConv2d.sh b/scripts/perftest/runAllConv2d.sh new file mode 100755 index 00000000000..6d762cc2171 --- /dev/null +++ b/scripts/perftest/runAllConv2d.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +# this sets the dot as the separating character in floating point numbers ie. their string representation +# this avoids an error where bc outputs results dot-separated but printf may expect floats comma-separated if the system default says so +export LC_NUMERIC="en_US.UTF-8" + +COMMAND=$1 +TEMPFOLDER=$2 +MAXMEM=$3 +USEGPU=$4 + +if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp; fi +BASE=${TEMPFOLDER}/mnist +MAXITR=200 + +FILENAME=$0 +err_report() { + echo "Error in $FILENAME on line $1" +} +trap 'err_report $LINENO' ERR + +max_size_ordinal=4 # these should be kept in sync with the ones set in genMNISTData, so that file names are in sync! +min_num_examples_train=12000 +max_num_examples_train=60000 +span_num_examples_train=$(echo "${max_num_examples_train} - ${min_num_examples_train}" | bc) +DATA=() +if [ $MAXMEM -ge 80 ]; then + size_ordinal=0 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi +if [ $MAXMEM -ge 800 ]; then + size_ordinal=1 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi +if [ $MAXMEM -ge 8000 ]; then + size_ordinal=2 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi +if [ $MAXMEM -ge 80000 ]; then + size_ordinal=3 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi +if [ $MAXMEM -ge 800000 ]; then + size_ordinal=4 + percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc) + target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc) + DATA+=(mnist_${target_num_train}) +fi + +echo "RUN CONV2D EXPERIMENTS" $(date) >>results/times.txt + +for d in ${DATA[@]}; do #"_KDD" + for f in "runMNISTLeNet"; do + echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt + ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out + echo "-- Running "$f" on "$d" for 25 epochs" >>results/times.txt + ./${f}.sh ${BASE}/${d}_train ${BASE}/${d}_test ${BASE} "${COMMAND}" ${d} 25 ${USEGPU} &>logs/${f}_${d}_25.out + done +done + +echo -e "\n\n" >>results/times.txt diff --git a/scripts/perftest/runAllNCF.sh b/scripts/perftest/runAllNCF.sh new file mode 100755 index 00000000000..e2639012612 --- /dev/null +++ b/scripts/perftest/runAllNCF.sh @@ -0,0 +1,62 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +COMMAND=$1 +TEMPFOLDER=$2 +MAXMEM=$3 +USEGPU=$4 + +if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp; fi +BASE=${TEMPFOLDER}/ncf +MAXITR=200 + +FILENAME=$0 +err_report() { + echo "Error in $FILENAME on line $1" +} +trap 'err_report $LINENO' ERR + +DATA=() # todo .. which data is needed? +if [ $MAXMEM -ge 80 ]; then DATA+=("1000_100_50_60"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("3000_300_150_180"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("9000_900_450_540"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("27000_2700_1350_1620"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("81000_8100_4050_4860"); fi + +echo "RUN NEURAL COLLABORATIVE FILTERING EXPERIMENTS" $(date) >>results/times.txt + +for d in ${DATA[@]}; do #"_KDD" + for f in "runNCF"; do + echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt + ./${f}.sh ${BASE}/Tt${d} ${BASE}/Tv${d} ${BASE}/It${d} ${BASE}/Iv${d} ${BASE}/Ut${d} ${BASE}/Uv${d} \ + ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out + echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt + ./${f}.sh ${BASE}/Tt${d} ${BASE}/Tv${d} ${BASE}/It${d} ${BASE}/Iv${d} ${BASE}/Ut${d} ${BASE}/Uv${d} \ + ${BASE} "${COMMAND}" ${d} 50 ${USEGPU} &>logs/${f}_${d}_50.out + done +done + +echo -e "\n\n" >>results/times.txt diff --git a/scripts/perftest/runAllNN.sh b/scripts/perftest/runAllNN.sh new file mode 100755 index 00000000000..1977c24cf44 --- /dev/null +++ b/scripts/perftest/runAllNN.sh @@ -0,0 +1,123 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +COMMAND=$1 +TEMPFOLDER=$2 +MAXMEM=$3 +USEGPU=$4 + +if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp; fi +BASE=${TEMPFOLDER}/nn +MAXITR=200 + +FILENAME=$0 +err_report() { + echo "Error in $FILENAME on line $1" +} +trap 'err_report $LINENO' ERR + +BASE_REG_SAMPLES=1024 # these should be kept in sync with the ones set in genNNData, so that file names are in sync! +BASE_REG_FEATRUES=100 +BASE_CLASS_SAMPLES=1024 +BASE_CLASS_FEATURES=100 +BASE_CLASS_CLASSES=5 + +REG_DATA=() # todo .. which data is needed? +CLASS_DATA=() # todo .. which data is needed? +if [ $MAXMEM -ge 80 ]; then + MULTIPLIER=1 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi +if [ $MAXMEM -ge 800 ]; then + MULTIPLIER=3 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi +if [ $MAXMEM -ge 8000 ]; then + MULTIPLIER=9 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi +if [ $MAXMEM -ge 80000 ]; then + MULTIPLIER=27 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi +if [ $MAXMEM -ge 800000 ]; then + MULTIPLIER=81 + REG_SAMPLES=$(echo "$BASE_REG_SAMPLES * $MULTIPLIER" | bc) + REG_FEATURES=$(echo "$BASE_REG_FEATRUES * $MULTIPLIER" | bc) + CLASS_SAMPLES=$(echo "$BASE_CLASS_SAMPLES * $MULTIPLIER" | bc) + CLASS_FEATURES=$(echo "$BASE_CLASS_FEATURES * $MULTIPLIER" | bc) + CLASS_CLASSES=$(echo "$BASE_CLASS_CLASSES * $MULTIPLIER" | bc) + REG_DATA+=(${REG_SAMPLES}_${REG_FEATURES}_reg_dense ${REG_SAMPLES}_${REG_FEATURES}_reg_sparse) + CLASS_DATA+=(${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_dense ${CLASS_SAMPLES}_${CLASS_FEATURES}_${CLASS_CLASSES}_class_sparse) +fi + +echo "RUN NEURAL NETWORK EXPERIMENTS" $(date) >>results/times.txt + +for d in ${REG_DATA[@]}; do #"_KDD" + # Regression tasks + for f in "runNNSimpleSGD"; do + echo "-- Running "$f" on "$d" for 5 epochs" >>results/times.txt + ./${f}.sh ${BASE}/X${d} ${BASE}/Y${d} ${BASE} "${COMMAND}" ${d} 5 ${USEGPU} &>logs/${f}_${d}_5.out + echo "-- Running "$f" on "$d" for 50 epochs" >>results/times.txt + ./${f}.sh ${BASE}/X${d} ${BASE}/Y${d} ${BASE} "${COMMAND}" ${d} 50 ${USEGPU} &>logs/${f}_${d}_50.out + done +done + +for d in ${CLASS_DATA[@]}; do + # Classification tasks + for f in "runNNNesterovClassify"; do + echo "-- Running "$f" on "$d" for 10 epochs" >>results/times.txt + ./${f}.sh ${BASE}/X${d} ${BASE}/Y${d} ${BASE} "${COMMAND}" ${d} 10 ${USEGPU} &>logs/${f}_${d}_10.out + echo "-- Running "$f" on "$d" for 100 epochs" >>results/times.txt + ./${f}.sh ${BASE}/X${d} ${BASE}/Y${d} ${BASE} "${COMMAND}" ${d} 100 ${USEGPU} &>logs/${f}_${d}_100.out + done +done + +echo -e "\n\n" >>results/times.txt diff --git a/scripts/perftest/runMNISTLeNet.sh b/scripts/perftest/runMNISTLeNet.sh new file mode 100755 index 00000000000..7799dc567f5 --- /dev/null +++ b/scripts/perftest/runMNISTLeNet.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# params: +# 1) X data +# 2) Y data +# 3) path of base temp dir +# 4) command for systemds +set -e + +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +Train_data=$1 +Test_data=$2 +BASE=$3 +CMD=$4 +LOGIDENTIFIER=$5 +EPOCHS=$6 +USEGPU=$7 + +FLAGS="--stats" +if [ "$USEGPU" = true ]; then + FLAGS="${FLAGS} --gpu" +fi + +echo "running mnist lenet" + +#training +tstart=$(date +%s.%N) +${CMD} -f scripts/mnist_lenet-train.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs train=${Train_data} C=1 Hin=28 Win=28 epochs=${EPOCHS} base_dir=${BASE} fmt="csv" &>logs/mnist_lenet-train_${LOGIDENTIFIER}_${EPOCHS}.out + + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "mnist lenet trained on "$5": "$ttrain >>results/times.txt + +#predict +tstart=$(date +%s.%N) +${CMD} -f scripts/mnist_lenet-predict.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs input=${Test_data} C=1 Hin=28 Win=28 model_dir=${BASE} fmt="csv" &>logs/mnist_lenet-predict_${LOGIDENTIFIER}_${EPOCHS}.out + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "mnist lenet predicted on "$5": "$tpredict >>results/times.txt diff --git a/scripts/perftest/runNCF.sh b/scripts/perftest/runNCF.sh new file mode 100755 index 00000000000..d15149a61e1 --- /dev/null +++ b/scripts/perftest/runNCF.sh @@ -0,0 +1,113 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# params: +# 1) X data +# 2) Y data +# 3) path of base temp dir +# 4) command for systemds +set -e + +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +TTrain=$1 +TVal=$2 +ITrain=$3 +IVal=$4 +UTrain=$5 +UVal=$6 +BASE=$7 +CMD=$8 +LOGIDENTIFIER=$9 +EPOCHS=${10} +USEGPU=${11} + +FLAGS="--stats" +if [ "$USEGPU" = true ]; then + FLAGS="${FLAGS} --gpu" +fi + + + +echo "running NCF" +echo \ +${CMD} -f scripts/NCF-train.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs B=${BASE} fmt="csv" \ + targets_train=${TTrain} \ + targets_val=${TVal} \ + items_train=${ITrain} \ + items_val=${IVal} \ + users_train=${UTrain} \ + users_val=${UVal} \ + epochs=${EPOCHS} +#training +tstart=$(date +%s.%N) +${CMD} -f scripts/NCF-train.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs B=${BASE} fmt="csv" \ + targets_train=${TTrain} \ + targets_val=${TVal} \ + items_train=${ITrain} \ + items_val=${IVal} \ + users_train=${UTrain} \ + users_val=${UVal} \ + epochs=${EPOCHS} \ + &>logs/NCF-train_${LOGIDENTIFIER}_${EPOCHS}.out + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "NCF trained on "$9": "$ttrain >>results/times.txt + +#predict +tstart=$(date +%s.%N) +${CMD} -f scripts/NCF-predict.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs B=${BASE} fmt="csv" epochs=${EPOCHS} \ + items=${ITrain} \ + users=${UTrain} \ + target=${TTrain} \ + biases=${BASE}/ncf_biases \ + weights=${BASE}/ncf_weights \ + &>logs/NCF-predict_train_${LOGIDENTIFIER}_${EPOCHS}.out + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "NCF predicted on training data "$9": "$tpredict >>results/times.txt + +tstart=$(date +%s.%N) +${CMD} -f scripts/NCF-predict.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs B=${BASE} fmt="csv" epochs=${EPOCHS} \ + items=${IVal} \ + users=${UVal} \ + target=${TVal} \ + biases=${BASE}/ncf_biases \ + weights=${BASE}/ncf_weights \ + &>logs/NCF-predict_val_${LOGIDENTIFIER}_${EPOCHS}.out + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "NCF predicted on validation data "$9": "$tpredict >>results/times.txt diff --git a/scripts/perftest/runNNNesterovClassify.sh b/scripts/perftest/runNNNesterovClassify.sh new file mode 100755 index 00000000000..4fe90085be7 --- /dev/null +++ b/scripts/perftest/runNNNesterovClassify.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# params: +# 1) X data +# 2) Y data +# 3) path of base temp dir +# 4) command for systemds +set -e + +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +X=$1 +Y=$2 +BASE=$3 +CMD=$4 +LOGIDENTIFIER=$5 +EPOCHS=$6 +USEGPU=$7 + +FLAGS="--stats" +if [ "$USEGPU" = true ]; then + FLAGS="${FLAGS} --gpu" +fi + +echo "running sgd nn classifier with nesterov momentum" + +#training +tstart=$(date +%s.%N) +${CMD} -f scripts/nnNesterovClassify-train.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" epochs=${EPOCHS} &>logs/nnNesterovClassify-train_${LOGIDENTIFIER}_${EPOCHS}.out + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "nesterov momentum neural network trained with SGD on "$5": "$ttrain >>results/times.txt + +#predict +tstart=$(date +%s.%N) +${CMD} -f scripts/nnNesterovClassify-predict.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}_${EPOCHS}.out + #--nvargs fmt=csv X=$1_test B=${BASE}/b Y=$2_test M=${BASE}/m O=${BASE}/out.csv &>logs/nnNesterovClassify-predict_${LOGIDENTIFIER}.out + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "nesterov momentum neural network trained with SGD predicted on "$5": "$tpredict >>results/times.txt diff --git a/scripts/perftest/runNNSimpleSGD.sh b/scripts/perftest/runNNSimpleSGD.sh new file mode 100755 index 00000000000..8c78e4c7af6 --- /dev/null +++ b/scripts/perftest/runNNSimpleSGD.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# params: +# 1) X data +# 2) Y data +# 3) path of base temp dir +# 4) command for systemds +set -e + +if [ "$(basename $PWD)" != "perftest" ]; then + echo "Please execute scripts from directory 'perftest'" + exit 1 +fi + +X=$1 +Y=$2 +BASE=$3 +CMD=$4 +LOGIDENTIFIER=$5 +EPOCHS=$6 +USEGPU=$7 + +FLAGS="--stats" +if [ "$USEGPU" = true ]; then + FLAGS="${FLAGS} --gpu" +fi + +echo "running simple sgd neural network" + +#training +tstart=$(date +%s.%N) +${CMD} -f scripts/nnSimpleSGD-train.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs X=${X} Y=${Y} B=${BASE} fmt="csv" epochs=${EPOCHS} &>logs/nnSimpleSGD-train_${LOGIDENTIFIER}_${EPOCHS}.out + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "simple neural network trained with SGD on "$5": "$ttrain >>results/times.txt + +#predict +tstart=$(date +%s.%N) +${CMD} -f scripts/nnSimpleSGD-predict.dml \ + --config conf/SystemDS-config.xml \ + ${FLAGS} \ + --nvargs fmt="csv" X=${X} Y=${Y} B=${BASE} &>logs/nnSimpleSGD-predict_${LOGIDENTIFIER}_${EPOCHS}.out + # --nvargs fmt=csv X=$1_test B=${BASE} Y=$2_test + +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "simple neural network trained with SGD predicted on "$5": "$tpredict >>results/times.txt diff --git a/scripts/perftest/scripts/NCF-predict.dml b/scripts/perftest/scripts/NCF-predict.dml new file mode 100755 index 00000000000..339f24294ab --- /dev/null +++ b/scripts/perftest/scripts/NCF-predict.dml @@ -0,0 +1,35 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +source("../../staging/NCF.dml") as NCF +fmt=ifdef($fmt, "csv") + +biases = read($biases) +weights = read($weights) + +items = read($items); +users = read($users); +target = read($target) + +[out_FA, out_F, out_D1A, out_D1, out_D2A, out_D2, out_D3A, out_D3, out_concat, out_U, out_I] = NCF::predict(users, items, biases, weights); + +[loss, accuracy] = NCF::eval(out_FA, target); + +print("got loss and accuracy of: " + loss + ", " + accuracy) diff --git a/scripts/perftest/scripts/NCF-train.dml b/scripts/perftest/scripts/NCF-train.dml new file mode 100755 index 00000000000..f92535af9c8 --- /dev/null +++ b/scripts/perftest/scripts/NCF-train.dml @@ -0,0 +1,45 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +source("../../staging/NCF.dml") as NCF + +fmt=ifdef($fmt, "csv") + +targets_train = read($targets_train); +targets_val = read($targets_val); +items_train = read($items_train); +items_val = read($items_val); +users_train = read($users_train); +users_val = read($users_val); + +# Train +epochs = ifdef($epochs, 50); +batch_size = ifdef($batch_size, 16); + +# layer dimensions +E = ifdef($embedding, 8); # embedding +D1 = ifdef($d1, 64); # dense layer 1 +D2 = ifdef($d2, 32); # dense layer 2 +D3 = ifdef($d3, 16); # dense layer 3 + +[biases, weights] = NCF::train(users_train, items_train, targets_train, users_val, items_val, targets_val, epochs, batch_size, E, D1, D2, D3); + +write(biases, ""+$B+"/ncf_biases",format=fmt) +write(weights, ""+$B+"/ncf_weights",format=fmt) \ No newline at end of file diff --git a/scripts/perftest/scripts/mnist_lenet-predict.dml b/scripts/perftest/scripts/mnist_lenet-predict.dml new file mode 100644 index 00000000000..e1c71931f28 --- /dev/null +++ b/scripts/perftest/scripts/mnist_lenet-predict.dml @@ -0,0 +1,57 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +source("../../nn/examples/mnist_lenet.dml") as mnist_lenet + +# Read training data +fmt = ifdef($fmt, "csv") +test = read($input, format=fmt) +C = $C +Hin = $Hin +Win = $Win + +# Extract images and labels +X_test = test[,2:ncol(test)] +Y_test = test[,1] + +# Scale images to [-1,1], and one-hot encode the labels +n_test = nrow(test) +X_test = (X_test / 255.0) * 2 - 1 +Y_test = table(seq(1, n_test), Y_test+1, n_test, 10) + + +# Read model coefficients +W1 = read($model_dir+"/W1") +b1 = read($model_dir+"/b1") +W2 = read($model_dir+"/W2") +b2 = read($model_dir+"/b2") +W3 = read($model_dir+"/W3") +b3 = read($model_dir+"/b3") +W4 = read($model_dir+"/W4") +b4 = read($model_dir+"/b4") + + +# Eval on test set +probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4) +[loss, accuracy] = mnist_lenet::eval(probs, Y_test) + +# Output results +print("Test Accuracy: " + accuracy) +print("Test Loss: " + loss) diff --git a/scripts/perftest/scripts/mnist_lenet-train.dml b/scripts/perftest/scripts/mnist_lenet-train.dml new file mode 100644 index 00000000000..6bcf418706b --- /dev/null +++ b/scripts/perftest/scripts/mnist_lenet-train.dml @@ -0,0 +1,60 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# The following is an adaptation of the script nn/examples/mnist_lenet_predict.dml +source("../../nn/examples/mnist_lenet.dml") as mnist_lenet + +# Read training data & settings +fmt = ifdef($fmt, "csv") +train = read($train, format=fmt) +C = $C +Hin = $Hin +Win = $Win +epochs = ifdef($epochs, 10) +out_dir = ifdef($base_dir, ".") + +# Extract images and labels +images = train[,2:ncol(train)] +labels = train[,1] + +# Scale images to [-1,1], and one-hot encode the labels +n = nrow(train) +images = (images / 255.0) * 2 - 1 +labels = table(seq(1, n), labels+1, n, 10) + +# Split into 80/20 training/validation data +split_idx = floor(.8 * nrow(images)) +X = images[1:split_idx,] +X_val = images[split_idx+1:nrow(images),] +Y = labels[1:split_idx,] +Y_val = labels[split_idx+1:nrow(images),] + +# Train +[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, Y, X_val, Y_val, C, Hin, Win, epochs) + +# Write model out +write(W1, out_dir+"/W1", format=fmt) +write(b1, out_dir+"/b1", format=fmt) +write(W2, out_dir+"/W2", format=fmt) +write(b2, out_dir+"/b2", format=fmt) +write(W3, out_dir+"/W3", format=fmt) +write(b3, out_dir+"/b3", format=fmt) +write(W4, out_dir+"/W4", format=fmt) +write(b4, out_dir+"/b4", format=fmt) \ No newline at end of file diff --git a/scripts/perftest/scripts/nnNesterovClassify-predict.dml b/scripts/perftest/scripts/nnNesterovClassify-predict.dml new file mode 100644 index 00000000000..378c5612883 --- /dev/null +++ b/scripts/perftest/scripts/nnNesterovClassify-predict.dml @@ -0,0 +1,70 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# Imports +source("../../nn/layers/affine.dml") as affine +source("../../nn/layers/cross_entropy_loss.dml") as cross_entropy_loss +source("../../nn/layers/dropout.dml") as dropout +source("../../nn/layers/relu.dml") as relu +source("../../nn/layers/softmax.dml") as softmax +source("../../nn/optim/sgd_nesterov.dml") as sgd_nesterov + +X = read($X) +Y = read($Y) + +W1 = read(""+$B+"/w1_nesterov_classify") +W2 = read(""+$B+"/w2_nesterov_classify") +W3 = read(""+$B+"/w3_nesterov_classify") +b1 = read(""+$B+"/b1_nesterov_classify") +b2 = read(""+$B+"/b2_nesterov_classify") +b3 = read(""+$B+"/b3_nesterov_classify") +p = read(""+$B+"/p_nesterov_classify") + +# Compute forward pass with dropout +## layer 1: +out1 = affine::forward(X, W1, b1) +outr1 = relu::forward(out1) +[outd1, maskd1] = dropout::forward(outr1, p, -1) +## layer 2: +out2 = affine::forward(outd1, W2, b2) +outr2 = relu::forward(out2) +[outd2, maskd2] = dropout::forward(outr2, p, -1) +## layer 3: +out3 = affine::forward(outd2, W3, b3) +probs = softmax::forward(out3) + +# Compute loss +loss = cross_entropy_loss::forward(probs, Y) +print("Cross entropy loss with dropout: " + loss) + +# repeat without dropout +## layer 1: +out1 = affine::forward(X, W1, b1) +outr1 = relu::forward(out1) +## layer 2: +out2 = affine::forward(outr1, W2, b2) +outr2 = relu::forward(out2) +## layer 3: +out3 = affine::forward(outr2, W3, b3) +probs = softmax::forward(out3) + +# Compute loss +loss = cross_entropy_loss::forward(probs, Y) +print("Cross entropy loss without dropout: " + loss) \ No newline at end of file diff --git a/scripts/perftest/scripts/nnNesterovClassify-train.dml b/scripts/perftest/scripts/nnNesterovClassify-train.dml new file mode 100644 index 00000000000..8d09dcb291a --- /dev/null +++ b/scripts/perftest/scripts/nnNesterovClassify-train.dml @@ -0,0 +1,119 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# Imports +source("../../nn/layers/affine.dml") as affine +source("../../nn/layers/cross_entropy_loss.dml") as cross_entropy_loss +source("../../nn/layers/dropout.dml") as dropout +source("../../nn/layers/relu.dml") as relu +source("../../nn/layers/softmax.dml") as softmax +source("../../nn/optim/sgd_nesterov.dml") as sgd_nesterov + +# read input data +X = read($X) +Y = read($Y) +fmt = ifdef($fmt, "csv") + +N = nrow(X) +D = ncol(X) +t = ncol(Y) + +# Create network: +# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax +H1 = ifdef($H1, 64) # number of neurons in 1st hidden layer +H2 = ifdef($H2, 64) # number of neurons in 2nd hidden layer +p = ifdef($dropout_prob, 0.5) # dropout probability +[W1, b1] = affine::init(D, H1, -1) +[W2, b2] = affine::init(H1, H2, -1) +[W3, b3] = affine::init(H2, t, -1) + +# Initialize SGD w/ Nesterov momentum optimizer +lr = 0.05 # learning rate +mu = 0.5 # momentum +decay = 0.99 # learning rate decay constant +vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1) +vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2) +vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3) + +# Optimize +print("Starting optimization") +batch_size = ifdef($batch_size, 64) +epochs = ifdef($epochs, 10) +iters = N / batch_size +for (e in 1:epochs) { + for(i in 1:iters) { + # Get next batch + X_batch = X[(i-1)*batch_size+1:i*batch_size,] + y_batch = Y[(i-1)*batch_size+1:i*batch_size,] + + # Compute forward pass + ## layer 1: + out1 = affine::forward(X_batch, W1, b1) + outr1 = relu::forward(out1) + [outd1, maskd1] = dropout::forward(outr1, p, -1) + ## layer 2: + out2 = affine::forward(outd1, W2, b2) + outr2 = relu::forward(out2) + [outd2, maskd2] = dropout::forward(outr2, p, -1) + ## layer 3: + out3 = affine::forward(outd2, W3, b3) + probs = softmax::forward(out3) + + # Compute loss + loss = cross_entropy_loss::forward(probs, y_batch) + print("Cross entropy loss: " + loss) + + # Compute backward pass + ## loss: + dprobs = cross_entropy_loss::backward(probs, y_batch) + ## layer 3: + dout3 = softmax::backward(dprobs, out3) + [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3) + ## layer 2: + doutr2 = dropout::backward(doutd2, outr2, p, maskd2) + dout2 = relu::backward(doutr2, out2) + [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2) + ## layer 1: + doutr1 = dropout::backward(doutd1, outr1, p, maskd1) + dout1 = relu::backward(doutr1, out1) + [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1) + + # Optimize with SGD w/ Nesterov momentum + [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1) + [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1) + [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2) + [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2) + [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3) + [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3) + } + # Anneal momentum towards 0.999 + mu = mu + (0.999 - mu)/(1+epochs-e) + # Decay learning rate + lr = lr * decay +} + +# save params +write(W1,""+$B+"/w1_nesterov_classify",format=fmt) +write(W2,""+$B+"/w2_nesterov_classify",format=fmt) +write(W3,""+$B+"/w3_nesterov_classify",format=fmt) +write(b1,""+$B+"/b1_nesterov_classify",format=fmt) +write(b2,""+$B+"/b2_nesterov_classify",format=fmt) +write(b3,""+$B+"/b3_nesterov_classify",format=fmt) +write(p,""+$B+"/p_nesterov_classify",format=fmt) \ No newline at end of file diff --git a/scripts/perftest/scripts/nnSimpleSGD-predict.dml b/scripts/perftest/scripts/nnSimpleSGD-predict.dml new file mode 100644 index 00000000000..7f12b65e270 --- /dev/null +++ b/scripts/perftest/scripts/nnSimpleSGD-predict.dml @@ -0,0 +1,47 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# Imports +source("../../nn/layers/affine.dml") as affine +source("../../nn/layers/l2_loss.dml") as l2_loss +source("../../nn/layers/relu.dml") as relu +source("../../nn/optim/sgd.dml") as sgd + +# read input data +X = read($X) +Y = read($Y) + + +# Create 2-layer network based on read params: +## affine1 -> relu1 -> affine2 +W1 = read(""+$B+"/w1_simple_sgd") +W2 = read(""+$B+"/w2_simple_sgd") +b1 = read(""+$B+"/b1_simple_sgd") +b2 = read(""+$B+"/b2_simple_sgd") + +# make prediction for given X +out1 = affine::forward(X, W1, b1) +outr1 = relu::forward(out1) +out2 = affine::forward(outr1, W2, b2) + +# check accuracy +loss = l2_loss::forward(out2, Y) + +print("Got loss of " + loss) \ No newline at end of file diff --git a/scripts/perftest/scripts/nnSimpleSGD-train.dml b/scripts/perftest/scripts/nnSimpleSGD-train.dml new file mode 100644 index 00000000000..91ef4fa314f --- /dev/null +++ b/scripts/perftest/scripts/nnSimpleSGD-train.dml @@ -0,0 +1,86 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +# Imports +source("../../nn/layers/affine.dml") as affine +source("../../nn/layers/l2_loss.dml") as l2_loss +source("../../nn/layers/relu.dml") as relu +source("../../nn/optim/sgd.dml") as sgd + +# read input data +X = read($X) +Y = read($Y) +fmt = ifdef($fmt, "csv") + +N = nrow(X) +D = ncol(X) +t = ncol(Y) + +# Create 2-layer network: +## affine1 -> relu1 -> affine2 +M = ifdef($M, 64) # number of neurons # todo parameterize this +[W1, b1] = affine::init(D, M, -1) +[W2, b2] = affine::init(M, t, -1) + +# Initialize optimizer +lr = 0.05 # learning rate +mu = 0.9 # momentum +decay = 0.99 # learning rate decay constant + +# Optimize +print("Starting optimization") +batch_size = ifdef($batch_size, 32) +epochs = ifdef($epochs, 5) +iters = N / batch_size +for (e in 1:epochs) { + for(i in 1:iters) { + # Get next batch + X_batch = X[(i-1)*batch_size+1:i*batch_size,] + y_batch = Y[(i-1)*batch_size+1:i*batch_size,] + + # Compute forward pass + out1 = affine::forward(X_batch, W1, b1) + outr1 = relu::forward(out1) + out2 = affine::forward(outr1, W2, b2) + + # Compute loss + loss = l2_loss::forward(out2, y_batch) + print("L2 loss: " + loss) + + # Compute backward pass + dout2 = l2_loss::backward(out2, y_batch) + [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2) + dout1 = relu::backward(doutr1, out1) + [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1) + + # Optimize with vanilla SGD + W1 = sgd::update(W1, dW1, lr) + b1 = sgd::update(b1, db1, lr) + W2 = sgd::update(W2, dW2, lr) + b2 = sgd::update(b2, db2, lr) + } + # Decay learning rate + lr = lr * decay +} +# save params +write(W1,""+$B+"/w1_simple_sgd",format=fmt) +write(W2,""+$B+"/w2_simple_sgd",format=fmt) +write(b1,""+$B+"/b1_simple_sgd",format=fmt) +write(b2,""+$B+"/b2_simple_sgd",format=fmt) \ No newline at end of file diff --git a/scripts/staging/NCF.dml b/scripts/staging/NCF.dml index 0719b585e2f..3a8ba62f882 100644 --- a/scripts/staging/NCF.dml +++ b/scripts/staging/NCF.dml @@ -24,12 +24,12 @@ # # Imports -source("nn/optim/adam.dml") as adam -source("nn/layers/relu.dml") as relu -source("nn/layers/sigmoid.dml") as sigmoid -source("nn/layers/affine.dml") as affine -source("nn/layers/log_loss.dml") as log_loss -source("nn/layers/l2_reg.dml") as l2_reg +source("../nn/optim/adam.dml") as adam +source("../nn/layers/relu.dml") as relu +source("../nn/layers/sigmoid.dml") as sigmoid +source("../nn/layers/affine.dml") as affine +source("../nn/layers/log_loss.dml") as log_loss +source("../nn/layers/l2_reg.dml") as l2_reg train = function( matrix[double] users_train, matrix[double] items_train,