apache · Sheypex · Jun 20, 2023 · Jun 22, 2023 · Jun 22, 2023 · Jun 22, 2023
diff --git a/scripts/datagen/extractMNISTData.dml b/scripts/datagen/extractMNISTData.dml
@@ -0,0 +1,69 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Generates random Gaussian-mixture data to test k-Means clustering algorithms
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------
+# NAME  TYPE   DEFAULT  MEANING
+# ----------------------------------------------------------------------------
+# file_xyz  String            ---     Respective output files name/path
+# ktrain    Int               1000    Number of training samples
+# kval      Int               100     Number of validation samples
+# nitems    Int               50      Number of items
+# nusers    Int               60      Number of users
+# fmt       Format specifier  csv     Format of output data
+# ----------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1
+
+print ("BEGIN MNIST EXTRACTOR SCRIPT");
+
+file_mnist_train      = $mnist_train;
+file_mnist_test       = $mnist_test;
+file_out_train        = $out_train;
+file_out_test         = $out_test;
+num_train             = ifdef ($num_train, 60000);
+num_test              = ifdef ($num_test, 10000);
+fmt                   = ifdef ($fmt, "csv");
+
+mnist_train = read(file_mnist_train);
+mnist_test  = read(file_mnist_test);
+
+# stay in bounds
+num_train = min(num_train, nrow(mnist_train));
+num_test  = min(num_test, nrow(mnist_test));
+
+# targets
+# todo add shuffle?
+out_train = mnist_train[1:num_train,];
+out_test  = mnist_test[1:num_test,];
+
+
+print ("Writing out the resulting dataset...");
+
+write (out_train, file_out_train, format=fmt);
+write (out_test, file_out_test, format=fmt);
+
+print ("DONE: MNIST EXTRACTOR SCRIPT");
+
diff --git a/scripts/datagen/genRandData4NCF.dml b/scripts/datagen/genRandData4NCF.dml
@@ -0,0 +1,84 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Generates random Gaussian-mixture data to test k-Means clustering algorithms
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------
+# NAME  TYPE   DEFAULT  MEANING
+# ----------------------------------------------------------------------------
+# file_xyz  String            ---     Respective output files name/path
+# ktrain    Int               1000    Number of training samples
+# kval      Int               100     Number of validation samples
+# nitems    Int               50      Number of items
+# nusers    Int               60      Number of users
+# fmt       Format specifier  csv     Format of output data
+# ----------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1
+
+print ("BEGIN NCF GENERATOR SCRIPT");
+
+file_users_train      = ifdef ($users_train, "users_train");
+file_items_train      = ifdef ($items_train, "items_train");
+file_targets_train    = ifdef ($targets_train, "targets_train");
+file_users_val        = ifdef ($users_val, "users_val");
+file_items_val        = ifdef ($items_val, "items_val");
+file_targets_val      = ifdef ($targets_val, "targets_val");
+fmt                   = ifdef ($fmt, "csv");
+
+# Generate input data
+K_train = ifdef($ktrain, 1000); # number of training samples
+K_val = ifdef($kval, 100); # number of validation samples
+
+N = ifdef($nitems, 50); # number items
+M = ifdef($nusers, 60); # number users
+
+# targets
+targets_train = round(rand(rows=K_train, cols=1));
+targets_val = round(rand(rows=K_val, cols=1));
+
+# user/items integer-encoded vectors
+items_train_int_encoded = round(rand(rows=K_train, cols=1, min=1, max=N));
+users_train_int_encoded = round(rand(rows=K_train, cols=1, min=1, max=M));
+items_val_int_encoded = round(rand(rows=K_val, cols=1, min=1, max=N));
+users_val_int_encoded = round(rand(rows=K_val, cols=1, min=1, max=M));
+
+# user/items matrices by applying one-hot-encoding
+items_train = toOneHot(items_train_int_encoded, N);
+items_val = toOneHot(items_val_int_encoded, N);
+users_train = toOneHot(users_train_int_encoded, M);
+users_val = toOneHot(users_val_int_encoded, M);
+
+
+print ("Writing out the resulting dataset...");
+
+write (users_train, file_users_train, format=fmt);
+write (items_train, file_items_train, format=fmt);
+write (targets_train, file_targets_train, format=fmt);
+write (users_val, file_users_val, format=fmt);
+write (items_val, file_items_val, format=fmt);
+write (targets_val, file_targets_val, format=fmt);
+
+print ("DONE: NCF GENERATOR SCRIPT");
+
diff --git a/scripts/datagen/getMNISTDataset.sh b/scripts/datagen/getMNISTDataset.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+BASE=$1
+if [ "$BASE" = "" ]; then BASE=$PWD; fi
+RET=$PWD
+if [ ! -d "$BASE" ]; then mkdir "$BASE"; fi
+cd "$BASE" || exit
+
+echo "Downloading"
+if [ ! -f "mnist_train.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_train.csv.zip; fi
+if [ ! -f "mnist_test.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_test.csv.zip; fi
+
+echo "Unzipping"
+unzip -u mnist_train.csv.zip
+unzip -u mnist_test.csv.zip
+
+# have to create metadata for these external csv files
+
+echo '{
+          "data_type": "matrix",
+          "value_type": "double",
+          "rows": 60000,
+          "cols": 785,
+          "nnz": 0,
+          "format": "csv",
+          "author": "anon",
+          "header": false,
+          "sep": ",",
+          "created": "2023-06-26 18:35:22 CEST"
+      }' > mnist_train.csv.mtd
+
+echo '{
+          "data_type": "matrix",
+          "value_type": "double",
+          "rows": 10000,
+          "cols": 785,
+          "nnz": 0,
+          "format": "csv",
+          "author": "nobody",
+          "header": false,
+          "sep": ",",
+          "created": "2023-06-26 18:35:22 CEST"
+      }' > mnist_test.csv.mtd
+
+cd "$RET" || exit
+echo "Done"
diff --git a/scripts/nn/README.md b/scripts/nn/README.md
@@ -55,8 +55,8 @@ iters = 1024 / batch_size
 for (e in 1:epochs) {
   for(i in 1:iters) {
     # Get next batch
-    X_batch = X[i:i+batch_size-1,]
-    y_batch = y[i:i+batch_size-1,]
+    X_batch = X[(i-1)*batch_size+1:i*batch_size,]
+    y_batch = Y[(i-1)*batch_size+1:i*batch_size,]
 
     # Compute forward pass
     out1 = affine::forward(X_batch, W1, b1)
@@ -131,8 +131,8 @@ iters = 1024 / batch_size
 for (e in 1:epochs) {
   for(i in 1:iters) {
     # Get next batch
-    X_batch = X[i:i+batch_size-1,]
-    y_batch = y[i:i+batch_size-1,]
+    X_batch = X[(i-1)*batch_size+1:i*batch_size,]
+    y_batch = Y[(i-1)*batch_size+1:i*batch_size,]
 
     # Compute forward pass
     ## layer 1:

diff --git a/scripts/nn/examples/ncf-dummy-data.dml b/scripts/nn/examples/ncf-dummy-data.dml
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 
 # Imports
-source("staging/NCF.dml") as NCF
+source("../../staging/NCF.dml") as NCF
 
 K_train = 1000; # number of training samples
 K_val = 100; # number of validation samples

diff --git a/scripts/perftest/datagen/genMNISTData.sh b/scripts/perftest/datagen/genMNISTData.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ]; then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1
+fi
+
+# this sets the dot as the separating character in floating point numbers ie. their string representation
+# this avoids an error where bc outputs results dot-separated but printf may expect floats comma-separated if the system default says so
+export LC_NUMERIC="en_US.UTF-8"
+
+CMD=$1
+DATADIR=$2/mnist
+MAXMEM=$3
+
+FORMAT="csv" # can be csv, mm, text, binary
+
+echo "-- Generating MNIST data." >>results/times.txt
+#make sure whole MNIST is available
+../datagen/getMNISTDataset.sh ${DATADIR}
+
+mnist_train_filename="mnist_train.csv"
+mnist_test_filename="mnist_test.csv"
+
+max_size_ordinal=4
+min_num_examples_train=12000
+max_num_examples_train=60000
+span_num_examples_train=$(echo "${max_num_examples_train} - ${min_num_examples_train}" | bc)
+min_num_examples_test=2000
+max_num_examples_test=10000
+span_num_examples_test=$(echo "${max_num_examples_test} - ${min_num_examples_test}" | bc)
+#generate XS scenarios (80MB) by producing a subset of MNIST
+if [ $MAXMEM -ge 80 ]; then
+  size_ordinal=0
+  percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
+  # these python calls are here to show what the equivalent computations for the target_num variables do .. only difference is that printf $0.f doesnt round the float value down like floor but rounds it to produce an integer value instead
+  # target_num_train=$(python -c "from math import floor; print(${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))")
+  target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
+  # target_num_test=$(python -c "from math import floor; print(${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))")
+  target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
+  ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
+    mnist_train=${DATADIR}/${mnist_train_filename} \
+    mnist_test=${DATADIR}/${mnist_test_filename} \
+    out_train=${DATADIR}/mnist_${target_num_train}_train \
+    out_test=${DATADIR}/mnist_${target_num_train}_test \
+    num_train=${target_num_train} \
+    num_test=${target_num_test} \
+    fmt=${FORMAT} &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+  size_ordinal=1
+  percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
+  target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
+  target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
+  ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
+    mnist_train=${DATADIR}/${mnist_train_filename} \
+    mnist_test=${DATADIR}/${mnist_test_filename} \
+    out_train=${DATADIR}/mnist_${target_num_train}_train \
+    out_test=${DATADIR}/mnist_${target_num_train}_test \
+    num_train=${target_num_train} \
+    num_test=${target_num_test} \
+    fmt=${FORMAT} &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+  size_ordinal=2
+  percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
+  target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
+  target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
+  ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
+    mnist_train=${DATADIR}/${mnist_train_filename} \
+    mnist_test=${DATADIR}/${mnist_test_filename} \
+    out_train=${DATADIR}/mnist_${target_num_train}_train \
+    out_test=${DATADIR}/mnist_${target_num_train}_test \
+    num_train=${target_num_train} \
+    num_test=${target_num_test} \
+    fmt=${FORMAT} &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+  size_ordinal=3
+  percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
+  target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
+  target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
+  ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
+    mnist_train=${DATADIR}/${mnist_train_filename} \
+    mnist_test=${DATADIR}/${mnist_test_filename} \
+    out_train=${DATADIR}/mnist_${target_num_train}_train \
+    out_test=${DATADIR}/mnist_${target_num_train}_test \
+    num_train=${target_num_train} \
+    num_test=${target_num_test} \
+    fmt=${FORMAT} &
+fi
+
+#generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+  size_ordinal=4
+  percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
+  target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
+  target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
+  ${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
+    mnist_train=${DATADIR}/${mnist_train_filename} \
+    mnist_test=${DATADIR}/${mnist_test_filename} \
+    out_train=${DATADIR}/mnist_${target_num_train}_train \
+    out_test=${DATADIR}/mnist_${target_num_train}_test \
+    num_train=${target_num_train} \
+    num_test=${target_num_test} \
+    fmt=${FORMAT} &
+fi
+
+wait