Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions scripts/datagen/extractMNISTData.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#
# Generates random Gaussian-mixture data to test k-Means clustering algorithms
#
# INPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# file_xyz String --- Respective output files name/path
# ktrain Int 1000 Number of training samples
# kval Int 100 Number of validation samples
# nitems Int 50 Number of items
# nusers Int 60 Number of users
# fmt Format specifier csv Format of output data
# ----------------------------------------------------------------------------
#
# Example:
# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1

print ("BEGIN MNIST EXTRACTOR SCRIPT");

file_mnist_train = $mnist_train;
file_mnist_test = $mnist_test;
file_out_train = $out_train;
file_out_test = $out_test;
num_train = ifdef ($num_train, 60000);
num_test = ifdef ($num_test, 10000);
fmt = ifdef ($fmt, "csv");

mnist_train = read(file_mnist_train);
mnist_test = read(file_mnist_test);

# stay in bounds
num_train = min(num_train, nrow(mnist_train));
num_test = min(num_test, nrow(mnist_test));

# targets
# todo add shuffle?
out_train = mnist_train[1:num_train,];
out_test = mnist_test[1:num_test,];


print ("Writing out the resulting dataset...");

write (out_train, file_out_train, format=fmt);
write (out_test, file_out_test, format=fmt);

print ("DONE: MNIST EXTRACTOR SCRIPT");

84 changes: 84 additions & 0 deletions scripts/datagen/genRandData4NCF.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#
# Generates random Gaussian-mixture data to test k-Means clustering algorithms
#
# INPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# file_xyz String --- Respective output files name/path
# ktrain Int 1000 Number of training samples
# kval Int 100 Number of validation samples
# nitems Int 50 Number of items
# nusers Int 60 Number of users
# fmt Format specifier csv Format of output data
# ----------------------------------------------------------------------------
#
# Example:
# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=1024 nf=100 nt=1

print ("BEGIN NCF GENERATOR SCRIPT");

file_users_train = ifdef ($users_train, "users_train");
file_items_train = ifdef ($items_train, "items_train");
file_targets_train = ifdef ($targets_train, "targets_train");
file_users_val = ifdef ($users_val, "users_val");
file_items_val = ifdef ($items_val, "items_val");
file_targets_val = ifdef ($targets_val, "targets_val");
fmt = ifdef ($fmt, "csv");

# Generate input data
K_train = ifdef($ktrain, 1000); # number of training samples
K_val = ifdef($kval, 100); # number of validation samples

N = ifdef($nitems, 50); # number items
M = ifdef($nusers, 60); # number users

# targets
targets_train = round(rand(rows=K_train, cols=1));
targets_val = round(rand(rows=K_val, cols=1));

# user/items integer-encoded vectors
items_train_int_encoded = round(rand(rows=K_train, cols=1, min=1, max=N));
users_train_int_encoded = round(rand(rows=K_train, cols=1, min=1, max=M));
items_val_int_encoded = round(rand(rows=K_val, cols=1, min=1, max=N));
users_val_int_encoded = round(rand(rows=K_val, cols=1, min=1, max=M));

# user/items matrices by applying one-hot-encoding
items_train = toOneHot(items_train_int_encoded, N);
items_val = toOneHot(items_val_int_encoded, N);
users_train = toOneHot(users_train_int_encoded, M);
users_val = toOneHot(users_val_int_encoded, M);


print ("Writing out the resulting dataset...");

write (users_train, file_users_train, format=fmt);
write (items_train, file_items_train, format=fmt);
write (targets_train, file_targets_train, format=fmt);
write (users_val, file_users_val, format=fmt);
write (items_val, file_items_val, format=fmt);
write (targets_val, file_targets_val, format=fmt);

print ("DONE: NCF GENERATOR SCRIPT");

66 changes: 66 additions & 0 deletions scripts/datagen/getMNISTDataset.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

BASE=$1
if [ "$BASE" = "" ]; then BASE=$PWD; fi
RET=$PWD
if [ ! -d "$BASE" ]; then mkdir "$BASE"; fi
cd "$BASE" || exit

echo "Downloading"
if [ ! -f "mnist_train.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_train.csv.zip; fi
if [ ! -f "mnist_test.csv.zip" ]; then wget --no-check-certificate https://github.com/phoebetronic/mnist/raw/main/mnist_test.csv.zip; fi

echo "Unzipping"
unzip -u mnist_train.csv.zip
unzip -u mnist_test.csv.zip

# have to create metadata for these external csv files

echo '{
"data_type": "matrix",
"value_type": "double",
"rows": 60000,
"cols": 785,
"nnz": 0,
"format": "csv",
"author": "anon",
"header": false,
"sep": ",",
"created": "2023-06-26 18:35:22 CEST"
}' > mnist_train.csv.mtd

echo '{
"data_type": "matrix",
"value_type": "double",
"rows": 10000,
"cols": 785,
"nnz": 0,
"format": "csv",
"author": "nobody",
"header": false,
"sep": ",",
"created": "2023-06-26 18:35:22 CEST"
}' > mnist_test.csv.mtd

cd "$RET" || exit
echo "Done"
8 changes: 4 additions & 4 deletions scripts/nn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ iters = 1024 / batch_size
for (e in 1:epochs) {
for(i in 1:iters) {
# Get next batch
X_batch = X[i:i+batch_size-1,]
y_batch = y[i:i+batch_size-1,]
X_batch = X[(i-1)*batch_size+1:i*batch_size,]
y_batch = Y[(i-1)*batch_size+1:i*batch_size,]

# Compute forward pass
out1 = affine::forward(X_batch, W1, b1)
Expand Down Expand Up @@ -131,8 +131,8 @@ iters = 1024 / batch_size
for (e in 1:epochs) {
for(i in 1:iters) {
# Get next batch
X_batch = X[i:i+batch_size-1,]
y_batch = y[i:i+batch_size-1,]
X_batch = X[(i-1)*batch_size+1:i*batch_size,]
y_batch = Y[(i-1)*batch_size+1:i*batch_size,]

# Compute forward pass
## layer 1:
Expand Down
2 changes: 1 addition & 1 deletion scripts/nn/examples/ncf-dummy-data.dml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#-------------------------------------------------------------

# Imports
source("staging/NCF.dml") as NCF
source("../../staging/NCF.dml") as NCF

K_train = 1000; # number of training samples
K_val = 100; # number of validation samples
Expand Down
134 changes: 134 additions & 0 deletions scripts/perftest/datagen/genMNISTData.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/bin/bash
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
if [ "$(basename $PWD)" != "perftest" ]; then
echo "Please execute scripts from directory 'perftest'"
exit 1
fi

# this sets the dot as the separating character in floating point numbers ie. their string representation
# this avoids an error where bc outputs results dot-separated but printf may expect floats comma-separated if the system default says so
export LC_NUMERIC="en_US.UTF-8"

CMD=$1
DATADIR=$2/mnist
MAXMEM=$3

FORMAT="csv" # can be csv, mm, text, binary

echo "-- Generating MNIST data." >>results/times.txt
#make sure whole MNIST is available
../datagen/getMNISTDataset.sh ${DATADIR}

mnist_train_filename="mnist_train.csv"
mnist_test_filename="mnist_test.csv"

max_size_ordinal=4
min_num_examples_train=12000
max_num_examples_train=60000
span_num_examples_train=$(echo "${max_num_examples_train} - ${min_num_examples_train}" | bc)
min_num_examples_test=2000
max_num_examples_test=10000
span_num_examples_test=$(echo "${max_num_examples_test} - ${min_num_examples_test}" | bc)
#generate XS scenarios (80MB) by producing a subset of MNIST
if [ $MAXMEM -ge 80 ]; then
size_ordinal=0
percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
# these python calls are here to show what the equivalent computations for the target_num variables do .. only difference is that printf $0.f doesnt round the float value down like floor but rounds it to produce an integer value instead
# target_num_train=$(python -c "from math import floor; print(${min_num_examples_train} + floor(${span_num_examples_train} * ${percent_size}))")
target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
# target_num_test=$(python -c "from math import floor; print(${min_num_examples_test} + floor(${span_num_examples_test} * ${percent_size}))")
target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
mnist_train=${DATADIR}/${mnist_train_filename} \
mnist_test=${DATADIR}/${mnist_test_filename} \
out_train=${DATADIR}/mnist_${target_num_train}_train \
out_test=${DATADIR}/mnist_${target_num_train}_test \
num_train=${target_num_train} \
num_test=${target_num_test} \
fmt=${FORMAT} &
fi

#generate S scenarios (800MB)
if [ $MAXMEM -ge 800 ]; then
size_ordinal=1
percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
mnist_train=${DATADIR}/${mnist_train_filename} \
mnist_test=${DATADIR}/${mnist_test_filename} \
out_train=${DATADIR}/mnist_${target_num_train}_train \
out_test=${DATADIR}/mnist_${target_num_train}_test \
num_train=${target_num_train} \
num_test=${target_num_test} \
fmt=${FORMAT} &
fi

#generate M scenarios (8GB)
if [ $MAXMEM -ge 8000 ]; then
size_ordinal=2
percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
mnist_train=${DATADIR}/${mnist_train_filename} \
mnist_test=${DATADIR}/${mnist_test_filename} \
out_train=${DATADIR}/mnist_${target_num_train}_train \
out_test=${DATADIR}/mnist_${target_num_train}_test \
num_train=${target_num_train} \
num_test=${target_num_test} \
fmt=${FORMAT} &
fi

#generate L scenarios (80GB)
if [ $MAXMEM -ge 80000 ]; then
size_ordinal=3
percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
mnist_train=${DATADIR}/${mnist_train_filename} \
mnist_test=${DATADIR}/${mnist_test_filename} \
out_train=${DATADIR}/mnist_${target_num_train}_train \
out_test=${DATADIR}/mnist_${target_num_train}_test \
num_train=${target_num_train} \
num_test=${target_num_test} \
fmt=${FORMAT} &
fi

#generate XL scenarios (800GB)
if [ $MAXMEM -ge 800000 ]; then
size_ordinal=4
percent_size=$(echo "scale=10; ${size_ordinal} / ${max_size_ordinal}" | bc)
target_num_train=$(echo "${min_num_examples_train} + $(printf "%.0f" "$(echo "${span_num_examples_train} * ${percent_size}" | bc)")" | bc)
target_num_test=$(echo "${min_num_examples_test} + $(printf "%.0f" "$(echo "${span_num_examples_test} * ${percent_size}" | bc)")" | bc)
${CMD} -f ../datagen/extractMNISTData.dml --nvargs \
mnist_train=${DATADIR}/${mnist_train_filename} \
mnist_test=${DATADIR}/${mnist_test_filename} \
out_train=${DATADIR}/mnist_${target_num_train}_train \
out_test=${DATADIR}/mnist_${target_num_train}_test \
num_train=${target_num_train} \
num_test=${target_num_test} \
fmt=${FORMAT} &
fi

wait
Loading