From f5d4f22bb2b9e30079cc69bbd2fa522844cbf088 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sun, 29 Jun 2025 23:46:12 +0200 Subject: [PATCH 1/2] Current status, unfortunately, it does not insert the values correctly in raGroupby_exp1.dml --- .gitignore | 1 + scripts/builtin/raGroupby_exp1.dml | 171 +++++++++++++++++++++++++++++ testing/groupby_new_time.dml | 9 ++ testing/groupby_old_time.dml | 21 ++++ testing/testing.dml | 51 +++++++++ 5 files changed, 253 insertions(+) create mode 100644 scripts/builtin/raGroupby_exp1.dml create mode 100644 testing/groupby_new_time.dml create mode 100644 testing/groupby_old_time.dml create mode 100644 testing/testing.dml diff --git a/.gitignore b/.gitignore index f3c28571bdf..b2210ba22c1 100644 --- a/.gitignore +++ b/.gitignore @@ -150,3 +150,4 @@ venv/* # resource optimization scripts/resource/output *.pem +ADDED_Testing/hello.dml diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml new file mode 100644 index 00000000000..7f5fef1815a --- /dev/null +++ b/scripts/builtin/raGroupby_exp1.dml @@ -0,0 +1,171 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This raGroupby-function takes a matrix dataset as input from where it performs +# relational operations : groupby +# +# INPUT: +# ------------------------------------------------------------------------------ +# X Matrix of input data [shape: N x M] +# col Integer indicating the column index to execute grupby command +# method Groupby implemention method (nested-loop, permutation-matrix) +# ------------------------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------------------------ +# Y Matrix of selected data [shape N' x M] with N' <= N +# ------------------------------------------------------------------------------ + +m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) + return (Matrix[Double] Y) +{ + if (method == "nested-loop") { + # Extract and sort unique values from the specified column (1-based index) + uniqueValues = unique(X[, col]) + order_uniqueValues = order(target = uniqueValues, by = 1); + + # Calcute the number of groups + numGroups = nrow(uniqueValues) + + # Determine the maximum number of rows in any group + maxRowsInGroup = max(table(X[,col],1)); + + # Define a zero matrix to put the group data into + Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) + + # Put the ordered uniqueValues into first column of Y as group_id + Y[,1] = uniqueValues + + # create matrix to store the amount of rows for each group + rows_per_group = matrix(0, numGroups, 1) + + # order the initial matrix + #Xordered = order(target = X[,col], by = col) + Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + + # the amount of unique groups that are remaining + restingGroups = nrow(uniqueValues) + + # intial group + Ypos = 1 + currentGroup = as.scalar(Y[Ypos,1]) + + # intial positon in the row final matrix + YrowStart = 1 + + #loop over the initial matrix + while(restingGroups > 0){ + + current_group = as.scalar(Y[Ypos,1]) + + #amount of rows that need to be copied + amountRows = 0 + + i = YrowStart + + GROUP = 1 + + while(GROUP > 0){ + if(i > nrow(Xordered)){ + GROUP = 0 + } + else if(as.scalar(Xordered[i, col]) == currentGroup){ + amountRows = amountRows + 1 + i = i + 1 + } + else{ + GROUP = 0 + } + + } + + # row position in the final matrix + restingGroups = restingGroups - 1 + + if (amountRows > 0){ + # copy the values into the final matrix + YrowEnd = YrowStart + amountRows - 1 + newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)]) + #flatten the new row + newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) + newRowColumns = amountRows * (ncol(X)-1) + Y[Ypos, 2: (newRowColumns + 1)] = newRow + } + + # continue with the next group + Ypos = Ypos + 1 + YrowStart = YrowStart + amountRows + + } + + } + else if (method == "permutation-matrix") { + # Extract the grouping column and create unique groups + key = X[,col] + key_unique = unique(X[, col]) + numGroups = nrow(key_unique) + + # Matrix for comparison + key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) + key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) + + # Find group index + groupIndex = rowIndexMax(t(key_compare == key_matrix)) + + # Determine the maximum number of rows in any group + maxRowsInGroup = max(table(X[,col],1)) + totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 + + # Create permutation matrix P copy relevant tuples with a single matrix multiplication + P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) + # Create offsets to store the first column of each group + offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) + + # Create row and column index for the permutation matrix + rowIndex = seq(1, nrow(X)) + indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) + selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) + colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) + + # Set values in P + P = table(seq(1, nrow(X)), colIndex) + + # Perform matrix multiplication + Y_temp = t(P) %*% X + + # Remove the selected column from Y_temp + if( col == 1 ) { + Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] + } + else if( col == ncol(X) ) { + Y_temp_reduce = Y_temp[, 1:col-1] + } + else{ + Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) + } + + # Set value of final output + Y = matrix(0, rows=numGroups, cols=totalCells) + Y[,1] = key_unique + Y[,2:ncol(Y)] = matrix(Y_temp_reduce, rows=numGroups, cols=totalCells-1) + } +} + diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml new file mode 100644 index 00000000000..fba92564b82 --- /dev/null +++ b/testing/groupby_new_time.dml @@ -0,0 +1,9 @@ +# create an input matrix +input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) + +# load functions +source("scripts/builtin/raGroupby_exp1.dml") as ra_new + +# use the new ra_groupby function +new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") + diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml new file mode 100644 index 00000000000..015a5797253 --- /dev/null +++ b/testing/groupby_old_time.dml @@ -0,0 +1,21 @@ +# create an input matrix +input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) +print("The amount of rows in the input matrix") +print(1000*10) + +# load functions +source("scripts/builtin/raGroupby.dml") as ra_old + +# use the initial ra_groupby function +old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") + +print(input_matrix[1:10, 1:ncol(input_matrix)]) + +print("The amount of rows in the final matrix") +print(nrow(old_func)) + +print("The amount of rows in the final matrix") +print(ncol(old_func)) + +print("The amount of rows * columns in the final matrix") +print(ncol(old_func)*nrow(old_func)) \ No newline at end of file diff --git a/testing/testing.dml b/testing/testing.dml new file mode 100644 index 00000000000..39ac856b2f2 --- /dev/null +++ b/testing/testing.dml @@ -0,0 +1,51 @@ +# create an input matrix +input_matrix = round(rand(rows = 10, cols = 10, min = 1, max = 5)) + +# load functions +source("scripts/builtin/raGroupby.dml") as ra_old +source("scripts/builtin/raGroupby_exp1.dml") as ra_new + +# use the initial ra_groupby function +old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") + +# use the new ra_groupby function +new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") + +# check if the new function still operates correctly +for(i in 1:nrow(old_func)){ + for(j in 1:ncol(old_func)){ + old_val = as.scalar(old_func[i,j]) + new_val = as.scalar(new_func[i,j]) + if (old_val != new_val){ + print("The values are not identical") + print("The index is i x j") + print(i) + print(j) + }else{ + if(j == 1){ + print("The first value was correct") + } + if(j == ncol(old_func)){ + print("The last value was correct") + } + } + } +} + +print("The amount of rows in the old matrix") +print(nrow(old_func)) + +print("The amount of col in the old matrix") +print(ncol(old_func)) + +print("The amount of rows * columns in the old matrix") +print(ncol(old_func)*nrow(old_func)) + +print("The amount of rows in the new matrix") +print(nrow(new_func)) + +print("The amount of col in the new matrix") +print(ncol(new_func)) + +print("The amount of rows * columns in the new matrix") +print(ncol(new_func)*nrow(new_func)) From a9617597c9d22d06d157f94a86c9d01206ac76d5 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Mon, 30 Jun 2025 15:11:59 +0200 Subject: [PATCH 2/2] Found the error, should beat the performance of nested loop, sorry for the last pull request --- scripts/builtin/raGroupby_exp1.dml | 26 ++++++++++++++++++-------- testing/groupby_new_time.dml | 2 +- testing/groupby_old_time.dml | 15 +-------------- testing/testing.dml | 2 +- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml index 7f5fef1815a..3e0b4abec4d 100644 --- a/scripts/builtin/raGroupby_exp1.dml +++ b/scripts/builtin/raGroupby_exp1.dml @@ -58,29 +58,29 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) rows_per_group = matrix(0, numGroups, 1) # order the initial matrix - #Xordered = order(target = X[,col], by = col) Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + # create a Matrix with the ordered groups + Yordered = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + # the amount of unique groups that are remaining restingGroups = nrow(uniqueValues) # intial group Ypos = 1 - currentGroup = as.scalar(Y[Ypos,1]) # intial positon in the row final matrix YrowStart = 1 + i = 1 #loop over the initial matrix while(restingGroups > 0){ - current_group = as.scalar(Y[Ypos,1]) + currentGroup = as.scalar(Yordered[Ypos,1]) #amount of rows that need to be copied amountRows = 0 - i = YrowStart - GROUP = 1 while(GROUP > 0){ @@ -94,28 +94,38 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) else{ GROUP = 0 } - } # row position in the final matrix restingGroups = restingGroups - 1 if (amountRows > 0){ + # copy the values into the final matrix YrowEnd = YrowStart + amountRows - 1 newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)]) + #flatten the new row newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) newRowColumns = amountRows * (ncol(X)-1) - Y[Ypos, 2: (newRowColumns + 1)] = newRow - } + # sort the ordered Y matrix back to the initial order + rowFinder = 1 + while(currentGroup != as.scalar(Y[rowFinder, 1])){ + rowFinder = rowFinder + 1 + } + Y[rowFinder, 2: (newRowColumns + 1)] = newRow + } # continue with the next group Ypos = Ypos + 1 YrowStart = YrowStart + amountRows } + + + + } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml index fba92564b82..011d7ab048a 100644 --- a/testing/groupby_new_time.dml +++ b/testing/groupby_new_time.dml @@ -1,5 +1,5 @@ # create an input matrix -input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) +input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby_exp1.dml") as ra_new diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml index 015a5797253..25ae4bfb416 100644 --- a/testing/groupby_old_time.dml +++ b/testing/groupby_old_time.dml @@ -1,21 +1,8 @@ # create an input matrix -input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) -print("The amount of rows in the input matrix") -print(1000*10) +input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby.dml") as ra_old # use the initial ra_groupby function old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") - -print(input_matrix[1:10, 1:ncol(input_matrix)]) - -print("The amount of rows in the final matrix") -print(nrow(old_func)) - -print("The amount of rows in the final matrix") -print(ncol(old_func)) - -print("The amount of rows * columns in the final matrix") -print(ncol(old_func)*nrow(old_func)) \ No newline at end of file diff --git a/testing/testing.dml b/testing/testing.dml index 39ac856b2f2..d19951f095d 100644 --- a/testing/testing.dml +++ b/testing/testing.dml @@ -1,5 +1,5 @@ # create an input matrix -input_matrix = round(rand(rows = 10, cols = 10, min = 1, max = 5)) +input_matrix = round(rand(rows = 5, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby.dml") as ra_old