From f5d4f22bb2b9e30079cc69bbd2fa522844cbf088 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sun, 29 Jun 2025 23:46:12 +0200 Subject: [PATCH 01/25] Current status, unfortunately, it does not insert the values correctly in raGroupby_exp1.dml --- .gitignore | 1 + scripts/builtin/raGroupby_exp1.dml | 171 +++++++++++++++++++++++++++++ testing/groupby_new_time.dml | 9 ++ testing/groupby_old_time.dml | 21 ++++ testing/testing.dml | 51 +++++++++ 5 files changed, 253 insertions(+) create mode 100644 scripts/builtin/raGroupby_exp1.dml create mode 100644 testing/groupby_new_time.dml create mode 100644 testing/groupby_old_time.dml create mode 100644 testing/testing.dml diff --git a/.gitignore b/.gitignore index f3c28571bdf..b2210ba22c1 100644 --- a/.gitignore +++ b/.gitignore @@ -150,3 +150,4 @@ venv/* # resource optimization scripts/resource/output *.pem +ADDED_Testing/hello.dml diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml new file mode 100644 index 00000000000..7f5fef1815a --- /dev/null +++ b/scripts/builtin/raGroupby_exp1.dml @@ -0,0 +1,171 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This raGroupby-function takes a matrix dataset as input from where it performs +# relational operations : groupby +# +# INPUT: +# ------------------------------------------------------------------------------ +# X Matrix of input data [shape: N x M] +# col Integer indicating the column index to execute grupby command +# method Groupby implemention method (nested-loop, permutation-matrix) +# ------------------------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------------------------ +# Y Matrix of selected data [shape N' x M] with N' <= N +# ------------------------------------------------------------------------------ + +m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) + return (Matrix[Double] Y) +{ + if (method == "nested-loop") { + # Extract and sort unique values from the specified column (1-based index) + uniqueValues = unique(X[, col]) + order_uniqueValues = order(target = uniqueValues, by = 1); + + # Calcute the number of groups + numGroups = nrow(uniqueValues) + + # Determine the maximum number of rows in any group + maxRowsInGroup = max(table(X[,col],1)); + + # Define a zero matrix to put the group data into + Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) + + # Put the ordered uniqueValues into first column of Y as group_id + Y[,1] = uniqueValues + + # create matrix to store the amount of rows for each group + rows_per_group = matrix(0, numGroups, 1) + + # order the initial matrix + #Xordered = order(target = X[,col], by = col) + Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + + # the amount of unique groups that are remaining + restingGroups = nrow(uniqueValues) + + # intial group + Ypos = 1 + currentGroup = as.scalar(Y[Ypos,1]) + + # intial positon in the row final matrix + YrowStart = 1 + + #loop over the initial matrix + while(restingGroups > 0){ + + current_group = as.scalar(Y[Ypos,1]) + + #amount of rows that need to be copied + amountRows = 0 + + i = YrowStart + + GROUP = 1 + + while(GROUP > 0){ + if(i > nrow(Xordered)){ + GROUP = 0 + } + else if(as.scalar(Xordered[i, col]) == currentGroup){ + amountRows = amountRows + 1 + i = i + 1 + } + else{ + GROUP = 0 + } + + } + + # row position in the final matrix + restingGroups = restingGroups - 1 + + if (amountRows > 0){ + # copy the values into the final matrix + YrowEnd = YrowStart + amountRows - 1 + newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)]) + #flatten the new row + newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) + newRowColumns = amountRows * (ncol(X)-1) + Y[Ypos, 2: (newRowColumns + 1)] = newRow + } + + # continue with the next group + Ypos = Ypos + 1 + YrowStart = YrowStart + amountRows + + } + + } + else if (method == "permutation-matrix") { + # Extract the grouping column and create unique groups + key = X[,col] + key_unique = unique(X[, col]) + numGroups = nrow(key_unique) + + # Matrix for comparison + key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) + key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) + + # Find group index + groupIndex = rowIndexMax(t(key_compare == key_matrix)) + + # Determine the maximum number of rows in any group + maxRowsInGroup = max(table(X[,col],1)) + totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 + + # Create permutation matrix P copy relevant tuples with a single matrix multiplication + P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) + # Create offsets to store the first column of each group + offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) + + # Create row and column index for the permutation matrix + rowIndex = seq(1, nrow(X)) + indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) + selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) + colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) + + # Set values in P + P = table(seq(1, nrow(X)), colIndex) + + # Perform matrix multiplication + Y_temp = t(P) %*% X + + # Remove the selected column from Y_temp + if( col == 1 ) { + Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] + } + else if( col == ncol(X) ) { + Y_temp_reduce = Y_temp[, 1:col-1] + } + else{ + Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) + } + + # Set value of final output + Y = matrix(0, rows=numGroups, cols=totalCells) + Y[,1] = key_unique + Y[,2:ncol(Y)] = matrix(Y_temp_reduce, rows=numGroups, cols=totalCells-1) + } +} + diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml new file mode 100644 index 00000000000..fba92564b82 --- /dev/null +++ b/testing/groupby_new_time.dml @@ -0,0 +1,9 @@ +# create an input matrix +input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) + +# load functions +source("scripts/builtin/raGroupby_exp1.dml") as ra_new + +# use the new ra_groupby function +new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") + diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml new file mode 100644 index 00000000000..015a5797253 --- /dev/null +++ b/testing/groupby_old_time.dml @@ -0,0 +1,21 @@ +# create an input matrix +input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) +print("The amount of rows in the input matrix") +print(1000*10) + +# load functions +source("scripts/builtin/raGroupby.dml") as ra_old + +# use the initial ra_groupby function +old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") + +print(input_matrix[1:10, 1:ncol(input_matrix)]) + +print("The amount of rows in the final matrix") +print(nrow(old_func)) + +print("The amount of rows in the final matrix") +print(ncol(old_func)) + +print("The amount of rows * columns in the final matrix") +print(ncol(old_func)*nrow(old_func)) \ No newline at end of file diff --git a/testing/testing.dml b/testing/testing.dml new file mode 100644 index 00000000000..39ac856b2f2 --- /dev/null +++ b/testing/testing.dml @@ -0,0 +1,51 @@ +# create an input matrix +input_matrix = round(rand(rows = 10, cols = 10, min = 1, max = 5)) + +# load functions +source("scripts/builtin/raGroupby.dml") as ra_old +source("scripts/builtin/raGroupby_exp1.dml") as ra_new + +# use the initial ra_groupby function +old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") + +# use the new ra_groupby function +new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") + +# check if the new function still operates correctly +for(i in 1:nrow(old_func)){ + for(j in 1:ncol(old_func)){ + old_val = as.scalar(old_func[i,j]) + new_val = as.scalar(new_func[i,j]) + if (old_val != new_val){ + print("The values are not identical") + print("The index is i x j") + print(i) + print(j) + }else{ + if(j == 1){ + print("The first value was correct") + } + if(j == ncol(old_func)){ + print("The last value was correct") + } + } + } +} + +print("The amount of rows in the old matrix") +print(nrow(old_func)) + +print("The amount of col in the old matrix") +print(ncol(old_func)) + +print("The amount of rows * columns in the old matrix") +print(ncol(old_func)*nrow(old_func)) + +print("The amount of rows in the new matrix") +print(nrow(new_func)) + +print("The amount of col in the new matrix") +print(ncol(new_func)) + +print("The amount of rows * columns in the new matrix") +print(ncol(new_func)*nrow(new_func)) From caaa93d48d85e1acd61b4a7ceedb04aa078f1099 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sun, 29 Jun 2025 23:51:58 +0200 Subject: [PATCH 02/25] Added the Becnhmarking framework (Python) --- Benchmarking | 1 + 1 file changed, 1 insertion(+) create mode 160000 Benchmarking diff --git a/Benchmarking b/Benchmarking new file mode 160000 index 00000000000..10eab1f884f --- /dev/null +++ b/Benchmarking @@ -0,0 +1 @@ +Subproject commit 10eab1f884fd23e1bf4452e424dd7fe739c142ec From a9617597c9d22d06d157f94a86c9d01206ac76d5 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Mon, 30 Jun 2025 15:11:59 +0200 Subject: [PATCH 03/25] Found the error, should beat the performance of nested loop, sorry for the last pull request --- scripts/builtin/raGroupby_exp1.dml | 26 ++++++++++++++++++-------- testing/groupby_new_time.dml | 2 +- testing/groupby_old_time.dml | 15 +-------------- testing/testing.dml | 2 +- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml index 7f5fef1815a..3e0b4abec4d 100644 --- a/scripts/builtin/raGroupby_exp1.dml +++ b/scripts/builtin/raGroupby_exp1.dml @@ -58,29 +58,29 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) rows_per_group = matrix(0, numGroups, 1) # order the initial matrix - #Xordered = order(target = X[,col], by = col) Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + # create a Matrix with the ordered groups + Yordered = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + # the amount of unique groups that are remaining restingGroups = nrow(uniqueValues) # intial group Ypos = 1 - currentGroup = as.scalar(Y[Ypos,1]) # intial positon in the row final matrix YrowStart = 1 + i = 1 #loop over the initial matrix while(restingGroups > 0){ - current_group = as.scalar(Y[Ypos,1]) + currentGroup = as.scalar(Yordered[Ypos,1]) #amount of rows that need to be copied amountRows = 0 - i = YrowStart - GROUP = 1 while(GROUP > 0){ @@ -94,28 +94,38 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) else{ GROUP = 0 } - } # row position in the final matrix restingGroups = restingGroups - 1 if (amountRows > 0){ + # copy the values into the final matrix YrowEnd = YrowStart + amountRows - 1 newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)]) + #flatten the new row newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) newRowColumns = amountRows * (ncol(X)-1) - Y[Ypos, 2: (newRowColumns + 1)] = newRow - } + # sort the ordered Y matrix back to the initial order + rowFinder = 1 + while(currentGroup != as.scalar(Y[rowFinder, 1])){ + rowFinder = rowFinder + 1 + } + Y[rowFinder, 2: (newRowColumns + 1)] = newRow + } # continue with the next group Ypos = Ypos + 1 YrowStart = YrowStart + amountRows } + + + + } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml index fba92564b82..011d7ab048a 100644 --- a/testing/groupby_new_time.dml +++ b/testing/groupby_new_time.dml @@ -1,5 +1,5 @@ # create an input matrix -input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) +input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby_exp1.dml") as ra_new diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml index 015a5797253..25ae4bfb416 100644 --- a/testing/groupby_old_time.dml +++ b/testing/groupby_old_time.dml @@ -1,21 +1,8 @@ # create an input matrix -input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) -print("The amount of rows in the input matrix") -print(1000*10) +input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby.dml") as ra_old # use the initial ra_groupby function old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") - -print(input_matrix[1:10, 1:ncol(input_matrix)]) - -print("The amount of rows in the final matrix") -print(nrow(old_func)) - -print("The amount of rows in the final matrix") -print(ncol(old_func)) - -print("The amount of rows * columns in the final matrix") -print(ncol(old_func)*nrow(old_func)) \ No newline at end of file diff --git a/testing/testing.dml b/testing/testing.dml index 39ac856b2f2..d19951f095d 100644 --- a/testing/testing.dml +++ b/testing/testing.dml @@ -1,5 +1,5 @@ # create an input matrix -input_matrix = round(rand(rows = 10, cols = 10, min = 1, max = 5)) +input_matrix = round(rand(rows = 5, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby.dml") as ra_old From fed2aa4e703db3f8f0baae14fd8aa361247b2c23 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Mon, 30 Jun 2025 15:14:11 +0200 Subject: [PATCH 04/25] Found the error, should beat the performance of nested loop, sorry for the last pull request --- Benchmarking | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Benchmarking b/Benchmarking index 10eab1f884f..acc4ca7e9f1 160000 --- a/Benchmarking +++ b/Benchmarking @@ -1 +1 @@ -Subproject commit 10eab1f884fd23e1bf4452e424dd7fe739c142ec +Subproject commit acc4ca7e9f1f906a63fb3b19374bd398d8d3fd42 From 7275695212e57aed9776a5f4915c0c3cdb5dfd93 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Fri, 4 Jul 2025 19:36:43 +0200 Subject: [PATCH 05/25] Avoided the addition loop over Y to copy the rows into Y by saving the initial order of Y and the resotring it after copying. In order to copy the values it was necessary to also order X. Before a copy matrix was needed, which can be avoided as well by saving and restoring the initial order of X --- scripts/builtin/raGroupby_exp1.dml | 67 ++++++++++++++++-------------- testing/testing.dml | 22 ++++------ 2 files changed, 43 insertions(+), 46 deletions(-) diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml index 3e0b4abec4d..47f3ee7eb0c 100644 --- a/scripts/builtin/raGroupby_exp1.dml +++ b/scripts/builtin/raGroupby_exp1.dml @@ -54,78 +54,83 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) # Put the ordered uniqueValues into first column of Y as group_id Y[,1] = uniqueValues - # create matrix to store the amount of rows for each group - rows_per_group = matrix(0, numGroups, 1) + # save the initial order of the groups in Y + Yindexes = order(target = Y, by = 1, index.return = TRUE) - # order the initial matrix - Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + # order Y by the groups increasing + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) - # create a Matrix with the ordered groups - Yordered = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + # order the initial matrix + Xindexes = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) # the amount of unique groups that are remaining restingGroups = nrow(uniqueValues) - # intial group + # intial group to copy the values into Y Ypos = 1 - # intial positon in the row final matrix + # start positon in the row final matrix YrowStart = 1 i = 1 #loop over the initial matrix while(restingGroups > 0){ - currentGroup = as.scalar(Yordered[Ypos,1]) + currentGroup = as.scalar(Y[Ypos,1]) - #amount of rows that need to be copied + # amount of rows that need to be copied amountRows = 0 + # find the rows for the current group GROUP = 1 - while(GROUP > 0){ - if(i > nrow(Xordered)){ + # break if there are no more row left in X + if(i > nrow(X)){ GROUP = 0 } - else if(as.scalar(Xordered[i, col]) == currentGroup){ + # check if the row belongs to the current group + else if(as.scalar(X[i, col]) == currentGroup){ amountRows = amountRows + 1 i = i + 1 } + # break if the row does not belong to the current group else{ GROUP = 0 } } - # row position in the final matrix - restingGroups = restingGroups - 1 - + # copy the values into the final matrix if (amountRows > 0){ - - # copy the values into the final matrix + # create a matrix of the row that should be copied without the group column YrowEnd = YrowStart + amountRows - 1 - newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)]) + newMatrix = cbind(X[YrowStart:YrowEnd, 1:(col-1)], X[YrowStart:YrowEnd, (col+1):ncol(X)]) - #flatten the new row + # flatten the new row newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) newRowColumns = amountRows * (ncol(X)-1) - # sort the ordered Y matrix back to the initial order - rowFinder = 1 - while(currentGroup != as.scalar(Y[rowFinder, 1])){ - rowFinder = rowFinder + 1 - } - Y[rowFinder, 2: (newRowColumns + 1)] = newRow + # add the new row into Y at the current group + Y[Ypos, 2: (newRowColumns + 1)] = newRow } + # continue with the next group Ypos = Ypos + 1 YrowStart = YrowStart + amountRows - + restingGroups = restingGroups - 1 } - - - - + #restore the initial order of X + X = cbind(X, Xindexes) + ncol_X = ncol(X) + X = order(target = X, by= ncol_X) + X = X[, 1:ncol_X-1] + + #restore the initial order of Y + Y = cbind(Y, Yindexes) + ncol_Y = ncol(Y) + Y = order(target = Y, by= ncol_Y) + Y = Y[, 1:ncol_Y-1] } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups diff --git a/testing/testing.dml b/testing/testing.dml index d19951f095d..2bea74ff08f 100644 --- a/testing/testing.dml +++ b/testing/testing.dml @@ -1,5 +1,5 @@ # create an input matrix -input_matrix = round(rand(rows = 5, cols = 5, min = 1, max = 5)) +input_matrix = round(rand(rows = 100, cols = 100, min = 1, max = 50)) # load functions source("scripts/builtin/raGroupby.dml") as ra_old @@ -32,20 +32,12 @@ for(i in 1:nrow(old_func)){ } } -print("The amount of rows in the old matrix") -print(nrow(old_func)) -print("The amount of col in the old matrix") -print(ncol(old_func)) +print("This is the input matrix") +print(toString(input_matrix)) -print("The amount of rows * columns in the old matrix") -print(ncol(old_func)*nrow(old_func)) +print("This is the old func") +print(toString(old_func)) -print("The amount of rows in the new matrix") -print(nrow(new_func)) - -print("The amount of col in the new matrix") -print(ncol(new_func)) - -print("The amount of rows * columns in the new matrix") -print(ncol(new_func)*nrow(new_func)) +print("This is the new func") +print(toString(new_func)) \ No newline at end of file From 80293ce88c698323e05e22b098bf1bd5de414b04 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sat, 5 Jul 2025 16:20:50 +0200 Subject: [PATCH 06/25] Removed additional files. Copied content into the correct file. Added switch if the first and the last column ist selected. Passes now all provided tests. --- scripts/builtin/raGroupby.dml | 209 +++++++++++++++++++---------- scripts/builtin/raGroupby_exp1.dml | 186 ------------------------- testing/groupby_new_time.dml | 9 -- testing/groupby_old_time.dml | 8 -- testing/testing.dml | 43 ------ 5 files changed, 136 insertions(+), 319 deletions(-) delete mode 100644 scripts/builtin/raGroupby_exp1.dml delete mode 100644 testing/groupby_new_time.dml delete mode 100644 testing/groupby_old_time.dml delete mode 100644 testing/testing.dml diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index 7d7035c0ff8..13e08f84789 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -38,6 +38,7 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) return (Matrix[Double] Y) { if (method == "nested-loop") { + # print(toString(X)) # Extract and sort unique values from the specified column (1-based index) uniqueValues = unique(X[, col]) order_uniqueValues = order(target = uniqueValues, by = 1); @@ -52,87 +53,149 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) # Put the ordered uniqueValues into first column of Y as group_id - #Y[,1] = order_uniqueValues Y[,1] = uniqueValues - # Loop for each group - for(i in 1:numGroups){ - index = 0 - - # Iterate each row in matrix X to deal with group data - for ( j in 1:nrow(X) ) { - if ( as.scalar( X[j,col] == uniqueValues[i,1] )) { - # Define the formula of the start and end column position - startCol = index*(ncol(X)-1) +2 - endCol = startCol + (ncol(X)-2) - - if (col == 1) { - # Case when the selected column is the first column - Y[i,startCol:endCol] = X[j,2:ncol(X)] - } - else if (col == ncol(X)) { - # Case when the selected column is the last column - Y[i,startCol:endCol] = X[j,1:(ncol(X)-1)] - } - else { - # General case - newRow = cbind(X[j, 1:(col-1)], X[j, (col+1):ncol(X)]) - Y[i,startCol:endCol] = newRow - } - index = index +1 + # save the initial order of the groups in Y + Yindexes = order(target = Y, by = 1, index.return = TRUE) + + # order Y by the groups increasing + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + + # order the initial matrix + Xindexes = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + + # the amount of unique groups that are remaining + restingGroups = nrow(uniqueValues) + + # intial group to copy the values into Y + Ypos = 1 + + # start positon in the row final matrix + YrowStart = 1 + i = 1 + + #loop over the initial matrix + while(restingGroups > 0){ + + currentGroup = as.scalar(Y[Ypos,1]) + + # amount of rows that need to be copied + amountRows = 0 + + # find the rows for the current group + GROUP = 1 + while(GROUP > 0){ + # break if there are no more row left in X + if(i > nrow(X)){ + GROUP = 0 + } + # check if the row belongs to the current group + else if(as.scalar(X[i, col]) == currentGroup){ + amountRows = amountRows + 1 + i = i + 1 + } + # break if the row does not belong to the current group + else{ + GROUP = 0 + } } - } - } - } - else if (method == "permutation-matrix") { - # Extract the grouping column and create unique groups - key = X[,col] - key_unique = unique(X[, col]) - numGroups = nrow(key_unique) - - # Matrix for comparison - key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) - key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) - - # Find group index - groupIndex = rowIndexMax(t(key_compare == key_matrix)) - - # Determine the maximum number of rows in any group - maxRowsInGroup = max(table(X[,col],1)) - totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 - # Create permutation matrix P copy relevant tuples with a single matrix multiplication - P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) - # Create offsets to store the first column of each group - offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) + # copy the values into the final matrix + if (amountRows > 0){ + # create a matrix of the row that should be copied without the group column + YrowEnd = YrowStart + amountRows - 1 + + # case selected column is first column + if (col == 1){ + newMatrix = X[YrowStart:YrowEnd, (col+1):ncol(X)] + } + # case selected column is last column + else if (col == ncol(X)) { + newMatrix = X [YrowStart:YrowEnd, 1:col-1] + } + # normal case + else { + newMatrix = cbind(X[YrowStart:YrowEnd, 1:(col-1)], X[YrowStart:YrowEnd, (col+1):ncol(X)]) + } + + # flatten the new row + newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) + newRowColumns = amountRows * (ncol(X)-1) + + # add the new row into Y at the current group + Y[Ypos, 2: (newRowColumns + 1)] = newRow + } - # Create row and column index for the permutation matrix - rowIndex = seq(1, nrow(X)) - indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) - selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) - colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) + # continue with the next group + Ypos = Ypos + 1 + YrowStart = YrowStart + amountRows + restingGroups = restingGroups - 1 + } - # Set values in P - P = table(seq(1, nrow(X)), colIndex) + #restore the initial order of X + X = cbind(X, Xindexes) + ncol_X = ncol(X) + X = order(target = X, by= ncol_X) + X = X[, 1:ncol_X-1] - # Perform matrix multiplication - Y_temp = t(P) %*% X + #restore the initial order of Y + Y = cbind(Y, Yindexes) + ncol_Y = ncol(Y) + Y = order(target = Y, by= ncol_Y) + Y = Y[, 1:ncol_Y-1] - # Remove the selected column from Y_temp - if( col == 1 ) { - Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] - } - else if( col == ncol(X) ) { - Y_temp_reduce = Y_temp[, 1:col-1] - } - else{ - Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) - } - - # Set value of final output - Y = matrix(0, rows=numGroups, cols=totalCells) - Y[,1] = key_unique - Y[,2:ncol(Y)] = matrix(Y_temp_reduce, rows=numGroups, cols=totalCells-1) + print(toString(Y)) } + else if (method == "permutation-matrix") { + # Extract the grouping column and create unique groups + key = X[,col] + key_unique = unique(X[, col]) + numGroups = nrow(key_unique) + + # Matrix for comparison + key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) + key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) + + # Find group index + groupIndex = rowIndexMax(t(key_compare == key_matrix)) + + # Determine the maximum number of rows in any group + maxRowsInGroup = max(table(X[,col],1)) + totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 + + # Create permutation matrix P copy relevant tuples with a single matrix multiplication + P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) + # Create offsets to store the first column of each group + offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) + + # Create row and column index for the permutation matrix + rowIndex = seq(1, nrow(X)) + indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) + selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) + colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) + + # Set values in P + P = table(seq(1, nrow(X)), colIndex) + + # Perform matrix multiplication + Y_temp = t(P) %*% X + + # Remove the selected column from Y_temp + if( col == 1 ) { + Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] + } + else if( col == ncol(X) ) { + Y_temp_reduce = Y_temp[, 1:col-1] + } + else{ + Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) + } + + # Set value of final output + Y = matrix(0, rows=numGroups, cols=totalCells) + Y[,1] = key_unique + Y[,2:ncol(Y)] = matrix(Y_temp_reduce, rows=numGroups, cols=totalCells-1) + } } diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml deleted file mode 100644 index 47f3ee7eb0c..00000000000 --- a/scripts/builtin/raGroupby_exp1.dml +++ /dev/null @@ -1,186 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# This raGroupby-function takes a matrix dataset as input from where it performs -# relational operations : groupby -# -# INPUT: -# ------------------------------------------------------------------------------ -# X Matrix of input data [shape: N x M] -# col Integer indicating the column index to execute grupby command -# method Groupby implemention method (nested-loop, permutation-matrix) -# ------------------------------------------------------------------------------ -# -# OUTPUT: -# ------------------------------------------------------------------------------ -# Y Matrix of selected data [shape N' x M] with N' <= N -# ------------------------------------------------------------------------------ - -m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) - return (Matrix[Double] Y) -{ - if (method == "nested-loop") { - # Extract and sort unique values from the specified column (1-based index) - uniqueValues = unique(X[, col]) - order_uniqueValues = order(target = uniqueValues, by = 1); - - # Calcute the number of groups - numGroups = nrow(uniqueValues) - - # Determine the maximum number of rows in any group - maxRowsInGroup = max(table(X[,col],1)); - - # Define a zero matrix to put the group data into - Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) - - # Put the ordered uniqueValues into first column of Y as group_id - Y[,1] = uniqueValues - - # save the initial order of the groups in Y - Yindexes = order(target = Y, by = 1, index.return = TRUE) - - # order Y by the groups increasing - Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) - - # order the initial matrix - Xindexes = order(target = X, by = col, index.return = TRUE) - X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) - - # the amount of unique groups that are remaining - restingGroups = nrow(uniqueValues) - - # intial group to copy the values into Y - Ypos = 1 - - # start positon in the row final matrix - YrowStart = 1 - i = 1 - - #loop over the initial matrix - while(restingGroups > 0){ - - currentGroup = as.scalar(Y[Ypos,1]) - - # amount of rows that need to be copied - amountRows = 0 - - # find the rows for the current group - GROUP = 1 - while(GROUP > 0){ - # break if there are no more row left in X - if(i > nrow(X)){ - GROUP = 0 - } - # check if the row belongs to the current group - else if(as.scalar(X[i, col]) == currentGroup){ - amountRows = amountRows + 1 - i = i + 1 - } - # break if the row does not belong to the current group - else{ - GROUP = 0 - } - } - - # copy the values into the final matrix - if (amountRows > 0){ - # create a matrix of the row that should be copied without the group column - YrowEnd = YrowStart + amountRows - 1 - newMatrix = cbind(X[YrowStart:YrowEnd, 1:(col-1)], X[YrowStart:YrowEnd, (col+1):ncol(X)]) - - # flatten the new row - newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) - newRowColumns = amountRows * (ncol(X)-1) - - # add the new row into Y at the current group - Y[Ypos, 2: (newRowColumns + 1)] = newRow - } - - # continue with the next group - Ypos = Ypos + 1 - YrowStart = YrowStart + amountRows - restingGroups = restingGroups - 1 - } - - #restore the initial order of X - X = cbind(X, Xindexes) - ncol_X = ncol(X) - X = order(target = X, by= ncol_X) - X = X[, 1:ncol_X-1] - - #restore the initial order of Y - Y = cbind(Y, Yindexes) - ncol_Y = ncol(Y) - Y = order(target = Y, by= ncol_Y) - Y = Y[, 1:ncol_Y-1] - } - else if (method == "permutation-matrix") { - # Extract the grouping column and create unique groups - key = X[,col] - key_unique = unique(X[, col]) - numGroups = nrow(key_unique) - - # Matrix for comparison - key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) - key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) - - # Find group index - groupIndex = rowIndexMax(t(key_compare == key_matrix)) - - # Determine the maximum number of rows in any group - maxRowsInGroup = max(table(X[,col],1)) - totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 - - # Create permutation matrix P copy relevant tuples with a single matrix multiplication - P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) - # Create offsets to store the first column of each group - offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) - - # Create row and column index for the permutation matrix - rowIndex = seq(1, nrow(X)) - indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) - selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) - colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) - - # Set values in P - P = table(seq(1, nrow(X)), colIndex) - - # Perform matrix multiplication - Y_temp = t(P) %*% X - - # Remove the selected column from Y_temp - if( col == 1 ) { - Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] - } - else if( col == ncol(X) ) { - Y_temp_reduce = Y_temp[, 1:col-1] - } - else{ - Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) - } - - # Set value of final output - Y = matrix(0, rows=numGroups, cols=totalCells) - Y[,1] = key_unique - Y[,2:ncol(Y)] = matrix(Y_temp_reduce, rows=numGroups, cols=totalCells-1) - } -} - diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml deleted file mode 100644 index 011d7ab048a..00000000000 --- a/testing/groupby_new_time.dml +++ /dev/null @@ -1,9 +0,0 @@ -# create an input matrix -input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) - -# load functions -source("scripts/builtin/raGroupby_exp1.dml") as ra_new - -# use the new ra_groupby function -new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") - diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml deleted file mode 100644 index 25ae4bfb416..00000000000 --- a/testing/groupby_old_time.dml +++ /dev/null @@ -1,8 +0,0 @@ -# create an input matrix -input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) - -# load functions -source("scripts/builtin/raGroupby.dml") as ra_old - -# use the initial ra_groupby function -old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") diff --git a/testing/testing.dml b/testing/testing.dml deleted file mode 100644 index 2bea74ff08f..00000000000 --- a/testing/testing.dml +++ /dev/null @@ -1,43 +0,0 @@ -# create an input matrix -input_matrix = round(rand(rows = 100, cols = 100, min = 1, max = 50)) - -# load functions -source("scripts/builtin/raGroupby.dml") as ra_old -source("scripts/builtin/raGroupby_exp1.dml") as ra_new - -# use the initial ra_groupby function -old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") - -# use the new ra_groupby function -new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") - -# check if the new function still operates correctly -for(i in 1:nrow(old_func)){ - for(j in 1:ncol(old_func)){ - old_val = as.scalar(old_func[i,j]) - new_val = as.scalar(new_func[i,j]) - if (old_val != new_val){ - print("The values are not identical") - print("The index is i x j") - print(i) - print(j) - }else{ - if(j == 1){ - print("The first value was correct") - } - if(j == ncol(old_func)){ - print("The last value was correct") - } - } - } -} - - -print("This is the input matrix") -print(toString(input_matrix)) - -print("This is the old func") -print(toString(old_func)) - -print("This is the new func") -print(toString(new_func)) \ No newline at end of file From a79420e628ee6a06217e76b131f2d31d682e910a Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sun, 6 Jul 2025 22:19:53 +0200 Subject: [PATCH 07/25] Removed print statement for debugging --- scripts/builtin/raGroupby.dml | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index 13e08f84789..33390f00447 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -38,7 +38,6 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) return (Matrix[Double] Y) { if (method == "nested-loop") { - # print(toString(X)) # Extract and sort unique values from the specified column (1-based index) uniqueValues = unique(X[, col]) order_uniqueValues = order(target = uniqueValues, by = 1); From de96b3e90cf90fdd478c7800f151212e4a056669 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Mon, 7 Jul 2025 12:07:20 +0200 Subject: [PATCH 08/25] Removed print statement for debugging, forgot one --- scripts/builtin/raGroupby.dml | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index 33390f00447..7d51b06ca33 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -143,8 +143,6 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) ncol_Y = ncol(Y) Y = order(target = Y, by= ncol_Y) Y = Y[, 1:ncol_Y-1] - - print(toString(Y)) } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups From 3ffaa3427b29ea2ed5c2cb6444f90b31831ba529 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Tue, 8 Jul 2025 22:30:28 +0200 Subject: [PATCH 09/25] commit to merge the fix of permutation matrix --- scripts/builtin/raGroupby.dml | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index 7d51b06ca33..dbe67a8f306 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -143,6 +143,7 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) ncol_Y = ncol(Y) Y = order(target = Y, by= ncol_Y) Y = Y[, 1:ncol_Y-1] + } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups From f07c3878e43f1c7a5b73c72858b72ee9621ab228 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sun, 29 Jun 2025 23:51:58 +0200 Subject: [PATCH 10/25] Added the Becnhmarking framework (Python) --- Benchmarking | 1 + 1 file changed, 1 insertion(+) create mode 160000 Benchmarking diff --git a/Benchmarking b/Benchmarking new file mode 160000 index 00000000000..10eab1f884f --- /dev/null +++ b/Benchmarking @@ -0,0 +1 @@ +Subproject commit 10eab1f884fd23e1bf4452e424dd7fe739c142ec From a93c2f5921b6e74892d888049a65448fe1a43542 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sun, 29 Jun 2025 23:46:12 +0200 Subject: [PATCH 11/25] Current status, unfortunately, it does not insert the values correctly in raGroupby_exp1.dml --- .gitignore | 1 + scripts/builtin/raGroupby_exp1.dml | 171 +++++++++++++++++++++++++++++ testing/groupby_new_time.dml | 9 ++ testing/groupby_old_time.dml | 21 ++++ testing/testing.dml | 51 +++++++++ 5 files changed, 253 insertions(+) create mode 100644 scripts/builtin/raGroupby_exp1.dml create mode 100644 testing/groupby_new_time.dml create mode 100644 testing/groupby_old_time.dml create mode 100644 testing/testing.dml diff --git a/.gitignore b/.gitignore index f3c28571bdf..b2210ba22c1 100644 --- a/.gitignore +++ b/.gitignore @@ -150,3 +150,4 @@ venv/* # resource optimization scripts/resource/output *.pem +ADDED_Testing/hello.dml diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml new file mode 100644 index 00000000000..7f5fef1815a --- /dev/null +++ b/scripts/builtin/raGroupby_exp1.dml @@ -0,0 +1,171 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This raGroupby-function takes a matrix dataset as input from where it performs +# relational operations : groupby +# +# INPUT: +# ------------------------------------------------------------------------------ +# X Matrix of input data [shape: N x M] +# col Integer indicating the column index to execute grupby command +# method Groupby implemention method (nested-loop, permutation-matrix) +# ------------------------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------------------------ +# Y Matrix of selected data [shape N' x M] with N' <= N +# ------------------------------------------------------------------------------ + +m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) + return (Matrix[Double] Y) +{ + if (method == "nested-loop") { + # Extract and sort unique values from the specified column (1-based index) + uniqueValues = unique(X[, col]) + order_uniqueValues = order(target = uniqueValues, by = 1); + + # Calcute the number of groups + numGroups = nrow(uniqueValues) + + # Determine the maximum number of rows in any group + maxRowsInGroup = max(table(X[,col],1)); + + # Define a zero matrix to put the group data into + Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) + + # Put the ordered uniqueValues into first column of Y as group_id + Y[,1] = uniqueValues + + # create matrix to store the amount of rows for each group + rows_per_group = matrix(0, numGroups, 1) + + # order the initial matrix + #Xordered = order(target = X[,col], by = col) + Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + + # the amount of unique groups that are remaining + restingGroups = nrow(uniqueValues) + + # intial group + Ypos = 1 + currentGroup = as.scalar(Y[Ypos,1]) + + # intial positon in the row final matrix + YrowStart = 1 + + #loop over the initial matrix + while(restingGroups > 0){ + + current_group = as.scalar(Y[Ypos,1]) + + #amount of rows that need to be copied + amountRows = 0 + + i = YrowStart + + GROUP = 1 + + while(GROUP > 0){ + if(i > nrow(Xordered)){ + GROUP = 0 + } + else if(as.scalar(Xordered[i, col]) == currentGroup){ + amountRows = amountRows + 1 + i = i + 1 + } + else{ + GROUP = 0 + } + + } + + # row position in the final matrix + restingGroups = restingGroups - 1 + + if (amountRows > 0){ + # copy the values into the final matrix + YrowEnd = YrowStart + amountRows - 1 + newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)]) + #flatten the new row + newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) + newRowColumns = amountRows * (ncol(X)-1) + Y[Ypos, 2: (newRowColumns + 1)] = newRow + } + + # continue with the next group + Ypos = Ypos + 1 + YrowStart = YrowStart + amountRows + + } + + } + else if (method == "permutation-matrix") { + # Extract the grouping column and create unique groups + key = X[,col] + key_unique = unique(X[, col]) + numGroups = nrow(key_unique) + + # Matrix for comparison + key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) + key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) + + # Find group index + groupIndex = rowIndexMax(t(key_compare == key_matrix)) + + # Determine the maximum number of rows in any group + maxRowsInGroup = max(table(X[,col],1)) + totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 + + # Create permutation matrix P copy relevant tuples with a single matrix multiplication + P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) + # Create offsets to store the first column of each group + offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) + + # Create row and column index for the permutation matrix + rowIndex = seq(1, nrow(X)) + indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) + selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) + colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) + + # Set values in P + P = table(seq(1, nrow(X)), colIndex) + + # Perform matrix multiplication + Y_temp = t(P) %*% X + + # Remove the selected column from Y_temp + if( col == 1 ) { + Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] + } + else if( col == ncol(X) ) { + Y_temp_reduce = Y_temp[, 1:col-1] + } + else{ + Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) + } + + # Set value of final output + Y = matrix(0, rows=numGroups, cols=totalCells) + Y[,1] = key_unique + Y[,2:ncol(Y)] = matrix(Y_temp_reduce, rows=numGroups, cols=totalCells-1) + } +} + diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml new file mode 100644 index 00000000000..fba92564b82 --- /dev/null +++ b/testing/groupby_new_time.dml @@ -0,0 +1,9 @@ +# create an input matrix +input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) + +# load functions +source("scripts/builtin/raGroupby_exp1.dml") as ra_new + +# use the new ra_groupby function +new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") + diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml new file mode 100644 index 00000000000..015a5797253 --- /dev/null +++ b/testing/groupby_old_time.dml @@ -0,0 +1,21 @@ +# create an input matrix +input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) +print("The amount of rows in the input matrix") +print(1000*10) + +# load functions +source("scripts/builtin/raGroupby.dml") as ra_old + +# use the initial ra_groupby function +old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") + +print(input_matrix[1:10, 1:ncol(input_matrix)]) + +print("The amount of rows in the final matrix") +print(nrow(old_func)) + +print("The amount of rows in the final matrix") +print(ncol(old_func)) + +print("The amount of rows * columns in the final matrix") +print(ncol(old_func)*nrow(old_func)) \ No newline at end of file diff --git a/testing/testing.dml b/testing/testing.dml new file mode 100644 index 00000000000..39ac856b2f2 --- /dev/null +++ b/testing/testing.dml @@ -0,0 +1,51 @@ +# create an input matrix +input_matrix = round(rand(rows = 10, cols = 10, min = 1, max = 5)) + +# load functions +source("scripts/builtin/raGroupby.dml") as ra_old +source("scripts/builtin/raGroupby_exp1.dml") as ra_new + +# use the initial ra_groupby function +old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") + +# use the new ra_groupby function +new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") + +# check if the new function still operates correctly +for(i in 1:nrow(old_func)){ + for(j in 1:ncol(old_func)){ + old_val = as.scalar(old_func[i,j]) + new_val = as.scalar(new_func[i,j]) + if (old_val != new_val){ + print("The values are not identical") + print("The index is i x j") + print(i) + print(j) + }else{ + if(j == 1){ + print("The first value was correct") + } + if(j == ncol(old_func)){ + print("The last value was correct") + } + } + } +} + +print("The amount of rows in the old matrix") +print(nrow(old_func)) + +print("The amount of col in the old matrix") +print(ncol(old_func)) + +print("The amount of rows * columns in the old matrix") +print(ncol(old_func)*nrow(old_func)) + +print("The amount of rows in the new matrix") +print(nrow(new_func)) + +print("The amount of col in the new matrix") +print(ncol(new_func)) + +print("The amount of rows * columns in the new matrix") +print(ncol(new_func)*nrow(new_func)) From 4ac826826d03eb82ecbfdcf35e06001080767136 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Mon, 30 Jun 2025 15:11:59 +0200 Subject: [PATCH 12/25] Found the error, should beat the performance of nested loop, sorry for the last pull request --- scripts/builtin/raGroupby_exp1.dml | 26 ++++++++++++++++++-------- testing/groupby_new_time.dml | 2 +- testing/groupby_old_time.dml | 15 +-------------- testing/testing.dml | 2 +- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml index 7f5fef1815a..3e0b4abec4d 100644 --- a/scripts/builtin/raGroupby_exp1.dml +++ b/scripts/builtin/raGroupby_exp1.dml @@ -58,29 +58,29 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) rows_per_group = matrix(0, numGroups, 1) # order the initial matrix - #Xordered = order(target = X[,col], by = col) Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + # create a Matrix with the ordered groups + Yordered = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + # the amount of unique groups that are remaining restingGroups = nrow(uniqueValues) # intial group Ypos = 1 - currentGroup = as.scalar(Y[Ypos,1]) # intial positon in the row final matrix YrowStart = 1 + i = 1 #loop over the initial matrix while(restingGroups > 0){ - current_group = as.scalar(Y[Ypos,1]) + currentGroup = as.scalar(Yordered[Ypos,1]) #amount of rows that need to be copied amountRows = 0 - i = YrowStart - GROUP = 1 while(GROUP > 0){ @@ -94,28 +94,38 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) else{ GROUP = 0 } - } # row position in the final matrix restingGroups = restingGroups - 1 if (amountRows > 0){ + # copy the values into the final matrix YrowEnd = YrowStart + amountRows - 1 newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)]) + #flatten the new row newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) newRowColumns = amountRows * (ncol(X)-1) - Y[Ypos, 2: (newRowColumns + 1)] = newRow - } + # sort the ordered Y matrix back to the initial order + rowFinder = 1 + while(currentGroup != as.scalar(Y[rowFinder, 1])){ + rowFinder = rowFinder + 1 + } + Y[rowFinder, 2: (newRowColumns + 1)] = newRow + } # continue with the next group Ypos = Ypos + 1 YrowStart = YrowStart + amountRows } + + + + } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml index fba92564b82..011d7ab048a 100644 --- a/testing/groupby_new_time.dml +++ b/testing/groupby_new_time.dml @@ -1,5 +1,5 @@ # create an input matrix -input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) +input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby_exp1.dml") as ra_new diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml index 015a5797253..25ae4bfb416 100644 --- a/testing/groupby_old_time.dml +++ b/testing/groupby_old_time.dml @@ -1,21 +1,8 @@ # create an input matrix -input_matrix = round(rand(rows = 1000, cols = 10, min = 1)) -print("The amount of rows in the input matrix") -print(1000*10) +input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby.dml") as ra_old # use the initial ra_groupby function old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") - -print(input_matrix[1:10, 1:ncol(input_matrix)]) - -print("The amount of rows in the final matrix") -print(nrow(old_func)) - -print("The amount of rows in the final matrix") -print(ncol(old_func)) - -print("The amount of rows * columns in the final matrix") -print(ncol(old_func)*nrow(old_func)) \ No newline at end of file diff --git a/testing/testing.dml b/testing/testing.dml index 39ac856b2f2..d19951f095d 100644 --- a/testing/testing.dml +++ b/testing/testing.dml @@ -1,5 +1,5 @@ # create an input matrix -input_matrix = round(rand(rows = 10, cols = 10, min = 1, max = 5)) +input_matrix = round(rand(rows = 5, cols = 5, min = 1, max = 5)) # load functions source("scripts/builtin/raGroupby.dml") as ra_old From 59305fcc168749fd1556bc8875624bb673e11e8b Mon Sep 17 00:00:00 2001 From: maxrankl Date: Mon, 30 Jun 2025 15:14:11 +0200 Subject: [PATCH 13/25] Found the error, should beat the performance of nested loop, sorry for the last pull request --- Benchmarking | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Benchmarking b/Benchmarking index 10eab1f884f..acc4ca7e9f1 160000 --- a/Benchmarking +++ b/Benchmarking @@ -1 +1 @@ -Subproject commit 10eab1f884fd23e1bf4452e424dd7fe739c142ec +Subproject commit acc4ca7e9f1f906a63fb3b19374bd398d8d3fd42 From 6894877a4afe28c847e3195dd3e50ccc2df1f711 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Fri, 4 Jul 2025 19:36:43 +0200 Subject: [PATCH 14/25] Avoided the addition loop over Y to copy the rows into Y by saving the initial order of Y and the resotring it after copying. In order to copy the values it was necessary to also order X. Before a copy matrix was needed, which can be avoided as well by saving and restoring the initial order of X --- scripts/builtin/raGroupby_exp1.dml | 67 ++++++++++++++++-------------- testing/testing.dml | 22 ++++------ 2 files changed, 43 insertions(+), 46 deletions(-) diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml index 3e0b4abec4d..47f3ee7eb0c 100644 --- a/scripts/builtin/raGroupby_exp1.dml +++ b/scripts/builtin/raGroupby_exp1.dml @@ -54,78 +54,83 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) # Put the ordered uniqueValues into first column of Y as group_id Y[,1] = uniqueValues - # create matrix to store the amount of rows for each group - rows_per_group = matrix(0, numGroups, 1) + # save the initial order of the groups in Y + Yindexes = order(target = Y, by = 1, index.return = TRUE) - # order the initial matrix - Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + # order Y by the groups increasing + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) - # create a Matrix with the ordered groups - Yordered = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + # order the initial matrix + Xindexes = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) # the amount of unique groups that are remaining restingGroups = nrow(uniqueValues) - # intial group + # intial group to copy the values into Y Ypos = 1 - # intial positon in the row final matrix + # start positon in the row final matrix YrowStart = 1 i = 1 #loop over the initial matrix while(restingGroups > 0){ - currentGroup = as.scalar(Yordered[Ypos,1]) + currentGroup = as.scalar(Y[Ypos,1]) - #amount of rows that need to be copied + # amount of rows that need to be copied amountRows = 0 + # find the rows for the current group GROUP = 1 - while(GROUP > 0){ - if(i > nrow(Xordered)){ + # break if there are no more row left in X + if(i > nrow(X)){ GROUP = 0 } - else if(as.scalar(Xordered[i, col]) == currentGroup){ + # check if the row belongs to the current group + else if(as.scalar(X[i, col]) == currentGroup){ amountRows = amountRows + 1 i = i + 1 } + # break if the row does not belong to the current group else{ GROUP = 0 } } - # row position in the final matrix - restingGroups = restingGroups - 1 - + # copy the values into the final matrix if (amountRows > 0){ - - # copy the values into the final matrix + # create a matrix of the row that should be copied without the group column YrowEnd = YrowStart + amountRows - 1 - newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)]) + newMatrix = cbind(X[YrowStart:YrowEnd, 1:(col-1)], X[YrowStart:YrowEnd, (col+1):ncol(X)]) - #flatten the new row + # flatten the new row newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) newRowColumns = amountRows * (ncol(X)-1) - # sort the ordered Y matrix back to the initial order - rowFinder = 1 - while(currentGroup != as.scalar(Y[rowFinder, 1])){ - rowFinder = rowFinder + 1 - } - Y[rowFinder, 2: (newRowColumns + 1)] = newRow + # add the new row into Y at the current group + Y[Ypos, 2: (newRowColumns + 1)] = newRow } + # continue with the next group Ypos = Ypos + 1 YrowStart = YrowStart + amountRows - + restingGroups = restingGroups - 1 } - - - - + #restore the initial order of X + X = cbind(X, Xindexes) + ncol_X = ncol(X) + X = order(target = X, by= ncol_X) + X = X[, 1:ncol_X-1] + + #restore the initial order of Y + Y = cbind(Y, Yindexes) + ncol_Y = ncol(Y) + Y = order(target = Y, by= ncol_Y) + Y = Y[, 1:ncol_Y-1] } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups diff --git a/testing/testing.dml b/testing/testing.dml index d19951f095d..2bea74ff08f 100644 --- a/testing/testing.dml +++ b/testing/testing.dml @@ -1,5 +1,5 @@ # create an input matrix -input_matrix = round(rand(rows = 5, cols = 5, min = 1, max = 5)) +input_matrix = round(rand(rows = 100, cols = 100, min = 1, max = 50)) # load functions source("scripts/builtin/raGroupby.dml") as ra_old @@ -32,20 +32,12 @@ for(i in 1:nrow(old_func)){ } } -print("The amount of rows in the old matrix") -print(nrow(old_func)) -print("The amount of col in the old matrix") -print(ncol(old_func)) +print("This is the input matrix") +print(toString(input_matrix)) -print("The amount of rows * columns in the old matrix") -print(ncol(old_func)*nrow(old_func)) +print("This is the old func") +print(toString(old_func)) -print("The amount of rows in the new matrix") -print(nrow(new_func)) - -print("The amount of col in the new matrix") -print(ncol(new_func)) - -print("The amount of rows * columns in the new matrix") -print(ncol(new_func)*nrow(new_func)) +print("This is the new func") +print(toString(new_func)) \ No newline at end of file From a85d3d23ff0986e7bfc604c9ae6ef5511dbe62d2 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sat, 5 Jul 2025 16:20:50 +0200 Subject: [PATCH 15/25] Removed additional files. Copied content into the correct file. Added switch if the first and the last column ist selected. Passes now all provided tests. --- scripts/builtin/raGroupby_exp1.dml | 186 ----------------------------- testing/groupby_new_time.dml | 9 -- testing/groupby_old_time.dml | 8 -- testing/testing.dml | 43 ------- 4 files changed, 246 deletions(-) delete mode 100644 scripts/builtin/raGroupby_exp1.dml delete mode 100644 testing/groupby_new_time.dml delete mode 100644 testing/groupby_old_time.dml delete mode 100644 testing/testing.dml diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml deleted file mode 100644 index 47f3ee7eb0c..00000000000 --- a/scripts/builtin/raGroupby_exp1.dml +++ /dev/null @@ -1,186 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# This raGroupby-function takes a matrix dataset as input from where it performs -# relational operations : groupby -# -# INPUT: -# ------------------------------------------------------------------------------ -# X Matrix of input data [shape: N x M] -# col Integer indicating the column index to execute grupby command -# method Groupby implemention method (nested-loop, permutation-matrix) -# ------------------------------------------------------------------------------ -# -# OUTPUT: -# ------------------------------------------------------------------------------ -# Y Matrix of selected data [shape N' x M] with N' <= N -# ------------------------------------------------------------------------------ - -m_raGroupby_new = function (Matrix[Double] X, Integer col, String method) - return (Matrix[Double] Y) -{ - if (method == "nested-loop") { - # Extract and sort unique values from the specified column (1-based index) - uniqueValues = unique(X[, col]) - order_uniqueValues = order(target = uniqueValues, by = 1); - - # Calcute the number of groups - numGroups = nrow(uniqueValues) - - # Determine the maximum number of rows in any group - maxRowsInGroup = max(table(X[,col],1)); - - # Define a zero matrix to put the group data into - Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) - - # Put the ordered uniqueValues into first column of Y as group_id - Y[,1] = uniqueValues - - # save the initial order of the groups in Y - Yindexes = order(target = Y, by = 1, index.return = TRUE) - - # order Y by the groups increasing - Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) - - # order the initial matrix - Xindexes = order(target = X, by = col, index.return = TRUE) - X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) - - # the amount of unique groups that are remaining - restingGroups = nrow(uniqueValues) - - # intial group to copy the values into Y - Ypos = 1 - - # start positon in the row final matrix - YrowStart = 1 - i = 1 - - #loop over the initial matrix - while(restingGroups > 0){ - - currentGroup = as.scalar(Y[Ypos,1]) - - # amount of rows that need to be copied - amountRows = 0 - - # find the rows for the current group - GROUP = 1 - while(GROUP > 0){ - # break if there are no more row left in X - if(i > nrow(X)){ - GROUP = 0 - } - # check if the row belongs to the current group - else if(as.scalar(X[i, col]) == currentGroup){ - amountRows = amountRows + 1 - i = i + 1 - } - # break if the row does not belong to the current group - else{ - GROUP = 0 - } - } - - # copy the values into the final matrix - if (amountRows > 0){ - # create a matrix of the row that should be copied without the group column - YrowEnd = YrowStart + amountRows - 1 - newMatrix = cbind(X[YrowStart:YrowEnd, 1:(col-1)], X[YrowStart:YrowEnd, (col+1):ncol(X)]) - - # flatten the new row - newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) - newRowColumns = amountRows * (ncol(X)-1) - - # add the new row into Y at the current group - Y[Ypos, 2: (newRowColumns + 1)] = newRow - } - - # continue with the next group - Ypos = Ypos + 1 - YrowStart = YrowStart + amountRows - restingGroups = restingGroups - 1 - } - - #restore the initial order of X - X = cbind(X, Xindexes) - ncol_X = ncol(X) - X = order(target = X, by= ncol_X) - X = X[, 1:ncol_X-1] - - #restore the initial order of Y - Y = cbind(Y, Yindexes) - ncol_Y = ncol(Y) - Y = order(target = Y, by= ncol_Y) - Y = Y[, 1:ncol_Y-1] - } - else if (method == "permutation-matrix") { - # Extract the grouping column and create unique groups - key = X[,col] - key_unique = unique(X[, col]) - numGroups = nrow(key_unique) - - # Matrix for comparison - key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) - key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) - - # Find group index - groupIndex = rowIndexMax(t(key_compare == key_matrix)) - - # Determine the maximum number of rows in any group - maxRowsInGroup = max(table(X[,col],1)) - totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 - - # Create permutation matrix P copy relevant tuples with a single matrix multiplication - P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) - # Create offsets to store the first column of each group - offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) - - # Create row and column index for the permutation matrix - rowIndex = seq(1, nrow(X)) - indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) - selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) - colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) - - # Set values in P - P = table(seq(1, nrow(X)), colIndex) - - # Perform matrix multiplication - Y_temp = t(P) %*% X - - # Remove the selected column from Y_temp - if( col == 1 ) { - Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] - } - else if( col == ncol(X) ) { - Y_temp_reduce = Y_temp[, 1:col-1] - } - else{ - Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) - } - - # Set value of final output - Y = matrix(0, rows=numGroups, cols=totalCells) - Y[,1] = key_unique - Y[,2:ncol(Y)] = matrix(Y_temp_reduce, rows=numGroups, cols=totalCells-1) - } -} - diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml deleted file mode 100644 index 011d7ab048a..00000000000 --- a/testing/groupby_new_time.dml +++ /dev/null @@ -1,9 +0,0 @@ -# create an input matrix -input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) - -# load functions -source("scripts/builtin/raGroupby_exp1.dml") as ra_new - -# use the new ra_groupby function -new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") - diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml deleted file mode 100644 index 25ae4bfb416..00000000000 --- a/testing/groupby_old_time.dml +++ /dev/null @@ -1,8 +0,0 @@ -# create an input matrix -input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5)) - -# load functions -source("scripts/builtin/raGroupby.dml") as ra_old - -# use the initial ra_groupby function -old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") diff --git a/testing/testing.dml b/testing/testing.dml deleted file mode 100644 index 2bea74ff08f..00000000000 --- a/testing/testing.dml +++ /dev/null @@ -1,43 +0,0 @@ -# create an input matrix -input_matrix = round(rand(rows = 100, cols = 100, min = 1, max = 50)) - -# load functions -source("scripts/builtin/raGroupby.dml") as ra_old -source("scripts/builtin/raGroupby_exp1.dml") as ra_new - -# use the initial ra_groupby function -old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop") - -# use the new ra_groupby function -new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop") - -# check if the new function still operates correctly -for(i in 1:nrow(old_func)){ - for(j in 1:ncol(old_func)){ - old_val = as.scalar(old_func[i,j]) - new_val = as.scalar(new_func[i,j]) - if (old_val != new_val){ - print("The values are not identical") - print("The index is i x j") - print(i) - print(j) - }else{ - if(j == 1){ - print("The first value was correct") - } - if(j == ncol(old_func)){ - print("The last value was correct") - } - } - } -} - - -print("This is the input matrix") -print(toString(input_matrix)) - -print("This is the old func") -print(toString(old_func)) - -print("This is the new func") -print(toString(new_func)) \ No newline at end of file From a669e42e405c3ee93023d625a12ecb98bcef8830 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Mon, 7 Jul 2025 12:07:20 +0200 Subject: [PATCH 16/25] Finished merge for the ra_groupby permutation matrix fix --- scripts/builtin/raGroupby.dml | 114 ++++++++++++++++++++++++++-------- 1 file changed, 87 insertions(+), 27 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index 0a23bf51ef0..ed7b8b37d87 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -52,37 +52,97 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) # Put the ordered uniqueValues into first column of Y as group_id - #Y[,1] = order_uniqueValues Y[,1] = uniqueValues - # Loop for each group - for(i in 1:numGroups){ - index = 0 - - # Iterate each row in matrix X to deal with group data - for ( j in 1:nrow(X) ) { - if ( as.scalar( X[j,col] == uniqueValues[i,1] )) { - # Define the formula of the start and end column position - startCol = index*(ncol(X)-1) +2 - endCol = startCol + (ncol(X)-2) - - if (col == 1) { - # Case when the selected column is the first column - Y[i,startCol:endCol] = X[j,2:ncol(X)] - } - else if (col == ncol(X)) { - # Case when the selected column is the last column - Y[i,startCol:endCol] = X[j,1:(ncol(X)-1)] - } - else { - # General case - newRow = cbind(X[j, 1:(col-1)], X[j, (col+1):ncol(X)]) - Y[i,startCol:endCol] = newRow - } - index = index +1 + # save the initial order of the groups in Y + Yindexes = order(target = Y, by = 1, index.return = TRUE) + + # order Y by the groups increasing + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + + # order the initial matrix + Xindexes = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + + # the amount of unique groups that are remaining + restingGroups = nrow(uniqueValues) + + # intial group to copy the values into Y + Ypos = 1 + + # start positon in the row final matrix + YrowStart = 1 + i = 1 + + #loop over the initial matrix + while(restingGroups > 0){ + + currentGroup = as.scalar(Y[Ypos,1]) + + # amount of rows that need to be copied + amountRows = 0 + + # find the rows for the current group + GROUP = 1 + while(GROUP > 0){ + # break if there are no more row left in X + if(i > nrow(X)){ + GROUP = 0 + } + # check if the row belongs to the current group + else if(as.scalar(X[i, col]) == currentGroup){ + amountRows = amountRows + 1 + i = i + 1 + } + # break if the row does not belong to the current group + else{ + GROUP = 0 + } } - } + + # copy the values into the final matrix + if (amountRows > 0){ + # create a matrix of the row that should be copied without the group column + YrowEnd = YrowStart + amountRows - 1 + + # case selected column is first column + if (col == 1){ + newMatrix = X[YrowStart:YrowEnd, (col+1):ncol(X)] + } + # case selected column is last column + else if (col == ncol(X)) { + newMatrix = X [YrowStart:YrowEnd, 1:col-1] + } + # normal case + else { + newMatrix = cbind(X[YrowStart:YrowEnd, 1:(col-1)], X[YrowStart:YrowEnd, (col+1):ncol(X)]) + } + + # flatten the new row + newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) + newRowColumns = amountRows * (ncol(X)-1) + + # add the new row into Y at the current group + Y[Ypos, 2: (newRowColumns + 1)] = newRow + } + + # continue with the next group + Ypos = Ypos + 1 + YrowStart = YrowStart + amountRows + restingGroups = restingGroups - 1 } + + #restore the initial order of X + X = cbind(X, Xindexes) + ncol_X = ncol(X) + X = order(target = X, by= ncol_X) + X = X[, 1:ncol_X-1] + + #restore the initial order of Y + Y = cbind(Y, Yindexes) + ncol_Y = ncol(Y) + Y = order(target = Y, by= ncol_Y) + Y = Y[, 1:ncol_Y-1] } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups From 66656c70ba5a045040a5063cfb8e43c5f91ef83a Mon Sep 17 00:00:00 2001 From: maxrankl Date: Tue, 8 Jul 2025 22:30:28 +0200 Subject: [PATCH 17/25] commit to merge the fix of permutation matrix --- scripts/builtin/raGroupby.dml | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index ed7b8b37d87..d116aee113e 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -143,6 +143,7 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) ncol_Y = ncol(Y) Y = order(target = Y, by= ncol_Y) Y = Y[, 1:ncol_Y-1] + } else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups From 09cc1d8cf84327a1546b443f64c2a30ea45d634f Mon Sep 17 00:00:00 2001 From: maxrankl Date: Thu, 24 Jul 2025 11:08:13 +0200 Subject: [PATCH 18/25] alternative version of permuatation works except edge cases --- scripts/builtin/raGroupby.dml | 117 +++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 3 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index d116aee113e..ba42df8af17 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -146,6 +146,8 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) } else if (method == "permutation-matrix") { + print("X") + print(toString(X)) # Extract the grouping column and create unique groups key = X[,col] key_unique = unique(X[, col]) @@ -160,25 +162,112 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) # Determine the maximum number of rows in any group maxRowsInGroup = max(table(X[,col],1)) + print("maxRowsin group") + print(toString(maxRowsInGroup)) totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 # Create permutation matrix P copy relevant tuples with a single matrix multiplication P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) - # Create offsets to store the first column of each group - offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) # Create row and column index for the permutation matrix rowIndex = seq(1, nrow(X)) indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) + print("selected Matrix") + print(toString(selectedMatrix)) + colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) + print(toString(indexWithInGroups)) + print(toString(colIndex)) # Set values in P P = table(seq(1, nrow(X)), colIndex) - + print("P") + print(toString(t(P))) # Perform matrix multiplication Y_temp = t(P) %*% X + + #Y_temp = X + print("Y_temp") + print(toString(Y_temp)) + + + # new implementation + # order the initial matrix + Xindexes = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + + rowPerGroup = cumsum(selectedMatrix) + rowPerGroup = rowPerGroup[nrow(rowPerGroup):nrow(rowPerGroup),] + print("rowsPerGroup") + print(toString(rowPerGroup)) + + longestRowY = maxRowsInGroup*(ncol(X)-1)+1 + biggestGroup = max(rowPerGroup) + print("longest row") + print(longestRowY) + print("biggest group") + print(biggestGroup) + + n_rpg = ncol(rowPerGroup) + + #create matrix with the missing padding + biggroupseq = matrix(biggestGroup, nrow(rowPerGroup), ncol(rowPerGroup)) + print("bi9ggroupsq") + print(toString(biggroupseq)) + missingpadding = biggroupseq - rowPerGroup + print("missing padding") + print(toString(missingpadding)) + amountofzerorows = sum(missingpadding) + print(amountofzerorows) + paddingwithoutgroups = matrix(0, rows=amountofzerorows, cols=(ncol(X))) + print("padding without groups") + print(toString(paddingwithoutgroups)) + + # group col for padding + print("key unique") + print(toString(key_unique)) + missingpadding_t = t(missingpadding) + print("missingpadding t") + print(toString(missingpadding_t)) + + r = missingpadding_t + g = key_unique + + n_g = nrow(g) + n_r = nrow(r) + total = sum(r) + + r_sliced = r[1:n_r-1,] + starts = cumsum(rbind(matrix(0, rows=n_r-nrow(r_sliced), cols=1), r_sliced)) + output_positions = seq(0, total-1) + output_positions_row = t(output_positions %*% matrix(1, rows=1, cols=n_g)) + + starts_col = starts %*% matrix(1, rows=1, cols=n_r) + starts_col = starts_col[, 1:total] + + ends_col = (starts + r ) %*% matrix(1, rows=1, cols=n_r) + ends_col = ends_col[, 1:total] + + in_group = (output_positions_row >= starts_col) & (output_positions_row < ends_col) + result = t(in_group) %*% g + + print(toString(result)) + + # insert group into the missing padding + paddingwithoutgroups [, col] = result + print("paddingwithoutgroups") + print(toString(paddingwithoutgroups)) + Y_temp = X + Y_temp = rbind(Y_temp, paddingwithoutgroups) + Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) + + print("Y_temp") + print(toString(Y_temp)) + # end new implementation + + # Remove the selected column from Y_temp if( col == 1 ) { Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] @@ -190,10 +279,20 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) } + print("Y_temp reduce") + print(toString(Y_temp_reduce)) + # Set value of final output Y = matrix(0, rows=numGroups, cols=totalCells) Y[,1] = key_unique + # save the initial order of the groups in Y + Yindexes = order(target = Y, by = 1, index.return = TRUE) + + # order Y by the groups increasing + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + + # The permutation matrix creates a structure where each group's data # may not fill exactly maxRowsInGroup rows. # If needed, we need to pad to the expected size first. @@ -207,8 +306,20 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) } else { Y_tmp_padded = Y_temp_reduce } + print("Y_temp padded ") + print(toString(Y_tmp_padded)) Y[,2:ncol(Y)] = matrix(Y_tmp_padded, rows=numGroups, cols=totalCells-1) + + #restore the initial order of Y + Y = cbind(Y, Yindexes) + ncol_Y = ncol(Y) + Y = order(target = Y, by= ncol_Y) + Y = Y[, 1:ncol_Y-1] + + print("Y") + print(toString(Y)) + } } From 81d4785760cbf615d1434a6a854e2f2fb3f094d4 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Thu, 24 Jul 2025 11:19:32 +0200 Subject: [PATCH 19/25] alternative version of permuatation works with edge cases, but needs cleanup --- scripts/builtin/raGroupby.dml | 91 ++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index ba42df8af17..2f70d6e3f35 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -220,49 +220,54 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) print("missing padding") print(toString(missingpadding)) amountofzerorows = sum(missingpadding) - print(amountofzerorows) - paddingwithoutgroups = matrix(0, rows=amountofzerorows, cols=(ncol(X))) - print("padding without groups") - print(toString(paddingwithoutgroups)) - - # group col for padding - print("key unique") - print(toString(key_unique)) - missingpadding_t = t(missingpadding) - print("missingpadding t") - print(toString(missingpadding_t)) - - r = missingpadding_t - g = key_unique - - n_g = nrow(g) - n_r = nrow(r) - total = sum(r) - - r_sliced = r[1:n_r-1,] - starts = cumsum(rbind(matrix(0, rows=n_r-nrow(r_sliced), cols=1), r_sliced)) - output_positions = seq(0, total-1) - output_positions_row = t(output_positions %*% matrix(1, rows=1, cols=n_g)) - - starts_col = starts %*% matrix(1, rows=1, cols=n_r) - starts_col = starts_col[, 1:total] - - ends_col = (starts + r ) %*% matrix(1, rows=1, cols=n_r) - ends_col = ends_col[, 1:total] - - in_group = (output_positions_row >= starts_col) & (output_positions_row < ends_col) - result = t(in_group) %*% g - - print(toString(result)) - - # insert group into the missing padding - paddingwithoutgroups [, col] = result - print("paddingwithoutgroups") - print(toString(paddingwithoutgroups)) - Y_temp = X - Y_temp = rbind(Y_temp, paddingwithoutgroups) - Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) - + if (amountofzerorows > 0){ + print(amountofzerorows) + paddingwithoutgroups = matrix(0, rows=amountofzerorows, cols=(ncol(X))) + print("padding without groups") + print(toString(paddingwithoutgroups)) + + # group col for padding + print("key unique") + print(toString(key_unique)) + missingpadding_t = t(missingpadding) + print("missingpadding t") + print(toString(missingpadding_t)) + + r = missingpadding_t + g = key_unique + + n_g = nrow(g) + n_r = nrow(r) + total = sum(r) + + r_sliced = r[1:n_r-1,] + starts = cumsum(rbind(matrix(0, rows=n_r-nrow(r_sliced), cols=1), r_sliced)) + output_positions = seq(0, total-1) + output_positions_row = t(output_positions %*% matrix(1, rows=1, cols=n_g)) + + starts_col = starts %*% matrix(1, rows=1, cols=n_r) + starts_col = starts_col[, 1:total] + + ends_col = (starts + r ) %*% matrix(1, rows=1, cols=n_r) + ends_col = ends_col[, 1:total] + + in_group = (output_positions_row >= starts_col) & (output_positions_row < ends_col) + result = t(in_group) %*% g + + print(toString(result)) + + # insert group into the missing padding + paddingwithoutgroups [, col] = result + print("paddingwithoutgroups") + print(toString(paddingwithoutgroups)) + Y_temp = X + Y_temp = rbind(Y_temp, paddingwithoutgroups) + Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) + } + else{ + Y_temp = X + Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) + } print("Y_temp") print(toString(Y_temp)) # end new implementation From effc3237f85619e71d25b5d6a7051c409a2d9ccd Mon Sep 17 00:00:00 2001 From: maxrankl Date: Fri, 25 Jul 2025 01:08:51 +0200 Subject: [PATCH 20/25] permutation matrix is not a real permutation amtrix anymore because it uses one loop to avoid heap overload for bigger data sets --- scripts/builtin/raGroupby.dml | 159 +++++++++------------------------- 1 file changed, 39 insertions(+), 120 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index 2f70d6e3f35..a1bb54c79cf 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -146,149 +146,86 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) } else if (method == "permutation-matrix") { - print("X") - print(toString(X)) # Extract the grouping column and create unique groups key = X[,col] key_unique = unique(X[, col]) numGroups = nrow(key_unique) - # Matrix for comparison - key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) - key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) - - # Find group index - groupIndex = rowIndexMax(t(key_compare == key_matrix)) - # Determine the maximum number of rows in any group maxRowsInGroup = max(table(X[,col],1)) - print("maxRowsin group") - print(toString(maxRowsInGroup)) - totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 - - # Create permutation matrix P copy relevant tuples with a single matrix multiplication - P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) - - # Create row and column index for the permutation matrix - rowIndex = seq(1, nrow(X)) - indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) - selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) - print("selected Matrix") - print(toString(selectedMatrix)) - - colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) - print(toString(indexWithInGroups)) - print(toString(colIndex)) - - # Set values in P - P = table(seq(1, nrow(X)), colIndex) - print("P") - print(toString(t(P))) - # Perform matrix multiplication - Y_temp = t(P) %*% X + #calculate the frequency of each group + freqPerGroup = table(key, 1) + freqPerGroup = removeEmpty(target = freqPerGroup, margin = "rows") + freqPerGroupIdx = order(target = key_unique, by = 1, index.return = TRUE) - #Y_temp = X - print("Y_temp") - print(toString(Y_temp)) - - - # new implementation - # order the initial matrix - Xindexes = order(target = X, by = col, index.return = TRUE) - X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + # match the length of freqPerGroup to key_uniqe and sort it accordingly + freqPerGroup = cbind(freqPerGroup, freqPerGroupIdx) + ncol_fpg = ncol(freqPerGroup) + freqPerGroup = order(target = freqPerGroup, by= ncol_fpg) + freqPerGroup = freqPerGroup[, 1:ncol_fpg-1] - rowPerGroup = cumsum(selectedMatrix) - rowPerGroup = rowPerGroup[nrow(rowPerGroup):nrow(rowPerGroup),] - print("rowsPerGroup") - print(toString(rowPerGroup)) + rowPerGroup = t(freqPerGroup) longestRowY = maxRowsInGroup*(ncol(X)-1)+1 biggestGroup = max(rowPerGroup) - print("longest row") - print(longestRowY) - print("biggest group") - print(biggestGroup) - - n_rpg = ncol(rowPerGroup) #create matrix with the missing padding - biggroupseq = matrix(biggestGroup, nrow(rowPerGroup), ncol(rowPerGroup)) - print("bi9ggroupsq") - print(toString(biggroupseq)) - missingpadding = biggroupseq - rowPerGroup - print("missing padding") - print(toString(missingpadding)) - amountofzerorows = sum(missingpadding) - if (amountofzerorows > 0){ - print(amountofzerorows) - paddingwithoutgroups = matrix(0, rows=amountofzerorows, cols=(ncol(X))) - print("padding without groups") - print(toString(paddingwithoutgroups)) + biggestGroupSeq = matrix(biggestGroup, nrow(rowPerGroup), ncol(rowPerGroup)) + missingPadding = biggestGroupSeq - rowPerGroup + amountOfZeroRows = sum(missingPadding) - # group col for padding - print("key unique") - print(toString(key_unique)) - missingpadding_t = t(missingpadding) - print("missingpadding t") - print(toString(missingpadding_t)) + if (amountOfZeroRows > 0){ + paddingwithoutgroups = matrix(0, rows=amountOfZeroRows, cols=(ncol(X))) - r = missingpadding_t + # group col for padding + r = t(missingPadding) g = key_unique - n_g = nrow(g) - n_r = nrow(r) - total = sum(r) + #loop start + total_repeats = sum(r) + A = matrix(0, rows=total_repeats, cols=1) - r_sliced = r[1:n_r-1,] - starts = cumsum(rbind(matrix(0, rows=n_r-nrow(r_sliced), cols=1), r_sliced)) - output_positions = seq(0, total-1) - output_positions_row = t(output_positions %*% matrix(1, rows=1, cols=n_g)) + row_idx = 1 + A_idx_s = 1 + for(i in 1:nrow(r)) { + repeat_count = as.scalar(r[i,1]) + if(repeat_count > 0){ + temp = matrix(as.scalar(g[i, 1]), rows=repeat_count, cols = 1) - starts_col = starts %*% matrix(1, rows=1, cols=n_r) - starts_col = starts_col[, 1:total] + A_idx_e = A_idx_s + repeat_count - 1 - ends_col = (starts + r ) %*% matrix(1, rows=1, cols=n_r) - ends_col = ends_col[, 1:total] - - in_group = (output_positions_row >= starts_col) & (output_positions_row < ends_col) - result = t(in_group) %*% g - - print(toString(result)) + A[A_idx_s:A_idx_e, 1] = temp + A_idx_s = A_idx_e + 1 + } + } # insert group into the missing padding - paddingwithoutgroups [, col] = result - print("paddingwithoutgroups") - print(toString(paddingwithoutgroups)) - Y_temp = X - Y_temp = rbind(Y_temp, paddingwithoutgroups) + paddingwithoutgroups [, col] = A + Y_temp = rbind(X, paddingwithoutgroups) Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) } else{ Y_temp = X Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) } - print("Y_temp") - print(toString(Y_temp)) # end new implementation # Remove the selected column from Y_temp if( col == 1 ) { - Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] + Y_temp = Y_temp[, col+1:ncol(Y_temp)] } else if( col == ncol(X) ) { - Y_temp_reduce = Y_temp[, 1:col-1] + Y_temp = Y_temp[, 1:col-1] } else{ - Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) + Y_temp = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) } - print("Y_temp reduce") - print(toString(Y_temp_reduce)) # Set value of final output - Y = matrix(0, rows=numGroups, cols=totalCells) + Y = matrix(0, rows=numGroups, cols=longestRowY) Y[,1] = key_unique # save the initial order of the groups in Y @@ -297,24 +234,8 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) # order Y by the groups increasing Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) - - # The permutation matrix creates a structure where each group's data - # may not fill exactly maxRowsInGroup rows. - # If needed, we need to pad to the expected size first. - expectedRows = numGroups * maxRowsInGroup - actualRows = nrow(Y_temp_reduce) - - if(actualRows < expectedRows) { - # Pad Y_temp_reduce with zeros to match expected structure - Y_tmp_padded = matrix(0, rows=expectedRows, cols=ncol(Y_temp_reduce)) - Y_tmp_padded[1:actualRows,] = Y_temp_reduce - } else { - Y_tmp_padded = Y_temp_reduce - } - print("Y_temp padded ") - print(toString(Y_tmp_padded)) - - Y[,2:ncol(Y)] = matrix(Y_tmp_padded, rows=numGroups, cols=totalCells-1) + # copy the values into Y + Y[,2:ncol(Y)] = matrix(Y_temp, rows=numGroups, cols=longestRowY-1) #restore the initial order of Y Y = cbind(Y, Yindexes) @@ -322,8 +243,6 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) Y = order(target = Y, by= ncol_Y) Y = Y[, 1:ncol_Y-1] - print("Y") - print(toString(Y)) } } From debdffcea12a1ee0413bc15e5123543e20b328ff Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sat, 26 Jul 2025 17:50:26 +0200 Subject: [PATCH 21/25] alternative version of permutation matrix sorts X and calculates the padding via matrix multiplikation. binding the padding to the existing X and sorting it according to their group. Then X gets reshaped to Y --- scripts/builtin/raGroupby.dml | 58 ++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index a1bb54c79cf..e084c0150c3 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -172,36 +172,58 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) #create matrix with the missing padding biggestGroupSeq = matrix(biggestGroup, nrow(rowPerGroup), ncol(rowPerGroup)) + missingPadding = biggestGroupSeq - rowPerGroup + amountOfZeroRows = sum(missingPadding) + if (amountOfZeroRows > 0){ - paddingwithoutgroups = matrix(0, rows=amountOfZeroRows, cols=(ncol(X))) - # group col for padding - r = t(missingPadding) + paddingwithoutgroups = matrix(0, rows = amountOfZeroRows, cols = (ncol(X))) + missingPadding = t(missingPadding) + g = key_unique - #loop start - total_repeats = sum(r) - A = matrix(0, rows=total_repeats, cols=1) + length_g = nrow(key_unique) + max_mp = sum(missingPadding) - row_idx = 1 - A_idx_s = 1 - for(i in 1:nrow(r)) { - repeat_count = as.scalar(r[i,1]) - if(repeat_count > 0){ - temp = matrix(as.scalar(g[i, 1]), rows=repeat_count, cols = 1) + removemask = (missingPadding != 0) - A_idx_e = A_idx_s + repeat_count - 1 + missingPadding = cbind(key_unique, missingPadding) + + missingPadding = removeEmpty(target = missingPadding, margin = "rows", select = removemask) + + + keysPadding = missingPadding[,1] + missingPadding = missingPadding[,2] + + lenKeysPadding = nrow(keysPadding) + + + missingPadding = missingPadding %*% matrix(1, rows=1, cols = amountOfZeroRows) + + + mask = t(seq(1, amountOfZeroRows)) + + mask = matrix(1, rows=lenKeysPadding, cols = 1) %*% mask + + + mask = mask - missingPadding + mask = (mask <= 0) + + keysPadding = keysPadding %*% matrix(1, rows=1, cols = amountOfZeroRows) + + + missingPadding = keysPadding * mask + total_cells = nrow(missingPadding)*ncol(missingPadding) + + missingPadding = matrix(missingPadding, rows = total_cells, cols =1) + missingPadding = removeEmpty(target = missingPadding, margin = "rows") - A[A_idx_s:A_idx_e, 1] = temp - A_idx_s = A_idx_e + 1 - } - } # insert group into the missing padding - paddingwithoutgroups [, col] = A + paddingwithoutgroups [, col] = missingPadding Y_temp = rbind(X, paddingwithoutgroups) Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) } From 54a7b2ccc90d8e88aeaf6e5b481c46d98c2f6a88 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sat, 26 Jul 2025 23:24:35 +0200 Subject: [PATCH 22/25] Alternative version of a permutation matrix that really uses a permutation matrix. It uses a loop to generate the padding to ensure robustness compared to the original version that fails for bigger data set. It beats performance in most cases. It still needs some cleanup and renaming of variables for better comprehension. --- scripts/builtin/raGroupby.dml | 78 ++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index e084c0150c3..b7a641fbeeb 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -179,15 +179,9 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) if (amountOfZeroRows > 0){ - - paddingwithoutgroups = matrix(0, rows = amountOfZeroRows, cols = (ncol(X))) + #paddingwithoutgroups = matrix(0, rows = amountOfZeroRows, cols = (ncol(X))) missingPadding = t(missingPadding) - g = key_unique - - length_g = nrow(key_unique) - max_mp = sum(missingPadding) - removemask = (missingPadding != 0) missingPadding = cbind(key_unique, missingPadding) @@ -198,34 +192,46 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) keysPadding = missingPadding[,1] missingPadding = missingPadding[,2] - lenKeysPadding = nrow(keysPadding) - + g = keysPadding + r = missingPadding - missingPadding = missingPadding %*% matrix(1, rows=1, cols = amountOfZeroRows) + total_repeats = sum(r) + A = matrix(0, rows=total_repeats, cols=1) + row_idx = 1 + A_idx_s = 1 + for(i in 1:nrow(r)) { + repeat_count = as.scalar(r[i,1]) + if(repeat_count > 0){ + temp = matrix(as.scalar(g[i, 1]), rows=repeat_count, cols = 1) + A_idx_e = A_idx_s + repeat_count - 1 + A[A_idx_s:A_idx_e, 1] = temp + A_idx_s = A_idx_e + 1 + } + } - mask = t(seq(1, amountOfZeroRows)) + #paddingwithoutgroups [, col] = A + #paddingwithoutgroups = paddingwithoutgroups [, col:col+1] + padding = matrix(0, rows = nrow(A), cols = 1) + paddingwithoutgroups = cbind(A, padding) + key_e = key %*% matrix(1, rows = 1, cols = 2) - mask = matrix(1, rows=lenKeysPadding, cols = 1) %*% mask + Y_temp = rbind(key_e, paddingwithoutgroups) + Y_temp = order(target = Y_temp, by = 1, decreasing = FALSE, index.return = FALSE) + zero = Y_temp[, 2] - mask = mask - missingPadding - mask = (mask <= 0) - keysPadding = keysPadding %*% matrix(1, rows=1, cols = amountOfZeroRows) + Y_temp_idx = order(target = Y_temp, by = 1, decreasing = FALSE, index.return = TRUE) + Y_temp_idx = removeEmpty(target = Y_temp_idx, margin = "rows", select = (zero!=0)) + P = table(seq(1, nrow(X)), Y_temp_idx) - missingPadding = keysPadding * mask - total_cells = nrow(missingPadding)*ncol(missingPadding) + # order the initial matrix + Xindexes = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) - missingPadding = matrix(missingPadding, rows = total_cells, cols =1) - missingPadding = removeEmpty(target = missingPadding, margin = "rows") - - - # insert group into the missing padding - paddingwithoutgroups [, col] = missingPadding - Y_temp = rbind(X, paddingwithoutgroups) - Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) + Y_temp = t(P) %*% X } else{ Y_temp = X @@ -250,6 +256,22 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) Y = matrix(0, rows=numGroups, cols=longestRowY) Y[,1] = key_unique + # The permutation matrix creates a structure where each group's data + # may not fill exactly maxRowsInGroup rows. + # If needed, we need to pad to the expected size first. + expectedRows = numGroups * maxRowsInGroup + actualRows = nrow(Y_temp) + + if(actualRows < expectedRows) { + # Pad Y_temp with zeros to match expected structure + Y_tmp_padded = matrix(0, rows=expectedRows, cols=ncol(Y_temp)) + Y_tmp_padded[1:actualRows,] = Y_temp + } else { + Y_tmp_padded = Y_temp + } + + + # save the initial order of the groups in Y Yindexes = order(target = Y, by = 1, index.return = TRUE) @@ -257,15 +279,15 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) # copy the values into Y - Y[,2:ncol(Y)] = matrix(Y_temp, rows=numGroups, cols=longestRowY-1) - + #Y[,2:ncol(Y)] = matrix(Y_temp, rows=numGroups, cols=longestRowY-1) + Y[,2:ncol(Y)] = matrix(Y_tmp_padded, rows=numGroups, cols=longestRowY-1) + #Y = X #restore the initial order of Y Y = cbind(Y, Yindexes) ncol_Y = ncol(Y) Y = order(target = Y, by= ncol_Y) Y = Y[, 1:ncol_Y-1] - } } From 098c387ff8e80826a5a9d92abb7bc55de4432b92 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sun, 27 Jul 2025 13:29:02 +0200 Subject: [PATCH 23/25] Alternative version of a permutation matrix that really uses a permutation matrix. It uses a loop to generate the padding to ensure robustness compared to the original version that fails for bigger data set. It beats performance in most cases. I added the comments and cleaned up the naming of the variables. --- scripts/builtin/raGroupby.dml | 153 +++++++++++++++++----------------- 1 file changed, 76 insertions(+), 77 deletions(-) diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index b7a641fbeeb..9fd66735bee 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -148,146 +148,145 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups key = X[,col] - key_unique = unique(X[, col]) - numGroups = nrow(key_unique) + keyUnique = unique(X[, col]) + numGroups = nrow(keyUnique) # Determine the maximum number of rows in any group maxRowsInGroup = max(table(X[,col],1)) - #calculate the frequency of each group - freqPerGroup = table(key, 1) - freqPerGroup = removeEmpty(target = freqPerGroup, margin = "rows") - freqPerGroupIdx = order(target = key_unique, by = 1, index.return = TRUE) + # calculate the frequency of each group + freqPerKey = table(key, 1) + freqPerKey = removeEmpty(target = freqPerKey, margin = "rows") + freqPerKeyIndexes = order(target = keyUnique, by = 1, index.return = TRUE) - # match the length of freqPerGroup to key_uniqe and sort it accordingly - freqPerGroup = cbind(freqPerGroup, freqPerGroupIdx) - ncol_fpg = ncol(freqPerGroup) - freqPerGroup = order(target = freqPerGroup, by= ncol_fpg) - freqPerGroup = freqPerGroup[, 1:ncol_fpg-1] - - rowPerGroup = t(freqPerGroup) + # match the length of freqPerKey to key_uniqe and sort it accordingly + freqPerKey = cbind(freqPerKey, freqPerKeyIndexes) + ncolFpk = ncol(freqPerKey) + freqPerKey = order(target = freqPerKey, by= ncolFpk) + freqPerKey = freqPerKey[, 1:ncolFpk-1] + freqPerKey = t(freqPerKey) + # calculate the longest row in Y, which is the biggest key in X longestRowY = maxRowsInGroup*(ncol(X)-1)+1 - biggestGroup = max(rowPerGroup) - - #create matrix with the missing padding - biggestGroupSeq = matrix(biggestGroup, nrow(rowPerGroup), ncol(rowPerGroup)) - - missingPadding = biggestGroupSeq - rowPerGroup + biggestKey = max(freqPerKey) + # caclulate the amount of rows that need padding and the amount of padding per key + biggestKeySeq = matrix(biggestKey, nrow(freqPerKey), ncol(freqPerKey)) + missingPadding = biggestKeySeq - freqPerKey amountOfZeroRows = sum(missingPadding) - + # case padding is needed if (amountOfZeroRows > 0){ - #paddingwithoutgroups = matrix(0, rows = amountOfZeroRows, cols = (ncol(X))) missingPadding = t(missingPadding) + # remove the keys that dont need padding removemask = (missingPadding != 0) - - missingPadding = cbind(key_unique, missingPadding) - + missingPadding = cbind(keyUnique, missingPadding) missingPadding = removeEmpty(target = missingPadding, margin = "rows", select = removemask) - + # save the keys that need padding keysPadding = missingPadding[,1] + + # save the amount of padding per group missingPadding = missingPadding[,2] - g = keysPadding - r = missingPadding + repeatKeys = matrix(0, rows=amountOfZeroRows, cols=1) - total_repeats = sum(r) - A = matrix(0, rows=total_repeats, cols=1) + # uses a loop to generate the repeting keys + # loop to heap space problem with larger data sets - row_idx = 1 - A_idx_s = 1 - for(i in 1:nrow(r)) { - repeat_count = as.scalar(r[i,1]) - if(repeat_count > 0){ - temp = matrix(as.scalar(g[i, 1]), rows=repeat_count, cols = 1) - A_idx_e = A_idx_s + repeat_count - 1 - A[A_idx_s:A_idx_e, 1] = temp - A_idx_s = A_idx_e + 1 - } - } + #start index for the repeatKeys + repeatKeysIdxS = 1 - #paddingwithoutgroups [, col] = A - #paddingwithoutgroups = paddingwithoutgroups [, col:col+1] - padding = matrix(0, rows = nrow(A), cols = 1) - paddingwithoutgroups = cbind(A, padding) - key_e = key %*% matrix(1, rows = 1, cols = 2) + for(i in 1:nrow(missingPadding)) { + repeat_count = as.scalar(missingPadding[i,1]) + if(repeat_count > 0){ + temp = matrix(as.scalar(keysPadding[i, 1]), rows=repeat_count, cols = 1) + repeatKeysIdxE = repeatKeysIdxS + repeat_count - 1 + repeatKeys[repeatKeysIdxS:repeatKeysIdxE, 1] = temp + repeatKeysIdxS = repeatKeysIdxE + 1 + } + } - Y_temp = rbind(key_e, paddingwithoutgroups) + # combine the keys that need padding with the actual padding + padding = matrix(0, rows = nrow(repeatKeys), cols = 1) + padding = cbind(repeatKeys, padding) - Y_temp = order(target = Y_temp, by = 1, decreasing = FALSE, index.return = FALSE) - zero = Y_temp[, 2] + # extend the existing keys to a second column to match the padded keys + key = key %*% matrix(1, rows = 1, cols = 2) + # combine the keys with the padded keys and sort them increasingly + tempY = rbind(key, padding) + tempY = order(target = tempY, by = 1, decreasing = FALSE, index.return = FALSE) - Y_temp_idx = order(target = Y_temp, by = 1, decreasing = FALSE, index.return = TRUE) - Y_temp_idx = removeEmpty(target = Y_temp_idx, margin = "rows", select = (zero!=0)) + # remove the padded rows and save the indexes of the combined keys for the permutation matrix + paddedRows = tempY[, 2] + tempYindexes = order(target = tempY, by = 1, decreasing = FALSE, index.return = TRUE) + tempYindexes = removeEmpty(target = tempYindexes, margin = "rows", select = (paddedRows!=0)) - P = table(seq(1, nrow(X)), Y_temp_idx) + # create the permutation matrix by using the indexes of the combined keys + P = table(seq(1, nrow(X)), tempYindexes) - # order the initial matrix + # order the initial matrix to match the sorted keys with padding Xindexes = order(target = X, by = col, index.return = TRUE) X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) - Y_temp = t(P) %*% X + # perform the matrix multiplication + tempY = t(P) %*% X } + # case no padding is needed else{ - Y_temp = X - Y_temp = order(target = Y_temp, by = col, decreasing = FALSE, index.return = FALSE) + tempY = X + tempY = order(target = tempY, by = col, decreasing = FALSE, index.return = FALSE) } - # end new implementation - - # Remove the selected column from Y_temp + # Remove the selected column from tempY if( col == 1 ) { - Y_temp = Y_temp[, col+1:ncol(Y_temp)] + tempY = tempY[, col+1:ncol(tempY)] } else if( col == ncol(X) ) { - Y_temp = Y_temp[, 1:col-1] + tempY = tempY[, 1:col-1] } else{ - Y_temp = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) + tempY = cbind(tempY[, 1:col-1],tempY[, col+1:ncol(tempY)]) } - # Set value of final output Y = matrix(0, rows=numGroups, cols=longestRowY) - Y[,1] = key_unique + Y[,1] = keyUnique # The permutation matrix creates a structure where each group's data # may not fill exactly maxRowsInGroup rows. # If needed, we need to pad to the expected size first. expectedRows = numGroups * maxRowsInGroup - actualRows = nrow(Y_temp) + actualRows = nrow(tempY) if(actualRows < expectedRows) { - # Pad Y_temp with zeros to match expected structure - Y_tmp_padded = matrix(0, rows=expectedRows, cols=ncol(Y_temp)) - Y_tmp_padded[1:actualRows,] = Y_temp + # Pad tempY with zeros to match expected structure + tempYpadded = matrix(0, rows=expectedRows, cols=ncol(tempY)) + tempYpadded[1:actualRows,] = tempY } else { - Y_tmp_padded = Y_temp + tempYpadded = tempY } - - - # save the initial order of the groups in Y + # save the initial order of the groups in Y and order Y to match the sorted tempYpadded Yindexes = order(target = Y, by = 1, index.return = TRUE) - - # order Y by the groups increasing Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) # copy the values into Y - #Y[,2:ncol(Y)] = matrix(Y_temp, rows=numGroups, cols=longestRowY-1) - Y[,2:ncol(Y)] = matrix(Y_tmp_padded, rows=numGroups, cols=longestRowY-1) - #Y = X + Y[,2:ncol(Y)] = matrix(tempYpadded, rows=numGroups, cols=longestRowY-1) + + #restore the initial order of X + X = cbind(X, Xindexes) + ncol_X = ncol(X) + X = order(target = X, by= ncol_X) + X = X[, 1:ncol_X-1] + #restore the initial order of Y Y = cbind(Y, Yindexes) ncol_Y = ncol(Y) Y = order(target = Y, by= ncol_Y) Y = Y[, 1:ncol_Y-1] - } } From f1ce29cd1d06670d5e09451b3328403af338ca07 Mon Sep 17 00:00:00 2001 From: maxrankl Date: Sun, 27 Jul 2025 13:34:38 +0200 Subject: [PATCH 24/25] Removed additional added files. Both implementations of the methods of ra_groupby should beat performance in most cases and be more stable than the original implementation --- Benchmarking | 1 - 1 file changed, 1 deletion(-) delete mode 160000 Benchmarking diff --git a/Benchmarking b/Benchmarking deleted file mode 160000 index a7c69739332..00000000000 --- a/Benchmarking +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a7c697393322ce2a843fa1bd8ff4fea4ad49b0b4 From a3adfbce58a0c4f2e47f32cc85ef1f264ebe8b96 Mon Sep 17 00:00:00 2001 From: Grigorii Turchenko Date: Fri, 8 Aug 2025 14:59:26 +0200 Subject: [PATCH 25/25] chore(raGroupBy): edits for the pull request --- .gitignore | 1 - scripts/builtin/raGroupby.dml | 274 +++++++++++++++------------------- 2 files changed, 124 insertions(+), 151 deletions(-) diff --git a/.gitignore b/.gitignore index b2210ba22c1..f3c28571bdf 100644 --- a/.gitignore +++ b/.gitignore @@ -150,4 +150,3 @@ venv/* # resource optimization scripts/resource/output *.pem -ADDED_Testing/hello.dml diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index 9fd66735bee..cd3039902c4 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -37,170 +37,142 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) return (Matrix[Double] Y) { - if (method == "nested-loop") { - # Extract and sort unique values from the specified column (1-based index) - uniqueValues = unique(X[, col]) - order_uniqueValues = order(target = uniqueValues, by = 1); - - # Calcute the number of groups - numGroups = nrow(uniqueValues) - - # Determine the maximum number of rows in any group + if (method == "nested-loop") { + # Extract and sort unique group values from the specified column (1-based index) + groupsUnique = unique(X[, col]) + groupsUniqueOrdered = order(target = groupsUnique, by = 1) + numGroups = nrow(groupsUnique) maxRowsInGroup = max(table(X[,col],1)); - # Define a zero matrix to put the group data into - Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) - - # Put the ordered uniqueValues into first column of Y as group_id - Y[,1] = uniqueValues - - # save the initial order of the groups in Y - Yindexes = order(target = Y, by = 1, index.return = TRUE) - - # order Y by the groups increasing + # Define a zero output matrix, save the initial order of the groups, and sort increasingly + Y = matrix(0, numGroups, maxRowsInGroup*(ncol(X) - 1) + 1) + Y[,1] = groupsUnique + indicesY = order(target = Y, by = 1, index.return = TRUE) Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) - # order the initial matrix - Xindexes = order(target = X, by = col, index.return = TRUE) + # Order the input matrix by the grouping column + indicesX = order(target = X, by = col, index.return = TRUE) X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) - # the amount of unique groups that are remaining - restingGroups = nrow(uniqueValues) - - # intial group to copy the values into Y - Ypos = 1 - - # start positon in the row final matrix - YrowStart = 1 + currentGroupX = 1 + currentGroupY = 1 i = 1 - #loop over the initial matrix - while(restingGroups > 0){ - - currentGroup = as.scalar(Y[Ypos,1]) - - # amount of rows that need to be copied - amountRows = 0 - - # find the rows for the current group - GROUP = 1 - while(GROUP > 0){ - # break if there are no more row left in X - if(i > nrow(X)){ - GROUP = 0 + # Iterate over the input matrix + while (numGroups > 0) { + currentGroup = as.scalar(Y[currentGroupX,1]) + nRowsToCopy = 0 + + # Find the rows for the current group + group = 1 + while (group > 0) { + # Break if there are no more rows left in X + if (i > nrow(X)) { + group = 0 } - # check if the row belongs to the current group - else if(as.scalar(X[i, col]) == currentGroup){ - amountRows = amountRows + 1 + # Check if the row belongs to the current group + else if (as.scalar(X[i, col]) == currentGroup) { + nRowsToCopy = nRowsToCopy + 1 i = i + 1 } - # break if the row does not belong to the current group - else{ - GROUP = 0 + # Break if the row does not belong to the current group + else { + group = 0 } } - # copy the values into the final matrix - if (amountRows > 0){ - # create a matrix of the row that should be copied without the group column - YrowEnd = YrowStart + amountRows - 1 + # Copy the values into the output matrix + if (nRowsToCopy > 0) { + nRowsCurrentGroup = currentGroupY + nRowsToCopy - 1 - # case selected column is first column - if (col == 1){ - newMatrix = X[YrowStart:YrowEnd, (col+1):ncol(X)] + # 1. Grouping column is the first column + if (col == 1) { + newMatrix = X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)] } - # case selected column is last column + # 2. Grouping column is the last column else if (col == ncol(X)) { - newMatrix = X [YrowStart:YrowEnd, 1:col-1] + newMatrix = X [currentGroupY:nRowsCurrentGroup, 1:col-1] } - # normal case + # 3. Grouping column has an intermediate position else { - newMatrix = cbind(X[YrowStart:YrowEnd, 1:(col-1)], X[YrowStart:YrowEnd, (col+1):ncol(X)]) + newMatrix = cbind(X[currentGroupY:nRowsCurrentGroup, 1:(col-1)], X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)]) } - # flatten the new row + # Flatten the new row newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) - newRowColumns = amountRows * (ncol(X)-1) + newRowColIdx = nRowsToCopy * (ncol(X)-1) - # add the new row into Y at the current group - Y[Ypos, 2: (newRowColumns + 1)] = newRow + # Add the new row into Y at the current group + Y[currentGroupX, 2: (newRowColIdx + 1)] = newRow } - # continue with the next group - Ypos = Ypos + 1 - YrowStart = YrowStart + amountRows - restingGroups = restingGroups - 1 + # Continue with the next group + currentGroupX = currentGroupX + 1 + currentGroupY = currentGroupY + nRowsToCopy + numGroups = numGroups - 1 } - #restore the initial order of X - X = cbind(X, Xindexes) - ncol_X = ncol(X) - X = order(target = X, by= ncol_X) - X = X[, 1:ncol_X-1] - - #restore the initial order of Y - Y = cbind(Y, Yindexes) - ncol_Y = ncol(Y) - Y = order(target = Y, by= ncol_Y) - Y = Y[, 1:ncol_Y-1] - + # Restore the initial order of X + X = cbind(X, indicesX) + nColX = ncol(X) + X = order(target = X, by= nColX) + X = X[, 1:nColX-1] + + # Restore the initial order of Y + Y = cbind(Y, indicesY) + nColY = ncol(Y) + Y = order(target = Y, by= nColY) + Y = Y[, 1:nColY-1] } + else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups key = X[,col] keyUnique = unique(X[, col]) numGroups = nrow(keyUnique) - - # Determine the maximum number of rows in any group maxRowsInGroup = max(table(X[,col],1)) - # calculate the frequency of each group + # Calculate the frequency of each group freqPerKey = table(key, 1) freqPerKey = removeEmpty(target = freqPerKey, margin = "rows") - freqPerKeyIndexes = order(target = keyUnique, by = 1, index.return = TRUE) + freqPerKeyIndices = order(target = keyUnique, by = 1, index.return = TRUE) - # match the length of freqPerKey to key_uniqe and sort it accordingly - freqPerKey = cbind(freqPerKey, freqPerKeyIndexes) - ncolFpk = ncol(freqPerKey) - freqPerKey = order(target = freqPerKey, by= ncolFpk) - freqPerKey = freqPerKey[, 1:ncolFpk-1] + # Match the length of freqPerKey to keyUnique and sort it accordingly + freqPerKey = cbind(freqPerKey, freqPerKeyIndices) + nColFpk = ncol(freqPerKey) + freqPerKey = order(target = freqPerKey, by= nColFpk) + freqPerKey = freqPerKey[, 1:nColFpk-1] freqPerKey = t(freqPerKey) - # calculate the longest row in Y, which is the biggest key in X - longestRowY = maxRowsInGroup*(ncol(X)-1)+1 - biggestKey = max(freqPerKey) + # Find the group with the most values + groupMaxVal = maxRowsInGroup*(ncol(X)-1)+1 + groupMaxValKey = max(freqPerKey) - # caclulate the amount of rows that need padding and the amount of padding per key - biggestKeySeq = matrix(biggestKey, nrow(freqPerKey), ncol(freqPerKey)) - missingPadding = biggestKeySeq - freqPerKey + # Calculate the amount of rows that need padding and the amount of padding per key + groupMaxValKeySeq = matrix(groupMaxValKey, nrow(freqPerKey), ncol(freqPerKey)) + missingPadding = groupMaxValKeySeq - freqPerKey amountOfZeroRows = sum(missingPadding) - # case padding is needed - if (amountOfZeroRows > 0){ + # 1. Padding is required + if (amountOfZeroRows > 0) { missingPadding = t(missingPadding) - # remove the keys that dont need padding - removemask = (missingPadding != 0) + # Remove the keys that dont need padding + removeMask = (missingPadding != 0) missingPadding = cbind(keyUnique, missingPadding) - missingPadding = removeEmpty(target = missingPadding, margin = "rows", select = removemask) + missingPadding = removeEmpty(target = missingPadding, margin = "rows", select = removeMask) - # save the keys that need padding + # Keys that need padding and padding length per group keysPadding = missingPadding[,1] - - # save the amount of padding per group missingPadding = missingPadding[,2] - repeatKeys = matrix(0, rows=amountOfZeroRows, cols=1) - # uses a loop to generate the repeting keys - # loop to heap space problem with larger data sets - - #start index for the repeatKeys + # Generate the repeating keys repeatKeysIdxS = 1 - for(i in 1:nrow(missingPadding)) { + for (i in 1:nrow(missingPadding)) { repeat_count = as.scalar(missingPadding[i,1]) - if(repeat_count > 0){ + if (repeat_count > 0) { temp = matrix(as.scalar(keysPadding[i, 1]), rows=repeat_count, cols = 1) repeatKeysIdxE = repeatKeysIdxS + repeat_count - 1 repeatKeys[repeatKeysIdxS:repeatKeysIdxE, 1] = temp @@ -208,85 +180,87 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) } } - # combine the keys that need padding with the actual padding + # Combine the keys that need padding with the actual padding padding = matrix(0, rows = nrow(repeatKeys), cols = 1) padding = cbind(repeatKeys, padding) - # extend the existing keys to a second column to match the padded keys + # Extend the existing keys to a second column to match the padded keys key = key %*% matrix(1, rows = 1, cols = 2) - # combine the keys with the padded keys and sort them increasingly + # Combine the keys with the padded keys and sort them increasingly tempY = rbind(key, padding) tempY = order(target = tempY, by = 1, decreasing = FALSE, index.return = FALSE) - # remove the padded rows and save the indexes of the combined keys for the permutation matrix + # Remove the padded rows and save the Indices of the combined keys for the permutation matrix paddedRows = tempY[, 2] - tempYindexes = order(target = tempY, by = 1, decreasing = FALSE, index.return = TRUE) - tempYindexes = removeEmpty(target = tempYindexes, margin = "rows", select = (paddedRows!=0)) + tempIndicesY = order(target = tempY, by = 1, decreasing = FALSE, index.return = TRUE) + tempIndicesY = removeEmpty(target = tempIndicesY, margin = "rows", select = (paddedRows!=0)) - # create the permutation matrix by using the indexes of the combined keys - P = table(seq(1, nrow(X)), tempYindexes) + # Create the permutation matrix by using the Indices of the combined keys + P = table(seq(1, nrow(X)), tempIndicesY) - # order the initial matrix to match the sorted keys with padding - Xindexes = order(target = X, by = col, index.return = TRUE) + # Order the initial matrix to match the sorted keys with padding + indicesX = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) - # perform the matrix multiplication + # Perform the matrix multiplication tempY = t(P) %*% X } - # case no padding is needed - else{ + + # 2. Padding is not required + else { tempY = X tempY = order(target = tempY, by = col, decreasing = FALSE, index.return = FALSE) } # Remove the selected column from tempY - if( col == 1 ) { + if (col == 1) { tempY = tempY[, col+1:ncol(tempY)] } - else if( col == ncol(X) ) { + else if (col == ncol(X)) { tempY = tempY[, 1:col-1] } - else{ + else { tempY = cbind(tempY[, 1:col-1],tempY[, col+1:ncol(tempY)]) } - # Set value of final output - Y = matrix(0, rows=numGroups, cols=longestRowY) + # Set the value of the final output + Y = matrix(0, rows=numGroups, cols=groupMaxVal) Y[,1] = keyUnique - # The permutation matrix creates a structure where each group's data - # may not fill exactly maxRowsInGroup rows. - # If needed, we need to pad to the expected size first. + # Each group's data may not fill exactly maxRowsInGroup rows + # If needed, we need to pad to the expected size first expectedRows = numGroups * maxRowsInGroup actualRows = nrow(tempY) - if(actualRows < expectedRows) { + if (actualRows < expectedRows) { # Pad tempY with zeros to match expected structure - tempYpadded = matrix(0, rows=expectedRows, cols=ncol(tempY)) - tempYpadded[1:actualRows,] = tempY - } else { - tempYpadded = tempY + tempYPadded = matrix(0, rows=expectedRows, cols=ncol(tempY)) + tempYPadded[1:actualRows,] = tempY + } + else { + tempYPadded = tempY } - # save the initial order of the groups in Y and order Y to match the sorted tempYpadded - Yindexes = order(target = Y, by = 1, index.return = TRUE) + # Save the initial order of the groups in Y and order Y to match the sorted tempYPadded + indicesY = order(target = Y, by = 1, index.return = TRUE) Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) - # copy the values into Y - Y[,2:ncol(Y)] = matrix(tempYpadded, rows=numGroups, cols=longestRowY-1) + # Copy the values into Y + Y[,2:ncol(Y)] = matrix(tempYPadded, rows=numGroups, cols=groupMaxVal-1) - #restore the initial order of X - X = cbind(X, Xindexes) - ncol_X = ncol(X) - X = order(target = X, by= ncol_X) - X = X[, 1:ncol_X-1] + # Restore the initial order of X + X = cbind(X, indicesX) + nColX = ncol(X) + X = order(target = X, by= nColX) + X = X[, 1:nColX-1] - #restore the initial order of Y - Y = cbind(Y, Yindexes) - ncol_Y = ncol(Y) - Y = order(target = Y, by= ncol_Y) - Y = Y[, 1:ncol_Y-1] + # Restore the initial order of Y + Y = cbind(Y, indicesY) + nColY = ncol(Y) + Y = order(target = Y, by= nColY) + Y = Y[, 1:nColY-1] } }