From f5d4f22bb2b9e30079cc69bbd2fa522844cbf088 Mon Sep 17 00:00:00 2001
From: maxrankl <max.rankl@hotmail.de>
Date: Sun, 29 Jun 2025 23:46:12 +0200
Subject: [PATCH 1/2] Current status, unfortunately, it does not insert the
 values correctly in raGroupby_exp1.dml

---
 .gitignore                         |   1 +
 scripts/builtin/raGroupby_exp1.dml | 171 +++++++++++++++++++++++++++++
 testing/groupby_new_time.dml       |   9 ++
 testing/groupby_old_time.dml       |  21 ++++
 testing/testing.dml                |  51 +++++++++
 5 files changed, 253 insertions(+)
 create mode 100644 scripts/builtin/raGroupby_exp1.dml
 create mode 100644 testing/groupby_new_time.dml
 create mode 100644 testing/groupby_old_time.dml
 create mode 100644 testing/testing.dml

diff --git a/.gitignore b/.gitignore
index f3c28571bdf..b2210ba22c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,3 +150,4 @@ venv/*
 # resource optimization
 scripts/resource/output
 *.pem
+ADDED_Testing/hello.dml
diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml
new file mode 100644
index 00000000000..7f5fef1815a
--- /dev/null
+++ b/scripts/builtin/raGroupby_exp1.dml
@@ -0,0 +1,171 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This raGroupby-function takes a matrix dataset as input from where it performs
+# relational operations : groupby
+#
+# INPUT:
+# ------------------------------------------------------------------------------
+# X         Matrix of input data [shape: N x M]
+# col       Integer indicating the column index to execute grupby command
+# method    Groupby implemention method (nested-loop, permutation-matrix)
+# ------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# Y         Matrix of selected data [shape N' x M] with N' <= N
+# ------------------------------------------------------------------------------
+
+m_raGroupby_new = function (Matrix[Double] X, Integer col, String method)
+  return (Matrix[Double] Y)
+{
+  if (method == "nested-loop") {
+    # Extract and sort unique values from the specified column (1-based index)
+    uniqueValues = unique(X[, col])
+    order_uniqueValues = order(target = uniqueValues, by = 1);
+
+    # Calcute the number of groups
+    numGroups = nrow(uniqueValues)
+
+    # Determine the maximum number of rows in any group
+    maxRowsInGroup = max(table(X[,col],1));
+
+    # Define a zero matrix to put the group data into
+    Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1)
+
+    # Put the ordered uniqueValues into first column of Y as group_id
+    Y[,1] = uniqueValues
+
+    # create matrix to store the amount of rows for each group
+    rows_per_group = matrix(0, numGroups, 1)
+
+    # order the initial matrix
+    #Xordered = order(target = X[,col], by = col)
+    Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE)
+
+    # the amount of unique groups that are remaining
+    restingGroups = nrow(uniqueValues)
+
+    # intial group
+    Ypos = 1
+    currentGroup = as.scalar(Y[Ypos,1])
+
+    # intial positon in the row final matrix
+    YrowStart = 1
+
+    #loop over the initial matrix
+    while(restingGroups > 0){
+
+        current_group = as.scalar(Y[Ypos,1])
+
+        #amount of rows that need to be copied
+        amountRows = 0
+
+        i = YrowStart
+
+        GROUP = 1
+
+        while(GROUP > 0){
+            if(i > nrow(Xordered)){
+                GROUP = 0
+            }
+            else if(as.scalar(Xordered[i, col]) == currentGroup){
+                amountRows = amountRows + 1
+                i = i + 1
+            }
+            else{
+               GROUP = 0
+            }
+
+        }
+
+        # row position in the final matrix
+        restingGroups = restingGroups - 1
+
+        if (amountRows > 0){
+            # copy the values into the final matrix
+            YrowEnd = YrowStart + amountRows - 1
+            newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)])
+            #flatten the new row
+            newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix))
+            newRowColumns = amountRows * (ncol(X)-1)
+            Y[Ypos, 2: (newRowColumns + 1)] = newRow
+        }
+
+        # continue with the next group
+        Ypos = Ypos + 1
+        YrowStart = YrowStart + amountRows
+
+    }
+
+  }
+  else if (method == "permutation-matrix") {
+    # Extract the grouping column and create unique groups
+    key = X[,col]
+    key_unique = unique(X[, col])
+    numGroups = nrow(key_unique)
+
+    # Matrix for comparison
+    key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X))
+    key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key)
+
+    # Find group index
+    groupIndex = rowIndexMax(t(key_compare == key_matrix))
+
+    # Determine the maximum number of rows in any group
+    maxRowsInGroup = max(table(X[,col],1))
+    totalCells = (maxRowsInGroup) * (ncol(X)-1) +1
+
+    # Create permutation matrix P copy relevant tuples with a single matrix multiplication
+    P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup)
+    # Create offsets to store the first column of each group
+    offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1)
+
+    # Create row and column index for the permutation matrix
+    rowIndex = seq(1, nrow(X))
+    indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X))))
+    selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex)
+    colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix)
+
+    # Set values in P
+    P = table(seq(1, nrow(X)), colIndex)
+
+    # Perform matrix multiplication
+    Y_temp = t(P) %*% X
+
+    # Remove the selected column from Y_temp
+    if( col == 1 ) {
+        Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)]
+    }
+    else if( col == ncol(X) ) {
+        Y_temp_reduce = Y_temp[, 1:col-1]
+    }
+    else{
+        Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)])
+    }
+
+    # Set value of final output
+    Y = matrix(0, rows=numGroups, cols=totalCells)
+    Y[,1] = key_unique
+    Y[,2:ncol(Y)] = matrix(Y_temp_reduce, rows=numGroups, cols=totalCells-1)
+  }
+}
+
diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml
new file mode 100644
index 00000000000..fba92564b82
--- /dev/null
+++ b/testing/groupby_new_time.dml
@@ -0,0 +1,9 @@
+# create an input matrix
+input_matrix = round(rand(rows = 1000, cols = 10, min = 1))
+
+# load functions
+source("scripts/builtin/raGroupby_exp1.dml") as ra_new
+
+# use the new ra_groupby function
+new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop")
+
diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml
new file mode 100644
index 00000000000..015a5797253
--- /dev/null
+++ b/testing/groupby_old_time.dml
@@ -0,0 +1,21 @@
+# create an input matrix
+input_matrix = round(rand(rows = 1000, cols = 10, min = 1))
+print("The amount of rows in the input matrix")
+print(1000*10)
+
+# load functions
+source("scripts/builtin/raGroupby.dml") as ra_old
+
+# use the initial ra_groupby function
+old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop")
+
+print(input_matrix[1:10, 1:ncol(input_matrix)])
+
+print("The amount of rows in the final matrix")
+print(nrow(old_func))
+
+print("The amount of rows in the final matrix")
+print(ncol(old_func))
+
+print("The amount of rows * columns in the final matrix")
+print(ncol(old_func)*nrow(old_func))
\ No newline at end of file
diff --git a/testing/testing.dml b/testing/testing.dml
new file mode 100644
index 00000000000..39ac856b2f2
--- /dev/null
+++ b/testing/testing.dml
@@ -0,0 +1,51 @@
+# create an input matrix
+input_matrix = round(rand(rows = 10, cols = 10, min = 1, max = 5))
+
+# load functions
+source("scripts/builtin/raGroupby.dml") as ra_old
+source("scripts/builtin/raGroupby_exp1.dml") as ra_new
+
+# use the initial ra_groupby function
+old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop")
+
+# use the new ra_groupby function
+new_func = ra_new::m_raGroupby_new(input_matrix, 2, "nested-loop")
+
+# check if the new function still operates correctly
+for(i in 1:nrow(old_func)){
+    for(j in 1:ncol(old_func)){
+        old_val = as.scalar(old_func[i,j])
+        new_val = as.scalar(new_func[i,j])
+        if (old_val != new_val){
+            print("The values are not identical")
+            print("The index is i x j")
+            print(i)
+            print(j)
+        }else{
+            if(j == 1){
+                print("The first value was correct")
+            }
+            if(j == ncol(old_func)){
+                print("The last value was correct")
+            }
+        }
+    }
+}
+
+print("The amount of rows in the old matrix")
+print(nrow(old_func))
+
+print("The amount of col in the old matrix")
+print(ncol(old_func))
+
+print("The amount of rows * columns in the old matrix")
+print(ncol(old_func)*nrow(old_func))
+
+print("The amount of rows in the new matrix")
+print(nrow(new_func))
+
+print("The amount of col in the new matrix")
+print(ncol(new_func))
+
+print("The amount of rows * columns in the new matrix")
+print(ncol(new_func)*nrow(new_func))

From a9617597c9d22d06d157f94a86c9d01206ac76d5 Mon Sep 17 00:00:00 2001
From: maxrankl <max.rankl@hotmail.de>
Date: Mon, 30 Jun 2025 15:11:59 +0200
Subject: [PATCH 2/2] Found the error, should beat the performance of nested
 loop, sorry for the last pull request

---
 scripts/builtin/raGroupby_exp1.dml | 26 ++++++++++++++++++--------
 testing/groupby_new_time.dml       |  2 +-
 testing/groupby_old_time.dml       | 15 +--------------
 testing/testing.dml                |  2 +-
 4 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/scripts/builtin/raGroupby_exp1.dml b/scripts/builtin/raGroupby_exp1.dml
index 7f5fef1815a..3e0b4abec4d 100644
--- a/scripts/builtin/raGroupby_exp1.dml
+++ b/scripts/builtin/raGroupby_exp1.dml
@@ -58,29 +58,29 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method)
     rows_per_group = matrix(0, numGroups, 1)
 
     # order the initial matrix
-    #Xordered = order(target = X[,col], by = col)
     Xordered = order(target = X, by = col, decreasing = FALSE, index.return = FALSE)
 
+    # create a Matrix with the ordered groups
+    Yordered = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE)
+
     # the amount of unique groups that are remaining
     restingGroups = nrow(uniqueValues)
 
     # intial group
     Ypos = 1
-    currentGroup = as.scalar(Y[Ypos,1])
 
     # intial positon in the row final matrix
     YrowStart = 1
+    i = 1
 
     #loop over the initial matrix
     while(restingGroups > 0){
 
-        current_group = as.scalar(Y[Ypos,1])
+        currentGroup = as.scalar(Yordered[Ypos,1])
 
         #amount of rows that need to be copied
         amountRows = 0
 
-        i = YrowStart
-
         GROUP = 1
 
         while(GROUP > 0){
@@ -94,28 +94,38 @@ m_raGroupby_new = function (Matrix[Double] X, Integer col, String method)
             else{
                GROUP = 0
             }
-
         }
 
         # row position in the final matrix
         restingGroups = restingGroups - 1
 
         if (amountRows > 0){
+
             # copy the values into the final matrix
             YrowEnd = YrowStart + amountRows - 1
             newMatrix = cbind(Xordered[YrowStart:YrowEnd, 1:(col-1)], Xordered[YrowStart:YrowEnd, (col+1):ncol(X)])
+
             #flatten the new row
             newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix))
             newRowColumns = amountRows * (ncol(X)-1)
-            Y[Ypos, 2: (newRowColumns + 1)] = newRow
-        }
 
+            # sort the ordered Y matrix back to the initial order
+            rowFinder = 1
+            while(currentGroup != as.scalar(Y[rowFinder, 1])){
+                rowFinder = rowFinder + 1
+            }
+            Y[rowFinder, 2: (newRowColumns + 1)] = newRow
+        }
         # continue with the next group
         Ypos = Ypos + 1
         YrowStart = YrowStart + amountRows
 
     }
 
+
+
+
+
   }
   else if (method == "permutation-matrix") {
     # Extract the grouping column and create unique groups
diff --git a/testing/groupby_new_time.dml b/testing/groupby_new_time.dml
index fba92564b82..011d7ab048a 100644
--- a/testing/groupby_new_time.dml
+++ b/testing/groupby_new_time.dml
@@ -1,5 +1,5 @@
 # create an input matrix
-input_matrix = round(rand(rows = 1000, cols = 10, min = 1))
+input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5))
 
 # load functions
 source("scripts/builtin/raGroupby_exp1.dml") as ra_new
diff --git a/testing/groupby_old_time.dml b/testing/groupby_old_time.dml
index 015a5797253..25ae4bfb416 100644
--- a/testing/groupby_old_time.dml
+++ b/testing/groupby_old_time.dml
@@ -1,21 +1,8 @@
 # create an input matrix
-input_matrix = round(rand(rows = 1000, cols = 10, min = 1))
-print("The amount of rows in the input matrix")
-print(1000*10)
+input_matrix = round(rand(rows = 10000, cols = 5, min = 1, max = 5))
 
 # load functions
 source("scripts/builtin/raGroupby.dml") as ra_old
 
 # use the initial ra_groupby function
 old_func = ra_old::m_raGroupby(input_matrix, 2, "nested-loop")
-
-print(input_matrix[1:10, 1:ncol(input_matrix)])
-
-print("The amount of rows in the final matrix")
-print(nrow(old_func))
-
-print("The amount of rows in the final matrix")
-print(ncol(old_func))
-
-print("The amount of rows * columns in the final matrix")
-print(ncol(old_func)*nrow(old_func))
\ No newline at end of file
diff --git a/testing/testing.dml b/testing/testing.dml
index 39ac856b2f2..d19951f095d 100644
--- a/testing/testing.dml
+++ b/testing/testing.dml
@@ -1,5 +1,5 @@
 # create an input matrix
-input_matrix = round(rand(rows = 10, cols = 10, min = 1, max = 5))
+input_matrix = round(rand(rows = 5, cols = 5, min = 1, max = 5))
 
 # load functions
 source("scripts/builtin/raGroupby.dml") as ra_old