Skip to content
Closed
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f5d4f22
Current status, unfortunately, it does not insert the values correctl…
maxrankl Jun 29, 2025
caaa93d
Added the Becnhmarking framework (Python)
maxrankl Jun 29, 2025
a961759
Found the error, should beat the performance of nested loop, sorry fo…
maxrankl Jun 30, 2025
cc85eaf
Merge branch 'experiment1'
maxrankl Jun 30, 2025
fed2aa4
Found the error, should beat the performance of nested loop, sorry fo…
maxrankl Jun 30, 2025
243ff1c
Merge branch 'apache:main' into main
maxrankl Jun 30, 2025
7275695
Avoided the addition loop over Y to copy the rows into Y by saving th…
maxrankl Jul 4, 2025
80293ce
Removed additional files. Copied content into the correct file. Added…
maxrankl Jul 5, 2025
a79420e
Removed print statement for debugging
maxrankl Jul 6, 2025
de96b3e
Removed print statement for debugging, forgot one
maxrankl Jul 7, 2025
a086101
Merge branch 'apache:main' into main
maxrankl Jul 7, 2025
3ffaa34
commit to merge the fix of permutation matrix
maxrankl Jul 8, 2025
f07c387
Added the Becnhmarking framework (Python)
maxrankl Jun 29, 2025
a93c2f5
Current status, unfortunately, it does not insert the values correctl…
maxrankl Jun 29, 2025
4ac8268
Found the error, should beat the performance of nested loop, sorry fo…
maxrankl Jun 30, 2025
59305fc
Found the error, should beat the performance of nested loop, sorry fo…
maxrankl Jun 30, 2025
6894877
Avoided the addition loop over Y to copy the rows into Y by saving th…
maxrankl Jul 4, 2025
a85d3d2
Removed additional files. Copied content into the correct file. Added…
maxrankl Jul 5, 2025
a669e42
Finished merge for the ra_groupby permutation matrix fix
maxrankl Jul 7, 2025
66656c7
commit to merge the fix of permutation matrix
maxrankl Jul 8, 2025
2e4a6ad
merged the changes of apache main into the main branch of this project
maxrankl Jul 20, 2025
09cc1d8
alternative version of permuatation works except edge cases
maxrankl Jul 24, 2025
81d4785
alternative version of permuatation works with edge cases, but needs …
maxrankl Jul 24, 2025
effc323
permutation matrix is not a real permutation amtrix anymore because i…
maxrankl Jul 24, 2025
debdffc
alternative version of permutation matrix sorts X and calculates the …
maxrankl Jul 26, 2025
54a7b2c
Alternative version of a permutation matrix that really uses a permut…
maxrankl Jul 26, 2025
098c387
Alternative version of a permutation matrix that really uses a permut…
maxrankl Jul 27, 2025
f1ce29c
Removed additional added files. Both implementations of the methods o…
maxrankl Jul 27, 2025
a3adfbc
chore(raGroupBy): edits for the pull request
gaturchenko Aug 8, 2025
0188b4c
Merge pull request #2 from gaturchenko/grigorii-revision
maxrankl Aug 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 199 additions & 86 deletions scripts/builtin/raGroupby.dml
Original file line number Diff line number Diff line change
Expand Up @@ -37,117 +37,230 @@
m_raGroupby = function (Matrix[Double] X, Integer col, String method)
return (Matrix[Double] Y)
{
if (method == "nested-loop") {
# Extract and sort unique values from the specified column (1-based index)
uniqueValues = unique(X[, col])
order_uniqueValues = order(target = uniqueValues, by = 1);
if (method == "nested-loop") {
# Extract and sort unique group values from the specified column (1-based index)
groupsUnique = unique(X[, col])
groupsUniqueOrdered = order(target = groupsUnique, by = 1)
numGroups = nrow(groupsUnique)
maxRowsInGroup = max(table(X[,col],1));

# Calcute the number of groups
numGroups = nrow(uniqueValues)
# Define a zero output matrix, save the initial order of the groups, and sort increasingly
Y = matrix(0, numGroups, maxRowsInGroup*(ncol(X) - 1) + 1)
Y[,1] = groupsUnique
indicesY = order(target = Y, by = 1, index.return = TRUE)
Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE)

# Determine the maximum number of rows in any group
maxRowsInGroup = max(table(X[,col],1));
# Order the input matrix by the grouping column
indicesX = order(target = X, by = col, index.return = TRUE)
X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE)

currentGroupX = 1
currentGroupY = 1
i = 1

# Iterate over the input matrix
while (numGroups > 0) {
currentGroup = as.scalar(Y[currentGroupX,1])
nRowsToCopy = 0

# Find the rows for the current group
group = 1
while (group > 0) {
# Break if there are no more rows left in X
if (i > nrow(X)) {
group = 0
}
# Check if the row belongs to the current group
else if (as.scalar(X[i, col]) == currentGroup) {
nRowsToCopy = nRowsToCopy + 1
i = i + 1
}
# Break if the row does not belong to the current group
else {
group = 0
}
}

# Copy the values into the output matrix
if (nRowsToCopy > 0) {
nRowsCurrentGroup = currentGroupY + nRowsToCopy - 1

# 1. Grouping column is the first column
if (col == 1) {
newMatrix = X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)]
}
# 2. Grouping column is the last column
else if (col == ncol(X)) {
newMatrix = X [currentGroupY:nRowsCurrentGroup, 1:col-1]
}
# 3. Grouping column has an intermediate position
else {
newMatrix = cbind(X[currentGroupY:nRowsCurrentGroup, 1:(col-1)], X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)])
}

# Flatten the new row
newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix))
newRowColIdx = nRowsToCopy * (ncol(X)-1)

# Define a zero matrix to put the group data into
Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1)

# Put the ordered uniqueValues into first column of Y as group_id
#Y[,1] = order_uniqueValues
Y[,1] = uniqueValues

# Loop for each group
for(i in 1:numGroups){
index = 0

# Iterate each row in matrix X to deal with group data
for ( j in 1:nrow(X) ) {
if ( as.scalar( X[j,col] == uniqueValues[i,1] )) {
# Define the formula of the start and end column position
startCol = index*(ncol(X)-1) +2
endCol = startCol + (ncol(X)-2)

if (col == 1) {
# Case when the selected column is the first column
Y[i,startCol:endCol] = X[j,2:ncol(X)]
}
else if (col == ncol(X)) {
# Case when the selected column is the last column
Y[i,startCol:endCol] = X[j,1:(ncol(X)-1)]
}
else {
# General case
newRow = cbind(X[j, 1:(col-1)], X[j, (col+1):ncol(X)])
Y[i,startCol:endCol] = newRow
}
index = index +1
# Add the new row into Y at the current group
Y[currentGroupX, 2: (newRowColIdx + 1)] = newRow
}
}

# Continue with the next group
currentGroupX = currentGroupX + 1
currentGroupY = currentGroupY + nRowsToCopy
numGroups = numGroups - 1
}

# Restore the initial order of X
X = cbind(X, indicesX)
nColX = ncol(X)
X = order(target = X, by= nColX)
X = X[, 1:nColX-1]

# Restore the initial order of Y
Y = cbind(Y, indicesY)
nColY = ncol(Y)
Y = order(target = Y, by= nColY)
Y = Y[, 1:nColY-1]
}

else if (method == "permutation-matrix") {
# Extract the grouping column and create unique groups
key = X[,col]
key_unique = unique(X[, col])
numGroups = nrow(key_unique)
keyUnique = unique(X[, col])
numGroups = nrow(keyUnique)
maxRowsInGroup = max(table(X[,col],1))

# Matrix for comparison
key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X))
key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key)
# Calculate the frequency of each group
freqPerKey = table(key, 1)
freqPerKey = removeEmpty(target = freqPerKey, margin = "rows")
freqPerKeyIndices = order(target = keyUnique, by = 1, index.return = TRUE)

# Find group index
groupIndex = rowIndexMax(t(key_compare == key_matrix))
# Match the length of freqPerKey to keyUnique and sort it accordingly
freqPerKey = cbind(freqPerKey, freqPerKeyIndices)
nColFpk = ncol(freqPerKey)
freqPerKey = order(target = freqPerKey, by= nColFpk)
freqPerKey = freqPerKey[, 1:nColFpk-1]
freqPerKey = t(freqPerKey)

# Determine the maximum number of rows in any group
maxRowsInGroup = max(table(X[,col],1))
totalCells = (maxRowsInGroup) * (ncol(X)-1) +1
# Find the group with the most values
groupMaxVal = maxRowsInGroup*(ncol(X)-1)+1
groupMaxValKey = max(freqPerKey)

# Create permutation matrix P copy relevant tuples with a single matrix multiplication
P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup)
# Create offsets to store the first column of each group
offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1)
# Calculate the amount of rows that need padding and the amount of padding per key
groupMaxValKeySeq = matrix(groupMaxValKey, nrow(freqPerKey), ncol(freqPerKey))
missingPadding = groupMaxValKeySeq - freqPerKey
amountOfZeroRows = sum(missingPadding)

# Create row and column index for the permutation matrix
rowIndex = seq(1, nrow(X))
indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X))))
selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex)
colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix)
# 1. Padding is required
if (amountOfZeroRows > 0) {
missingPadding = t(missingPadding)

# Set values in P
P = table(seq(1, nrow(X)), colIndex)
# Remove the keys that dont need padding
removeMask = (missingPadding != 0)
missingPadding = cbind(keyUnique, missingPadding)
missingPadding = removeEmpty(target = missingPadding, margin = "rows", select = removeMask)

# Perform matrix multiplication
Y_temp = t(P) %*% X
# Keys that need padding and padding length per group
keysPadding = missingPadding[,1]
missingPadding = missingPadding[,2]
repeatKeys = matrix(0, rows=amountOfZeroRows, cols=1)

# Remove the selected column from Y_temp
if( col == 1 ) {
Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)]
# Generate the repeating keys
repeatKeysIdxS = 1

for (i in 1:nrow(missingPadding)) {
repeat_count = as.scalar(missingPadding[i,1])
if (repeat_count > 0) {
temp = matrix(as.scalar(keysPadding[i, 1]), rows=repeat_count, cols = 1)
repeatKeysIdxE = repeatKeysIdxS + repeat_count - 1
repeatKeys[repeatKeysIdxS:repeatKeysIdxE, 1] = temp
repeatKeysIdxS = repeatKeysIdxE + 1
}
}

# Combine the keys that need padding with the actual padding
padding = matrix(0, rows = nrow(repeatKeys), cols = 1)
padding = cbind(repeatKeys, padding)

# Extend the existing keys to a second column to match the padded keys
key = key %*% matrix(1, rows = 1, cols = 2)

# Combine the keys with the padded keys and sort them increasingly
tempY = rbind(key, padding)
tempY = order(target = tempY, by = 1, decreasing = FALSE, index.return = FALSE)

# Remove the padded rows and save the Indices of the combined keys for the permutation matrix
paddedRows = tempY[, 2]
tempIndicesY = order(target = tempY, by = 1, decreasing = FALSE, index.return = TRUE)
tempIndicesY = removeEmpty(target = tempIndicesY, margin = "rows", select = (paddedRows!=0))

# Create the permutation matrix by using the Indices of the combined keys
P = table(seq(1, nrow(X)), tempIndicesY)

# Order the initial matrix to match the sorted keys with padding
indicesX = order(target = X, by = col, index.return = TRUE)
X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE)
X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE)

# Perform the matrix multiplication
tempY = t(P) %*% X
}

# 2. Padding is not required
else {
tempY = X
tempY = order(target = tempY, by = col, decreasing = FALSE, index.return = FALSE)
}

# Remove the selected column from tempY
if (col == 1) {
tempY = tempY[, col+1:ncol(tempY)]
}
else if( col == ncol(X) ) {
Y_temp_reduce = Y_temp[, 1:col-1]
else if (col == ncol(X)) {
tempY = tempY[, 1:col-1]
}
else{
Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)])
else {
tempY = cbind(tempY[, 1:col-1],tempY[, col+1:ncol(tempY)])
}

# Set value of final output
Y = matrix(0, rows=numGroups, cols=totalCells)
Y[,1] = key_unique
# Set the value of the final output
Y = matrix(0, rows=numGroups, cols=groupMaxVal)
Y[,1] = keyUnique

# The permutation matrix creates a structure where each group's data
# may not fill exactly maxRowsInGroup rows.
# If needed, we need to pad to the expected size first.
# Each group's data may not fill exactly maxRowsInGroup rows
# If needed, we need to pad to the expected size first
expectedRows = numGroups * maxRowsInGroup
actualRows = nrow(Y_temp_reduce)

if(actualRows < expectedRows) {
# Pad Y_temp_reduce with zeros to match expected structure
Y_tmp_padded = matrix(0, rows=expectedRows, cols=ncol(Y_temp_reduce))
Y_tmp_padded[1:actualRows,] = Y_temp_reduce
} else {
Y_tmp_padded = Y_temp_reduce
actualRows = nrow(tempY)

if (actualRows < expectedRows) {
# Pad tempY with zeros to match expected structure
tempYPadded = matrix(0, rows=expectedRows, cols=ncol(tempY))
tempYPadded[1:actualRows,] = tempY
}
else {
tempYPadded = tempY
}

# Save the initial order of the groups in Y and order Y to match the sorted tempYPadded
indicesY = order(target = Y, by = 1, index.return = TRUE)
Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE)

# Copy the values into Y
Y[,2:ncol(Y)] = matrix(tempYPadded, rows=numGroups, cols=groupMaxVal-1)

# Restore the initial order of X
X = cbind(X, indicesX)
nColX = ncol(X)
X = order(target = X, by= nColX)
X = X[, 1:nColX-1]

Y[,2:ncol(Y)] = matrix(Y_tmp_padded, rows=numGroups, cols=totalCells-1)
# Restore the initial order of Y
Y = cbind(Y, indicesY)
nColY = ncol(Y)
Y = order(target = Y, by= nColY)
Y = Y[, 1:nColY-1]
}
}

Loading