diff --git a/scripts/builtin/raGroupby.dml b/scripts/builtin/raGroupby.dml index 0a23bf51ef0..cd3039902c4 100644 --- a/scripts/builtin/raGroupby.dml +++ b/scripts/builtin/raGroupby.dml @@ -37,117 +37,230 @@ m_raGroupby = function (Matrix[Double] X, Integer col, String method) return (Matrix[Double] Y) { - if (method == "nested-loop") { - # Extract and sort unique values from the specified column (1-based index) - uniqueValues = unique(X[, col]) - order_uniqueValues = order(target = uniqueValues, by = 1); + if (method == "nested-loop") { + # Extract and sort unique group values from the specified column (1-based index) + groupsUnique = unique(X[, col]) + groupsUniqueOrdered = order(target = groupsUnique, by = 1) + numGroups = nrow(groupsUnique) + maxRowsInGroup = max(table(X[,col],1)); - # Calcute the number of groups - numGroups = nrow(uniqueValues) + # Define a zero output matrix, save the initial order of the groups, and sort increasingly + Y = matrix(0, numGroups, maxRowsInGroup*(ncol(X) - 1) + 1) + Y[,1] = groupsUnique + indicesY = order(target = Y, by = 1, index.return = TRUE) + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) - # Determine the maximum number of rows in any group - maxRowsInGroup = max(table(X[,col],1)); + # Order the input matrix by the grouping column + indicesX = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + + currentGroupX = 1 + currentGroupY = 1 + i = 1 + + # Iterate over the input matrix + while (numGroups > 0) { + currentGroup = as.scalar(Y[currentGroupX,1]) + nRowsToCopy = 0 + + # Find the rows for the current group + group = 1 + while (group > 0) { + # Break if there are no more rows left in X + if (i > nrow(X)) { + group = 0 + } + # Check if the row belongs to the current group + else if (as.scalar(X[i, col]) == currentGroup) { + nRowsToCopy = nRowsToCopy + 1 + i = i + 1 + } + # Break if the row does not belong to the current group + else { + group = 0 + } + } + + # Copy the values into the output matrix + if (nRowsToCopy > 0) { + nRowsCurrentGroup = currentGroupY + nRowsToCopy - 1 + + # 1. Grouping column is the first column + if (col == 1) { + newMatrix = X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)] + } + # 2. Grouping column is the last column + else if (col == ncol(X)) { + newMatrix = X [currentGroupY:nRowsCurrentGroup, 1:col-1] + } + # 3. Grouping column has an intermediate position + else { + newMatrix = cbind(X[currentGroupY:nRowsCurrentGroup, 1:(col-1)], X[currentGroupY:nRowsCurrentGroup, (col+1):ncol(X)]) + } + + # Flatten the new row + newRow = matrix(newMatrix, rows = 1, cols = nrow(newMatrix) * ncol(newMatrix)) + newRowColIdx = nRowsToCopy * (ncol(X)-1) - # Define a zero matrix to put the group data into - Y = matrix(0,numGroups,maxRowsInGroup*(ncol(X)-1)+1) - - # Put the ordered uniqueValues into first column of Y as group_id - #Y[,1] = order_uniqueValues - Y[,1] = uniqueValues - - # Loop for each group - for(i in 1:numGroups){ - index = 0 - - # Iterate each row in matrix X to deal with group data - for ( j in 1:nrow(X) ) { - if ( as.scalar( X[j,col] == uniqueValues[i,1] )) { - # Define the formula of the start and end column position - startCol = index*(ncol(X)-1) +2 - endCol = startCol + (ncol(X)-2) - - if (col == 1) { - # Case when the selected column is the first column - Y[i,startCol:endCol] = X[j,2:ncol(X)] - } - else if (col == ncol(X)) { - # Case when the selected column is the last column - Y[i,startCol:endCol] = X[j,1:(ncol(X)-1)] - } - else { - # General case - newRow = cbind(X[j, 1:(col-1)], X[j, (col+1):ncol(X)]) - Y[i,startCol:endCol] = newRow - } - index = index +1 + # Add the new row into Y at the current group + Y[currentGroupX, 2: (newRowColIdx + 1)] = newRow } - } + + # Continue with the next group + currentGroupX = currentGroupX + 1 + currentGroupY = currentGroupY + nRowsToCopy + numGroups = numGroups - 1 } + + # Restore the initial order of X + X = cbind(X, indicesX) + nColX = ncol(X) + X = order(target = X, by= nColX) + X = X[, 1:nColX-1] + + # Restore the initial order of Y + Y = cbind(Y, indicesY) + nColY = ncol(Y) + Y = order(target = Y, by= nColY) + Y = Y[, 1:nColY-1] } + else if (method == "permutation-matrix") { # Extract the grouping column and create unique groups key = X[,col] - key_unique = unique(X[, col]) - numGroups = nrow(key_unique) + keyUnique = unique(X[, col]) + numGroups = nrow(keyUnique) + maxRowsInGroup = max(table(X[,col],1)) - # Matrix for comparison - key_compare = key_unique %*% matrix(1, rows=1, cols=nrow(X)) - key_matrix = matrix(1, rows=nrow(key_unique), cols=1) %*% t(key) + # Calculate the frequency of each group + freqPerKey = table(key, 1) + freqPerKey = removeEmpty(target = freqPerKey, margin = "rows") + freqPerKeyIndices = order(target = keyUnique, by = 1, index.return = TRUE) - # Find group index - groupIndex = rowIndexMax(t(key_compare == key_matrix)) + # Match the length of freqPerKey to keyUnique and sort it accordingly + freqPerKey = cbind(freqPerKey, freqPerKeyIndices) + nColFpk = ncol(freqPerKey) + freqPerKey = order(target = freqPerKey, by= nColFpk) + freqPerKey = freqPerKey[, 1:nColFpk-1] + freqPerKey = t(freqPerKey) - # Determine the maximum number of rows in any group - maxRowsInGroup = max(table(X[,col],1)) - totalCells = (maxRowsInGroup) * (ncol(X)-1) +1 + # Find the group with the most values + groupMaxVal = maxRowsInGroup*(ncol(X)-1)+1 + groupMaxValKey = max(freqPerKey) - # Create permutation matrix P copy relevant tuples with a single matrix multiplication - P = matrix(0, rows=nrow(X), cols=numGroups * maxRowsInGroup) - # Create offsets to store the first column of each group - offsets = matrix(seq(0, (numGroups-1)*maxRowsInGroup, maxRowsInGroup), rows=numGroups, cols=1) + # Calculate the amount of rows that need padding and the amount of padding per key + groupMaxValKeySeq = matrix(groupMaxValKey, nrow(freqPerKey), ncol(freqPerKey)) + missingPadding = groupMaxValKeySeq - freqPerKey + amountOfZeroRows = sum(missingPadding) - # Create row and column index for the permutation matrix - rowIndex = seq(1, nrow(X)) - indexWithInGroups = cumsum(t(table(groupIndex, seq(1, nrow(X)), numGroups, nrow(X)))) - selectedMatrix = table(seq(1, nrow(indexWithInGroups)), groupIndex) - colIndex = groupIndex * maxRowsInGroup - maxRowsInGroup + rowSums(indexWithInGroups * selectedMatrix) + # 1. Padding is required + if (amountOfZeroRows > 0) { + missingPadding = t(missingPadding) - # Set values in P - P = table(seq(1, nrow(X)), colIndex) + # Remove the keys that dont need padding + removeMask = (missingPadding != 0) + missingPadding = cbind(keyUnique, missingPadding) + missingPadding = removeEmpty(target = missingPadding, margin = "rows", select = removeMask) - # Perform matrix multiplication - Y_temp = t(P) %*% X + # Keys that need padding and padding length per group + keysPadding = missingPadding[,1] + missingPadding = missingPadding[,2] + repeatKeys = matrix(0, rows=amountOfZeroRows, cols=1) - # Remove the selected column from Y_temp - if( col == 1 ) { - Y_temp_reduce = Y_temp[, col+1:ncol(Y_temp)] + # Generate the repeating keys + repeatKeysIdxS = 1 + + for (i in 1:nrow(missingPadding)) { + repeat_count = as.scalar(missingPadding[i,1]) + if (repeat_count > 0) { + temp = matrix(as.scalar(keysPadding[i, 1]), rows=repeat_count, cols = 1) + repeatKeysIdxE = repeatKeysIdxS + repeat_count - 1 + repeatKeys[repeatKeysIdxS:repeatKeysIdxE, 1] = temp + repeatKeysIdxS = repeatKeysIdxE + 1 + } + } + + # Combine the keys that need padding with the actual padding + padding = matrix(0, rows = nrow(repeatKeys), cols = 1) + padding = cbind(repeatKeys, padding) + + # Extend the existing keys to a second column to match the padded keys + key = key %*% matrix(1, rows = 1, cols = 2) + + # Combine the keys with the padded keys and sort them increasingly + tempY = rbind(key, padding) + tempY = order(target = tempY, by = 1, decreasing = FALSE, index.return = FALSE) + + # Remove the padded rows and save the Indices of the combined keys for the permutation matrix + paddedRows = tempY[, 2] + tempIndicesY = order(target = tempY, by = 1, decreasing = FALSE, index.return = TRUE) + tempIndicesY = removeEmpty(target = tempIndicesY, margin = "rows", select = (paddedRows!=0)) + + # Create the permutation matrix by using the Indices of the combined keys + P = table(seq(1, nrow(X)), tempIndicesY) + + # Order the initial matrix to match the sorted keys with padding + indicesX = order(target = X, by = col, index.return = TRUE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + X = order(target = X, by = col, decreasing = FALSE, index.return = FALSE) + + # Perform the matrix multiplication + tempY = t(P) %*% X + } + + # 2. Padding is not required + else { + tempY = X + tempY = order(target = tempY, by = col, decreasing = FALSE, index.return = FALSE) + } + + # Remove the selected column from tempY + if (col == 1) { + tempY = tempY[, col+1:ncol(tempY)] } - else if( col == ncol(X) ) { - Y_temp_reduce = Y_temp[, 1:col-1] + else if (col == ncol(X)) { + tempY = tempY[, 1:col-1] } - else{ - Y_temp_reduce = cbind(Y_temp[, 1:col-1],Y_temp[, col+1:ncol(Y_temp)]) + else { + tempY = cbind(tempY[, 1:col-1],tempY[, col+1:ncol(tempY)]) } - # Set value of final output - Y = matrix(0, rows=numGroups, cols=totalCells) - Y[,1] = key_unique + # Set the value of the final output + Y = matrix(0, rows=numGroups, cols=groupMaxVal) + Y[,1] = keyUnique - # The permutation matrix creates a structure where each group's data - # may not fill exactly maxRowsInGroup rows. - # If needed, we need to pad to the expected size first. + # Each group's data may not fill exactly maxRowsInGroup rows + # If needed, we need to pad to the expected size first expectedRows = numGroups * maxRowsInGroup - actualRows = nrow(Y_temp_reduce) - - if(actualRows < expectedRows) { - # Pad Y_temp_reduce with zeros to match expected structure - Y_tmp_padded = matrix(0, rows=expectedRows, cols=ncol(Y_temp_reduce)) - Y_tmp_padded[1:actualRows,] = Y_temp_reduce - } else { - Y_tmp_padded = Y_temp_reduce + actualRows = nrow(tempY) + + if (actualRows < expectedRows) { + # Pad tempY with zeros to match expected structure + tempYPadded = matrix(0, rows=expectedRows, cols=ncol(tempY)) + tempYPadded[1:actualRows,] = tempY } + else { + tempYPadded = tempY + } + + # Save the initial order of the groups in Y and order Y to match the sorted tempYPadded + indicesY = order(target = Y, by = 1, index.return = TRUE) + Y = order(target = Y, by = 1, decreasing = FALSE, index.return = FALSE) + + # Copy the values into Y + Y[,2:ncol(Y)] = matrix(tempYPadded, rows=numGroups, cols=groupMaxVal-1) + + # Restore the initial order of X + X = cbind(X, indicesX) + nColX = ncol(X) + X = order(target = X, by= nColX) + X = X[, 1:nColX-1] - Y[,2:ncol(Y)] = matrix(Y_tmp_padded, rows=numGroups, cols=totalCells-1) + # Restore the initial order of Y + Y = cbind(Y, indicesY) + nColY = ncol(Y) + Y = order(target = Y, by= nColY) + Y = Y[, 1:nColY-1] } }