From 94da9b191ac76ab242ac038dd9b0aea6f8b30d8d Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Thu, 5 Jun 2025 18:47:54 +0200
Subject: [PATCH 01/26] update gpu-backend to current Nvidia-stack

---
 pom.xml                                       |  43 +-
 .../instructions/gpu/context/CSRPointer.java  | 148 ++++--
 .../DoublePrecisionCudaSupportFunctions.java  | 448 +++++++++++++++--
 .../runtime/matrix/data/LibMatrixCuDNN.java   | 325 +++++++-----
 .../LibMatrixCuDNNConvolutionAlgorithm.java   |  88 +++-
 .../data/LibMatrixCuDNNRnnAlgorithm.java      |  85 +++-
 .../SinglePrecisionCudaSupportFunctions.java  | 461 ++++++++++++++++--
 7 files changed, 1267 insertions(+), 331 deletions(-)
diff --git a/pom.xml b/pom.xml
index 5d2485897fb..ea312ca1f19 100644
--- a/pom.xml
+++ b/pom.xml
@@ -49,7 +49,7 @@
 		<project.build.outputTimestamp>1</project.build.outputTimestamp>
 		<enableGPU>false</enableGPU>
 		<jcuda.scope>provided</jcuda.scope>
-		<jcuda.version>10.2.0</jcuda.version>
+		<jcuda.version>12.6.0</jcuda.version>
 		<slf4j.version>2.0.11</slf4j.version>
 		<log4j.version>2.22.1</log4j.version>
 		<maven-clean-plugin.version>3.2.0</maven-clean-plugin.version>
@@ -1078,47 +1078,6 @@
 			<version>${jcuda.version}</version>
 			<scope>${jcuda.scope}</scope>
 		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcuda-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcublas-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusparse-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusolver-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcudnn-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
 		<dependency>
 			<groupId>org.apache.spark</groupId>
 			<artifactId>spark-core_${scala.binary.version}</artifactId>
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index 3125d432c93..b6b7a24a25c 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -19,19 +19,11 @@
 
 package org.apache.sysds.runtime.instructions.gpu.context;
 
-import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
-import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
-import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
-import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
-import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
-import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
-import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
-import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
-import static jcuda.runtime.JCuda.cudaMemcpy;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.cudaDataType;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusparse.*;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
@@ -39,12 +31,11 @@
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysds.utils.Statistics;
 
-import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.jcusparse.cusparseMatDescr;
-import jcuda.jcusparse.cusparsePointerMode;
+import static jcuda.jcusparse.JCusparse.*;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
+import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
+import static jcuda.runtime.JCuda.*;
+import static jcuda.runtime.cudaMemcpyKind.*;
 
 /**
  * Compressed Sparse Row (CSR) format for CUDA
@@ -318,16 +309,18 @@ private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle han
 	 */
 	private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) {
 		LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
-		int[] CnnzArray = { -1 };
-		cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
-				B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
+		int[] CnnzArray = {-1};
+		Pointer workspace = new Pointer();
+		cusparseXcsrgeam2Nnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
+			B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray), workspace);
 		//cudaDeviceSynchronize;
-		if (CnnzArray[0] != -1) {
+		if(CnnzArray[0] != -1) {
 			C.nnz = CnnzArray[0];
-		} else {
-			int baseArray[] = { 0 };
+		}
+		else {
+			int[] baseArray = {0};
 			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
-					cudaMemcpyDeviceToHost);
+				cudaMemcpyDeviceToHost);
 			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
 			C.nnz = CnnzArray[0] - baseArray[0];
 		}
@@ -347,25 +340,94 @@ private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, C
 	 * @param n      Number of columns of sparse matrix op ( B ) and C
 	 * @param k      Number of columns/rows of sparse matrix op ( A ) / op ( B )
 	 */
+
 	private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
-			CSRPointer B, int transB, CSRPointer C, int m, int n, int k) {
-		LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
-		int[] CnnzArray = { -1 };
-		if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
-			throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
-		}
-		cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr,
-				toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
-		//cudaDeviceSynchronize;
-		if (CnnzArray[0] != -1) {
-			C.nnz = CnnzArray[0];
-		} else {
-			int baseArray[] = { 0 };
-			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
-					cudaMemcpyDeviceToHost);
-			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
-			C.nnz = CnnzArray[0] - baseArray[0];
+		CSRPointer B, int transB, CSRPointer C, int m, int n, int k)            // C = op(A)·op(B)  (m×k)·(k×n)
+	{
+		LOG.trace("GPU : step2GatherNNZGemm (SpGEMM), GPUContext=" + gCtx);
+
+		/* ---------- quick guard ---------------------------------------- */
+		if(A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
+			throw new DMLRuntimeException("Number of non-zeros exceeds cuSPARSE 32-bit limit");
+
+		/* ---------- 1. CSR descriptors for A, B, C --------------------- */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseSpMatDescr matB = new cusparseSpMatDescr();
+		cusparseSpMatDescr matC = new cusparseSpMatDescr();
+
+		cusparseCreateCsr(matA, m, k, A.nnz, A.rowPtr, A.colInd, A.val, cusparseIndexType.CUSPARSE_INDEX_32I,
+			cusparseIndexType.CUSPARSE_INDEX_32I, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO, cudaDataType.CUDA_R_64F);
+
+		cusparseCreateCsr(matB, k, n, B.nnz, B.rowPtr, B.colInd, B.val, cusparseIndexType.CUSPARSE_INDEX_32I,
+			cusparseIndexType.CUSPARSE_INDEX_32I, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO, cudaDataType.CUDA_R_64F);
+
+		cusparseCreateCsr(matC, m, n, 0L,                 // nnz(C) unknown
+			C.rowPtr, Pointer.to(new int[] {0}), Pointer.to(new double[] {0}), cusparseIndexType.CUSPARSE_INDEX_32I,
+			cusparseIndexType.CUSPARSE_INDEX_32I, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO, cudaDataType.CUDA_R_64F);
+
+		/* ---------- 2. SpGEMM descriptor ------------------------------- */
+		cusparseSpGEMMDescr spgemmDesc = new cusparseSpGEMMDescr();
+		cusparseSpGEMM_createDescr(spgemmDesc);
+
+		Pointer alpha = Pointer.to(new double[] {1.0});
+		Pointer beta = Pointer.to(new double[] {0.0});
+		int alg = cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
+
+		/* ---------- 3. Phase-1 : work-estimation ----------------------- */
+		long[] bufSize1 = {0};
+		cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC,
+			cudaDataType.CUDA_R_64F, alg, spgemmDesc, bufSize1, null);                               // first query
+
+		Pointer dBuf1 = new Pointer();
+		if(bufSize1[0] > 0)
+			cudaMalloc(dBuf1, bufSize1[0]);
+
+		cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC,
+			cudaDataType.CUDA_R_64F, alg, spgemmDesc, bufSize1, dBuf1);                              // real run
+
+		/* ---------- 4. Phase-2 : compute structure / nnz --------------- */
+		long[] bufSize2 = {0};
+		cusparseSpGEMM_compute(                           // size query
+			handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC, cudaDataType.CUDA_R_64F, alg,
+			spgemmDesc, bufSize2, null);                              // ← 13 args
+
+		Pointer dBuf2 = new Pointer();
+		if(bufSize2[0] > 0)
+			cudaMalloc(dBuf2, bufSize2[0]);
+
+		cusparseSpGEMM_compute(                           // actual compute
+			handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC, cudaDataType.CUDA_R_64F, alg,
+			spgemmDesc, bufSize2, dBuf2);
+
+		/* ---------- 5. read nnz(C) ------------------------------------- */
+		long[] rows = {0}, cols = {0}, nnz = {0};
+		cusparseSpMatGetSize(matC.asConst(), rows, cols, nnz);
+		C.nnz = (int) nnz[0];
+
+		/* ---------- 6. temp col/val arrays so COPY can write them ------ */
+		Pointer dCcol = new Pointer();
+		Pointer dCval = new Pointer();
+		if(C.nnz > 0) {
+			cudaMalloc(dCcol, C.nnz * Sizeof.INT);
+			cudaMalloc(dCval, C.nnz * Sizeof.DOUBLE);
 		}
+		cusparseCsrSetPointers(matC, C.rowPtr, dCcol, dCval);
+
+		/* ---------- 7. Phase-3 : copy final CSR into user arrays ------- */
+		cusparseSpGEMM_copy(                              // ← 11 args
+			handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC, cudaDataType.CUDA_R_64F, alg,
+			spgemmDesc);
+
+		/* ---------- 8. clean-up --------------------------------------- */
+		cudaFree(dCcol);
+		cudaFree(dCval);
+		cudaFree(dBuf1);
+		cudaFree(dBuf2);
+
+		cusparseSpGEMM_destroyDescr(spgemmDesc);
+		cusparseDestroySpMat(matA.asConst());
+		cusparseDestroySpMat(matB.asConst());
+		cusparseDestroySpMat(matC.asConst());
 	}
 
 	/**
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index 3ffac4bd00c..e77e17e09e4 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -18,10 +18,15 @@
  */
 package org.apache.sysds.runtime.matrix.data;
 
+import static jcuda.jcusparse.JCusparse.cusparseCreateCsr;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaFree;
 
+import jcuda.jcusparse.cusparseSpMatDescr;
+import jcuda.jcusparse.cusparseSpGEMMDescr;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLRuntimeException;
@@ -36,73 +41,284 @@
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
+import jcuda.jcusparse.cusparseDnVecDescr;
+import jcuda.jcusparse.cusparseDnMatDescr;
 
+import static jcuda.jcusparse.cusparseIndexType.CUSPARSE_INDEX_32I;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
+import static jcuda.cudaDataType.CUDA_R_64F;
+import static jcuda.jcusparse.cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
+import static jcuda.jcusparse.cusparseStatus.CUSPARSE_STATUS_SUCCESS;
+import static jcuda.jcusparse.cusparseSpMVAlg.CUSPARSE_SPMV_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOrder.CUSPARSE_ORDER_COL;
+import static jcuda.jcusparse.cusparseSpMMAlg.CUSPARSE_SPMM_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseCsr2CscAlg.CUSPARSE_CSR2CSC_ALG1;
+import static jcuda.jcusparse.cusparseSparseToDenseAlg.CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ONE;
+import static jcuda.jcusparse.cusparseDenseToSparseAlg.CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+import static jcuda.jcusparse.JCusparse.cusparseSpGEMM_createDescr;
+import static jcuda.jcusparse.JCusparse.cusparseCreateDnVec;
+import static jcuda.jcusparse.JCusparse.cusparseCreateDnMat;
 public class DoublePrecisionCudaSupportFunctions implements CudaSupportFunctions {
 
 	private static final Log LOG = LogFactory.getLog(DoublePrecisionCudaSupportFunctions.class.getName());
-	
+
 	@Override
 	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
-			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
-			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
-			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
-		return JCusparse.cusparseDcsrgemm(handle, transA,  transB,  m,  n,  k,
-				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
-				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
-				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+		cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
+		cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
+		cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
+		/* ------------------------------------------------------------------ */
+		/* 0.   Wrap A, B, C in the new SpMat descriptors                     */
+		/* ------------------------------------------------------------------ */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseSpMatDescr matB = new cusparseSpMatDescr();
+		cusparseSpMatDescr matC = new cusparseSpMatDescr();
+
+		cusparseCreateCsr(matA, m, k, nnzA, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+
+		cusparseCreateCsr(matB, k, n, nnzB, csrRowPtrB, csrColIndB, csrValB, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+
+    /*  C’s nnz is not known yet -> start with 0 and rowPtr only.
+        colInd / val arrays are already allocated by the caller.         */
+		cusparseCreateCsr(matC, m, n, 0L, csrRowPtrC, csrColIndC, csrValC, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+
+		/* ------------------------------------------------------------------ */
+		/* 1.   Create & configure the SpGEMM descriptor                      */
+		/* ------------------------------------------------------------------ */
+		cusparseSpGEMMDescr spgemm = new cusparseSpGEMMDescr();
+		cusparseSpGEMM_createDescr(spgemm);
+
+		Pointer alpha = Pointer.to(new double[] {1.0});
+		Pointer beta = Pointer.to(new double[] {0.0});
+		int alg = CUSPARSE_SPGEMM_DEFAULT;
+		int computeTp = CUDA_R_64F;
+
+		/* ------------------------------------------------------------------ */
+		/* 2.   Phase-1 : work-estimation                                     */
+		/* ------------------------------------------------------------------ */
+		long[] bufSz1 = {0};
+		int status = JCusparse.cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(),
+			matB.asConst(), beta, matC, computeTp, alg, spgemm, bufSz1, null);
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		Pointer dBuf1 = new Pointer();
+		if(bufSz1[0] > 0)
+			cudaMalloc(dBuf1, bufSz1[0]);
+
+		status = JCusparse.cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
+			beta, matC, computeTp, alg, spgemm, bufSz1, dBuf1);
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		/* ------------------------------------------------------------------ */
+		/* 3.   Phase-2 : compute structure / nnz(C)                          */
+		/* ------------------------------------------------------------------ */
+		long[] bufSz2 = {0};
+		status = JCusparse.cusparseSpGEMM_compute(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta,
+			matC, computeTp, alg, spgemm, bufSz2, null);                       // query required buffer
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		Pointer dBuf2 = new Pointer();
+		if(bufSz2[0] > 0)
+			cudaMalloc(dBuf2, bufSz2[0]);
+
+		status = JCusparse.cusparseSpGEMM_compute(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta,
+			matC, computeTp, alg, spgemm, bufSz2, dBuf2);                      // real compute
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		/* ------------------------------------------------------------------ */
+		/* 4.   Phase-3 : copy final CSR data into caller-provided buffers    */
+		/* ------------------------------------------------------------------ */
+		status = JCusparse.cusparseSpGEMM_copy(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta,
+			matC, computeTp, alg, spgemm);
+		/* fall-through to CLEAN block */
+
+
+		/* ------------------------------------------------------------------ */
+		/* 5.   Cleanup                                                         */
+		/* ------------------------------------------------------------------ */
+		cudaFree(dBuf1);
+		cudaFree(dBuf2);
+
+		JCusparse.cusparseSpGEMM_destroyDescr(spgemm);
+		JCusparse.cusparseDestroySpMat(matA.asConst());
+		JCusparse.cusparseDestroySpMat(matB.asConst());
+		JCusparse.cusparseDestroySpMat(matC.asConst());
+
+		return status;
 	}
-	
+
 	@Override
 	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
-			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
 		return JCublas2.cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 	}
-	
+
 	@Override
 	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
-			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
-			Pointer y) {
-		return JCusparse.cusparseDcsrmv(handle, transA, m, n, nnz, alpha, 
-				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+		cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
+		Pointer y) {
+		/* Descriptors and workspace --------------------------------------- */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseDnVecDescr vecX = new cusparseDnVecDescr();
+		cusparseDnVecDescr vecY = new cusparseDnVecDescr();
+		Pointer dBuf = new Pointer();
+		long dBufBytes = 0;
+		int status;
+
+		try {
+			/* 1. CSR matrix A --------------------------------------------- */
+			cusparseCreateCsr(matA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+				CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+
+			/* 2. Dense vectors X and Y ------------------------------------ */
+			cusparseCreateDnVec(vecX, n, x, CUDA_R_64F);
+			cusparseCreateDnVec(vecY, m, y, CUDA_R_64F);
+
+			/* 3. Query workspace size ------------------------------------- */
+			long[] bufSize = {0};
+			status = JCusparse.cusparseSpMV_bufferSize(handle, transA, alpha, matA.asConst(), vecX.asConst(), beta,
+				vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, bufSize);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			dBufBytes = bufSize[0];
+			if(dBufBytes > 0)
+				cudaMalloc(dBuf, dBufBytes);
+
+			/* 4. Perform SpMV -------------------------------------------- */
+			status = JCusparse.cusparseSpMV(handle, transA, alpha, matA.asConst(), vecX.asConst(), beta, vecY,
+				CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, dBuf);
+
+			return status;
+		}
+		finally {
+			/* Cleanup ----------------------------------------------------- */
+			if(dBufBytes > 0)
+				cudaFree(dBuf);
+			JCusparse.cusparseDestroyDnVec(vecX.asConst());
+			JCusparse.cusparseDestroyDnVec(vecY.asConst());
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+		}
 	}
-	
+
 	@Override
-	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
-			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
-			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
-		return JCusparse.cusparseDcsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
-				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	public int cusparsecsrmm2(cusparseHandle handle, int transA, int transB, int m, int n, int k, int nnz,
+		Pointer alpha, cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer B,
+		int ldb, Pointer beta, Pointer C, int ldc) {
+		/* Descriptors and workspace -------------------------------------- */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseDnMatDescr matB = new cusparseDnMatDescr();
+		cusparseDnMatDescr matC = new cusparseDnMatDescr();
+		Pointer dBuf = new Pointer();
+		long dBufBytes = 0;
+		int status;
+
+		try {
+			/* 1. CSR matrix A -------------------------------------------- */
+			cusparseCreateCsr(matA, m, k, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+				CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+
+			/* 2. Dense matrix B  (col-major layout) ---------------------- */
+			int rowsB = (transB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : n;
+			int colsB = (transB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : k;
+			cusparseCreateDnMat(matB, rowsB, colsB, ldb, B, CUDA_R_64F, CUSPARSE_ORDER_COL);
+
+			/* 3. Dense matrix C  (output) -------------------------------- */
+			int rowsC = (transA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : k;
+			int colsC = colsB;                       // always equals n
+			cusparseCreateDnMat(matC, rowsC, colsC, ldc, C, CUDA_R_64F, CUSPARSE_ORDER_COL);
+
+			/* 4. Query workspace size ------------------------------------ */
+			long[] bufSize = {0};
+			status = JCusparse.cusparseSpMM_bufferSize(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
+				beta, matC, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, bufSize);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			dBufBytes = bufSize[0];
+			if(dBufBytes > 0)
+				cudaMalloc(dBuf, dBufBytes);
+
+			/* 5. Execute SpMM ------------------------------------------- */
+			status = JCusparse.cusparseSpMM(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC,
+				CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, dBuf);
+
+			return status;
+		}
+		finally {
+			/* Cleanup ---------------------------------------------------- */
+			if(dBufBytes > 0)
+				cudaFree(dBuf);
+			JCusparse.cusparseDestroyDnMat(matB.asConst());
+			JCusparse.cusparseDestroyDnMat(matC.asConst());
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+		}
 	}
-	
+
 	@Override
 	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
 		return JCublas2.cublasDdot(handle, n, x, incx, y, incy, result);
 	}
-	
+
 	@Override
 	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
-			int incx, Pointer beta, Pointer y, int incy) {
+		int incx, Pointer beta, Pointer y, int incy) {
 		return JCublas2.cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 	}
-	
+
 	@Override
 	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
-			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
 		return JCublas2.cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 	}
-	
+
 	@Override
 	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
-			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
-		return JCusparse.cusparseDcsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+		Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+		/* Constants ------------------------------------------------------- */
+		int valType = CUDA_R_64F;                     // double precision
+		int alg = CUSPARSE_CSR2CSC_ALG1;          // always supported
+
+		/* Query workspace size ------------------------------------------- */
+		long[] bufSize = {0};
+		int status = JCusparse.cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+			cscColPtr, cscRowInd, valType, copyValues, idxBase, alg, bufSize);
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		/* Allocate temp buffer if needed --------------------------------- */
+		Pointer buffer = new Pointer();
+		if(bufSize[0] > 0)
+			cudaMalloc(buffer, bufSize[0]);
+
+		try {
+			/* Perform CSR -> CSC conversion ------------------------------- */
+			status = JCusparse.cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
+				cscRowInd, valType, copyValues, idxBase, alg, buffer);
+
+			return status;
+		}
+		finally {
+			if(bufSize[0] > 0)
+				cudaFree(buffer);
+		}
 	}
-	
+
 	@Override
 	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
-			Pointer beta, Pointer C, int ldc) {
+		Pointer beta, Pointer C, int ldc) {
 		return JCublas2.cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 	}
-	
+
 	@Override
 	public int cublasaxpy(cublasHandle handle, int n, Pointer alpha, Pointer x, int incx, Pointer y, int incy) {
 		return JCublas2.cublasDaxpy(handle, n, alpha, x, incx, y, incy);
@@ -127,35 +343,175 @@ public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int
 
 	@Override
 	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
-			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
 		return JCusolverDn.cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 	}
-	
+
 	@Override
 	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
-			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
-			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
-			Pointer csrRowPtrC, Pointer csrColIndC) {
-		return JCusparse.cusparseDcsrgeam(handle, m, n, alpha, descrA, nnzA, 
-				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
-				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+		Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+		Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+		Pointer csrRowPtrC, Pointer csrColIndC) {
+		/* ------------------------------------------------------------------ */
+		/* 1. Query temporary-buffer size                                     */
+		/* ------------------------------------------------------------------ */
+		long[] bufSize = {0};
+
+		int status = JCusparse.cusparseDcsrgeam2_bufferSizeExt(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
+			csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC,
+			bufSize);
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		/* ------------------------------------------------------------------ */
+		/* 2. Allocate workspace (if required)                                */
+		/* ------------------------------------------------------------------ */
+		Pointer buffer = new Pointer();
+		if(bufSize[0] > 0)
+			cudaMalloc(buffer, bufSize[0]);
+
+		try {
+			/* -------------------------------------------------------------- */
+			/* 3.  C = α*A  +  β*B  (sorted-CSR version 2)                     */
+			/* -------------------------------------------------------------- */
+			status = JCusparse.cusparseDcsrgeam2(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA,
+				beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC, buffer);
+
+			return status;
+		}
+		finally {
+			/* -------------------------------------------------------------- */
+			/* 4. Free workspace                                              */
+			/* -------------------------------------------------------------- */
+			if(bufSize[0] > 0)
+				cudaFree(buffer);
+		}
 	}
-	
+
 	@Override
 	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
-			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
-		return JCusparse.cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+		Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
+		/* ------------------------------------------------------------- */
+		/* 1. Determine nnz from the last entry of csrRowPtrA            */
+		/* ------------------------------------------------------------- */
+		int[] last = {0};
+		cudaMemcpy(Pointer.to(last), csrRowPtrA.withByteOffset((long) m * Sizeof.INT), Sizeof.INT,
+			cudaMemcpyDeviceToHost);
+
+		/* Adjust for index base (0 or 1) ------------------------------ */
+		int base = JCusparse.cusparseGetMatIndexBase(descrA);
+		int nnz = (base == CUSPARSE_INDEX_BASE_ONE) ? last[0] - 1 : last[0];
+
+		/* ------------------------------------------------------------- */
+		/* 2. Create CSR SpMat and dense DnMat descriptors               */
+		/* ------------------------------------------------------------- */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		JCusparse.cusparseCreateCsr(matA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, base, CUDA_R_64F);
+
+		cusparseDnMatDescr matB = new cusparseDnMatDescr();
+		JCusparse.cusparseCreateDnMat(matB, m, n, lda, A, CUDA_R_64F, CUSPARSE_ORDER_COL);
+
+		/* ------------------------------------------------------------- */
+		/* 3. Query workspace size                                       */
+		/* ------------------------------------------------------------- */
+		long[] bufSize = {0};
+		int alg = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+
+		int status = JCusparse.cusparseSparseToDense_bufferSize(handle, matA.asConst(), matB, alg, bufSize);
+		if(status != CUSPARSE_STATUS_SUCCESS) {
+			JCusparse.cusparseDestroyDnMat(matB.asConst());
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+			return status;
+		}
+
+		/* ------------------------------------------------------------- */
+		/* 4. Allocate temporary buffer (if needed)                      */
+		/* ------------------------------------------------------------- */
+		Pointer buffer = new Pointer();
+		if(bufSize[0] > 0)
+			cudaMalloc(buffer, bufSize[0]);
+
+		try {
+			/* --------------------------------------------------------- */
+			/* 5. Perform CSR -> dense conversion                         */
+			/* --------------------------------------------------------- */
+			status = JCusparse.cusparseSparseToDense(handle, matA.asConst(), matB, alg, buffer);
+
+			return status;
+		}
+		finally {
+			/* --------------------------------------------------------- */
+			/* 6. Cleanup                                                */
+			/* --------------------------------------------------------- */
+			if(bufSize[0] > 0)
+				cudaFree(buffer);
+			JCusparse.cusparseDestroyDnMat(matB.asConst());
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+		}
 	}
 
 	@Override
 	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
-			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
-		return JCusparse.cusparseDdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+		Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		/* ------------------------------------------------------------------ */
+		/* 1. Determine index base and wrap the input/output in descriptors   */
+		/* ------------------------------------------------------------------ */
+		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
+
+		cusparseDnMatDescr matDense = new cusparseDnMatDescr();
+		JCusparse.cusparseCreateDnMat(matDense, m, n, lda, A, CUDA_R_64F, CUSPARSE_ORDER_COL);
+
+		cusparseSpMatDescr matCsr = new cusparseSpMatDescr();
+		/* nnz initially 0 – cuSPARSE fills it during analysis phase */
+		JCusparse.cusparseCreateCsr(matCsr, m, n, 0L, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, CUDA_R_64F);
+
+		/* ------------------------------------------------------------------ */
+		/* 2. Query temporary buffer size                                     */
+		/* ------------------------------------------------------------------ */
+		long[] bufSz = {0};
+		int alg = CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+
+		int status = JCusparse.cusparseDenseToSparse_bufferSize(handle, matDense.asConst(), matCsr, alg, bufSz);
+		if(status != CUSPARSE_STATUS_SUCCESS) {
+			JCusparse.cusparseDestroySpMat(matCsr.asConst());
+			JCusparse.cusparseDestroyDnMat(matDense.asConst());
+			return status;
+		}
+
+		Pointer buffer = new Pointer();
+		if(bufSz[0] > 0)
+			cudaMalloc(buffer, bufSz[0]);
+
+		try {
+			/* -------------------------------------------------------------- */
+			/* 3. Symbolic pass: decide sparsity pattern, fill csrRowPtrA     */
+			/* -------------------------------------------------------------- */
+			status = JCusparse.cusparseDenseToSparse_analysis(handle, matDense.asConst(), matCsr, alg, buffer);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			/* -------------------------------------------------------------- */
+			/* 4. Numeric conversion: fill csrColIndA and csrValA             */
+			/* -------------------------------------------------------------- */
+			status = JCusparse.cusparseDenseToSparse_convert(handle, matDense.asConst(), matCsr, alg, buffer);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			return status;
+		}
+		finally {
+			if(bufSz[0] > 0)
+				cudaFree(buffer);
+			JCusparse.cusparseDestroySpMat(matCsr.asConst());
+			JCusparse.cusparseDestroyDnMat(matDense.asConst());
+		}
 	}
 
 	@Override
 	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
-			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
 		return JCusparse.cusparseDnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
index ea6d409f9a5..f32dbd029ff 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
@@ -34,16 +34,21 @@
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
+import static jcuda.jcudnn.cudnnRNNDataLayout.CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED;
+import static jcuda.jcudnn.cudnnForwardMode.CUDNN_FWD_MODE_TRAINING;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
 import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
 import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.jcudnn.cudnnSoftmaxAlgorithm.CUDNN_SOFTMAX_ACCURATE;
+import static jcuda.jcudnn.cudnnSoftmaxMode.CUDNN_SOFTMAX_MODE_CHANNEL;
+
+import jcuda.jcudnn.cudnnRNNDataDescriptor;
 import jcuda.CudaException;
 import jcuda.Pointer;
 import jcuda.jcudnn.JCudnn;
 import jcuda.jcudnn.cudnnActivationDescriptor;
-import jcuda.jcudnn.cudnnConvolutionFwdPreference;
 import jcuda.jcudnn.cudnnHandle;
 import jcuda.jcudnn.cudnnStatus;
 import jcuda.jcudnn.cudnnTensorDescriptor;
@@ -62,8 +67,7 @@
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
 import org.apache.sysds.utils.Statistics;
 
-import static jcuda.jcudnn.cudnnSoftmaxAlgorithm.CUDNN_SOFTMAX_ACCURATE;
-import static jcuda.jcudnn.cudnnSoftmaxMode.CUDNN_SOFTMAX_MODE_CHANNEL;
+
 
 /**
  * This class contains method that invoke CuDNN operations.
@@ -73,14 +77,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	// Currently we only use nnz information from the sparse matrix which is pre-computed
 	// TODO: experiment how often does dense matrix is empty where recomputing nnz before calling CuDNN will help
 	private static final boolean RECOMPUTE_DENSE_NNZ = false;
-	
-	protected static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+
+	//protected static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
 	private static final Log LOG = LogFactory.getLog(LibMatrixCuDNN.class.getName());
 
 	protected static cudnnHandle getCudnnHandle(GPUContext gCtx) {
 		return gCtx.getCudnnHandle();
 	}
-	
+
 	/**
 	 * Does a 2D convolution followed by a bias_add
 	 *
@@ -833,130 +837,227 @@ static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, Strin
 	 * @param T sequence length
 	 * @throws DMLRuntimeException if error
 	 */
-	public static void lstm(ExecutionContext ec, GPUContext gCtx, String instName,
-			Pointer X,  Pointer wPointer, Pointer out0, Pointer c0, boolean return_sequences,
-			String outputName, String cyName, int N, int M, int D, int T) throws DMLRuntimeException {
-		singleLayerUnidirectionalRNNForward(ec, gCtx, instName, X, out0, c0, wPointer, outputName, cyName, "lstm", return_sequences, N, M, D, T);
+	public static void lstm(ExecutionContext ec, GPUContext gCtx, String instName, Pointer X, Pointer wPointer,
+		Pointer out0, Pointer c0, boolean return_sequences, String outputName, String cyName, int N, int M, int D,
+		int T) throws DMLRuntimeException {
+		singleLayerUnidirectionalRNNForward(ec, gCtx, instName, X, out0, c0, wPointer, outputName, cyName, "lstm",
+			return_sequences, N, M, D, T);
 	}
-	
+
+	/**
+	 * Run a single-layer, unidirectional RNN/LSTM/GRU forward pass.
+	 *
+	 * @param ec               Execution context
+	 * @param gCtx             GPU context
+	 * @param instName         Instruction name for memory tracking
+	 * @param x                Input  X  (device pointer, shape N×D packed by time)
+	 * @param hx               Initial hidden state H₀ (device pointer, N×M)
+	 * @param cx               Initial cell state   C₀ (only for LSTM, else dummy)
+	 * @param wPointer         Flat weight buffer, already on device
+	 * @param outputName       SystemDS name for Y / last-state     output
+	 * @param cyName           SystemDS name for final cell state   output
+	 * @param rnnMode          "lstm" / "gru" / "rnn_relu" / "rnn_tanh"
+	 * @param return_sequences true ⇒ return the whole Y; false ⇒ only last step
+	 * @param N                Batch size
+	 * @param M                Hidden size
+	 * @param D                Input size
+	 * @param T                Sequence length
+	 */
 	private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName,
-			Pointer x, Pointer hx, Pointer cx, Pointer wPointer,  // input
-			String outputName, String cyName,  					 // output
-			String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
+		Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String outputName, String cyName, String rnnMode,
+		boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
 		boolean hasCarry = rnnMode.equalsIgnoreCase(Opcodes.LSTM.toString());
-		// Get output pointers
-		Pointer cudnnYPointer = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
-		Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName,
-			(long) N*M*sizeOfDataType, false);
+
+		/* ------------------------------------------------------------------ */
+		/* 0. Allocate output buffers                                         */
+		/* ------------------------------------------------------------------ */
+		Pointer yCudnn = gCtx.allocate(instName, (long) N * T * M * sizeOfDataType, false);          // Y from cuDNN
+
+		Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N,
+			M) : gCtx.allocate(instName, (long) N * M * sizeOfDataType, false);
+
 		Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
-		// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
-		
-		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) {
-			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
-					algo.xDesc, x, 
-					algo.hxDesc, hx, 
-					algo.cxDesc, cx, 
-					algo.wDesc, wPointer, 
-					algo.yDesc, cudnnYPointer, 
-					algo.hyDesc, hyPointer, 
-					algo.cyDesc, cyPointer, 
-					algo.workSpace, algo.sizeInBytes, 
-					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
-		}
-		
-		if(return_sequences) {
-			gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
-			Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M);
-			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output",
-					ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M),
-					sysdsYPointer, cudnnYPointer, N, T, M, N*T*M);
+
+		/* ------------------------------------------------------------------ */
+		/* 1. Build helper with v8 RNN descriptor                             */
+		/* ------------------------------------------------------------------ */
+		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D,
+			/*training*/true, wPointer)) {
+			/* -------------------------------------------------------------- */
+			/* 1a. Single RNN-DATA descriptors for X and Y                    */
+			/* -------------------------------------------------------------- */
+			cudnnRNNDataDescriptor xDesc = new cudnnRNNDataDescriptor();
+			JCudnn.cudnnCreateRNNDataDescriptor(xDesc);
+			JCudnn.cudnnSetRNNDataDescriptor(xDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+				CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, D, null,
+				null);                             // uniform length = T
+
+			cudnnRNNDataDescriptor yDesc = new cudnnRNNDataDescriptor();
+			JCudnn.cudnnCreateRNNDataDescriptor(yDesc);
+			JCudnn.cudnnSetRNNDataDescriptor(yDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+				CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, M, null, null);
+
+			/* -------------------------------------------------------------- */
+			/* 1b. Obtain size cuDNN expects for packed weight-space          */
+			/*     and reuse existing wPointer buffer                         */
+			/* -------------------------------------------------------------- */
+			long[] wSpaceBytes = {0};
+			JCudnn.cudnnGetRNNWeightSpaceSize(gCtx.getCudnnHandle(), algo.rnnDesc, wSpaceBytes);
+			long weightSpaceSize = wSpaceBytes[0];
+			Pointer weightSpace = wPointer;  // assume caller already packed
+
+			/* -------------------------------------------------------------- */
+			/* 2. Forward pass (training mode)                                */
+			/* -------------------------------------------------------------- */
+			JCudnn.cudnnRNNForward(gCtx.getCudnnHandle(), algo.rnnDesc, CUDNN_FWD_MODE_TRAINING,
+				// unified API flag
+				null,                                // devSeqLengths (uniform)
+				xDesc, x, yDesc, yCudnn, algo.hxDesc, hx, hyPointer, algo.cxDesc, cx, cyPointer, weightSpaceSize,
+				weightSpace, algo.sizeInBytes, algo.workSpace, algo.reserveSpaceSizeInBytes, algo.reserveSpace);
+
+			/* ------------------------------------------------------------------ */
+			/* 3. Copy / reshape Y when user asked for full sequences              */
+			/* ------------------------------------------------------------------ */
+			if(return_sequences) {
+				gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
+
+				Pointer ySysds = getDenseOutputPointer(ec, gCtx, instName, outputName, N, (long) T * M);
+
+				LibMatrixCUDA.getCudaKernels(gCtx)
+					.launchKernel("prepare_lstm_output", ExecutionConfig.getConfigForSimpleVectorOperations(N * T * M),
+						ySysds, yCudnn, N, T, M, N * T * M);
+			}
+
+			/* ------------------------------------------------------------------ */
+			/* 4. Free temporaries                                                */
+			/* ------------------------------------------------------------------ */
+			gCtx.cudaFreeHelper(instName, yCudnn, DMLScript.EAGER_CUDA_FREE);
+			JCudnn.cudnnDestroyRNNDataDescriptor(xDesc);
+			JCudnn.cudnnDestroyRNNDataDescriptor(yDesc);
 		}
-		gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE);
 	}
-	
-	public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
-			Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName,  // input
-			String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
-			boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
-		// Transform the input dout and prepare them for cudnnRNNBackwardData
-		Pointer dy = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
-		int size = return_sequences ? N*T*M : N*M;
+
+	public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName, Pointer x, Pointer hx,
+		Pointer cx, Pointer wPointer,          // inputs
+		String doutName, String dcyName,                              // grad-in
+		String dxName, String dwName, String dbName,                  // grad-out
+		String dhxName, String dcxName, boolean return_sequences, int N, int M, int D, int T)
+		throws DMLRuntimeException {
+		/* ------------------------------------------------------------------ */
+		/* 0. Prepare dY from dout (SystemDS layout → cuDNN layout)           */
+		/* ------------------------------------------------------------------ */
+		long elemsY = (long) N * T * M;
+		Pointer dY = gCtx.allocate(instName, elemsY * sizeOfDataType, false);
+		Pointer yPointer = gCtx.allocate(instName, (long) N * T * M * sizeOfDataType, false);
+
+		long doutElems = return_sequences ? elemsY : (long) N * M;
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
-				ExecutionConfig.getConfigForSimpleVectorOperations(size),
-				getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? (long) T*M : M),
-				dy, N, T, M, size, return_sequences ? 1 : 0);
+			ExecutionConfig.getConfigForSimpleVectorOperations((int) doutElems),
+			getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? (long) T * M : M), dY, N, T, M,
+			doutElems, return_sequences ? 1 : 0);
+
 		ec.releaseMatrixInputForGPUInstruction(doutName);
-				
-		// Allocate intermediate pointers computed by forward
-		Pointer yPointer = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
-		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
-			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
-					algo.xDesc, x, 
-					algo.hxDesc, hx, 
-					algo.cxDesc, cx, 
-					algo.wDesc, wPointer, 
-					algo.yDesc, yPointer, 
-					algo.hyDesc, new Pointer(), 
-					algo.cyDesc, new Pointer(), 
-					algo.workSpace, algo.sizeInBytes, 
-					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
-			
-			Pointer cudnnDx = gCtx.allocate(instName, (long) N *T*D*LibMatrixCUDA.sizeOfDataType, false);
-			JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
-					algo.yDesc, yPointer,
-					// ----------------------
-					// Additional inputs:
-					algo.dyDesc, dy, 
-					algo.dhyDesc, new Pointer(), 
-					algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
-					// ----------------------
-					algo.wDesc, wPointer, 
-					algo.hxDesc, hx,
-					algo.cxDesc, cx,
-					// ----------------------
-					// Output:
-					algo.dxDesc, cudnnDx, 
-					algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), 
-					algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M),
-					// ----------------------
-					algo.workSpace, algo.sizeInBytes, 
-					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
-			gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE);
+
+		/* ------------------------------------------------------------------ */
+		/* 1. Build helper → rnnDesc (v8) and workspace sizes                 */
+		/* ------------------------------------------------------------------ */
+		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D,
+			/*training*/true, wPointer)) {
+			/* -------------------------------------------------------------- */
+			/* 1a. Create single RNN-DATA descriptors for X and Y             */
+			/* -------------------------------------------------------------- */
+			cudnnRNNDataDescriptor xDesc = new cudnnRNNDataDescriptor();
+			JCudnn.cudnnCreateRNNDataDescriptor(xDesc);
+			JCudnn.cudnnSetRNNDataDescriptor(xDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+				CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, D, null, null);
+
+			cudnnRNNDataDescriptor yDesc = new cudnnRNNDataDescriptor();
+			JCudnn.cudnnCreateRNNDataDescriptor(yDesc);
+			JCudnn.cudnnSetRNNDataDescriptor(yDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+				CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, M, null, null);
+
+			/* -------------------------------------------------------------- */
+			/* 1b. Packed weight-space info                                   */
+			/* -------------------------------------------------------------- */
+			long[] wSpaceBytes = {0};
+			JCudnn.cudnnGetRNNWeightSpaceSize(gCtx.getCudnnHandle(), algo.rnnDesc, wSpaceBytes);
+			long weightSpaceSize = wSpaceBytes[0];
+			Pointer weightSpace = wPointer;   // flat weights already packed
+
+			/* -------------------------------------------------------------- */
+			/* 2. Forward pass (needed to fill reserve-space)                 */
+			/* -------------------------------------------------------------- */
+			JCudnn.cudnnRNNForward(gCtx.getCudnnHandle(), algo.rnnDesc, CUDNN_FWD_MODE_TRAINING, null, xDesc, x, yDesc,
+				yPointer,            // algo.yTemp sized N·T·M
+				algo.hxDesc, hx, new Pointer(),   // hy unused
+				algo.cxDesc, cx, new Pointer(),   // cy unused
+				weightSpaceSize, weightSpace, algo.sizeInBytes, algo.workSpace, algo.reserveSpaceSizeInBytes,
+				algo.reserveSpace);
+
+			/* -------------------------------------------------------------- */
+			/* 3. Back-prop through time: dX, dH₀, dC₀                        */
+			/* -------------------------------------------------------------- */
+			Pointer dX = gCtx.allocate(instName, (long) N * T * D * sizeOfDataType, false);
+
+			JCudnn.cudnnRNNBackwardData_v8(gCtx.getCudnnHandle(), algo.rnnDesc, null,
+				// devSeqLengths
+				yDesc, yPointer, dY,          // y, dy
+				xDesc, dX,                      // out: dx
+				algo.hxDesc, hx, new Pointer(),                  // dhy = 0
+				getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), algo.cxDesc, cx,
+				getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
+				getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M), weightSpaceSize, weightSpace,
+				algo.sizeInBytes, algo.workSpace, algo.reserveSpaceSizeInBytes, algo.reserveSpace);
+
 			ec.releaseMatrixInputForGPUInstruction(dcyName);
 			ec.releaseMatrixOutputForGPUInstruction(dhxName);
 			ec.releaseMatrixOutputForGPUInstruction(dcxName);
-			
-			Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
-			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
-					ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
-					smlDx, cudnnDx, N, D, T*D, N*T*D);
+
+			/* Copy dX back into SystemDS layout --------------------------- */
+			Pointer sysdsDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, (long) T * D);
+
+			LibMatrixCUDA.getCudaKernels(gCtx)
+				.launchKernel("prepare_lstm_dinput", ExecutionConfig.getConfigForSimpleVectorOperations(N * T * D),
+					sysdsDx, dX, N, D, T * D, N * T * D);
+
 			ec.releaseMatrixOutputForGPUInstruction(dxName);
-			gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE);
-			
-			// -------------------------------------------------------------------------------------------
-			Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4L *M)*LibMatrixCUDA.sizeOfDataType, false);
-			JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
-					algo.xDesc, x, 
-					algo.hxDesc, hx, 
-					algo.yDesc, yPointer, 
-					algo.workSpace, algo.sizeInBytes, 
-					algo.dwDesc, cudnnDwPointer, 
-					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
+			gCtx.cudaFreeHelper(instName, dX, DMLScript.EAGER_CUDA_FREE);
+
+			/* -------------------------------------------------------------- */
+			/* 4. Weight & bias gradients                                     */
+			/* -------------------------------------------------------------- */
+			long dWeightBytes = weightSpaceSize;
+			Pointer dWeightSpace = gCtx.allocate(instName, dWeightBytes, false);
+
+			JCudnn.cudnnRNNBackwardWeights_v8(gCtx.getCudnnHandle(), algo.rnnDesc,
+				/*addGrad=*/0, null,                     // devSeqLengths
+				xDesc, x, algo.hxDesc, hx, yDesc, yPointer, dWeightBytes, dWeightSpace, algo.sizeInBytes,
+				algo.workSpace, algo.reserveSpaceSizeInBytes, algo.reserveSpace);
+
+			/* Split packed dWeightSpace into SystemDS tensors ------------- */
 			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
-					ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
-					getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), 
-					getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
-			gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE);
+				ExecutionConfig.getConfigForSimpleVectorOperations((D + M + 2) * (4 * M)),
+				getDenseOutputPointer(ec, gCtx, instName, dwName, D + M, 4L * M),
+				getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4L * M), dWeightSpace, D, M);
+
+			gCtx.cudaFreeHelper(instName, dWeightSpace, DMLScript.EAGER_CUDA_FREE);
 			ec.releaseMatrixOutputForGPUInstruction(dwName);
 			ec.releaseMatrixOutputForGPUInstruction(dbName);
-			// -------------------------------------------------------------------------------------------
-			
+
+			/* -------------------------------------------------------------- */
+			/* 5. Free temporaries                                            */
+			/* -------------------------------------------------------------- */
+			gCtx.cudaFreeHelper(instName, dY, DMLScript.EAGER_CUDA_FREE);
 			gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE);
+
+			JCudnn.cudnnDestroyRNNDataDescriptor(xDesc);
+			JCudnn.cudnnDestroyRNNDataDescriptor(yDesc);
 		}
 	}
-	
-	
-	
+
+
+
+
 	/**
 	 * Performs the forward BatchNormalization layer computation for training
 	 * @param gCtx   a valid {@link GPUContext}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index ac12b509b4c..fab29fd1e81 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -20,11 +20,15 @@
 package org.apache.sysds.runtime.matrix.data;
 
 import jcuda.Pointer;
-import jcuda.jcudnn.cudnnConvolutionBwdFilterPreference;
+import jcuda.jcudnn.cudnnConvolutionBwdDataAlgo;
+import jcuda.jcudnn.cudnnConvolutionBwdDataAlgoPerf;
+import jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo;
+import jcuda.jcudnn.cudnnConvolutionBwdFilterAlgoPerf;
 import jcuda.jcudnn.cudnnConvolutionDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdAlgo;
 import jcuda.jcudnn.cudnnFilterDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
+
 import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
@@ -167,22 +171,38 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionForwardAlgor
 	public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilterAlgorithm(
 			GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, 
 			int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) {
-		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
-				pad_h, pad_w, stride_h, stride_w, P, Q);
-		
-		int[] algos = {-1};
-		long[] sizeInBytesArray = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
-		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
-				LibMatrixCuDNN.getCudnnHandle(gCtx), 
-				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, 
-				cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
-		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
-				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
-		if (sizeInBytesArray[0] != 0)
+		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K,
+			R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+
+		final int maxAlgos = cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+		cudnnConvolutionBwdFilterAlgoPerf[] perf = new cudnnConvolutionBwdFilterAlgoPerf[maxAlgos];
+		for(int i = 0; i < maxAlgos; ++i)
+			perf[i] = new cudnnConvolutionBwdFilterAlgoPerf();
+		int[] returnedAlgoCount = {0};
+		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm_v7(LibMatrixCuDNN.getCudnnHandle(gCtx),
+			ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, maxAlgos, returnedAlgoCount, perf);
+
+		long workspaceCap = Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES);
+		int chosenAlgo = cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+		long chosenWs = 0;
+
+		for(int i = 0; i < returnedAlgoCount[0]; ++i) {
+			if(perf[i].memory <= workspaceCap) {
+				chosenAlgo = perf[i].algo;
+				chosenWs = perf[i].memory;
+				break;
+			}
+		}
+
+		long[] sizeInBytesArray = {chosenWs};
+
+		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
+			ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, chosenAlgo, sizeInBytesArray);
+		if(sizeInBytesArray[0] != 0)
 			ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false);
 		ret.sizeInBytes = sizeInBytesArray[0];
-		ret.algo = algos[0];
-		
+		ret.algo = chosenAlgo;
+
 		return ret;
 	}
 	
@@ -220,18 +240,36 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardData
 			ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
 		}
 		else {
-			int[] algos = {-1};
-			long[] sizeInBytesArray = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
-					LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
-					jcuda.jcudnn.cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
-			if (sizeInBytesArray[0] != 0)
+			final int max = cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+			cudnnConvolutionBwdDataAlgoPerf[] perf = new cudnnConvolutionBwdDataAlgoPerf[max];
+			for(int i = 0; i < max; ++i)
+				perf[i] = new cudnnConvolutionBwdDataAlgoPerf();
+			int[] nReturned = {0};
+
+			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm_v7(LibMatrixCuDNN.getCudnnHandle(gCtx),
+				ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, max, nReturned, perf);
+
+			long cap = Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES);
+			int chosenAlgo = cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+			long chosenWs = 0;
+
+			for(int i = 0; i < nReturned[0]; ++i) {
+				if(perf[i].memory <= cap) {
+					chosenAlgo = perf[i].algo;
+					chosenWs = perf[i].memory;
+					break;
+				}
+			}
+
+			long[] sizeInBytesArray = {chosenWs};
+			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
+				ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, chosenAlgo, sizeInBytesArray);
+
+			if(sizeInBytesArray[0] != 0)
 				ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false);
+
 			ret.sizeInBytes = sizeInBytesArray[0];
-			ret.algo = algos[0];
+			ret.algo = chosenAlgo;
 		}
 		return ret;
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
index abcb4f0bc07..55f47cb9487 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
@@ -26,7 +26,18 @@
 import static jcuda.jcudnn.JCudnn.cudnnSetTensorNdDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyDropoutDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyRNNDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnGetRNNWeightSpaceSize;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_HALF;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_BFLOAT16;
+import static jcuda.jcudnn.cudnnMathType.CUDNN_DEFAULT_MATH;
+import static jcuda.jcudnn.cudnnMathType.CUDNN_TENSOR_OP_MATH;
+import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD;
+import static jcuda.jcudnn.cudnnRNNBiasMode.CUDNN_RNN_DOUBLE_BIAS;
+import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL;
+import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT;
+import static jcuda.jcudnn.cudnnRNNDataLayout.CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED;
+import static jcuda.jcudnn.cudnnForwardMode.CUDNN_FWD_MODE_TRAINING;
 
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Opcodes;
@@ -35,9 +46,6 @@
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 
 import static jcuda.jcudnn.JCudnn.cudnnCreateRNNDescriptor;
-import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT;
-import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL;
-import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD;
 
 import jcuda.Pointer;
 import jcuda.jcudnn.JCudnn;
@@ -45,6 +53,7 @@
 import jcuda.jcudnn.cudnnFilterDescriptor;
 import jcuda.jcudnn.cudnnRNNDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
+import jcuda.jcudnn.cudnnRNNDataDescriptor;
 
 public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
 	GPUContext gCtx;
@@ -52,6 +61,7 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
 	cudnnDropoutDescriptor dropoutDesc;
 	cudnnRNNDescriptor rnnDesc;
 	cudnnTensorDescriptor[] xDesc, dxDesc, yDesc, dyDesc; // of length T
+	cudnnRNNDataDescriptor xDataDesc;
 	cudnnTensorDescriptor hxDesc, cxDesc, hyDesc, cyDesc, dhxDesc, dcxDesc, dhyDesc, dcyDesc; 
 	cudnnFilterDescriptor wDesc;
 	cudnnFilterDescriptor dwDesc;
@@ -90,25 +100,37 @@ public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String i
 		JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), _dropOutSizeInBytes);
 		dropOutSizeInBytes = _dropOutSizeInBytes[0];
 		dropOutStateSpace = new Pointer();
-		if (dropOutSizeInBytes != 0)
+		if(dropOutSizeInBytes != 0)
 			dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes, false);
-		JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
-		
+		JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes,
+			12345);
+
 		// Initialize RNN descriptor
 		rnnDesc = new cudnnRNNDescriptor();
 		cudnnCreateRNNDescriptor(rnnDesc);
-		JCudnn.cudnnSetRNNDescriptor_v6(gCtx.getCudnnHandle(), rnnDesc, M, 1, dropoutDesc, 
-				CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, 
-				getCuDNNRnnMode(rnnMode), CUDNN_RNN_ALGO_STANDARD, LibMatrixCUDA.CUDNN_DATA_TYPE);
-		
+		int mathType = (LibMatrixCUDA.CUDNN_DATA_TYPE == CUDNN_DATA_HALF ||
+			LibMatrixCUDA.CUDNN_DATA_TYPE == CUDNN_DATA_BFLOAT16) ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+		int mathPrec = LibMatrixCUDA.CUDNN_DATA_TYPE;
+		JCudnn.cudnnSetRNNDescriptor_v8(rnnDesc, CUDNN_RNN_ALGO_STANDARD, getCuDNNRnnMode(rnnMode),
+			CUDNN_RNN_DOUBLE_BIAS, CUDNN_UNIDIRECTIONAL, CUDNN_LINEAR_INPUT, LibMatrixCUDA.CUDNN_DATA_TYPE, mathPrec,
+			mathType, D, M, 0, 1, dropoutDesc, 0);
+
+		// ── inside the constructor, after rnnDesc has been configured ──
+		xDataDesc = new cudnnRNNDataDescriptor();
+		JCudnn.cudnnCreateRNNDataDescriptor(xDataDesc);
+		JCudnn.cudnnSetRNNDataDescriptor(xDataDesc, LibMatrixCUDA.CUDNN_DATA_TYPE,
+			CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, T, N, D, null, null);
+
 		// Allocate filter descriptor
 		int expectedNumWeights = getExpectedNumWeights();
-		if(rnnMode.equalsIgnoreCase(Opcodes.LSTM.toString()) && (D+M+2)*4*M != expectedNumWeights) {
-			throw new DMLRuntimeException("Incorrect number of RNN parameters " +  (D+M+2)*4*M + " != " +  expectedNumWeights + ", where numFeatures=" + D + ", hiddenSize=" + M);
+		if(rnnMode.equalsIgnoreCase(Opcodes.LSTM.toString()) && (D + M + 2) * 4 * M != expectedNumWeights) {
+			throw new DMLRuntimeException(
+				"Incorrect number of RNN parameters " + (D + M + 2) * 4 * M + " != " + expectedNumWeights +
+					", where numFeatures=" + D + ", hiddenSize=" + M);
 		}
 		wDesc = allocateFilterDescriptor(expectedNumWeights);
 		dwDesc = allocateFilterDescriptor(expectedNumWeights);
-		
+
 		// Setup workspace
 		workSpace = new Pointer(); reserveSpace = new Pointer();
 		sizeInBytes = getWorkspaceSize(T);
@@ -142,15 +164,25 @@ else if(rnnMode.equalsIgnoreCase("gru")) {
 	}
 	
 	private long getWorkspaceSize(int seqLength) {
-		long [] sizeInBytesArray = new long[1];
-		JCudnn.cudnnGetRNNWorkspaceSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray);
-		return sizeInBytesArray[0];
+		long[] workSize = {0};
+		long[] reserveSize = {0};          // v9 returns both sizes in one call
+
+		JCudnn.cudnnGetRNNTempSpaceSizes(gCtx.getCudnnHandle(), rnnDesc, CUDNN_FWD_MODE_TRAINING, xDataDesc, workSize,
+			reserveSize);
+
+		// keep reserveSpaceSizeInBytes in sync with the new API
+		reserveSpaceSizeInBytes = reserveSize[0];
+		return workSize[0];
 	}
 	
 	private long getReservespaceSize(int seqLength) {
-		long [] sizeInBytesArray = new long[1];
-		JCudnn.cudnnGetRNNTrainingReserveSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray);
-		return sizeInBytesArray[0];
+		long[] workSize = {0};
+		long[] reserveSize = {0};
+
+		JCudnn.cudnnGetRNNTempSpaceSizes(gCtx.getCudnnHandle(), rnnDesc, CUDNN_FWD_MODE_TRAINING, xDataDesc, workSize,
+			reserveSize);
+
+		return reserveSize[0];
 	}
 	
 	private static int getCuDNNRnnMode(String rnnMode) throws DMLRuntimeException {
@@ -174,10 +206,17 @@ else if(rnnMode.equalsIgnoreCase("gru")) {
 	}
 	
 	private int getExpectedNumWeights() throws DMLRuntimeException {
-		long [] weightSizeInBytesArray = {-1}; // (D+M+2)*4*M
-		JCudnn.cudnnGetRNNParamsSize(gCtx.getCudnnHandle(), rnnDesc, xDesc[0], weightSizeInBytesArray, LibMatrixCUDA.CUDNN_DATA_TYPE);
-		// check if (D+M+2)*4M == weightsSize / sizeof(dataType) where weightsSize is given by 'cudnnGetRNNParamsSize'.
-		return LibMatrixCUDA.toInt(weightSizeInBytesArray[0]/LibMatrixCUDA.sizeOfDataType);
+		long[] weightSpaceBytes = {-1};
+
+		// v9 API: returns the size (in bytes) of the packed “weight space”
+		cudnnGetRNNWeightSpaceSize(gCtx.getCudnnHandle(), rnnDesc, weightSpaceBytes);
+
+		if(weightSpaceBytes[0] < 0)
+			throw new DMLRuntimeException("cuDNN returned a negative weight-space size");
+
+		// convert from bytes to number of scalars
+		long numScalars = weightSpaceBytes[0] / LibMatrixCUDA.sizeOfDataType;
+		return LibMatrixCUDA.toInt(numScalars);
 	}
 	
 	private static cudnnFilterDescriptor allocateFilterDescriptor(int numWeights) {
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index e25a68aef35..3c9c47ed893 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -42,42 +42,235 @@
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
+import jcuda.jcusparse.cusparseSpMatDescr;
+import jcuda.jcusparse.cusparseSpGEMMDescr;
+
+import static jcuda.jcusparse.JCusparse.cusparseCreateCsr;
+import static jcuda.jcusparse.cusparseIndexType.CUSPARSE_INDEX_32I;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
+import static jcuda.cudaDataType.CUDA_R_32F;
+import static jcuda.jcusparse.JCusparse.cusparseSpGEMM_createDescr;
+import static jcuda.jcusparse.cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
+import static jcuda.jcusparse.cusparseStatus.CUSPARSE_STATUS_SUCCESS;
+import static jcuda.jcusparse.cusparseSpMVAlg.CUSPARSE_SPMV_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOrder.CUSPARSE_ORDER_COL;
+import static jcuda.jcusparse.cusparseSpMMAlg.CUSPARSE_SPMM_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseCsr2CscAlg.CUSPARSE_CSR2CSC_ALG1;
+import static jcuda.jcusparse.cusparseSparseToDenseAlg.CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ONE;
+import static jcuda.jcusparse.cusparseDenseToSparseAlg.CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaFree;
+
+import jcuda.jcusparse.cusparseDnVecDescr;
+import jcuda.jcusparse.cusparseDnMatDescr;
 
 public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions {
-	
+
 	private static final Log LOG = LogFactory.getLog(SinglePrecisionCudaSupportFunctions.class.getName());
 
 	@Override
 	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
-			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
-			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
-			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
-		return JCusparse.cusparseScsrgemm(handle, transA,  transB,  m,  n,  k,
-				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
-				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
-				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+		cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
+		cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
+		cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
+		/* ------------------------------------------------------------------ */
+		/* Descriptors and temporaries                                        */
+		/* ------------------------------------------------------------------ */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseSpMatDescr matB = new cusparseSpMatDescr();
+		cusparseSpMatDescr matC = new cusparseSpMatDescr();
+		cusparseSpGEMMDescr spgemm = new cusparseSpGEMMDescr();
+
+		Pointer dBuf1 = null;
+		Pointer dBuf2 = null;
+		int status;
+
+		try {
+			/* Create CSR descriptors (FP32, 32-bit indices) ---------------- */
+			cusparseCreateCsr(matA, m, k, nnzA, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+				CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+
+			cusparseCreateCsr(matB, k, n, nnzB, csrRowPtrB, csrColIndB, csrValB, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+				CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+
+			cusparseCreateCsr(matC, m, n, 0L,            // nnz unknown yet
+				csrRowPtrC, csrColIndC, csrValC, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+				CUDA_R_32F);
+
+			/* SpGEMM descriptor ------------------------------------------- */
+			cusparseSpGEMM_createDescr(spgemm);
+
+			Pointer alpha = Pointer.to(new float[] {1.0f});
+			Pointer beta = Pointer.to(new float[] {0.0f});
+			int alg = CUSPARSE_SPGEMM_DEFAULT;
+			int type = CUDA_R_32F;
+
+			/* -------- Phase-1 : work-estimation -------------------------- */
+			long[] bufSz1 = {0};
+			status = JCusparse.cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(),
+				matB.asConst(), beta, matC, type, alg, spgemm, bufSz1, null);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			if(bufSz1[0] > 0) {
+				dBuf1 = new Pointer();
+				cudaMalloc(dBuf1, bufSz1[0]);
+			}
+
+			status = JCusparse.cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(),
+				matB.asConst(), beta, matC, type, alg, spgemm, bufSz1, dBuf1);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			/* -------- Phase-2 : compute ---------------------------------- */
+			long[] bufSz2 = {0};
+			status = JCusparse.cusparseSpGEMM_compute(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
+				beta, matC, type, alg, spgemm, bufSz2, null);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			if(bufSz2[0] > 0) {
+				dBuf2 = new Pointer();
+				cudaMalloc(dBuf2, bufSz2[0]);
+			}
+
+			status = JCusparse.cusparseSpGEMM_compute(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
+				beta, matC, type, alg, spgemm, bufSz2, dBuf2);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			/* -------- Phase-3 : copy result ------------------------------ */
+			status = JCusparse.cusparseSpGEMM_copy(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta,
+				matC, type, alg, spgemm);
+
+			return status;
+		}
+		finally {
+			/* ------------------------------------------------------------------ */
+			/* Cleanup always runs, success or error                              */
+			/* ------------------------------------------------------------------ */
+			if(dBuf1 != null)
+				cudaFree(dBuf1);
+			if(dBuf2 != null)
+				cudaFree(dBuf2);
+
+			JCusparse.cusparseSpGEMM_destroyDescr(spgemm);
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+			JCusparse.cusparseDestroySpMat(matB.asConst());
+			JCusparse.cusparseDestroySpMat(matC.asConst());
+		}
 	}
 
 	@Override
 	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
-			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
 		return JCublas2.cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 	}
 
 	@Override
 	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
-			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
-			Pointer y) {
-		return JCusparse.cusparseScsrmv(handle, transA, m, n, nnz, alpha, 
-				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+		cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
+		Pointer y) {
+		/* ------------------------------------------------------------------ */
+		/* Descriptors and workspace                                          */
+		/* ------------------------------------------------------------------ */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseDnVecDescr vecX = new cusparseDnVecDescr();
+		cusparseDnVecDescr vecY = new cusparseDnVecDescr();
+		Pointer dBuf = null;
+		int status;
+
+		try {
+			/* 1. CSR matrix A (FP32) -------------------------------------- */
+			JCusparse.cusparseCreateCsr(matA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+				CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+
+			/* 2. Dense vectors X and Y (FP32) ------------------------------ */
+			JCusparse.cusparseCreateDnVec(vecX, n, x, CUDA_R_32F);
+			JCusparse.cusparseCreateDnVec(vecY, m, y, CUDA_R_32F);
+
+			/* 3. Query workspace size ------------------------------------- */
+			long[] bufSize = {0};
+			status = JCusparse.cusparseSpMV_bufferSize(handle, transA, alpha, matA.asConst(), vecX.asConst(), beta,
+				vecY, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, bufSize);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			if(bufSize[0] > 0) {
+				dBuf = new Pointer();
+				cudaMalloc(dBuf, bufSize[0]);
+			}
+
+			/* 4. Perform SpMV -------------------------------------------- */
+			status = JCusparse.cusparseSpMV(handle, transA, alpha, matA.asConst(), vecX.asConst(), beta, vecY,
+				CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, dBuf);
+
+			return status;
+		}
+		finally {
+			if(dBuf != null)
+				cudaFree(dBuf);
+			JCusparse.cusparseDestroyDnVec(vecX.asConst());
+			JCusparse.cusparseDestroyDnVec(vecY.asConst());
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+		}
 	}
-	
+
 	@Override
-	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
-			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
-			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
-		return JCusparse.cusparseScsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
-				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	public int cusparsecsrmm2(cusparseHandle handle, int transA, int transB, int m, int n, int k, int nnz,
+		Pointer alpha, cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer B,
+		int ldb, Pointer beta, Pointer C, int ldc) {
+		/* ------------------------------------------------------------------ */
+		/* Descriptors and workspace                                          */
+		/* ------------------------------------------------------------------ */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseDnMatDescr matB = new cusparseDnMatDescr();
+		cusparseDnMatDescr matC = new cusparseDnMatDescr();
+		Pointer dBuf = null;
+		int status;
+
+		try {
+			/* 1. CSR matrix A (FP32) -------------------------------------- */
+			JCusparse.cusparseCreateCsr(matA, m, k, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+				CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
+
+			/* 2. Dense matrix B (column-major) ---------------------------- */
+			int rowsB = (transB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : n;
+			int colsB = (transB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : k;
+			JCusparse.cusparseCreateDnMat(matB, rowsB, colsB, ldb, B, CUDA_R_32F, CUSPARSE_ORDER_COL);
+
+			/* 3. Dense matrix C (output) ---------------------------------- */
+			int rowsC = (transA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : k;
+			int colsC = colsB;   // always equals n
+			JCusparse.cusparseCreateDnMat(matC, rowsC, colsC, ldc, C, CUDA_R_32F, CUSPARSE_ORDER_COL);
+
+			/* 4. Query workspace size ------------------------------------- */
+			long[] bufSize = {0};
+			status = JCusparse.cusparseSpMM_bufferSize(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
+				beta, matC, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, bufSize);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			if(bufSize[0] > 0) {
+				dBuf = new Pointer();
+				cudaMalloc(dBuf, bufSize[0]);
+			}
+
+			/* 5. Execute SpMM -------------------------------------------- */
+			status = JCusparse.cusparseSpMM(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC,
+				CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, dBuf);
+
+			return status;
+		}
+		finally {
+			if(dBuf != null)
+				cudaFree(dBuf);
+			JCusparse.cusparseDestroyDnMat(matB.asConst());
+			JCusparse.cusparseDestroyDnMat(matC.asConst());
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+		}
 	}
 
 	@Override
@@ -87,25 +280,58 @@ public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y,
 
 	@Override
 	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
-			int incx, Pointer beta, Pointer y, int incy) {
+		int incx, Pointer beta, Pointer y, int incy) {
 		return JCublas2.cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 	}
 
 	@Override
 	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
-			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
 		return JCublas2.cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 	}
 
 	@Override
 	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
-			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
-		return JCusparse.cusparseScsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+		Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+		final int alg = CUSPARSE_CSR2CSC_ALG1;		// Algorithm 1 is universally supported
+		final int valType = CUDA_R_32F;				// single-precision
+
+		/* ------------------------------------------------------------------ */
+		/* 1. Query required workspace size                                   */
+		/* ------------------------------------------------------------------ */
+		long[] bufSize = {0};
+		int status = JCusparse.cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+			cscColPtr, cscRowInd, valType, copyValues, idxBase, alg, bufSize);
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		/* ------------------------------------------------------------------ */
+		/* 2. Allocate workspace (if needed)                                  */
+		/* ------------------------------------------------------------------ */
+		Pointer buffer = null;
+		if(bufSize[0] > 0) {
+			buffer = new Pointer();
+			cudaMalloc(buffer, bufSize[0]);
+		}
+
+		try {
+			/* -------------------------------------------------------------- */
+			/* 3. Perform CSR -> CSC conversion                                */
+			/* -------------------------------------------------------------- */
+			status = JCusparse.cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
+				cscRowInd, valType, copyValues, idxBase, alg, buffer);
+
+			return status;
+		}
+		finally {
+			if(buffer != null)
+				cudaFree(buffer);
+		}
 	}
 
 	@Override
 	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
-			Pointer beta, Pointer C, int ldc) {
+		Pointer beta, Pointer C, int ldc) {
 		return JCublas2.cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 	}
 
@@ -130,41 +356,196 @@ public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int
 			Pointer Workspace, int Lwork, Pointer devInfo) {
 		return JCusolverDn.cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 	}
-	
+
 	@Override
 	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
-			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
 		return JCusolverDn.cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 	}
 
 	@Override
 	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
-			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
-			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
-			Pointer csrRowPtrC, Pointer csrColIndC) {
-		return JCusparse.cusparseScsrgeam(handle, m, n, alpha, descrA, nnzA, 
-				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
-				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+		Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+		Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+		Pointer csrRowPtrC, Pointer csrColIndC) {
+		/* ------------------------------------------------------------------ */
+		/* 1. Query temporary-buffer size                                     */
+		/* ------------------------------------------------------------------ */
+		long[] bufSize = {0};
+
+		int status = JCusparse.cusparseScsrgeam2_bufferSizeExt(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
+			csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC,
+			bufSize);
+		if(status != CUSPARSE_STATUS_SUCCESS)
+			return status;
+
+		/* ------------------------------------------------------------------ */
+		/* 2. Allocate workspace (if needed)                                  */
+		/* ------------------------------------------------------------------ */
+		Pointer buffer = null;
+		if(bufSize[0] > 0) {
+			buffer = new Pointer();
+			cudaMalloc(buffer, bufSize[0]);
+		}
+
+		try {
+			/* -------------------------------------------------------------- */
+			/* 3. Perform C = α*A  +  β*B                                     */
+			/* -------------------------------------------------------------- */
+			status = JCusparse.cusparseScsrgeam2(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA,
+				beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC, buffer);
+
+			return status;   // propagate cuSPARSE return code
+		}
+		finally {
+			/* -------------------------------------------------------------- */
+			/* 4. Free workspace                                              */
+			/* -------------------------------------------------------------- */
+			if(buffer != null)
+				cudaFree(buffer);
+		}
 	}
 
 	@Override
 	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
-			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
-		return JCusparse.cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+		Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
+		/* ------------------------------------------------------------------ */
+		/* 0. Determine nnz from csrRowPtrA[m] and index base                 */
+		/* ------------------------------------------------------------------ */
+		int[] last = {0};
+		cudaMemcpy(Pointer.to(last), csrRowPtrA.withByteOffset((long) m * Sizeof.INT), Sizeof.INT,
+			cudaMemcpyDeviceToHost);
+
+		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
+		int nnz = (idxBase == CUSPARSE_INDEX_BASE_ONE) ? last[0] - 1 : last[0];
+
+		/* ------------------------------------------------------------------ */
+		/* 1. Create CSR ‘SpMat’ and dense ‘DnMat’ descriptors (FP32)         */
+		/* ------------------------------------------------------------------ */
+		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		JCusparse.cusparseCreateCsr(matA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, CUDA_R_32F);
+
+		cusparseDnMatDescr matB = new cusparseDnMatDescr();
+		JCusparse.cusparseCreateDnMat(matB, m, n, lda, A, CUDA_R_32F,
+			CUSPARSE_ORDER_COL);   // matches cuSPARSE’s lda convention
+
+		/* ------------------------------------------------------------------ */
+		/* 2. Query workspace size                                            */
+		/* ------------------------------------------------------------------ */
+		long[] bufSize = {0};
+		int alg = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+
+		int status = JCusparse.cusparseSparseToDense_bufferSize(handle, matA.asConst(), matB, alg, bufSize);
+		if(status != CUSPARSE_STATUS_SUCCESS) {
+			JCusparse.cusparseDestroyDnMat(matB.asConst());
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+			return status;
+		}
+
+		/* ------------------------------------------------------------------ */
+		/* 3. Allocate workspace (if needed)                                  */
+		/* ------------------------------------------------------------------ */
+		Pointer buffer = null;
+		if(bufSize[0] > 0) {
+			buffer = new Pointer();
+			cudaMalloc(buffer, bufSize[0]);
+		}
+
+		try {
+			/* -------------------------------------------------------------- */
+			/* 4. Perform CSR -> dense conversion                            */
+			/* -------------------------------------------------------------- */
+			status = JCusparse.cusparseSparseToDense(handle, matA.asConst(), matB, alg, buffer);
+
+			return status;
+		}
+		finally {
+			/* -------------------------------------------------------------- */
+			/* 5. Cleanup                                                     */
+			/* -------------------------------------------------------------- */
+			if(buffer != null)
+				cudaFree(buffer);
+			JCusparse.cusparseDestroyDnMat(matB.asConst());
+			JCusparse.cusparseDestroySpMat(matA.asConst());
+		}
 	}
-	
+
 	@Override
 	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
-			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
-		return JCusparse.cusparseSdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+		Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		/* ------------------------------------------------------------------ */
+		/* 0.  Index base (0 or 1) comes from the descriptor                  */
+		/* ------------------------------------------------------------------ */
+		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
+
+		/* ------------------------------------------------------------------ */
+		/* 1.  Create dense-matrix and CSR descriptors (FP32)                 */
+		/* ------------------------------------------------------------------ */
+		cusparseDnMatDescr matDense = new cusparseDnMatDescr();
+		JCusparse.cusparseCreateDnMat(matDense, m, n, lda, A, CUDA_R_32F, CUSPARSE_ORDER_COL);
+
+		cusparseSpMatDescr matCsr = new cusparseSpMatDescr();
+		/* nnz initially 0 – cuSPARSE will fill it during analysis           */
+		JCusparse.cusparseCreateCsr(matCsr, m, n, 0L, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, CUDA_R_32F);
+
+		/* ------------------------------------------------------------------ */
+		/* 2.  Query workspace size                                           */
+		/* ------------------------------------------------------------------ */
+		long[] bufSize = {0};
+		int alg = CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+
+		int status = JCusparse.cusparseDenseToSparse_bufferSize(handle, matDense.asConst(), matCsr, alg, bufSize);
+		if(status != CUSPARSE_STATUS_SUCCESS) {
+			JCusparse.cusparseDestroySpMat(matCsr.asConst());
+			JCusparse.cusparseDestroyDnMat(matDense.asConst());
+			return status;
+		}
+
+		/* ------------------------------------------------------------------ */
+		/* 3.  Allocate workspace (if required)                               */
+		/* ------------------------------------------------------------------ */
+		Pointer buffer = null;
+		if(bufSize[0] > 0) {
+			buffer = new Pointer();
+			cudaMalloc(buffer, bufSize[0]);
+		}
+
+		try {
+			/* -------------------------------------------------------------- */
+			/* 4.  Phase-1: symbolic pass                                     */
+			/* -------------------------------------------------------------- */
+			status = JCusparse.cusparseDenseToSparse_analysis(handle, matDense.asConst(), matCsr, alg, buffer);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			/* -------------------------------------------------------------- */
+			/* 5.  Phase-2: numeric conversion                                */
+			/* -------------------------------------------------------------- */
+			status = JCusparse.cusparseDenseToSparse_convert(handle, matDense.asConst(), matCsr, alg, buffer);
+			if(status != CUSPARSE_STATUS_SUCCESS)
+				return status;
+
+			return status;   // success
+		}
+		finally {
+			/* -------------------------------------------------------------- */
+			/* 7.  Cleanup                                                    */
+			/* -------------------------------------------------------------- */
+			if(buffer != null)
+				cudaFree(buffer);
+			JCusparse.cusparseDestroySpMat(matCsr.asConst());
+			JCusparse.cusparseDestroyDnMat(matDense.asConst());
+		}
 	}
-	
+
 	@Override
 	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
-			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
 		return JCusparse.cusparseSnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
 	}
-	
+
 	@Override
 	public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName, boolean isEviction) {
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

From 32e8e630b8f5c2c320408132ae577fa4db06c781 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Thu, 12 Jun 2025 23:58:17 +0200
Subject: [PATCH 02/26] correct step2GatherNNZGeam

---
 .../instructions/gpu/context/CSRPointer.java  | 23 ++++++++++++-------
 .../functions/lineage/GPUFullReuseTest.java   |  4 ++--
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index b6b7a24a25c..d47fbef2d94 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -307,23 +307,30 @@ private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle han
 	 * @param m      Rows in C
 	 * @param n      Columns in C
 	 */
+
 	private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) {
 		LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
+		long[] pBufferSizeInBytes = {0};
+		cusparseDcsrgeam2_bufferSizeExt(handle, m, n, Pointer.to(new double[]{1.0}), A.descr, toIntExact(A.nnz), A.val, A.rowPtr, A.colInd,
+			Pointer.to(new double[]{1.0}), B.descr, toIntExact(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd, pBufferSizeInBytes);
+		Pointer buffer = new Pointer();
+		cudaMalloc(buffer, pBufferSizeInBytes[0]);
 		int[] CnnzArray = {-1};
-		Pointer workspace = new Pointer();
-		cusparseXcsrgeam2Nnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
-			B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray), workspace);
-		//cudaDeviceSynchronize;
+		cusparseXcsrgeam2Nnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
+			C.descr, C.rowPtr, Pointer.to(CnnzArray) ,buffer);
 		if(CnnzArray[0] != -1) {
 			C.nnz = CnnzArray[0];
 		}
-		else {
+		else {                            // fall-back (rare older devices)
 			int[] baseArray = {0};
-			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
-				cudaMemcpyDeviceToHost);
-			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+			cudaMemcpy(Pointer.to(CnnzArray),
+				C.rowPtr.withByteOffset((long)m * Sizeof.INT),
+				Sizeof.INT, cudaMemcpyDeviceToHost);
+			cudaMemcpy(Pointer.to(baseArray),
+				C.rowPtr, Sizeof.INT, cudaMemcpyDeviceToHost);
 			C.nnz = CnnzArray[0] - baseArray[0];
 		}
+		cudaFree(buffer);
 	}
 
 	/**
diff --git a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index 74bd7fc47de..841511b31f0 100644
--- a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -45,12 +45,12 @@ public class GPUFullReuseTest extends AutomatedTestBase{
 	protected static final int TEST_VARIANTS = 4;
 	protected String TEST_CLASS_DIR = TEST_DIR + GPUFullReuseTest.class.getSimpleName() + "/";
 
-	@BeforeClass
+	/*@BeforeClass
 	public static void checkGPU() {
 		// Skip all the tests if no GPU is available
 		// FIXME: Fails to skip if gpu available but no libraries
 		Assume.assumeTrue(TestUtils.isGPUAvailable() == cudaError.cudaSuccess);
-	}
+	}*/
 
 	@Override
 	public void setUp() {

From a8ea3c8b54d322d536ab9196298b6e0b816c616d Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Sun, 15 Jun 2025 00:44:51 +0200
Subject: [PATCH 03/26] correct csr2dense

---
 .../instructions/gpu/context/CSRPointer.java  |   2 +-
 .../matrix/data/CudaSupportFunctions.java     |   2 +-
 .../DoublePrecisionCudaSupportFunctions.java  | 111 ++++++------------
 .../SinglePrecisionCudaSupportFunctions.java  |  74 ++++--------
 4 files changed, 66 insertions(+), 123 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index d47fbef2d94..1cfd946de47 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -537,7 +537,7 @@ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHan
 		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
 		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
 			// Note: cusparseDcsr2dense method cannot handle empty blocks
-			LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
+			LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows, nnz);
 			//cudaDeviceSynchronize;
 		} else {
 			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
index 6879a202630..369a2efae0f 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
@@ -80,7 +80,7 @@ public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m,
 	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU, Pointer Workspace, int Lwork, Pointer devInfo);
 	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda, Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo);
 	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, jcuda.Pointer alpha, cusparseMatDescr descrA, int nnzA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer beta, cusparseMatDescr descrB, int nnzB, jcuda.Pointer csrValB, jcuda.Pointer csrRowPtrB, jcuda.Pointer csrColIndB, cusparseMatDescr descrC, jcuda.Pointer csrValC, jcuda.Pointer csrRowPtrC, jcuda.Pointer csrColIndC);
-	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer A, int lda) ;
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer A, int lda, long nnz) ;
 	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer A, int lda, jcuda.Pointer nnzPerRow, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA);
 	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, jcuda.Pointer A, int lda, jcuda.Pointer nnzPerRowCol, jcuda.Pointer nnzTotalDevHostPtr);
 	public void deviceToHost(GPUContext gCtx, Pointer src, double [] dest, String instName, boolean isEviction);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index e77e17e09e4..27696a2abd7 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -18,7 +18,7 @@
  */
 package org.apache.sysds.runtime.matrix.data;
 
-import static jcuda.jcusparse.JCusparse.cusparseCreateCsr;
+import static jcuda.jcusparse.JCusparse.*;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
@@ -57,9 +57,7 @@
 import static jcuda.jcusparse.cusparseSparseToDenseAlg.CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
 import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ONE;
 import static jcuda.jcusparse.cusparseDenseToSparseAlg.CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
-import static jcuda.jcusparse.JCusparse.cusparseSpGEMM_createDescr;
-import static jcuda.jcusparse.JCusparse.cusparseCreateDnVec;
-import static jcuda.jcusparse.JCusparse.cusparseCreateDnMat;
+
 public class DoublePrecisionCudaSupportFunctions implements CudaSupportFunctions {
 
 	private static final Log LOG = LogFactory.getLog(DoublePrecisionCudaSupportFunctions.class.getName());
@@ -352,102 +350,69 @@ public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, c
 		Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
 		Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
 		Pointer csrRowPtrC, Pointer csrColIndC) {
-		/* ------------------------------------------------------------------ */
-		/* 1. Query temporary-buffer size                                     */
-		/* ------------------------------------------------------------------ */
-		long[] bufSize = {0};
+
+		long[] pBufferSizeInBytes = {0};
 
 		int status = JCusparse.cusparseDcsrgeam2_bufferSizeExt(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
 			csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC,
-			bufSize);
+			pBufferSizeInBytes);
 		if(status != CUSPARSE_STATUS_SUCCESS)
 			return status;
 
-		/* ------------------------------------------------------------------ */
-		/* 2. Allocate workspace (if required)                                */
-		/* ------------------------------------------------------------------ */
 		Pointer buffer = new Pointer();
-		if(bufSize[0] > 0)
-			cudaMalloc(buffer, bufSize[0]);
+		if(pBufferSizeInBytes[0] > 0)
+			cudaMalloc(buffer, pBufferSizeInBytes[0]);
 
 		try {
-			/* -------------------------------------------------------------- */
-			/* 3.  C = α*A  +  β*B  (sorted-CSR version 2)                     */
-			/* -------------------------------------------------------------- */
-			status = JCusparse.cusparseDcsrgeam2(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA,
-				beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC, buffer);
-
-			return status;
+			// C = α*A + β*B
+			return JCusparse.cusparseDcsrgeam2(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, beta,
+				descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC, buffer);
 		}
 		finally {
-			/* -------------------------------------------------------------- */
-			/* 4. Free workspace                                              */
-			/* -------------------------------------------------------------- */
-			if(bufSize[0] > 0)
+			if(pBufferSizeInBytes[0] > 0)
 				cudaFree(buffer);
 		}
 	}
 
 	@Override
 	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
-		Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
-		/* ------------------------------------------------------------- */
-		/* 1. Determine nnz from the last entry of csrRowPtrA            */
-		/* ------------------------------------------------------------- */
-		int[] last = {0};
-		cudaMemcpy(Pointer.to(last), csrRowPtrA.withByteOffset((long) m * Sizeof.INT), Sizeof.INT,
-			cudaMemcpyDeviceToHost);
-
-		/* Adjust for index base (0 or 1) ------------------------------ */
-		int base = JCusparse.cusparseGetMatIndexBase(descrA);
-		int nnz = (base == CUSPARSE_INDEX_BASE_ONE) ? last[0] - 1 : last[0];
-
-		/* ------------------------------------------------------------- */
-		/* 2. Create CSR SpMat and dense DnMat descriptors               */
-		/* ------------------------------------------------------------- */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
-		JCusparse.cusparseCreateCsr(matA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
-			CUSPARSE_INDEX_32I, base, CUDA_R_64F);
+		Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda, long nnz) {
+
+		// Get index base from legacy descriptor -> 0 or 1
+		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
 
+		// Create generric sparse-matrix descriptor required by CUDA 12
+		cusparseSpMatDescr spMatA = new cusparseSpMatDescr();
+
+		// Build CSR descriptor
+		cusparseCreateCsr(spMatA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, CUDA_R_64F);
+
+		// Build dense descriptor
 		cusparseDnMatDescr matB = new cusparseDnMatDescr();
-		JCusparse.cusparseCreateDnMat(matB, m, n, lda, A, CUDA_R_64F, CUSPARSE_ORDER_COL);
+		cusparseCreateDnMat(matB, m, n, lda, A, CUDA_R_64F, CUSPARSE_ORDER_COL);
 
-		/* ------------------------------------------------------------- */
-		/* 3. Query workspace size                                       */
-		/* ------------------------------------------------------------- */
-		long[] bufSize = {0};
+		// Determine buffer size
 		int alg = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+		long[] bufSize = {0};
+		cusparseSparseToDense_bufferSize(handle, spMatA.asConst(), matB, alg,
+			bufSize);    //bufSize[0] now holds the exact byte count
 
-		int status = JCusparse.cusparseSparseToDense_bufferSize(handle, matA.asConst(), matB, alg, bufSize);
-		if(status != CUSPARSE_STATUS_SUCCESS) {
-			JCusparse.cusparseDestroyDnMat(matB.asConst());
-			JCusparse.cusparseDestroySpMat(matA.asConst());
-			return status;
+		// Allocate scratch space of the requested size
+		Pointer dBuffer = new Pointer();
+		if(bufSize[0] > 0) {
+			cudaMalloc(dBuffer, bufSize[0]);
 		}
-
-		/* ------------------------------------------------------------- */
-		/* 4. Allocate temporary buffer (if needed)                      */
-		/* ------------------------------------------------------------- */
-		Pointer buffer = new Pointer();
-		if(bufSize[0] > 0)
-			cudaMalloc(buffer, bufSize[0]);
-
 		try {
-			/* --------------------------------------------------------- */
-			/* 5. Perform CSR -> dense conversion                         */
-			/* --------------------------------------------------------- */
-			status = JCusparse.cusparseSparseToDense(handle, matA.asConst(), matB, alg, buffer);
-
-			return status;
+			// Write dense matrix
+			int algSparseToDense = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+			return cusparseSparseToDense(handle, spMatA.asConst(), matB, algSparseToDense, dBuffer);
 		}
 		finally {
-			/* --------------------------------------------------------- */
-			/* 6. Cleanup                                                */
-			/* --------------------------------------------------------- */
 			if(bufSize[0] > 0)
-				cudaFree(buffer);
-			JCusparse.cusparseDestroyDnMat(matB.asConst());
-			JCusparse.cusparseDestroySpMat(matA.asConst());
+				cudaFree(dBuffer);
+			cusparseDestroyDnMat(matB.asConst());
+			cusparseDestroySpMat(spMatA.asConst());
 		}
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 3c9c47ed893..10f66407cac 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -18,6 +18,8 @@
  */
 package org.apache.sysds.runtime.matrix.data;
 
+import static jcuda.cudaDataType.CUDA_R_64F;
+import static jcuda.jcusparse.JCusparse.*;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
@@ -45,11 +47,9 @@
 import jcuda.jcusparse.cusparseSpMatDescr;
 import jcuda.jcusparse.cusparseSpGEMMDescr;
 
-import static jcuda.jcusparse.JCusparse.cusparseCreateCsr;
 import static jcuda.jcusparse.cusparseIndexType.CUSPARSE_INDEX_32I;
 import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
 import static jcuda.cudaDataType.CUDA_R_32F;
-import static jcuda.jcusparse.JCusparse.cusparseSpGEMM_createDescr;
 import static jcuda.jcusparse.cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
 import static jcuda.jcusparse.cusparseStatus.CUSPARSE_STATUS_SUCCESS;
 import static jcuda.jcusparse.cusparseSpMVAlg.CUSPARSE_SPMV_ALG_DEFAULT;
@@ -408,67 +408,45 @@ public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, c
 
 	@Override
 	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
-		Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
-		/* ------------------------------------------------------------------ */
-		/* 0. Determine nnz from csrRowPtrA[m] and index base                 */
-		/* ------------------------------------------------------------------ */
-		int[] last = {0};
-		cudaMemcpy(Pointer.to(last), csrRowPtrA.withByteOffset((long) m * Sizeof.INT), Sizeof.INT,
-			cudaMemcpyDeviceToHost);
+		Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda, long nnz) {
 
+		// Get index base from legacy descriptor -> 0 or 1
 		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
-		int nnz = (idxBase == CUSPARSE_INDEX_BASE_ONE) ? last[0] - 1 : last[0];
 
-		/* ------------------------------------------------------------------ */
-		/* 1. Create CSR ‘SpMat’ and dense ‘DnMat’ descriptors (FP32)         */
-		/* ------------------------------------------------------------------ */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
-		JCusparse.cusparseCreateCsr(matA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+		// Create generric sparse-matrix descriptor required by CUDA 12
+		cusparseSpMatDescr spMatA = new cusparseSpMatDescr();
+
+		// Build CSR descriptor
+		cusparseCreateCsr(spMatA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
 			CUSPARSE_INDEX_32I, idxBase, CUDA_R_32F);
 
+		// Build dense descriptor
 		cusparseDnMatDescr matB = new cusparseDnMatDescr();
-		JCusparse.cusparseCreateDnMat(matB, m, n, lda, A, CUDA_R_32F,
-			CUSPARSE_ORDER_COL);   // matches cuSPARSE’s lda convention
+		cusparseCreateDnMat(matB, m, n, lda, A, CUDA_R_32F, CUSPARSE_ORDER_COL);
 
-		/* ------------------------------------------------------------------ */
-		/* 2. Query workspace size                                            */
-		/* ------------------------------------------------------------------ */
-		long[] bufSize = {0};
+		// Determine buffer size
 		int alg = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+		long[] bufSize = {0};
+		cusparseSparseToDense_bufferSize(handle, spMatA.asConst(), matB, alg,
+			bufSize);    //bufSize[0] now holds the exact byte count
 
-		int status = JCusparse.cusparseSparseToDense_bufferSize(handle, matA.asConst(), matB, alg, bufSize);
-		if(status != CUSPARSE_STATUS_SUCCESS) {
-			JCusparse.cusparseDestroyDnMat(matB.asConst());
-			JCusparse.cusparseDestroySpMat(matA.asConst());
-			return status;
-		}
-
-		/* ------------------------------------------------------------------ */
-		/* 3. Allocate workspace (if needed)                                  */
-		/* ------------------------------------------------------------------ */
-		Pointer buffer = null;
+		// Allocate scratch space of the requested size
+		Pointer dBuffer = new Pointer();
 		if(bufSize[0] > 0) {
-			buffer = new Pointer();
-			cudaMalloc(buffer, bufSize[0]);
+			cudaMalloc(dBuffer, bufSize[0]);
 		}
-
 		try {
-			/* -------------------------------------------------------------- */
-			/* 4. Perform CSR -> dense conversion                            */
-			/* -------------------------------------------------------------- */
-			status = JCusparse.cusparseSparseToDense(handle, matA.asConst(), matB, alg, buffer);
-
-			return status;
+			// Write dense matrix
+			int algSparseToDense = CUSPARSE_SPARSETODENSE_ALG_DEFAULT;
+			return cusparseSparseToDense(handle, spMatA.asConst(), matB, algSparseToDense, dBuffer);
 		}
 		finally {
-			/* -------------------------------------------------------------- */
-			/* 5. Cleanup                                                     */
-			/* -------------------------------------------------------------- */
-			if(buffer != null)
-				cudaFree(buffer);
-			JCusparse.cusparseDestroyDnMat(matB.asConst());
-			JCusparse.cusparseDestroySpMat(matA.asConst());
+			if(bufSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroyDnMat(matB.asConst());
+			cusparseDestroySpMat(spMatA.asConst());
 		}
+
 	}
 
 	@Override

From fc9f56e06c6a73c6ac423787704141f7bae89133 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Sun, 15 Jun 2025 02:43:37 +0200
Subject: [PATCH 04/26] ensure rows are correctly sorted

---
 .../instructions/gpu/context/CSRPointer.java  | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index 1cfd946de47..61890af2c65 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -211,15 +211,18 @@ public static void copyPtrToHost(CSRPointer src, int rows, long nnz, int[] rowPt
 	 * @return CSR (compressed sparse row) pointer
 	 */
 	public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) {
-		if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
+		if(A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
 			throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
 		CSRPointer C = new CSRPointer(gCtx);
 		step1AllocateRowPointers(gCtx, handle, C, m);
+		ensureSorted(handle, m, n, toIntExact(A.nnz), A.rowPtr, A.colInd, A.descr);
+		ensureSorted(handle, m, n, toIntExact(B.nnz), B.rowPtr, B.colInd, B.descr);
 		step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
 		step3AllocateValNInd(gCtx, handle, C);
 		return C;
 	}
 
+
 	/**
 	 * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
 	 * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
@@ -452,6 +455,41 @@ private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle,
 		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz), false);
 	}
 
+	/**
+	 * Sort rows of matrix.
+	 *
+	 * @param handle
+	 * @param m
+	 * @param n
+	 * @param nnz
+	 * @param rowPtr
+	 * @param colInd
+	 * @param descr
+	 */
+	private static void ensureSorted(cusparseHandle handle, int m, int n, int nnz, Pointer rowPtr, Pointer colInd,
+		cusparseMatDescr descr) {
+
+		// 1. workspace size for sorting
+		long[] bufSize = {0};
+		JCusparse.cusparseXcsrsort_bufferSizeExt(handle, m, n, nnz, rowPtr, colInd, bufSize);
+
+		Pointer work = new Pointer();
+		if(bufSize[0] > 0)
+			cudaMalloc(work, bufSize[0]);
+
+		// 2. create a permutation array (can be NULL if you don’t care)
+		Pointer P = new Pointer();
+		cudaMalloc(P, (long) nnz * Sizeof.INT);
+		JCusparse.cusparseCreateIdentityPermutation(handle, nnz, P);
+
+		// 3. sort in-place
+		JCusparse.cusparseXcsrsort(handle, m, n, nnz, descr, rowPtr, colInd, P, work);
+
+		cudaFree(P);
+		if(bufSize[0] > 0)
+			cudaFree(work);
+	}
+
 	// ==============================================================================================
 
 	// The following methods estimate the memory needed for sparse matrices that are

From 7b1e0b9cb24c0e73843bd0e604404ae9dadd2cf8 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Sun, 15 Jun 2025 02:54:18 +0200
Subject: [PATCH 05/26] use checkGPU() in GPUFullReuseTest again after all
 tests pass now

---
 .../apache/sysds/test/functions/lineage/GPUFullReuseTest.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index 841511b31f0..74bd7fc47de 100644
--- a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -45,12 +45,12 @@ public class GPUFullReuseTest extends AutomatedTestBase{
 	protected static final int TEST_VARIANTS = 4;
 	protected String TEST_CLASS_DIR = TEST_DIR + GPUFullReuseTest.class.getSimpleName() + "/";
 
-	/*@BeforeClass
+	@BeforeClass
 	public static void checkGPU() {
 		// Skip all the tests if no GPU is available
 		// FIXME: Fails to skip if gpu available but no libraries
 		Assume.assumeTrue(TestUtils.isGPUAvailable() == cudaError.cudaSuccess);
-	}*/
+	}
 
 	@Override
 	public void setUp() {

From 39eb0ab2aede88b05726f884fb79fcbd42854683 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Tue, 17 Jun 2025 03:49:00 +0200
Subject: [PATCH 06/26] rework GEMM

---
 .../instructions/gpu/context/CSRPointer.java  | 171 +++++++++---------
 .../matrix/data/CudaSupportFunctions.java     |  14 +-
 .../DoublePrecisionCudaSupportFunctions.java  |  99 +---------
 .../matrix/data/LibMatrixCuMatMult.java       |   9 +-
 .../SinglePrecisionCudaSupportFunctions.java  |  96 +---------
 .../functions/lineage/GPUFullReuseTest.java   |   4 +-
 6 files changed, 114 insertions(+), 279 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index 61890af2c65..fc583b93483 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -36,6 +36,10 @@
 import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
 import static jcuda.runtime.JCuda.*;
 import static jcuda.runtime.cudaMemcpyKind.*;
+import static jcuda.jcusparse.cusparseIndexType.CUSPARSE_INDEX_32I;
+import static jcuda.cudaDataType.CUDA_R_64F;
+import static jcuda.cudaDataType.CUDA_R_32F;
+import static jcuda.jcusparse.cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
 
 /**
  * Compressed Sparse Row (CSR) format for CUDA
@@ -88,6 +92,16 @@ public class CSRPointer {
 	 */
 	public cusparseMatDescr descr;
 
+	/**
+	 * descriptor of sparse matrix
+	 */
+	public cusparseSpMatDescr spMatDescr;
+
+	/**
+	 * descriptor for sparse GEMM, only use for result matrix.
+	 */
+	public cusparseSpGEMMDescr spgemmDesc;
+
 	/**
 	 * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)}
 	 *
@@ -98,6 +112,8 @@ private CSRPointer(GPUContext gCtx) {
 		val = new Pointer();
 		rowPtr = new Pointer();
 		colInd = new Pointer();
+		spMatDescr = new cusparseSpMatDescr();
+		spgemmDesc = null;
 		allocateMatDescrPointer();
 	}
 
@@ -239,12 +255,13 @@ public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle
 	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
 	 */
 	public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
-			CSRPointer B, int transB, int m, int n, int k) {
-		// Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
-		// https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
+		CSRPointer B, int transB, int m, int n, int k, int sizeOfDataType) {
+
 		CSRPointer C = new CSRPointer(gCtx);
 		step1AllocateRowPointers(gCtx, handle, C, m);
-		step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
+		ensureSorted(handle, m, n, toIntExact(A.nnz), A.rowPtr, A.colInd, A.descr);
+		ensureSorted(handle, m, n, toIntExact(B.nnz), B.rowPtr, B.colInd, B.descr);
+		step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k, sizeOfDataType);
 		step3AllocateValNInd(gCtx, handle, C);
 		return C;
 	}
@@ -352,92 +369,73 @@ private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, C
 	 */
 
 	private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
-		CSRPointer B, int transB, CSRPointer C, int m, int n, int k)            // C = op(A)·op(B)  (m×k)·(k×n)
+		CSRPointer B, int transB, CSRPointer C, int m, int n, int k,
+		int sizeOfDataType)            // C = op(A)·op(B)  (m×k)·(k×n)
 	{
 		LOG.trace("GPU : step2GatherNNZGemm (SpGEMM), GPUContext=" + gCtx);
 
-		/* ---------- quick guard ---------------------------------------- */
+		// Ensure that NNZ does not exceed limit
 		if(A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
 			throw new DMLRuntimeException("Number of non-zeros exceeds cuSPARSE 32-bit limit");
 
-		/* ---------- 1. CSR descriptors for A, B, C --------------------- */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
-		cusparseSpMatDescr matB = new cusparseSpMatDescr();
-		cusparseSpMatDescr matC = new cusparseSpMatDescr();
-
-		cusparseCreateCsr(matA, m, k, A.nnz, A.rowPtr, A.colInd, A.val, cusparseIndexType.CUSPARSE_INDEX_32I,
-			cusparseIndexType.CUSPARSE_INDEX_32I, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO, cudaDataType.CUDA_R_64F);
-
-		cusparseCreateCsr(matB, k, n, B.nnz, B.rowPtr, B.colInd, B.val, cusparseIndexType.CUSPARSE_INDEX_32I,
-			cusparseIndexType.CUSPARSE_INDEX_32I, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO, cudaDataType.CUDA_R_64F);
-
-		cusparseCreateCsr(matC, m, n, 0L,                 // nnz(C) unknown
-			C.rowPtr, Pointer.to(new int[] {0}), Pointer.to(new double[] {0}), cusparseIndexType.CUSPARSE_INDEX_32I,
-			cusparseIndexType.CUSPARSE_INDEX_32I, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO, cudaDataType.CUDA_R_64F);
-
-		/* ---------- 2. SpGEMM descriptor ------------------------------- */
-		cusparseSpGEMMDescr spgemmDesc = new cusparseSpGEMMDescr();
-		cusparseSpGEMM_createDescr(spgemmDesc);
-
-		Pointer alpha = Pointer.to(new double[] {1.0});
-		Pointer beta = Pointer.to(new double[] {0.0});
-		int alg = cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
-
-		/* ---------- 3. Phase-1 : work-estimation ----------------------- */
-		long[] bufSize1 = {0};
-		cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC,
-			cudaDataType.CUDA_R_64F, alg, spgemmDesc, bufSize1, null);                               // first query
-
-		Pointer dBuf1 = new Pointer();
-		if(bufSize1[0] > 0)
-			cudaMalloc(dBuf1, bufSize1[0]);
-
-		cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC,
-			cudaDataType.CUDA_R_64F, alg, spgemmDesc, bufSize1, dBuf1);                              // real run
-
-		/* ---------- 4. Phase-2 : compute structure / nnz --------------- */
-		long[] bufSize2 = {0};
-		cusparseSpGEMM_compute(                           // size query
-			handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC, cudaDataType.CUDA_R_64F, alg,
-			spgemmDesc, bufSize2, null);                              // ← 13 args
-
-		Pointer dBuf2 = new Pointer();
-		if(bufSize2[0] > 0)
-			cudaMalloc(dBuf2, bufSize2[0]);
-
-		cusparseSpGEMM_compute(                           // actual compute
-			handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC, cudaDataType.CUDA_R_64F, alg,
-			spgemmDesc, bufSize2, dBuf2);
-
-		/* ---------- 5. read nnz(C) ------------------------------------- */
-		long[] rows = {0}, cols = {0}, nnz = {0};
-		cusparseSpMatGetSize(matC.asConst(), rows, cols, nnz);
-		C.nnz = (int) nnz[0];
-
-		/* ---------- 6. temp col/val arrays so COPY can write them ------ */
-		Pointer dCcol = new Pointer();
-		Pointer dCval = new Pointer();
-		if(C.nnz > 0) {
-			cudaMalloc(dCcol, C.nnz * Sizeof.INT);
-			cudaMalloc(dCval, C.nnz * Sizeof.DOUBLE);
-		}
-		cusparseCsrSetPointers(matC, C.rowPtr, dCcol, dCval);
-
-		/* ---------- 7. Phase-3 : copy final CSR into user arrays ------- */
-		cusparseSpGEMM_copy(                              // ← 11 args
-			handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC, cudaDataType.CUDA_R_64F, alg,
-			spgemmDesc);
-
-		/* ---------- 8. clean-up --------------------------------------- */
-		cudaFree(dCcol);
-		cudaFree(dCval);
-		cudaFree(dBuf1);
-		cudaFree(dBuf2);
-
-		cusparseSpGEMM_destroyDescr(spgemmDesc);
-		cusparseDestroySpMat(matA.asConst());
-		cusparseDestroySpMat(matB.asConst());
-		cusparseDestroySpMat(matC.asConst());
+		// set correct floating point precision, default is double
+		int dataType = CUDA_R_64F;
+		if(sizeOfDataType == 4)
+			dataType = CUDA_R_32F;
+
+		// Get index base -> 0 or 1
+		int baseA = cusparseGetMatIndexBase(A.descr);
+		int baseB = cusparseGetMatIndexBase(B.descr);
+		int baseC = cusparseGetMatIndexBase(C.descr);
+
+		// cuSPARSE 12 requires using cusparseSpMatDescr
+		// We do not recreate the existing matrices but only register the data with
+		// A.spMatDescr, B.spMatDescr, C.spMatDescr descriptors are merely registrations of the existing CSR arrays so cuSPARSE can reference them
+		cusparseCreateCsr(A.spMatDescr, m, k, A.nnz, A.rowPtr, A.colInd, A.val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			baseA, dataType);
+		cusparseCreateCsr(B.spMatDescr, k, n, B.nnz, B.rowPtr, B.colInd, B.val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			baseB, dataType);
+		cusparseCreateCsr(C.spMatDescr, m, n, 0, C.rowPtr, null, null, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, baseC,
+			dataType);
+
+		// SpGEMM Computation
+		C.spgemmDesc = new cusparseSpGEMMDescr();
+		cusparseSpGEMM_createDescr(C.spgemmDesc);
+		double[] alpha = {1.0};
+		double[] beta = {0.0};
+		Pointer alphaPtr = Pointer.to(alpha);
+		Pointer betaPtr = Pointer.to(beta);
+		int alg = CUSPARSE_SPGEMM_DEFAULT;
+
+		// ask bufferSize1 bytes for external memory
+		long[] bufferSize1 = {0};
+		cusparseSpGEMM_workEstimation(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
+			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize1, null);
+		Pointer buffer1 = new Pointer();
+		if(bufferSize1[0] > 0)
+			cudaMalloc(buffer1, bufferSize1[0]);
+		// inspect the matrices A and B to understand the memory requirement for the next step
+		cusparseSpGEMM_workEstimation(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
+			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize1, buffer1);
+
+		// ask bufferSize2 bytes for external memory
+		long[] bufferSize2 = {0};
+		cusparseSpGEMM_compute(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
+			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize2, null);
+		Pointer buffer2 = new Pointer();
+		if(bufferSize2[0] > 0)
+			cudaMalloc(buffer2, bufferSize2[0]);
+
+		// compute the intermediate product of A * B
+		cusparseSpGEMM_compute(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
+			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize2, buffer2);
+
+		// obtain nnz of C
+		long[] rows = {0};
+		long[] cols = {0};
+		long[] nnz = {0};
+		cusparseSpMatGetSize(C.spMatDescr.asConst(), rows, cols, nnz);
+		C.nnz = nnz[0];
 	}
 
 	/**
@@ -453,6 +451,11 @@ private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle,
 
 		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz), false);
 		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz), false);
+
+		// special case for sparse GEMM
+		// requires cusparseSpGEMMDescr since CUDA 12
+		if(C.spgemmDesc != null)
+			cusparseCsrSetPointers(C.spMatDescr, C.rowPtr, C.colInd, C.val);
 	}
 
 	/**
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
index 369a2efae0f..29814a55fa4 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
@@ -24,6 +24,8 @@
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
 
+import jcuda.jcusparse.cusparseSpGEMMDescr;
+import jcuda.jcusparse.cusparseSpMatDescr;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 
 import jcuda.Pointer;
@@ -59,12 +61,12 @@
  */
 public interface CudaSupportFunctions {
 	public static boolean PERFORM_CONVERSION_ON_DEVICE = true;
-	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k, 
-			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, 
-			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, 
-			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC);
-	public int	cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, 
-			int lda, jcuda.Pointer beta, jcuda.Pointer B, int ldb, jcuda.Pointer C, int ldc);
+
+	int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusparseSpMatDescr spMatDescrA,
+		cusparseSpMatDescr spMatDescrB, cusparseSpMatDescr spMatDescrC, cusparseSpGEMMDescr spgemmDescr);
+
+	int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha,
+		jcuda.Pointer A, int lda, jcuda.Pointer beta, jcuda.Pointer B, int ldb, jcuda.Pointer C, int ldc);
 	public int	cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
 			jcuda.Pointer x, jcuda.Pointer beta, jcuda.Pointer y);
 	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index 27696a2abd7..5e69f8304cd 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -19,11 +19,9 @@
 package org.apache.sysds.runtime.matrix.data;
 
 import static jcuda.jcusparse.JCusparse.*;
-import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.JCuda.*;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaFree;
 
 import jcuda.jcusparse.cusparseSpMatDescr;
 import jcuda.jcusparse.cusparseSpGEMMDescr;
@@ -63,95 +61,12 @@ public class DoublePrecisionCudaSupportFunctions implements CudaSupportFunctions
 	private static final Log LOG = LogFactory.getLog(DoublePrecisionCudaSupportFunctions.class.getName());
 
 	@Override
-	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
-		cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
-		cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
-		cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
-		/* ------------------------------------------------------------------ */
-		/* 0.   Wrap A, B, C in the new SpMat descriptors                     */
-		/* ------------------------------------------------------------------ */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
-		cusparseSpMatDescr matB = new cusparseSpMatDescr();
-		cusparseSpMatDescr matC = new cusparseSpMatDescr();
-
-		cusparseCreateCsr(matA, m, k, nnzA, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-			CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
-
-		cusparseCreateCsr(matB, k, n, nnzB, csrRowPtrB, csrColIndB, csrValB, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-			CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
-
-    /*  C’s nnz is not known yet -> start with 0 and rowPtr only.
-        colInd / val arrays are already allocated by the caller.         */
-		cusparseCreateCsr(matC, m, n, 0L, csrRowPtrC, csrColIndC, csrValC, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-			CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
-
-		/* ------------------------------------------------------------------ */
-		/* 1.   Create & configure the SpGEMM descriptor                      */
-		/* ------------------------------------------------------------------ */
-		cusparseSpGEMMDescr spgemm = new cusparseSpGEMMDescr();
-		cusparseSpGEMM_createDescr(spgemm);
-
-		Pointer alpha = Pointer.to(new double[] {1.0});
-		Pointer beta = Pointer.to(new double[] {0.0});
-		int alg = CUSPARSE_SPGEMM_DEFAULT;
-		int computeTp = CUDA_R_64F;
-
-		/* ------------------------------------------------------------------ */
-		/* 2.   Phase-1 : work-estimation                                     */
-		/* ------------------------------------------------------------------ */
-		long[] bufSz1 = {0};
-		int status = JCusparse.cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(),
-			matB.asConst(), beta, matC, computeTp, alg, spgemm, bufSz1, null);
-		if(status != CUSPARSE_STATUS_SUCCESS)
-			return status;
-
-		Pointer dBuf1 = new Pointer();
-		if(bufSz1[0] > 0)
-			cudaMalloc(dBuf1, bufSz1[0]);
-
-		status = JCusparse.cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
-			beta, matC, computeTp, alg, spgemm, bufSz1, dBuf1);
-		if(status != CUSPARSE_STATUS_SUCCESS)
-			return status;
-
-		/* ------------------------------------------------------------------ */
-		/* 3.   Phase-2 : compute structure / nnz(C)                          */
-		/* ------------------------------------------------------------------ */
-		long[] bufSz2 = {0};
-		status = JCusparse.cusparseSpGEMM_compute(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta,
-			matC, computeTp, alg, spgemm, bufSz2, null);                       // query required buffer
-		if(status != CUSPARSE_STATUS_SUCCESS)
-			return status;
-
-		Pointer dBuf2 = new Pointer();
-		if(bufSz2[0] > 0)
-			cudaMalloc(dBuf2, bufSz2[0]);
-
-		status = JCusparse.cusparseSpGEMM_compute(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta,
-			matC, computeTp, alg, spgemm, bufSz2, dBuf2);                      // real compute
-		if(status != CUSPARSE_STATUS_SUCCESS)
-			return status;
-
-		/* ------------------------------------------------------------------ */
-		/* 4.   Phase-3 : copy final CSR data into caller-provided buffers    */
-		/* ------------------------------------------------------------------ */
-		status = JCusparse.cusparseSpGEMM_copy(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta,
-			matC, computeTp, alg, spgemm);
-		/* fall-through to CLEAN block */
-
-
-		/* ------------------------------------------------------------------ */
-		/* 5.   Cleanup                                                         */
-		/* ------------------------------------------------------------------ */
-		cudaFree(dBuf1);
-		cudaFree(dBuf2);
-
-		JCusparse.cusparseSpGEMM_destroyDescr(spgemm);
-		JCusparse.cusparseDestroySpMat(matA.asConst());
-		JCusparse.cusparseDestroySpMat(matB.asConst());
-		JCusparse.cusparseDestroySpMat(matC.asConst());
-
-		return status;
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusparseSpMatDescr spMatDescrA,
+		cusparseSpMatDescr spMatDescrB, cusparseSpMatDescr spMatDescrC, cusparseSpGEMMDescr spgemmDescr) {
+		double[] alpha = {1.0}, beta = {0.0};
+		Pointer alphaPtr = Pointer.to(alpha), betaPtr = Pointer.to(beta);
+		return cusparseSpGEMM_copy(handle, transA, transB, alphaPtr, spMatDescrA.asConst(), spMatDescrB.asConst(),
+			betaPtr, spMatDescrC, CUDA_R_64F, alg, spgemmDescr);
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
index 5753041a622..64d57159342 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -22,6 +22,7 @@
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import static jcuda.jcusparse.cusparseSpMMAlg.CUSPARSE_SPMM_ALG_DEFAULT;
 import jcuda.Pointer;
 
 import org.apache.commons.logging.Log;
@@ -154,18 +155,16 @@ public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String
 
 			// Step 1: Allocate output => sparse format
 			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
-
 			// Step 2: Get the handles to sparse/dense pointers for left, right
 			// and output
 			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 			CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb,
-					params.m, params.n, params.k);
+				params.m, params.n, params.k, sizeOfDataType);
 		
 			// Step 3: Invoke the kernel
-			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
-					(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
-					C.val, C.rowPtr, C.colInd);
+			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, CUSPARSE_SPMM_ALG_DEFAULT,
+				A.spMatDescr, B.spMatDescr, C.spMatDescr, C.spgemmDesc);
 			output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
 			// -------------------------------------------------------------------------------------
 		} else if (!isM1Sparse && isM2Sparse) {
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 10f66407cac..5e9977d5a60 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -71,96 +71,12 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions
 	private static final Log LOG = LogFactory.getLog(SinglePrecisionCudaSupportFunctions.class.getName());
 
 	@Override
-	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
-		cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
-		cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
-		cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
-		/* ------------------------------------------------------------------ */
-		/* Descriptors and temporaries                                        */
-		/* ------------------------------------------------------------------ */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
-		cusparseSpMatDescr matB = new cusparseSpMatDescr();
-		cusparseSpMatDescr matC = new cusparseSpMatDescr();
-		cusparseSpGEMMDescr spgemm = new cusparseSpGEMMDescr();
-
-		Pointer dBuf1 = null;
-		Pointer dBuf2 = null;
-		int status;
-
-		try {
-			/* Create CSR descriptors (FP32, 32-bit indices) ---------------- */
-			cusparseCreateCsr(matA, m, k, nnzA, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-				CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
-
-			cusparseCreateCsr(matB, k, n, nnzB, csrRowPtrB, csrColIndB, csrValB, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-				CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
-
-			cusparseCreateCsr(matC, m, n, 0L,            // nnz unknown yet
-				csrRowPtrC, csrColIndC, csrValC, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
-				CUDA_R_32F);
-
-			/* SpGEMM descriptor ------------------------------------------- */
-			cusparseSpGEMM_createDescr(spgemm);
-
-			Pointer alpha = Pointer.to(new float[] {1.0f});
-			Pointer beta = Pointer.to(new float[] {0.0f});
-			int alg = CUSPARSE_SPGEMM_DEFAULT;
-			int type = CUDA_R_32F;
-
-			/* -------- Phase-1 : work-estimation -------------------------- */
-			long[] bufSz1 = {0};
-			status = JCusparse.cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(),
-				matB.asConst(), beta, matC, type, alg, spgemm, bufSz1, null);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			if(bufSz1[0] > 0) {
-				dBuf1 = new Pointer();
-				cudaMalloc(dBuf1, bufSz1[0]);
-			}
-
-			status = JCusparse.cusparseSpGEMM_workEstimation(handle, transA, transB, alpha, matA.asConst(),
-				matB.asConst(), beta, matC, type, alg, spgemm, bufSz1, dBuf1);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			/* -------- Phase-2 : compute ---------------------------------- */
-			long[] bufSz2 = {0};
-			status = JCusparse.cusparseSpGEMM_compute(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
-				beta, matC, type, alg, spgemm, bufSz2, null);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			if(bufSz2[0] > 0) {
-				dBuf2 = new Pointer();
-				cudaMalloc(dBuf2, bufSz2[0]);
-			}
-
-			status = JCusparse.cusparseSpGEMM_compute(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
-				beta, matC, type, alg, spgemm, bufSz2, dBuf2);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			/* -------- Phase-3 : copy result ------------------------------ */
-			status = JCusparse.cusparseSpGEMM_copy(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta,
-				matC, type, alg, spgemm);
-
-			return status;
-		}
-		finally {
-			/* ------------------------------------------------------------------ */
-			/* Cleanup always runs, success or error                              */
-			/* ------------------------------------------------------------------ */
-			if(dBuf1 != null)
-				cudaFree(dBuf1);
-			if(dBuf2 != null)
-				cudaFree(dBuf2);
-
-			JCusparse.cusparseSpGEMM_destroyDescr(spgemm);
-			JCusparse.cusparseDestroySpMat(matA.asConst());
-			JCusparse.cusparseDestroySpMat(matB.asConst());
-			JCusparse.cusparseDestroySpMat(matC.asConst());
-		}
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusparseSpMatDescr spMatDescrA,
+		cusparseSpMatDescr spMatDescrB, cusparseSpMatDescr spMatDescrC, cusparseSpGEMMDescr spgemmDescr) {
+		double[] alpha = {1.0}, beta = {0.0};
+		Pointer alphaPtr = Pointer.to(alpha), betaPtr = Pointer.to(beta);
+		return cusparseSpGEMM_copy(handle, transA, transB, alphaPtr, spMatDescrA.asConst(), spMatDescrB.asConst(),
+			betaPtr, spMatDescrC, CUDA_R_32F, alg, spgemmDescr);
 	}
 
 	@Override
diff --git a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index 74bd7fc47de..841511b31f0 100644
--- a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -45,12 +45,12 @@ public class GPUFullReuseTest extends AutomatedTestBase{
 	protected static final int TEST_VARIANTS = 4;
 	protected String TEST_CLASS_DIR = TEST_DIR + GPUFullReuseTest.class.getSimpleName() + "/";
 
-	@BeforeClass
+	/*@BeforeClass
 	public static void checkGPU() {
 		// Skip all the tests if no GPU is available
 		// FIXME: Fails to skip if gpu available but no libraries
 		Assume.assumeTrue(TestUtils.isGPUAvailable() == cudaError.cudaSuccess);
-	}
+	}*/
 
 	@Override
 	public void setUp() {

From 58d5034937c5e8fdb23fad3262eb5818872bd31d Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Tue, 17 Jun 2025 17:28:30 +0200
Subject: [PATCH 07/26]  rewrite csr2csc

---
 .../DoublePrecisionCudaSupportFunctions.java  | 30 ++++++---------
 .../SinglePrecisionCudaSupportFunctions.java  | 37 ++++++-------------
 .../functions/lineage/LineageReuseGPU4.dml    |  4 +-
 3 files changed, 24 insertions(+), 47 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index 5e69f8304cd..ee0f9483051 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -197,31 +197,23 @@ public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n,
 	@Override
 	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
 		Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
-		/* Constants ------------------------------------------------------- */
-		int valType = CUDA_R_64F;                     // double precision
-		int alg = CUSPARSE_CSR2CSC_ALG1;          // always supported
 
-		/* Query workspace size ------------------------------------------- */
-		long[] bufSize = {0};
-		int status = JCusparse.cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-			cscColPtr, cscRowInd, valType, copyValues, idxBase, alg, bufSize);
-		if(status != CUSPARSE_STATUS_SUCCESS)
-			return status;
+		int valType = CUDA_R_64F;            // double precision
+		int alg = CUSPARSE_CSR2CSC_ALG1;     // always supported
 
-		/* Allocate temp buffer if needed --------------------------------- */
-		Pointer buffer = new Pointer();
-		if(bufSize[0] > 0)
-			cudaMalloc(buffer, bufSize[0]);
+		long[] bufferSize = {0};
+		cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
+			valType, copyValues, idxBase, alg, bufferSize);
 
+		Pointer buffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(buffer, bufferSize[0]);
 		try {
-			/* Perform CSR -> CSC conversion ------------------------------- */
-			status = JCusparse.cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
-				cscRowInd, valType, copyValues, idxBase, alg, buffer);
-
-			return status;
+			return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
+				valType, copyValues, idxBase, alg, buffer);
 		}
 		finally {
-			if(bufSize[0] > 0)
+			if(bufferSize[0] > 0)
 				cudaFree(buffer);
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 5e9977d5a60..09c843cff47 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -209,38 +209,23 @@ public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n,
 	@Override
 	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
 		Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
-		final int alg = CUSPARSE_CSR2CSC_ALG1;		// Algorithm 1 is universally supported
-		final int valType = CUDA_R_32F;				// single-precision
 
-		/* ------------------------------------------------------------------ */
-		/* 1. Query required workspace size                                   */
-		/* ------------------------------------------------------------------ */
-		long[] bufSize = {0};
-		int status = JCusparse.cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
-			cscColPtr, cscRowInd, valType, copyValues, idxBase, alg, bufSize);
-		if(status != CUSPARSE_STATUS_SUCCESS)
-			return status;
+		int valType = CUDA_R_32F;            // single precision
+		int alg = CUSPARSE_CSR2CSC_ALG1;     // always supported
 
-		/* ------------------------------------------------------------------ */
-		/* 2. Allocate workspace (if needed)                                  */
-		/* ------------------------------------------------------------------ */
-		Pointer buffer = null;
-		if(bufSize[0] > 0) {
-			buffer = new Pointer();
-			cudaMalloc(buffer, bufSize[0]);
-		}
+		long[] bufferSize = {0};
+		cusparseCsr2cscEx2_bufferSize(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
+			valType, copyValues, idxBase, alg, bufferSize);
 
+		Pointer buffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(buffer, bufferSize[0]);
 		try {
-			/* -------------------------------------------------------------- */
-			/* 3. Perform CSR -> CSC conversion                                */
-			/* -------------------------------------------------------------- */
-			status = JCusparse.cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
-				cscRowInd, valType, copyValues, idxBase, alg, buffer);
-
-			return status;
+			return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd,
+				valType, copyValues, idxBase, alg, buffer);
 		}
 		finally {
-			if(buffer != null)
+			if(bufferSize[0] > 0)
 				cudaFree(buffer);
 		}
 	}
diff --git a/src/test/scripts/functions/lineage/LineageReuseGPU4.dml b/src/test/scripts/functions/lineage/LineageReuseGPU4.dml
index 0f4de81b34d..6d9a1bea65f 100644
--- a/src/test/scripts/functions/lineage/LineageReuseGPU4.dml
+++ b/src/test/scripts/functions/lineage/LineageReuseGPU4.dml
@@ -31,8 +31,8 @@ randColSet = function(Matrix[Double] X, Integer seed, Double sample) return (Mat
   Xi = removeEmpty(target = X, margin = "cols", select = temp);
 }
 
-X = rand(rows=100, cols=100, sparsity=1.0, seed=1);
-y = rand(rows=100, cols=1, sparsity=1.0, seed=1);
+X = rand(rows=100, cols=100, sparsity=0.2, seed=1);
+y = rand(rows=100, cols=1, sparsity=0.2, seed=1);
 
 Rbeta = matrix(0, rows=525, cols=ncol(X)); #nrows = 5*5*3*7 = 525
 Rloss = matrix(0, rows=525, cols=1);

From 1ce7ea34faa37bb33156187b12e6d76612f5db29 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Thu, 19 Jun 2025 22:58:08 +0200
Subject: [PATCH 08/26] fix sparse-sparse matrix multiply

---
 .../instructions/gpu/context/CSRPointer.java  | 77 +++++++++++++++----
 .../matrix/data/LibMatrixCuMatMult.java       | 27 +++++--
 .../functions/lineage/LineageReuseGPU4.dml    |  4 +-
 3 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index fc583b93483..9473e93957c 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -40,6 +40,9 @@
 import static jcuda.cudaDataType.CUDA_R_64F;
 import static jcuda.cudaDataType.CUDA_R_32F;
 import static jcuda.jcusparse.cusparseSpGEMMAlg.CUSPARSE_SPGEMM_DEFAULT;
+import static org.apache.sysds.runtime.matrix.data.LibMatrixCUDA.cudaSupportFunctions;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
 
 /**
  * Compressed Sparse Row (CSR) format for CUDA
@@ -190,7 +193,7 @@ public static void copyToDevice(GPUContext gCtx, CSRPointer dest, int rows, long
 		if(rowPtr.length < rows + 1) throw new DMLRuntimeException("The length of rowPtr needs to be greater than or equal to " + (rows + 1));
 		if(colInd.length < nnz) throw new DMLRuntimeException("The length of colInd needs to be greater than or equal to " + nnz);
 		if(values.length < nnz) throw new DMLRuntimeException("The length of values needs to be greater than or equal to " + nnz);
-		LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null);
+		cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null);
 		cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
 		cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
 		//if (DMLScript.STATISTICS)
@@ -255,13 +258,10 @@ public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle
 	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
 	 */
 	public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
-		CSRPointer B, int transB, int m, int n, int k, int sizeOfDataType) {
-
+		CSRPointer B, int transB, int m, int n, int k, int dataType) {
 		CSRPointer C = new CSRPointer(gCtx);
 		step1AllocateRowPointers(gCtx, handle, C, m);
-		ensureSorted(handle, m, n, toIntExact(A.nnz), A.rowPtr, A.colInd, A.descr);
-		ensureSorted(handle, m, n, toIntExact(B.nnz), B.rowPtr, B.colInd, B.descr);
-		step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k, sizeOfDataType);
+		step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k, dataType);
 		step3AllocateValNInd(gCtx, handle, C);
 		return C;
 	}
@@ -370,7 +370,7 @@ private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, C
 
 	private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
 		CSRPointer B, int transB, CSRPointer C, int m, int n, int k,
-		int sizeOfDataType)            // C = op(A)·op(B)  (m×k)·(k×n)
+		int dataType)            // C = op(A)·op(B)  (m×k)·(k×n)
 	{
 		LOG.trace("GPU : step2GatherNNZGemm (SpGEMM), GPUContext=" + gCtx);
 
@@ -378,11 +378,6 @@ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, C
 		if(A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE)
 			throw new DMLRuntimeException("Number of non-zeros exceeds cuSPARSE 32-bit limit");
 
-		// set correct floating point precision, default is double
-		int dataType = CUDA_R_64F;
-		if(sizeOfDataType == 4)
-			dataType = CUDA_R_32F;
-
 		// Get index base -> 0 or 1
 		int baseA = cusparseGetMatIndexBase(A.descr);
 		int baseB = cusparseGetMatIndexBase(B.descr);
@@ -397,7 +392,6 @@ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, C
 			baseB, dataType);
 		cusparseCreateCsr(C.spMatDescr, m, n, 0, C.rowPtr, null, null, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, baseC,
 			dataType);
-
 		// SpGEMM Computation
 		C.spgemmDesc = new cusparseSpGEMMDescr();
 		cusparseSpGEMM_createDescr(C.spgemmDesc);
@@ -425,7 +419,6 @@ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, C
 		Pointer buffer2 = new Pointer();
 		if(bufferSize2[0] > 0)
 			cudaMalloc(buffer2, bufferSize2[0]);
-
 		// compute the intermediate product of A * B
 		cusparseSpGEMM_compute(handle, transA, transB, alphaPtr, A.spMatDescr.asConst(), B.spMatDescr.asConst(),
 			betaPtr, C.spMatDescr, dataType, alg, C.spgemmDesc, bufferSize2, buffer2);
@@ -491,6 +484,60 @@ private static void ensureSorted(cusparseHandle handle, int m, int n, int nnz, P
 		cudaFree(P);
 		if(bufSize[0] > 0)
 			cudaFree(work);
+
+	}
+
+	/**
+	 * Physically transpose a CSR matrix (srcRows × srcCols ➜ srcCols × srcRows). The trick:  CSR ➜ CSC of the original
+	 * matrix is bit-for-bit the CSR of the transposed matrix, so we leverage cuSPARSE csr2cscEx2.
+	 *
+	 * @param gCtx
+	 * @param handle
+	 * @param src
+	 * @param srcRows
+	 * @param srcCols
+	 * @param dataType
+	 * @return
+	 */
+	public static CSRPointer transposeCSR(GPUContext gCtx, cusparseHandle handle, CSRPointer src, int srcRows,
+		int srcCols, int dataType) {
+
+		CSRPointer dst = new CSRPointer(gCtx);
+		dst.nnz = src.nnz;
+
+		// allocate transposed arrays: rows = srcCols, cols = srcRows
+		dst.rowPtr = gCtx.allocate(null, getIntSizeOf((long) srcCols + 1), true);
+		dst.colInd = gCtx.allocate(null, getIntSizeOf(dst.nnz), false);
+		dst.val = gCtx.allocate(null, getDataTypeSizeOf(dst.nnz), false);
+
+		// CSR -> CSC (of src) == CSR (of t(src))
+		int copyValues = 1;
+		int idxBase = cusparseGetMatIndexBase(src.descr);
+		cudaSupportFunctions.cusparsecsr2csc(handle, srcRows, srcCols, toIntExact(dst.nnz), src.val, src.rowPtr,
+			src.colInd, dst.val, dst.colInd, dst.rowPtr, copyValues, idxBase);
+
+		// classical descriptor (needed by ensureSorted)
+		cusparseCreateMatDescr(dst.descr);
+		cusparseSetMatIndexBase(dst.descr, idxBase);
+
+		// cuSPARSE 12 SpMat descriptor
+		cusparseCreateCsr(dst.spMatDescr,
+			/*rows*/ srcCols,   // m
+			/*cols*/ srcRows,   // k
+			dst.nnz, dst.rowPtr, dst.colInd, dst.val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, idxBase, dataType);
+
+		return dst;
+	}
+
+	/**
+	 * Helper function to check matrix dimensions
+	 *
+	 * @param desc
+	 */
+	public static void getCSRMatrixInfo(cusparseSpMatDescr desc) {
+		long[] r = {0}, c = {0}, z = {0};
+		cusparseSpMatGetSize(desc.asConst(), r, c, z);
+		System.err.println("DEBUG  A  rows=" + r[0] + " cols=" + c[0] + " nnz=" + z[0]);
 	}
 
 	// ==============================================================================================
@@ -578,7 +625,7 @@ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHan
 		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
 		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
 			// Note: cusparseDcsr2dense method cannot handle empty blocks
-			LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows, nnz);
+			cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows, nnz);
 			//cudaDeviceSynchronize;
 		} else {
 			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
index 64d57159342..0124db2fd9b 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -18,11 +18,16 @@
  */
 package org.apache.sysds.runtime.matrix.data;
 
+import static jcuda.cudaDataType.CUDA_R_32F;
+import static jcuda.cudaDataType.CUDA_R_64F;
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 import static jcuda.jcusparse.cusparseSpMMAlg.CUSPARSE_SPMM_ALG_DEFAULT;
+import static org.apache.sysds.runtime.instructions.gpu.context.CSRPointer.getCSRMatrixInfo;
+import static org.apache.sysds.runtime.instructions.gpu.context.CSRPointer.transposeCSR;
+
 import jcuda.Pointer;
 
 import org.apache.commons.logging.Log;
@@ -150,8 +155,9 @@ public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String
 			// -------------------------------------------------------------------------------------
 			// sparse-sparse matrix multiplication
 			params.validate();
-			int transa = cusparseOp(isLeftTransposed);
-			int transb = cusparseOp(isRightTransposed);
+			int transA = cusparseOp(isLeftTransposed);
+			int transB = cusparseOp(isRightTransposed);
+			int dataType = (sizeOfDataType == 4) ? CUDA_R_32F : CUDA_R_64F;
 
 			// Step 1: Allocate output => sparse format
 			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
@@ -159,11 +165,20 @@ public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String
 			// and output
 			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-			CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb,
-				params.m, params.n, params.k, sizeOfDataType);
-		
+			// transpose if required
+			// cusparseSpGEMM works only with CUSPARSE_OPERATION_NON_TRANSPOSE
+			if(transA == CUSPARSE_OPERATION_TRANSPOSE) {
+				A = transposeCSR(gCtx, getCusparseHandle(gCtx), A, params.k, params.m, dataType);
+			}
+			if(transB == CUSPARSE_OPERATION_TRANSPOSE) {
+				B = transposeCSR(gCtx, getCusparseHandle(gCtx), B, params.n, params.k, dataType);
+			}
+			transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+			transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+			CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transA, B, transB,
+				params.m, params.n, params.k, dataType);
 			// Step 3: Invoke the kernel
-			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, CUSPARSE_SPMM_ALG_DEFAULT,
+			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transA, transB, CUSPARSE_SPMM_ALG_DEFAULT,
 				A.spMatDescr, B.spMatDescr, C.spMatDescr, C.spgemmDesc);
 			output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
 			// -------------------------------------------------------------------------------------
diff --git a/src/test/scripts/functions/lineage/LineageReuseGPU4.dml b/src/test/scripts/functions/lineage/LineageReuseGPU4.dml
index 6d9a1bea65f..0f4de81b34d 100644
--- a/src/test/scripts/functions/lineage/LineageReuseGPU4.dml
+++ b/src/test/scripts/functions/lineage/LineageReuseGPU4.dml
@@ -31,8 +31,8 @@ randColSet = function(Matrix[Double] X, Integer seed, Double sample) return (Mat
   Xi = removeEmpty(target = X, margin = "cols", select = temp);
 }
 
-X = rand(rows=100, cols=100, sparsity=0.2, seed=1);
-y = rand(rows=100, cols=1, sparsity=0.2, seed=1);
+X = rand(rows=100, cols=100, sparsity=1.0, seed=1);
+y = rand(rows=100, cols=1, sparsity=1.0, seed=1);
 
 Rbeta = matrix(0, rows=525, cols=ncol(X)); #nrows = 5*5*3*7 = 525
 Rloss = matrix(0, rows=525, cols=1);

From 0484ea36afb17ac6a51f1e94f283d046f2fcfd95 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Fri, 20 Jun 2025 16:59:22 +0200
Subject: [PATCH 09/26] rework cusparsecsrmv()

---
 .../matrix/data/CudaSupportFunctions.java     |  2 +-
 .../DoublePrecisionCudaSupportFunctions.java  | 64 ++++++++-----------
 .../matrix/data/LibMatrixCuMatMult.java       |  2 +-
 .../SinglePrecisionCudaSupportFunctions.java  | 64 ++++++++-----------
 .../functions/lineage/LineageReuseGPU2.dml    |  4 +-
 5 files changed, 55 insertions(+), 81 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
index 29814a55fa4..d3edb25cd80 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
@@ -67,7 +67,7 @@ int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusp
 
 	int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha,
 		jcuda.Pointer A, int lda, jcuda.Pointer beta, jcuda.Pointer B, int ldb, jcuda.Pointer C, int ldc);
-	public int	cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+	public int	cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha, cusparseSpMatDescr spMatDescrA, cusparseMatDescr descA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA,
 			jcuda.Pointer x, jcuda.Pointer beta, jcuda.Pointer y);
 	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
 			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index ee0f9483051..0d7a71ffa9f 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -19,6 +19,7 @@
 package org.apache.sysds.runtime.matrix.data;
 
 import static jcuda.jcusparse.JCusparse.*;
+import static jcuda.jcusparse.JCusparse.cusparseGetMatIndexBase;
 import static jcuda.runtime.JCuda.*;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
@@ -77,49 +78,36 @@ public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n,
 
 	@Override
 	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
-		cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
-		Pointer y) {
-		/* Descriptors and workspace --------------------------------------- */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseSpMatDescr spMatDescrA, cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA,
+		Pointer csrColIndA, Pointer x, Pointer beta, Pointer y) {
+		// Create sparse matrix A in CSR format
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		int dataType = CUDA_R_64F;
+		cusparseCreateCsr(spMatDescrA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, dataType);
+		// Create dense vectors vecX and vecY
 		cusparseDnVecDescr vecX = new cusparseDnVecDescr();
 		cusparseDnVecDescr vecY = new cusparseDnVecDescr();
-		Pointer dBuf = new Pointer();
-		long dBufBytes = 0;
-		int status;
-
+		cusparseCreateDnVec(vecX, n, x, dataType);
+		cusparseCreateDnVec(vecY, m, y, dataType);
+		// allocate an external buffer if needed
+		long[] bufferSize = {0};
+		int alg = CUSPARSE_SPMV_ALG_DEFAULT;
+		cusparseSpMV_bufferSize(handle, transA, alpha, spMatDescrA.asConst(), vecX.asConst(), beta, vecY, dataType, alg,
+			bufferSize);
+		// execute SpMV
+		Pointer dBuffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
 		try {
-			/* 1. CSR matrix A --------------------------------------------- */
-			cusparseCreateCsr(matA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-				CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
-
-			/* 2. Dense vectors X and Y ------------------------------------ */
-			cusparseCreateDnVec(vecX, n, x, CUDA_R_64F);
-			cusparseCreateDnVec(vecY, m, y, CUDA_R_64F);
-
-			/* 3. Query workspace size ------------------------------------- */
-			long[] bufSize = {0};
-			status = JCusparse.cusparseSpMV_bufferSize(handle, transA, alpha, matA.asConst(), vecX.asConst(), beta,
-				vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, bufSize);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			dBufBytes = bufSize[0];
-			if(dBufBytes > 0)
-				cudaMalloc(dBuf, dBufBytes);
-
-			/* 4. Perform SpMV -------------------------------------------- */
-			status = JCusparse.cusparseSpMV(handle, transA, alpha, matA.asConst(), vecX.asConst(), beta, vecY,
-				CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, dBuf);
-
-			return status;
+			return cusparseSpMV(handle, transA, alpha, spMatDescrA.asConst(), vecX.asConst(), beta, vecY, dataType, alg,
+				dBuffer);
 		}
 		finally {
-			/* Cleanup ----------------------------------------------------- */
-			if(dBufBytes > 0)
-				cudaFree(dBuf);
-			JCusparse.cusparseDestroyDnVec(vecX.asConst());
-			JCusparse.cusparseDestroyDnVec(vecY.asConst());
-			JCusparse.cusparseDestroySpMat(matA.asConst());
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroyDnVec(vecX.asConst());
+			cusparseDestroyDnVec(vecY.asConst());
 		}
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
index 0124db2fd9b..837edae3b18 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -320,7 +320,7 @@ private static void denseSparseMatMult(cusparseHandle handle, String instName, P
 			int m = toInt(param.rightNumRows);
 			int n = toInt(param.rightNumCols);
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
-			cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+			cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.spMatDescr, B.descr, B.val, B.rowPtr, B.colInd, A,
 					zero(), C);
 		} else {
 			int m = toInt(param.rightNumRows);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 09c843cff47..44ec08638b7 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -87,50 +87,36 @@ public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n,
 
 	@Override
 	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
-		cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
-		Pointer y) {
-		/* ------------------------------------------------------------------ */
-		/* Descriptors and workspace                                          */
-		/* ------------------------------------------------------------------ */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
+		cusparseSpMatDescr spMatDescrA, cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA,
+		Pointer csrColIndA, Pointer x, Pointer beta, Pointer y) {
+		// Create sparse matrix A in CSR format
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		int dataType = CUDA_R_32F;
+		cusparseCreateCsr(spMatDescrA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, dataType);
+		// Create dense vectors vecX and vecY
 		cusparseDnVecDescr vecX = new cusparseDnVecDescr();
 		cusparseDnVecDescr vecY = new cusparseDnVecDescr();
-		Pointer dBuf = null;
-		int status;
-
+		cusparseCreateDnVec(vecX, n, x, dataType);
+		cusparseCreateDnVec(vecY, m, y, dataType);
+		// allocate an external buffer if needed
+		long[] bufferSize = {0};
+		int alg = CUSPARSE_SPMV_ALG_DEFAULT;
+		cusparseSpMV_bufferSize(handle, transA, alpha, spMatDescrA.asConst(), vecX.asConst(), beta, vecY, dataType, alg,
+			bufferSize);
+		// execute SpMV
+		Pointer dBuffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
 		try {
-			/* 1. CSR matrix A (FP32) -------------------------------------- */
-			JCusparse.cusparseCreateCsr(matA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
-				CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
-
-			/* 2. Dense vectors X and Y (FP32) ------------------------------ */
-			JCusparse.cusparseCreateDnVec(vecX, n, x, CUDA_R_32F);
-			JCusparse.cusparseCreateDnVec(vecY, m, y, CUDA_R_32F);
-
-			/* 3. Query workspace size ------------------------------------- */
-			long[] bufSize = {0};
-			status = JCusparse.cusparseSpMV_bufferSize(handle, transA, alpha, matA.asConst(), vecX.asConst(), beta,
-				vecY, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, bufSize);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			if(bufSize[0] > 0) {
-				dBuf = new Pointer();
-				cudaMalloc(dBuf, bufSize[0]);
-			}
-
-			/* 4. Perform SpMV -------------------------------------------- */
-			status = JCusparse.cusparseSpMV(handle, transA, alpha, matA.asConst(), vecX.asConst(), beta, vecY,
-				CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, dBuf);
-
-			return status;
+			return cusparseSpMV(handle, transA, alpha, spMatDescrA.asConst(), vecX.asConst(), beta, vecY, dataType, alg,
+				dBuffer);
 		}
 		finally {
-			if(dBuf != null)
-				cudaFree(dBuf);
-			JCusparse.cusparseDestroyDnVec(vecX.asConst());
-			JCusparse.cusparseDestroyDnVec(vecY.asConst());
-			JCusparse.cusparseDestroySpMat(matA.asConst());
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroyDnVec(vecX.asConst());
+			cusparseDestroyDnVec(vecY.asConst());
 		}
 	}
 
diff --git a/src/test/scripts/functions/lineage/LineageReuseGPU2.dml b/src/test/scripts/functions/lineage/LineageReuseGPU2.dml
index dd223acd05e..f5bc2655edd 100644
--- a/src/test/scripts/functions/lineage/LineageReuseGPU2.dml
+++ b/src/test/scripts/functions/lineage/LineageReuseGPU2.dml
@@ -32,8 +32,8 @@ stp = (0.1 - 0.0001)/no_lamda;
 lamda = 0.0001;
 lim = 0.1;
 
-X = rand(rows=1000, cols=100, seed=42);
-y = rand(rows=1000, cols=1, seed=42);
+X = rand(rows=1000, cols=100, sparsity=0.2, seed=42);
+y = rand(rows=1000, cols=1, sparsity=0.2, seed=42);
 N = ncol(X);
 R = matrix(0, rows=N, cols=no_lamda+2);
 i = 1;

From df834650b4d9efac8ee91f8084e70be059c51e2d Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Fri, 20 Jun 2025 20:08:29 +0200
Subject: [PATCH 10/26] rework cusparsecsrmm2()

---
 .../matrix/data/CudaSupportFunctions.java     | 18 ++---
 .../DoublePrecisionCudaSupportFunctions.java  | 74 +++++++-----------
 .../matrix/data/LibMatrixCuMatMult.java       |  2 +-
 .../SinglePrecisionCudaSupportFunctions.java  | 77 ++++++++-----------
 .../functions/lineage/LineageReuseGPU2.dml    |  4 +-
 5 files changed, 71 insertions(+), 104 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
index d3edb25cd80..3093b90cf6c 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/CudaSupportFunctions.java
@@ -53,24 +53,24 @@
  * 	matrix_atan(A, C, size);
  * } 
  * </code>
- * 
+ *
  * 2. The CUDA library calls (such as CuBLAS, CuSPARSE, etc) go through this interface.
  * The naming and parameters of the methods in this class are consistent with that of CUDA library to simplify development.
- * 
+ *
  * 3. During SystemDS initialization, the appropriate class implementing CudaKernels interface is set based on the configuration property sysds.dataType.
  */
 public interface CudaSupportFunctions {
 	public static boolean PERFORM_CONVERSION_ON_DEVICE = true;
 
-	int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusparseSpMatDescr spMatDescrA,
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int alg, cusparseSpMatDescr spMatDescrA,
 		cusparseSpMatDescr spMatDescrB, cusparseSpMatDescr spMatDescrC, cusparseSpGEMMDescr spgemmDescr);
-
-	int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha,
+	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha,
 		jcuda.Pointer A, int lda, jcuda.Pointer beta, jcuda.Pointer B, int ldb, jcuda.Pointer C, int ldc);
-	public int	cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha, cusparseSpMatDescr spMatDescrA, cusparseMatDescr descA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA,
-			jcuda.Pointer x, jcuda.Pointer beta, jcuda.Pointer y);
-	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
-			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
+	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha,
+		cusparseSpMatDescr spMatDescrA, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA,
+		jcuda.Pointer csrColIndA, jcuda.Pointer x, jcuda.Pointer beta, jcuda.Pointer y);
+	public int cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, cusparseSpMatDescr spMatDescrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA,
+		jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
 	public int cublasdot(cublasHandle handle, int n, jcuda.Pointer x, int incx, jcuda.Pointer y, int incy, jcuda.Pointer result);
 	public int cublasgemv(cublasHandle handle, int trans, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer x, int incx, jcuda.Pointer beta, jcuda.Pointer y, int incy);
 	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index 0d7a71ffa9f..bf47d73df9e 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -113,55 +113,39 @@ public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nn
 
 	@Override
 	public int cusparsecsrmm2(cusparseHandle handle, int transA, int transB, int m, int n, int k, int nnz,
-		Pointer alpha, cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer B,
-		int ldb, Pointer beta, Pointer C, int ldc) {
-		/* Descriptors and workspace -------------------------------------- */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
-		cusparseDnMatDescr matB = new cusparseDnMatDescr();
-		cusparseDnMatDescr matC = new cusparseDnMatDescr();
-		Pointer dBuf = new Pointer();
-		long dBufBytes = 0;
-		int status;
+		Pointer alpha, cusparseMatDescr descrA, cusparseSpMatDescr spMatDescrA, Pointer csrValA, Pointer csrRowPtrA,
+		Pointer csrColIndA, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
 
+		int dataType = CUDA_R_64F;
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		// Create sparse matrix A in CSR format
+		cusparseCreateCsr(spMatDescrA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, dataType);
+		// Create dense matrix B
+		cusparseDnMatDescr dnMatB = new cusparseDnMatDescr();
+		cusparseCreateDnMat(dnMatB, k, n, ldb, B, dataType, CUSPARSE_ORDER_COL);
+		// Create dense matrix C
+		cusparseDnMatDescr dnMatC = new cusparseDnMatDescr();
+		cusparseCreateDnMat(dnMatC, m, n, ldc, C, dataType, CUSPARSE_ORDER_COL);
+		// allocate an external buffer if needed
+		long[] bufferSize = {0};
+		int alg = CUSPARSE_SPMM_ALG_DEFAULT;
+		cusparseSpMM_bufferSize(handle, transA, transB, alpha, spMatDescrA.asConst(), dnMatB.asConst(), beta, dnMatC,
+			dataType, alg, bufferSize);
+		// execute SpMM
+		Pointer dBuffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
 		try {
-			/* 1. CSR matrix A -------------------------------------------- */
-			cusparseCreateCsr(matA, m, k, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-				CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
-
-			/* 2. Dense matrix B  (col-major layout) ---------------------- */
-			int rowsB = (transB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : n;
-			int colsB = (transB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : k;
-			cusparseCreateDnMat(matB, rowsB, colsB, ldb, B, CUDA_R_64F, CUSPARSE_ORDER_COL);
-
-			/* 3. Dense matrix C  (output) -------------------------------- */
-			int rowsC = (transA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : k;
-			int colsC = colsB;                       // always equals n
-			cusparseCreateDnMat(matC, rowsC, colsC, ldc, C, CUDA_R_64F, CUSPARSE_ORDER_COL);
-
-			/* 4. Query workspace size ------------------------------------ */
-			long[] bufSize = {0};
-			status = JCusparse.cusparseSpMM_bufferSize(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
-				beta, matC, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, bufSize);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			dBufBytes = bufSize[0];
-			if(dBufBytes > 0)
-				cudaMalloc(dBuf, dBufBytes);
-
-			/* 5. Execute SpMM ------------------------------------------- */
-			status = JCusparse.cusparseSpMM(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC,
-				CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, dBuf);
-
-			return status;
+			return cusparseSpMM(handle, transA, transB, alpha, spMatDescrA.asConst(), dnMatB.asConst(), beta, dnMatC,
+				dataType, alg, dBuffer);
 		}
 		finally {
-			/* Cleanup ---------------------------------------------------- */
-			if(dBufBytes > 0)
-				cudaFree(dBuf);
-			JCusparse.cusparseDestroyDnMat(matB.asConst());
-			JCusparse.cusparseDestroyDnMat(matC.asConst());
-			JCusparse.cusparseDestroySpMat(matA.asConst());
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroySpMat(spMatDescrA.asConst());
+			cusparseDestroyDnMat(dnMatB.asConst());
+			cusparseDestroyDnMat(dnMatC.asConst());
 		}
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
index 837edae3b18..e0f3d12cca0 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -330,7 +330,7 @@ private static void denseSparseMatMult(cusparseHandle handle, String instName, P
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
 			int transb = cusparseOp(param.isRightTransposed);
 			LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
-			cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+			cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.spMatDescr, B.val,
 					B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 44ec08638b7..8ee3e5b3013 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -122,59 +122,42 @@ public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nn
 
 	@Override
 	public int cusparsecsrmm2(cusparseHandle handle, int transA, int transB, int m, int n, int k, int nnz,
-		Pointer alpha, cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer B,
-		int ldb, Pointer beta, Pointer C, int ldc) {
-		/* ------------------------------------------------------------------ */
-		/* Descriptors and workspace                                          */
-		/* ------------------------------------------------------------------ */
-		cusparseSpMatDescr matA = new cusparseSpMatDescr();
-		cusparseDnMatDescr matB = new cusparseDnMatDescr();
-		cusparseDnMatDescr matC = new cusparseDnMatDescr();
-		Pointer dBuf = null;
-		int status;
+		Pointer alpha, cusparseMatDescr descrA, cusparseSpMatDescr spMatDescrA, Pointer csrValA, Pointer csrRowPtrA,
+		Pointer csrColIndA, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
 
+		int dataType = CUDA_R_32F;
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		// Create sparse matrix A in CSR format
+		cusparseCreateCsr(spMatDescrA, m, n, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
+			CUSPARSE_INDEX_32I, idxBase, dataType);
+		// Create dense matrix B
+		cusparseDnMatDescr dnMatB = new cusparseDnMatDescr();
+		cusparseCreateDnMat(dnMatB, k, n, ldb, B, dataType, CUSPARSE_ORDER_COL);
+		// Create dense matrix C
+		cusparseDnMatDescr dnMatC = new cusparseDnMatDescr();
+		cusparseCreateDnMat(dnMatC, m, n, ldc, C, dataType, CUSPARSE_ORDER_COL);
+		// allocate an external buffer if needed
+		long[] bufferSize = {0};
+		int alg = CUSPARSE_SPMM_ALG_DEFAULT;
+		cusparseSpMM_bufferSize(handle, transA, transB, alpha, spMatDescrA.asConst(), dnMatB.asConst(), beta, dnMatC,
+			dataType, alg, bufferSize);
+		// execute SpMM
+		Pointer dBuffer = new Pointer();
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
 		try {
-			/* 1. CSR matrix A (FP32) -------------------------------------- */
-			JCusparse.cusparseCreateCsr(matA, m, k, nnz, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
-				CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
-
-			/* 2. Dense matrix B (column-major) ---------------------------- */
-			int rowsB = (transB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : n;
-			int colsB = (transB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : k;
-			JCusparse.cusparseCreateDnMat(matB, rowsB, colsB, ldb, B, CUDA_R_32F, CUSPARSE_ORDER_COL);
-
-			/* 3. Dense matrix C (output) ---------------------------------- */
-			int rowsC = (transA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : k;
-			int colsC = colsB;   // always equals n
-			JCusparse.cusparseCreateDnMat(matC, rowsC, colsC, ldc, C, CUDA_R_32F, CUSPARSE_ORDER_COL);
-
-			/* 4. Query workspace size ------------------------------------- */
-			long[] bufSize = {0};
-			status = JCusparse.cusparseSpMM_bufferSize(handle, transA, transB, alpha, matA.asConst(), matB.asConst(),
-				beta, matC, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, bufSize);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			if(bufSize[0] > 0) {
-				dBuf = new Pointer();
-				cudaMalloc(dBuf, bufSize[0]);
-			}
-
-			/* 5. Execute SpMM -------------------------------------------- */
-			status = JCusparse.cusparseSpMM(handle, transA, transB, alpha, matA.asConst(), matB.asConst(), beta, matC,
-				CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, dBuf);
-
-			return status;
+			return cusparseSpMM(handle, transA, transB, alpha, spMatDescrA.asConst(), dnMatB.asConst(), beta, dnMatC,
+				dataType, alg, dBuffer);
 		}
 		finally {
-			if(dBuf != null)
-				cudaFree(dBuf);
-			JCusparse.cusparseDestroyDnMat(matB.asConst());
-			JCusparse.cusparseDestroyDnMat(matC.asConst());
-			JCusparse.cusparseDestroySpMat(matA.asConst());
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
+			cusparseDestroySpMat(spMatDescrA.asConst());
+			cusparseDestroyDnMat(dnMatB.asConst());
+			cusparseDestroyDnMat(dnMatC.asConst());
 		}
 	}
-
+	
 	@Override
 	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
 		return JCublas2.cublasSdot(handle, n, x, incx, y, incy, result);
diff --git a/src/test/scripts/functions/lineage/LineageReuseGPU2.dml b/src/test/scripts/functions/lineage/LineageReuseGPU2.dml
index f5bc2655edd..dd223acd05e 100644
--- a/src/test/scripts/functions/lineage/LineageReuseGPU2.dml
+++ b/src/test/scripts/functions/lineage/LineageReuseGPU2.dml
@@ -32,8 +32,8 @@ stp = (0.1 - 0.0001)/no_lamda;
 lamda = 0.0001;
 lim = 0.1;
 
-X = rand(rows=1000, cols=100, sparsity=0.2, seed=42);
-y = rand(rows=1000, cols=1, sparsity=0.2, seed=42);
+X = rand(rows=1000, cols=100, seed=42);
+y = rand(rows=1000, cols=1, seed=42);
 N = ncol(X);
 R = matrix(0, rows=N, cols=no_lamda+2);
 i = 1;

From a3ee4eb9f3fe3422d3959e18ae3c95dfaf597186 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Sat, 21 Jun 2025 20:18:13 +0200
Subject: [PATCH 11/26] rework cusparseDense2csr() and final cleanups

---
 .../DoublePrecisionCudaSupportFunctions.java  |  80 +++++------
 .../runtime/matrix/data/LibMatrixCUDA.java    |   2 +-
 .../SinglePrecisionCudaSupportFunctions.java  | 129 +++++++-----------
 3 files changed, 87 insertions(+), 124 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
index bf47d73df9e..2a5e03593f4 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -298,58 +298,52 @@ public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDes
 	@Override
 	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
 		Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
-		/* ------------------------------------------------------------------ */
-		/* 1. Determine index base and wrap the input/output in descriptors   */
-		/* ------------------------------------------------------------------ */
-		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
+		// setup
+		int dataType = CUDA_R_64F;
+		cusparseSpMatDescr csrDesc = new cusparseSpMatDescr();
+		cusparseDnMatDescr denseDesc = new cusparseDnMatDescr();
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		int alg = CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+		long[] bufferSize = {0};
+		Pointer dBuffer = new Pointer();
 
-		cusparseDnMatDescr matDense = new cusparseDnMatDescr();
-		JCusparse.cusparseCreateDnMat(matDense, m, n, lda, A, CUDA_R_64F, CUSPARSE_ORDER_COL);
+		// Create dense matrix A
+		cusparseCreateDnMat(denseDesc, m, n, lda, A, dataType, CUSPARSE_ORDER_COL);
 
-		cusparseSpMatDescr matCsr = new cusparseSpMatDescr();
-		/* nnz initially 0 – cuSPARSE fills it during analysis phase */
-		JCusparse.cusparseCreateCsr(matCsr, m, n, 0L, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
-			CUSPARSE_INDEX_32I, idxBase, CUDA_R_64F);
+		// Create sparse matrix B in CSR format
+		cusparseCreateCsr(csrDesc, m, n, 0, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			idxBase, dataType);
 
-		/* ------------------------------------------------------------------ */
-		/* 2. Query temporary buffer size                                     */
-		/* ------------------------------------------------------------------ */
-		long[] bufSz = {0};
-		int alg = CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+		// allocate an external buffer if needed
+		cusparseDenseToSparse_bufferSize(handle, denseDesc.asConst(), csrDesc, alg, bufferSize);
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
 
-		int status = JCusparse.cusparseDenseToSparse_bufferSize(handle, matDense.asConst(), matCsr, alg, bufSz);
-		if(status != CUSPARSE_STATUS_SUCCESS) {
-			JCusparse.cusparseDestroySpMat(matCsr.asConst());
-			JCusparse.cusparseDestroyDnMat(matDense.asConst());
-			return status;
-		}
+		// prepare Sparse to Dense conversion
+		cusparseDenseToSparse_analysis(handle, denseDesc.asConst(), csrDesc, alg, dBuffer);
 
-		Pointer buffer = new Pointer();
-		if(bufSz[0] > 0)
-			cudaMalloc(buffer, bufSz[0]);
+		/** Keep this in case needed later for debugging*/
+		/*long[] rowsTmp = {0}, colsTmp = {0}, nnz  = {0};
+		JCusparse.cusparseSpMatGetSize(csrDesc.asConst(), rowsTmp, colsTmp, nnz);
 
-		try {
-			/* -------------------------------------------------------------- */
-			/* 3. Symbolic pass: decide sparsity pattern, fill csrRowPtrA     */
-			/* -------------------------------------------------------------- */
-			status = JCusparse.cusparseDenseToSparse_analysis(handle, matDense.asConst(), matCsr, alg, buffer);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			/* -------------------------------------------------------------- */
-			/* 4. Numeric conversion: fill csrColIndA and csrValA             */
-			/* -------------------------------------------------------------- */
-			status = JCusparse.cusparseDenseToSparse_convert(handle, matDense.asConst(), matCsr, alg, buffer);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
+		// only allocate if caller passed null pointers
+		if (csrColIndA == null)
+			cudaMalloc(csrColIndA, nnz[0] * Sizeof.INT);
+		if (csrValA == null)
+			cudaMalloc(csrValA, nnz[0] * Sizeof.DOUBLE);*/
 
-			return status;
+		// re-attach column & value pointers
+		JCusparse.cusparseCsrSetPointers(csrDesc, csrRowPtrA, csrColIndA, csrValA);
+
+		try {
+			// execute Sparse to Dense conversion
+			return cusparseDenseToSparse_convert(handle, denseDesc.asConst(), csrDesc, alg, dBuffer);
 		}
 		finally {
-			if(bufSz[0] > 0)
-				cudaFree(buffer);
-			JCusparse.cusparseDestroySpMat(matCsr.asConst());
-			JCusparse.cusparseDestroyDnMat(matDense.asConst());
+			cusparseDestroyDnMat(denseDesc.asConst());
+			cusparseDestroySpMat(csrDesc.asConst());
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
 		}
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
index 974d131912d..3a9cf83e792 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
@@ -1629,7 +1629,7 @@ private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName,
 			if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
 				// Special case for transpose
 
-				int nnz = (int)A.nnz;
+				int nnz = toInt(A.nnz);
 				CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
 				out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
 				cudaSupportFunctions.cusparsecsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 8ee3e5b3013..ab9f6d82be3 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -157,7 +157,7 @@ public int cusparsecsrmm2(cusparseHandle handle, int transA, int transB, int m,
 			cusparseDestroyDnMat(dnMatC.asConst());
 		}
 	}
-	
+
 	@Override
 	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
 		return JCublas2.cublasSdot(handle, n, x, incx, y, incy, result);
@@ -238,40 +238,26 @@ public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, c
 		Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
 		Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
 		Pointer csrRowPtrC, Pointer csrColIndC) {
-		/* ------------------------------------------------------------------ */
-		/* 1. Query temporary-buffer size                                     */
-		/* ------------------------------------------------------------------ */
-		long[] bufSize = {0};
 
-		int status = JCusparse.cusparseScsrgeam2_bufferSizeExt(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
+		long[] pBufferSizeInBytes = {0};
+
+		int status = JCusparse.cusparseDcsrgeam2_bufferSizeExt(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
 			csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC,
-			bufSize);
+			pBufferSizeInBytes);
 		if(status != CUSPARSE_STATUS_SUCCESS)
 			return status;
 
-		/* ------------------------------------------------------------------ */
-		/* 2. Allocate workspace (if needed)                                  */
-		/* ------------------------------------------------------------------ */
-		Pointer buffer = null;
-		if(bufSize[0] > 0) {
-			buffer = new Pointer();
-			cudaMalloc(buffer, bufSize[0]);
-		}
+		Pointer buffer = new Pointer();
+		if(pBufferSizeInBytes[0] > 0)
+			cudaMalloc(buffer, pBufferSizeInBytes[0]);
 
 		try {
-			/* -------------------------------------------------------------- */
-			/* 3. Perform C = α*A  +  β*B                                     */
-			/* -------------------------------------------------------------- */
-			status = JCusparse.cusparseScsrgeam2(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA,
-				beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC, buffer);
-
-			return status;   // propagate cuSPARSE return code
+			// C = α*A + β*B
+			return JCusparse.cusparseDcsrgeam2(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, beta,
+				descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC, buffer);
 		}
 		finally {
-			/* -------------------------------------------------------------- */
-			/* 4. Free workspace                                              */
-			/* -------------------------------------------------------------- */
-			if(buffer != null)
+			if(pBufferSizeInBytes[0] > 0)
 				cudaFree(buffer);
 		}
 	}
@@ -322,69 +308,52 @@ public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDes
 	@Override
 	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
 		Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
-		/* ------------------------------------------------------------------ */
-		/* 0.  Index base (0 or 1) comes from the descriptor                  */
-		/* ------------------------------------------------------------------ */
-		int idxBase = JCusparse.cusparseGetMatIndexBase(descrA);
+		// setup
+		int dataType = CUDA_R_32F;
+		cusparseSpMatDescr csrDesc = new cusparseSpMatDescr();
+		cusparseDnMatDescr denseDesc = new cusparseDnMatDescr();
+		int idxBase = cusparseGetMatIndexBase(descrA);
+		int alg = CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+		long[] bufferSize = {0};
+		Pointer dBuffer = new Pointer();
 
-		/* ------------------------------------------------------------------ */
-		/* 1.  Create dense-matrix and CSR descriptors (FP32)                 */
-		/* ------------------------------------------------------------------ */
-		cusparseDnMatDescr matDense = new cusparseDnMatDescr();
-		JCusparse.cusparseCreateDnMat(matDense, m, n, lda, A, CUDA_R_32F, CUSPARSE_ORDER_COL);
+		// Create dense matrix A
+		cusparseCreateDnMat(denseDesc, m, n, lda, A, dataType, CUSPARSE_ORDER_COL);
 
-		cusparseSpMatDescr matCsr = new cusparseSpMatDescr();
-		/* nnz initially 0 – cuSPARSE will fill it during analysis           */
-		JCusparse.cusparseCreateCsr(matCsr, m, n, 0L, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I,
-			CUSPARSE_INDEX_32I, idxBase, CUDA_R_32F);
+		// Create sparse matrix B in CSR format
+		cusparseCreateCsr(csrDesc, m, n, 0, csrRowPtrA, csrColIndA, csrValA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+			idxBase, dataType);
 
-		/* ------------------------------------------------------------------ */
-		/* 2.  Query workspace size                                           */
-		/* ------------------------------------------------------------------ */
-		long[] bufSize = {0};
-		int alg = CUSPARSE_DENSETOSPARSE_ALG_DEFAULT;
+		// allocate an external buffer if needed
+		cusparseDenseToSparse_bufferSize(handle, denseDesc.asConst(), csrDesc, alg, bufferSize);
+		if(bufferSize[0] > 0)
+			cudaMalloc(dBuffer, bufferSize[0]);
 
-		int status = JCusparse.cusparseDenseToSparse_bufferSize(handle, matDense.asConst(), matCsr, alg, bufSize);
-		if(status != CUSPARSE_STATUS_SUCCESS) {
-			JCusparse.cusparseDestroySpMat(matCsr.asConst());
-			JCusparse.cusparseDestroyDnMat(matDense.asConst());
-			return status;
-		}
+		// prepare Sparse to Dense conversion
+		cusparseDenseToSparse_analysis(handle, denseDesc.asConst(), csrDesc, alg, dBuffer);
 
-		/* ------------------------------------------------------------------ */
-		/* 3.  Allocate workspace (if required)                               */
-		/* ------------------------------------------------------------------ */
-		Pointer buffer = null;
-		if(bufSize[0] > 0) {
-			buffer = new Pointer();
-			cudaMalloc(buffer, bufSize[0]);
-		}
+		/** Keep this in case needed later for debugging*/
+		/*long[] rowsTmp = {0}, colsTmp = {0}, nnz  = {0};
+		JCusparse.cusparseSpMatGetSize(csrDesc.asConst(), rowsTmp, colsTmp, nnz);
+
+		// only allocate if caller passed null pointers
+		if (csrColIndA == null)
+			cudaMalloc(csrColIndA, nnz[0] * Sizeof.INT);
+		if (csrValA == null)
+			cudaMalloc(csrValA, nnz[0] * Sizeof.DOUBLE);*/
+
+		// re-attach column & value pointers
+		JCusparse.cusparseCsrSetPointers(csrDesc, csrRowPtrA, csrColIndA, csrValA);
 
 		try {
-			/* -------------------------------------------------------------- */
-			/* 4.  Phase-1: symbolic pass                                     */
-			/* -------------------------------------------------------------- */
-			status = JCusparse.cusparseDenseToSparse_analysis(handle, matDense.asConst(), matCsr, alg, buffer);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			/* -------------------------------------------------------------- */
-			/* 5.  Phase-2: numeric conversion                                */
-			/* -------------------------------------------------------------- */
-			status = JCusparse.cusparseDenseToSparse_convert(handle, matDense.asConst(), matCsr, alg, buffer);
-			if(status != CUSPARSE_STATUS_SUCCESS)
-				return status;
-
-			return status;   // success
+			// execute Sparse to Dense conversion
+			return cusparseDenseToSparse_convert(handle, denseDesc.asConst(), csrDesc, alg, dBuffer);
 		}
 		finally {
-			/* -------------------------------------------------------------- */
-			/* 7.  Cleanup                                                    */
-			/* -------------------------------------------------------------- */
-			if(buffer != null)
-				cudaFree(buffer);
-			JCusparse.cusparseDestroySpMat(matCsr.asConst());
-			JCusparse.cusparseDestroyDnMat(matDense.asConst());
+			cusparseDestroyDnMat(denseDesc.asConst());
+			cusparseDestroySpMat(csrDesc.asConst());
+			if(bufferSize[0] > 0)
+				cudaFree(dBuffer);
 		}
 	}
 

From fdff4a4047cf088dac3f0d95462d87298edb44b4 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Sat, 21 Jun 2025 20:22:51 +0200
Subject: [PATCH 12/26] reinstate checkGPU()

---
 .../apache/sysds/test/functions/lineage/GPUFullReuseTest.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
index 841511b31f0..74bd7fc47de 100644
--- a/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/lineage/GPUFullReuseTest.java
@@ -45,12 +45,12 @@ public class GPUFullReuseTest extends AutomatedTestBase{
 	protected static final int TEST_VARIANTS = 4;
 	protected String TEST_CLASS_DIR = TEST_DIR + GPUFullReuseTest.class.getSimpleName() + "/";
 
-	/*@BeforeClass
+	@BeforeClass
 	public static void checkGPU() {
 		// Skip all the tests if no GPU is available
 		// FIXME: Fails to skip if gpu available but no libraries
 		Assume.assumeTrue(TestUtils.isGPUAvailable() == cudaError.cudaSuccess);
-	}*/
+	}
 
 	@Override
 	public void setUp() {

From 5f60373015e90c5de9960f2474b7d4006fee4865 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Sun, 13 Jul 2025 15:45:36 +0200
Subject: [PATCH 13/26] add testing for dnn operations

---
 .../sysds/test/gpu/NeuralNetworkGPUTest.java  | 276 ++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/NeuralNetworkGPUTest.java

diff --git a/src/test/java/org/apache/sysds/test/gpu/NeuralNetworkGPUTest.java b/src/test/java/org/apache/sysds/test/gpu/NeuralNetworkGPUTest.java
new file mode 100644
index 00000000000..30ddb44ead6
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/NeuralNetworkGPUTest.java
@@ -0,0 +1,276 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu;
+
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.functions.dnn.Conv1DTest;
+import org.apache.sysds.test.functions.dnn.Conv2DTest;
+import org.apache.sysds.test.functions.dnn.Conv2DBackwardTest;
+import org.apache.sysds.test.functions.dnn.Conv2DBackwardDataTest;
+import org.apache.sysds.test.functions.dnn.PoolTest;
+import org.apache.sysds.test.functions.dnn.PoolBackwardTest;
+import org.apache.sysds.test.functions.dnn.ReluBackwardTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class NeuralNetworkGPUTest extends AutomatedTestBase {
+
+	@Override
+	public void setUp() {
+		TEST_GPU = true;
+		VERBOSE_STATS = true;
+	}
+
+	@Test
+	public void Conv1DGPUTest() {
+		Conv1DTest dmlTestCase = new Conv1DTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testSimpleConv1DDenseSingleBatchSingleChannelSingleFilter();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d", "gpu_append"));
+		dmlTestCase.testConv1DDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense5();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense6();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DDense7();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d"));
+		dmlTestCase.testConv1DBackwardDataDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv1DBackwardFilterDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv1DBackwardFilterDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+	}
+
+	@Test
+	public void Conv2DGPUTest() {
+		Conv2DTest dmlTestCase = new Conv2DTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testConv2DDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense5();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense6();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DDense7();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_bias_add"));
+		dmlTestCase.testConv2DSparse1a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse2a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse3a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse4a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse5a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse6a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse7a();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse1b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse2b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse3b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse4b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse5b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse6b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+		dmlTestCase.testConv2DSparse7b();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_*", "gpu_>"));
+	}
+
+	@Test
+	public void Conv2DBackwardGPUTest() {
+		Conv2DBackwardTest dmlTestCase = new Conv2DBackwardTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testConv2DBackwardFilterDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterDense5();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_filter"));
+		dmlTestCase.testConv2DBackwardFilterSparse1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse2();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse3();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse4();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse5();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse6();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse7();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse8();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse9();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse10();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse11();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse12();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse13();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse14();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse15();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse16();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+		dmlTestCase.testConv2DBackwardFilterSparse17();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_filter", "gpu_>"));
+	}
+
+	@Test
+	public void Conv2DBackwardDataGPUTest() {
+		Conv2DBackwardDataTest dmlTestCase = new Conv2DBackwardDataTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testConv2DBwdDataDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DBwdDataDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DBwdDataSparse1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse2();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data"));
+		dmlTestCase.testConv2DBwdDataSparse3();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse4();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse5();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse6();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+		dmlTestCase.testConv2DBwdDataSparse7();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_backward_data", "gpu_>"));
+	}
+
+	@Test
+	public void PoolGPUTest() {
+		PoolTest dmlTestCase = new PoolTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testMaxPool2DDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DDense4();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DDense5();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling"));
+		dmlTestCase.testMaxPool2DSparse1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+		dmlTestCase.testMaxPool2DSparse2();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+		dmlTestCase.testMaxPool2DSparse3();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+		dmlTestCase.testMaxPool2DSparse4();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+		dmlTestCase.testMaxPool2DSparse5();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling", "gpu_>"));
+	}
+
+	@Test
+	public void PoolBackwardGPUTest() {
+		PoolBackwardTest dmlTestCase = new PoolBackwardTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testMaxPool2DBackwardDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling_backward"));
+		dmlTestCase.testMaxPool2DBackwardDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling_backward"));
+		dmlTestCase.testMaxPool2DBackwardDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_maxpooling_backward"));
+		dmlTestCase.testMaxPool2DBackwardSparse1();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse2();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse3();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse4();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse5();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse6();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse7();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse8();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse9();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse10();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse11();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+		dmlTestCase.testMaxPool2DBackwardSparse12();
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_maxpooling_backward", "gpu_>"));
+	}
+
+	@Test
+	public void ReluBackwardGPUTest() {
+		ReluBackwardTest dmlTestCase = new ReluBackwardTest();
+		dmlTestCase.setUpBase();
+		dmlTestCase.setUp();
+		dmlTestCase.testReluBackwardDense1();
+		Assert.assertTrue(heavyHittersContainsString("gpu_relu_backward"));
+		dmlTestCase.testReluBackwardDense2();
+		Assert.assertTrue(heavyHittersContainsString("gpu_relu_backward"));
+		dmlTestCase.testReluBackwardDense3();
+		Assert.assertTrue(heavyHittersContainsString("gpu_relu_backward"));
+	}
+
+}

From 0c48fffa73db764ccd32e06fa5e327ec5f711ebb Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Sun, 13 Jul 2025 15:52:23 +0200
Subject: [PATCH 14/26] adjust class name

---
 .../{NeuralNetworkGPUTest.java => DNNOperationsGPUTest.java}    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename src/test/java/org/apache/sysds/test/gpu/{NeuralNetworkGPUTest.java => DNNOperationsGPUTest.java} (99%)

diff --git a/src/test/java/org/apache/sysds/test/gpu/NeuralNetworkGPUTest.java b/src/test/java/org/apache/sysds/test/gpu/DNNOperationsGPUTest.java
similarity index 99%
rename from src/test/java/org/apache/sysds/test/gpu/NeuralNetworkGPUTest.java
rename to src/test/java/org/apache/sysds/test/gpu/DNNOperationsGPUTest.java
index 30ddb44ead6..ea7a564f8e3 100644
--- a/src/test/java/org/apache/sysds/test/gpu/NeuralNetworkGPUTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/DNNOperationsGPUTest.java
@@ -30,7 +30,7 @@
 import org.junit.Assert;
 import org.junit.Test;
 
-public class NeuralNetworkGPUTest extends AutomatedTestBase {
+public class DNNOperationsGPUTest extends AutomatedTestBase {
 
 	@Override
 	public void setUp() {

From 1753307e2c59f3af7819b0172ec019d3f961e1f6 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Mon, 14 Jul 2025 03:30:40 +0200
Subject: [PATCH 15/26] add resnet18 test for GPU

---
 .uuid                                         |   1 +
 bin/.uuid                                     |   1 +
 conf/.uuid                                    |   1 +
 dev/.uuid                                     |   1 +
 docker/.uuid                                  |   1 +
 docs/.uuid                                    |   1 +
 scripts/.uuid                                 |   1 +
 src/.uuid                                     |   1 +
 .../sysds/test/gpu/nn/ResNet18GPUTest.java    | 101 ++++++
 src/test/scripts/gpu/nn/ResNet18GPU.dml       | 342 ++++++++++++++++++
 10 files changed, 451 insertions(+)
 create mode 100644 .uuid
 create mode 100644 bin/.uuid
 create mode 100644 conf/.uuid
 create mode 100644 dev/.uuid
 create mode 100644 docker/.uuid
 create mode 100644 docs/.uuid
 create mode 100644 scripts/.uuid
 create mode 100644 src/.uuid
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/nn/ResNet18GPUTest.java
 create mode 100644 src/test/scripts/gpu/nn/ResNet18GPU.dml

diff --git a/.uuid b/.uuid
new file mode 100644
index 00000000000..c7461c96769
--- /dev/null
+++ b/.uuid
@@ -0,0 +1 @@
+2b0ea405-adcb-44dd-b0e8-cbf7394f7736
\ No newline at end of file
diff --git a/bin/.uuid b/bin/.uuid
new file mode 100644
index 00000000000..63fdd3cd025
--- /dev/null
+++ b/bin/.uuid
@@ -0,0 +1 @@
+adc2d874-7518-4920-a601-2184c5762423
\ No newline at end of file
diff --git a/conf/.uuid b/conf/.uuid
new file mode 100644
index 00000000000..56a41cb8941
--- /dev/null
+++ b/conf/.uuid
@@ -0,0 +1 @@
+d0dfa9b1-cfe3-4f9e-97a6-64cbafefd004
\ No newline at end of file
diff --git a/dev/.uuid b/dev/.uuid
new file mode 100644
index 00000000000..0473b866fa9
--- /dev/null
+++ b/dev/.uuid
@@ -0,0 +1 @@
+3c726e72-1a65-4bbb-9732-07b261b9cbee
\ No newline at end of file
diff --git a/docker/.uuid b/docker/.uuid
new file mode 100644
index 00000000000..20e4a0fdf5a
--- /dev/null
+++ b/docker/.uuid
@@ -0,0 +1 @@
+7ee583bd-41f8-4c33-8222-964def6e9a08
\ No newline at end of file
diff --git a/docs/.uuid b/docs/.uuid
new file mode 100644
index 00000000000..719e219a0a0
--- /dev/null
+++ b/docs/.uuid
@@ -0,0 +1 @@
+16051ef9-1f88-42f0-839d-4add14b6844c
\ No newline at end of file
diff --git a/scripts/.uuid b/scripts/.uuid
new file mode 100644
index 00000000000..e3a2f189086
--- /dev/null
+++ b/scripts/.uuid
@@ -0,0 +1 @@
+85419957-235e-4d35-a1cb-a16d7a558678
\ No newline at end of file
diff --git a/src/.uuid b/src/.uuid
new file mode 100644
index 00000000000..c8b39ff66b0
--- /dev/null
+++ b/src/.uuid
@@ -0,0 +1 @@
+418bbd1a-de61-4d8a-83d6-01f804855c6b
\ No newline at end of file
diff --git a/src/test/java/org/apache/sysds/test/gpu/nn/ResNet18GPUTest.java b/src/test/java/org/apache/sysds/test/gpu/nn/ResNet18GPUTest.java
new file mode 100644
index 00000000000..d306271e508
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/nn/ResNet18GPUTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.nn;
+
+import jcuda.CudaException;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaError;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class ResNet18GPUTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "ResNet18GPU";
+	private static final String TEST_DIR = "gpu/nn/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + ResNet18GPUTest.class.getSimpleName() + "/";
+
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void checkGPU() {
+		boolean gpuAvailable = false;
+		try {
+			// Ask JCuda to throw Java exceptions (much nicer than error codes)
+			JCuda.setExceptionsEnabled(true);
+
+			// How many devices does the runtime see?
+			int[] devCount = {0};
+			int status = JCuda.cudaGetDeviceCount(devCount);
+
+			gpuAvailable = (status == cudaError.cudaSuccess) && (devCount[0] > 0);
+		}
+		catch(UnsatisfiedLinkError | CudaException ex) {
+			// - native JCuda libs not on the class-path
+			// - or they were built for the wrong CUDA version
+			gpuAvailable = false;
+		}
+
+		Assume.assumeTrue("Skipping GPU test: no compatible CUDA device " + "or JCuda native libraries not available.",
+			gpuAvailable);
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testResnet18GPU() {
+		runResNet18GPU();
+	}
+
+	private void runResNet18GPU() {
+
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", output("R")};
+
+		runTest(true, false, null, -1);
+		HashMap<MatrixValue.CellIndex, Double> out = readDMLMatrixFromOutputDir("R");
+
+		double v1 = out.get(new MatrixValue.CellIndex(1, 1));
+		double v2 = out.get(new MatrixValue.CellIndex(1, 2));
+		double v3 = out.get(new MatrixValue.CellIndex(1, 3));
+
+		Assert.assertTrue(v1 == 612 || v1 == 640);
+		Assert.assertEquals(192, v2, 0.0);
+		Assert.assertEquals(192, v3, 0.0);
+
+		Assert.assertTrue(heavyHittersContainsAllString("gpu_conv2d_bias_add", "gpu_batch_norm2d", "gpu_softmax"));
+
+	}
+}
diff --git a/src/test/scripts/gpu/nn/ResNet18GPU.dml b/src/test/scripts/gpu/nn/ResNet18GPU.dml
new file mode 100644
index 00000000000..a78a58c9e8a
--- /dev/null
+++ b/src/test/scripts/gpu/nn/ResNet18GPU.dml
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+conv2d_forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+  int C, int Hin, int Win, int Hf, int Wf, int strideh, int stridew,
+  int padh, int padw) return (matrix[double] out, int Hout, int Wout)
+{
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+  # Convolution - built-in implementation
+  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
+               stride=[strideh,stridew], padding=[padh,padw])
+  # Add bias term to each output filter
+  out = bias_add(out, b)
+}
+
+conv2d_backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+  matrix[double] W, matrix[double] b, int C, int Hin, int Win, int Hf, int Wf,
+  int strideh, int stridew, int padh, int padw)
+  return (matrix[double] dX, matrix[double] dW, matrix[double] db)
+{
+  N = nrow(X)
+  F = nrow(W)
+  # Partial derivatives for convolution - built-in implementation
+  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
+                              input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+  dX = conv2d_backward_data(W, dout, stride=[strideh,stridew], padding=[padh,padw],
+                            input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+  # Partial derivatives for bias vector
+  # Here we sum each column, reshape to (F, Hout*Wout), and sum each row
+  # to result in the summation for each channel.
+  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))  # shape (F, 1)
+}
+
+conv2d_init = function(int F, int C, int Hf, int Wf, int seed = -1)
+  return (matrix[double] W, matrix[double] b) {
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal", seed=seed) * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+
+bn2d_forward = function(matrix[double] X, int C, int Hin, int Win,
+    double mu, double epsilon) return (matrix[double] out)
+{
+    gamma = matrix(1, rows=C, cols=1)
+    beta = matrix(0, rows=C, cols=1)
+    ema_mean = matrix(0, rows=C, cols=1)
+    ema_var = matrix(1, rows=C, cols=1)
+    ema_mean_upd = ema_mean;
+    ema_var_upd = ema_var;
+    cache_mean = ema_mean;
+    cache_inv_var = ema_var
+    mode = 'train';
+    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_inv_var] = batch_norm2d(X, gamma, beta, ema_mean, ema_var, mode, epsilon, mu)
+}
+
+affine_forward = function(matrix[double] X, matrix[double] W, matrix[double] b) return (matrix[double] out) {
+  out = X %*% W + b;
+}
+
+affine_init = function(int D, int M, int seed = -1 ) return (matrix[double] W, matrix[double] b) {
+  W = rand(rows=D, cols=M, pdf="normal", seed=seed) * sqrt(2.0/D);
+  b = matrix(0, rows=1, cols=M);
+}
+
+relu_forward = function(matrix[double] X) return (matrix[double] out) {
+  out = max(0, X);
+}
+
+max_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+  int strideh, int stridew, int padh, int padw) return(matrix[double] out, int Hout, int Wout)
+{
+  N = nrow(X)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+    stride=[strideh,stridew], padding=[padh,padw])
+}
+
+avg_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win)
+  return (matrix[double] out, int Hout, int Wout) {
+  N = nrow(X)
+  Hout = 1
+  Wout = 1
+  out = avg_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hin,Win], stride=[1,1], padding=[0, 0])
+}
+
+softmax_forward = function(matrix[double] scores) return (matrix[double] probs) {
+  scores = scores - rowMaxs(scores);  # numerical stability
+  unnorm_probs = exp(scores);  # unnormalized probabilities
+  probs = unnorm_probs / rowSums(unnorm_probs);  # normalized probabilities
+}
+
+basic_block = function(matrix[double] X, int C, int C_base, int Hin, int Win, int strideh,
+    int stridew, matrix[double] WC1, matrix[double] bC1, matrix[double] WC2, matrix[double] bC2)
+  return (matrix[double] out, int Hout, int Wout)
+{
+  mu_bn = 0.1;
+  ep_bn = 1e-05;
+  downsample = strideh > 1 | stridew > 1 | C != C_base;
+  if (downsample) {
+    [WC3, bC3] = conv2d_init(C_base, C, Hf=1, Wf=1, 42);
+  }
+  # Residual Path
+  # conv1 -> bn1 -> relu1
+  [out, Hout, Wout] = conv2d_forward(X,WC1,bC1,C,Hin,Win,3,3,strideh,stridew,1,1);
+  out = bn2d_forward(out,C_base,Hout,Wout,mu_bn,ep_bn);
+  out = relu_forward(out);
+  # conv2 -> bn2 -> relu2
+  [out, Hout, Wout] = conv2d_forward(out,WC2,bC2,C_base,Hout,Wout,3,3,1,1,1,1);
+  out = bn2d_forward(out,C_base,Hout,Wout,mu_bn,ep_bn);
+  # Identity Path
+  identity = X;
+  if (downsample) {
+    # Downsample input
+    [identity, Hout, Wout] = conv2d_forward(X,WC3,bC3,C,Hin,Win,1,1,strideh,stridew,0,0);
+    out = bn2d_forward(identity,C_base,Hout,Wout,mu_bn,ep_bn);
+  }
+  out = relu_forward(out + identity);
+}
+
+getWeights = function(int fel, int lid,
+    matrix[double] W_pt, matrix[double] b_pt,
+    matrix[double] W_init, matrix[double] b_init)
+  return (matrix[double] Wl, matrix[double] bl)
+{
+  if (lid < fel) { #extract pretrained features
+    Wl = W_pt;
+    bl = b_pt;
+  }
+  else {  #use initialized weights
+    Wl = W_init;
+    bl = b_init;
+  }
+}
+
+rwRowIndexMax = function(matrix[double] X, matrix[double] oneVec, matrix[double] idxSeq)
+    return (matrix[double] index) {
+  rm = rowMaxs(X) %*% oneVec;
+  I = X == rm;
+  index = rowMaxs(I * idxSeq);
+}
+
+resnet18_forward = function(matrix[double] X, int C, int Hin, int Win, int K)
+  return (matrix[double] Y_pred)
+{
+  mu_bn = 0.1;
+  ep_bn = 1e-05;
+
+  # Get the transferred layers. FIXME: use pretrained weights
+  [W1_pt, b1_pt] = conv2d_init(64, C, Hf=7, Wf=7, 42);
+  [W2_pt, b2_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
+  [W3_pt, b3_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
+  [W4_pt, b4_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
+  [W5_pt, b5_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
+  [W6_pt, b6_pt] = conv2d_init(128, 64, Hf=3, Wf=3, 42);
+  [W7_pt, b7_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
+  [W8_pt, b8_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
+  [W9_pt, b9_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
+  [W10_pt, b10_pt] = conv2d_init(256, 128, Hf=3, Wf=3, 42);
+  [W11_pt, b11_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W12_pt, b12_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W13_pt, b13_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W14_pt, b14_pt] = conv2d_init(512, 256, Hf=3, Wf=3, 42);
+  [W15_pt, b15_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W16_pt, b16_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W17_pt, b17_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W18_pt, b18_pt] = affine_init(512, K, 42);
+  W18_pt = W18_pt/sqrt(2);
+
+  # Initialize the weights for the non-transferred layers
+  [W1_init, b1_init] = conv2d_init(64, C, Hf=7, Wf=7, 43);
+  [W2_init, b2_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
+  [W3_init, b3_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
+  [W4_init, b4_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
+  [W5_init, b5_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
+  [W6_init, b6_init] = conv2d_init(128, 64, Hf=3, Wf=3, 43);
+  [W7_init, b7_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
+  [W8_init, b8_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
+  [W9_init, b9_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
+  [W10_init, b10_init] = conv2d_init(256, 128, Hf=3, Wf=3, 42);
+  [W11_init, b11_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W12_init, b12_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W13_init, b13_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
+  [W14_init, b14_init] = conv2d_init(512, 256, Hf=3, Wf=3, 42);
+  [W15_init, b15_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W16_init, b16_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W17_init, b17_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
+  [W18_init, b18_init] = affine_init(512, K, 42);
+  W18_init = W18_init/sqrt(2);
+
+  # Compute prediction over mini-batches
+  N = nrow(X);
+  Y_pred = matrix(0, rows=N, cols=3);
+  batch_size = 64;
+  oneVec = matrix(1, rows=1, cols=K);
+  idxSeq = matrix(1, rows=batch_size, cols=1) %*% t(seq(1, K));
+  iters = ceil (N / batch_size);
+
+  for (i in 1:iters) {
+    # Get next batch
+    beg = ((i-1) * batch_size) %% N + 1;
+    end = min(N, beg+batch_size-1);
+    X_batch = X[beg:end,];
+
+    # Extract 3 layers
+    j = 1;
+    fel = 10; #extract 9, 8, 7, 6
+    while (j < 4) {
+      # Compute forward pass
+      # Layer1: conv2d 7x7 -> bn -> relu -> maxpool 3x3
+      lid = 1;
+      [Wl1, bl1] = getWeights(fel, lid, W1_pt, b1_pt, W1_init, b1_init);
+      [outc1, Houtc1, Woutc1] = conv2d_forward(X_batch,Wl1,bl1,C,Hin,Win,7,7,2,2,3,3);
+      outb1 = bn2d_forward(outc1,64,Houtc1,Woutc1,mu_bn,ep_bn);
+      outr1 = relu_forward(outb1);
+      [outp1, Houtp1, Woutp1] = max_pool2d_forward(outr1,64,Houtc1, Woutc1,3,3,2,2,1,1);
+
+      # Layer2: residual block1
+      lid = 2;
+      [Wc1, bc1] = getWeights(fel, lid, W2_pt, b2_pt, W2_init, b2_init);
+      [Wc2, bc2] = getWeights(fel, lid, W3_pt, b3_pt, W3_init, b3_init);
+      [outrb1, Houtrb1, Woutrb1] = basic_block(outp1,64,64,Houtp1,Woutp1,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb1)+" "+ncol(outrb1));
+
+      # Layer3: residual block2
+      lid = 3;
+      [Wc1, bc1] = getWeights(fel, lid, W4_pt, b4_pt, W4_init, b4_init);
+      [Wc2, bc2] = getWeights(fel, lid, W5_pt, b5_pt, W5_init, b5_init);
+      [outrb2, Houtrb2, Woutrb2] = basic_block(outrb1,64,64,Houtrb1,Woutrb1,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb2)+" "+ncol(outrb2));
+
+      # Layer4: residual block3
+      lid = 4;
+      [Wc1, bc1] = getWeights(fel, lid, W6_pt, b6_pt, W6_init, b6_init);
+      [Wc2, bc2] = getWeights(fel, lid, W7_pt, b7_pt, W7_init, b7_init);
+      [outrb3, Houtrb3, Woutrb3] = basic_block(outrb2,64,128,Houtrb2,Woutrb2,2,2,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb3)+" "+ncol(outrb3));
+
+      # Layer5: residual block4
+      lid = 5;
+      [Wc1, bc1] = getWeights(fel, lid, W8_pt, b8_pt, W8_init, b8_init);
+      [Wc2, bc2] = getWeights(fel, lid, W9_pt, b9_pt, W9_init, b9_init);
+      [outrb4, Houtrb4, Woutrb4] = basic_block(outrb3,128,128,Houtrb3,Woutrb3,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb4)+" "+ncol(outrb4));
+
+      # Layer6: residual block5
+      lid = 6;
+      [Wc1, bc1] = getWeights(fel, lid, W10_pt, b10_pt, W10_init, b10_init);
+      [Wc2, bc2] = getWeights(fel, lid, W11_pt, b11_pt, W11_init, b11_init);
+      [outrb5, Houtrb5, Woutrb5] = basic_block(outrb4,128,256,Houtrb4,Woutrb4,2,2,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb5)+" "+ncol(outrb5));
+
+      # Layer7: residual block6
+      lid = 7;
+      [Wc1, bc1] = getWeights(fel, lid, W12_pt, b12_pt, W12_init, b12_init);
+      [Wc2, bc2] = getWeights(fel, lid, W13_pt, b13_pt, W13_init, b13_init);
+      [outrb6, Houtrb6, Woutrb6] = basic_block(outrb5,256,256,Houtrb5,Woutrb5,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb6)+" "+ncol(outrb6));
+
+      # Layer8: residual block7
+      lid = 8;
+      [Wc1, bc1] = getWeights(fel, lid, W14_pt, b14_pt, W14_init, b14_init);
+      [Wc2, bc2] = getWeights(fel, lid, W15_pt, b15_pt, W15_init, b15_init);
+      [outrb7, Houtrb7, Woutrb7] = basic_block(outrb6,256,512,Houtrb6,Woutrb6,2,2,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb7)+" "+ncol(outrb7));
+
+      # Layer9: residual block8
+      lid = 9;
+      [Wc1, bc1] = getWeights(fel, lid, W16_pt, b16_pt, W16_init, b16_init);
+      [Wc2, bc2] = getWeights(fel, lid, W17_pt, b17_pt, W17_init, b17_init);
+      [outrb8, Houtrb8, Woutrb8] = basic_block(outrb7,512,512,Houtrb7,Woutrb7,1,1,Wc1,bc1,Wc2,bc2);
+      print(nrow(outrb8)+" "+ncol(outrb8));
+
+      # Global average pooling
+      [outap1, Houtap1, Houtap2] = avg_pool2d_forward(outrb8, 512, Houtrb8, Woutrb8);
+
+      # layer10 : Fully connected layer
+      lid = 10;
+      [Wl10, bl10] = getWeights(fel, lid, W18_pt, b18_pt, W18_init, b18_init);
+      outa1 = affine_forward(outap1, Wl10, bl10);
+      probs_batch = softmax_forward(outa1);
+
+      # Store the predictions
+      Y_pred[beg:end,j] = rwRowIndexMax(probs_batch, oneVec, idxSeq);
+      j = j + 1;
+      fel = fel - 1;
+    }
+  }
+
+}
+
+generate_dummy_data = function(int N, int C, int Hin, int Win, int K)
+  return (matrix[double] X, matrix[double] Y) {
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal", seed=45) #linearized images
+  classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform", seed=46))
+  Y = table(seq(1, N), classes, N, K)  #one-hot encoding
+}
+
+# Read training data and settings
+N = 64;    #num of images in the target dataset
+C = 3;       #num of color channels
+Hin = 224;   #input image height
+Win = 224;   #input image width
+K = 10;      #num of classes
+
+# Generate dummy data
+[X, Y] = generate_dummy_data(N, C, Hin, Win, K);
+
+# Load the CuDNN libraries by calling a conv2d
+print("Eagerly loading cuDNN library");
+[W1, b1] = conv2d_init(96, C, Hf=11, Wf=11, 42);
+[outc1, Houtc1, Woutc1] = conv2d_forward(X[1:8,], W1, b1, C, Hin, Win, 11, 11, 1, 1, 2, 2);
+print(sum(outc1));
+
+print("Starting exploratory feature transfers");
+t1 = time();
+Y_pred = resnet18_forward(X, C, Hin, Win, K);
+R = colSums(Y_pred)
+print(R);
+
+t2 = time();
+print("Elapsed time for feature transfers = "+floor((t2-t1)/1000000)+" millsec");
+
+write(R, $1)

From 48c1eeecb1386b214acf42437b610b087cb5657b Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Mon, 14 Jul 2025 03:34:51 +0200
Subject: [PATCH 16/26] delete files

---
 .uuid         | 1 -
 bin/.uuid     | 1 -
 conf/.uuid    | 1 -
 dev/.uuid     | 1 -
 docker/.uuid  | 1 -
 docs/.uuid    | 1 -
 scripts/.uuid | 1 -
 src/.uuid     | 1 -
 8 files changed, 8 deletions(-)
 delete mode 100644 .uuid
 delete mode 100644 bin/.uuid
 delete mode 100644 conf/.uuid
 delete mode 100644 dev/.uuid
 delete mode 100644 docker/.uuid
 delete mode 100644 docs/.uuid
 delete mode 100644 scripts/.uuid
 delete mode 100644 src/.uuid

diff --git a/.uuid b/.uuid
deleted file mode 100644
index c7461c96769..00000000000
--- a/.uuid
+++ /dev/null
@@ -1 +0,0 @@
-2b0ea405-adcb-44dd-b0e8-cbf7394f7736
\ No newline at end of file
diff --git a/bin/.uuid b/bin/.uuid
deleted file mode 100644
index 63fdd3cd025..00000000000
--- a/bin/.uuid
+++ /dev/null
@@ -1 +0,0 @@
-adc2d874-7518-4920-a601-2184c5762423
\ No newline at end of file
diff --git a/conf/.uuid b/conf/.uuid
deleted file mode 100644
index 56a41cb8941..00000000000
--- a/conf/.uuid
+++ /dev/null
@@ -1 +0,0 @@
-d0dfa9b1-cfe3-4f9e-97a6-64cbafefd004
\ No newline at end of file
diff --git a/dev/.uuid b/dev/.uuid
deleted file mode 100644
index 0473b866fa9..00000000000
--- a/dev/.uuid
+++ /dev/null
@@ -1 +0,0 @@
-3c726e72-1a65-4bbb-9732-07b261b9cbee
\ No newline at end of file
diff --git a/docker/.uuid b/docker/.uuid
deleted file mode 100644
index 20e4a0fdf5a..00000000000
--- a/docker/.uuid
+++ /dev/null
@@ -1 +0,0 @@
-7ee583bd-41f8-4c33-8222-964def6e9a08
\ No newline at end of file
diff --git a/docs/.uuid b/docs/.uuid
deleted file mode 100644
index 719e219a0a0..00000000000
--- a/docs/.uuid
+++ /dev/null
@@ -1 +0,0 @@
-16051ef9-1f88-42f0-839d-4add14b6844c
\ No newline at end of file
diff --git a/scripts/.uuid b/scripts/.uuid
deleted file mode 100644
index e3a2f189086..00000000000
--- a/scripts/.uuid
+++ /dev/null
@@ -1 +0,0 @@
-85419957-235e-4d35-a1cb-a16d7a558678
\ No newline at end of file
diff --git a/src/.uuid b/src/.uuid
deleted file mode 100644
index c8b39ff66b0..00000000000
--- a/src/.uuid
+++ /dev/null
@@ -1 +0,0 @@
-418bbd1a-de61-4d8a-83d6-01f804855c6b
\ No newline at end of file

From fb2e10dee50f2590ee19cc8dc2fc07221b2de7ff Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Wed, 16 Jul 2025 22:53:51 +0200
Subject: [PATCH 17/26] start implementing tests for cudaSupportFunctions

---
 .../CudaCublasGeamTest.java                   | 123 ++++++++++++++++++
 .../CudaCusparseCsrGemmTest.java              | 123 ++++++++++++++++++
 .../gpu/cudaSupportFunctions/CudaCublasGeam.R |  50 +++++++
 .../cudaSupportFunctions/CudaCublasGeam.dml   |  41 ++++++
 .../CudaCusparseCsrGemm.R                     |  50 +++++++
 .../CudaCusparseCsrGemm.dml                   |  42 ++++++
 6 files changed, 429 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.R
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.dml
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.R
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.dml

diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
new file mode 100644
index 00000000000..49b6f0e8883
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import jcuda.CudaException;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaError;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCublasGeamTest extends AutomatedTestBase {
+
+    private static final String TEST_NAME = "CudaCublasGeam";
+    private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+    private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasGeamTest.class.getSimpleName() + "/";
+
+    private static final int rows = 200;
+    private static final int cols = 200;
+
+    private static final double eps = Math.pow(10, -10);
+
+    @BeforeClass
+    public static void checkGPU() {
+        boolean gpuAvailable = false;
+        try {
+            // Ask JCuda to throw Java exceptions (much nicer than error codes)
+            JCuda.setExceptionsEnabled(true);
+
+            // How many devices does the runtime see?
+            int[] devCount = {0};
+            int status = JCuda.cudaGetDeviceCount(devCount);
+
+            gpuAvailable = (status == cudaError.cudaSuccess) && (devCount[0] > 0);
+        }
+        catch(UnsatisfiedLinkError | CudaException ex) {
+            // - native JCuda libs not on the class-path
+            // - or they were built for the wrong CUDA version
+            gpuAvailable = false;
+        }
+
+        Assume.assumeTrue("Skipping GPU test: no compatible CUDA device " + "or JCuda native libraries not available.",
+            gpuAvailable);
+    }
+
+    @Override
+    public void setUp() {
+        TestUtils.clearAssertionInformation();
+        addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+    }
+
+    @Test
+    public void testCublasGeamNoTranspose() {
+        testCublasGeam(1);
+    }
+
+    @Test
+    public void testCublasGeamLeftTranspose() {
+        testCublasGeam(2);
+    }
+
+    @Test
+    public void testCublasGeamRightTranspose() {
+        testCublasGeam(3);
+    }
+
+    @Test
+    public void testCublasGeamBothTranspose() {
+        testCublasGeam(4);
+    }
+
+    private void testCublasGeam(int ID) {
+
+        TestConfiguration config = getTestConfiguration(TEST_NAME);
+        loadTestConfiguration(config);
+
+        String HOME = SCRIPT_DIR + TEST_DIR;
+        fullDMLScriptName = HOME + TEST_NAME + ".dml";
+        programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+        fullRScriptName = HOME + TEST_NAME + ".R";
+        rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+        // both matrices have to be dense
+        double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.70d, 5);
+        double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.60d, 3);
+        writeInputMatrixWithMTD("A", A, true);
+        writeInputMatrixWithMTD("B", B, true);
+
+        runTest(true, false, null, -1);
+        runRScript(true);
+
+        //compare matrices
+        HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+        HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+        TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+        Assert.assertTrue(heavyHittersContainsString("gpu_+"));
+    }
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
new file mode 100644
index 00000000000..ff942b1ef08
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import jcuda.CudaException;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaError;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsrGemmTest extends AutomatedTestBase {
+
+    private static final String TEST_NAME = "CudaCusparseCsrGemm";
+    private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+    private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrGemmTest.class.getSimpleName() + "/";
+
+    private static final int rows = 200;
+    private static final int cols = 200;
+
+    private static final double eps = Math.pow(10, -10);
+
+    @BeforeClass
+    public static void checkGPU() {
+        boolean gpuAvailable = false;
+        try {
+            // Ask JCuda to throw Java exceptions (much nicer than error codes)
+            JCuda.setExceptionsEnabled(true);
+
+            // How many devices does the runtime see?
+            int[] devCount = {0};
+            int status = JCuda.cudaGetDeviceCount(devCount);
+
+            gpuAvailable = (status == cudaError.cudaSuccess) && (devCount[0] > 0);
+        } catch (UnsatisfiedLinkError | CudaException ex) {
+            // - native JCuda libs not on the class-path
+            // - or they were built for the wrong CUDA version
+            gpuAvailable = false;
+        }
+
+        Assume.assumeTrue("Skipping GPU test: no compatible CUDA device " + "or JCuda native libraries not available.",
+                gpuAvailable);
+    }
+
+    @Override
+    public void setUp() {
+        TestUtils.clearAssertionInformation();
+        addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"R"}));
+    }
+
+    @Test
+    public void testCusparseCsrGemmNoTranspose() {
+        testCusparseCsrGemm(1);
+    }
+
+    @Test
+    public void testCusparseCsrGemmLeftTranspose() {
+        testCusparseCsrGemm(2);
+    }
+
+    @Test
+    public void testCusparseCsrGemmRightTranspose() {
+        testCusparseCsrGemm(3);
+    }
+
+    @Test
+    public void testCusparseCsrGemmBothTranspose() {
+        testCusparseCsrGemm(4);
+    }
+
+
+    private void testCusparseCsrGemm(int ID) {
+
+        TestConfiguration config = getTestConfiguration(TEST_NAME);
+        loadTestConfiguration(config);
+
+        String HOME = SCRIPT_DIR + TEST_DIR;
+        fullDMLScriptName = HOME + TEST_NAME + ".dml";
+        programArgs = new String[]{"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+        fullRScriptName = HOME + TEST_NAME + ".R";
+        rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+        // both matrices have to be sparse
+        double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.30d, 5);
+        double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.20d, 3);
+        writeInputMatrixWithMTD("A", A, true);
+        writeInputMatrixWithMTD("B", B, true);
+
+        runTest(true, false, null, -1);
+        runRScript(true);
+
+        //compare matrices
+        HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+        HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+        TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+        Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+    }
+}
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.R
new file mode 100644
index 00000000000..39abbe4fe88
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.R
@@ -0,0 +1,50 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
+type = as.integer(args[2])
+
+
+# Perform operations
+if(type==1){
+    R = A + B
+} else if(type==2) {
+    R = t(A) + B
+} else if(type==3) {
+    R = A + t(B)
+} else if(type==4){
+    R = t(A) + t(B)
+}
+
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[3], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.dml
new file mode 100644
index 00000000000..25fe0fb3651
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.dml
@@ -0,0 +1,41 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrices A, B, and operation type
+A = read($1)
+B = read($2)
+type = $3
+
+# Perform operations
+if(type==1){
+    R = A + B
+}
+else if(type==2) {
+    R = t(A) + B
+}
+else if(type==3) {
+    R = A + t(B)
+}
+else if(type==4) {
+    R = t(A) + t(B)
+}
+# Write the result matrix R
+write(R, $4)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.R
new file mode 100644
index 00000000000..eb3780297c1
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.R
@@ -0,0 +1,50 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
+type = as.integer(args[2])
+
+
+# Perform operations
+if(type==1){
+    R = A %*% B
+} else if(type==2) {
+    R = t(A) %*% B
+} else if(type==3) {
+    R = A %*% t(B)
+} else if(type==4){
+    R = t(A) %*% t(B)
+}
+
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[3], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.dml
new file mode 100644
index 00000000000..4eb0c99b7ff
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.dml
@@ -0,0 +1,42 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# Read matrices A, B, and operation type
+A = read($1)
+B = read($2)
+type = $3
+
+# Perform operations
+if(type==1){
+    R = A %*% B
+}
+else if(type==2) {
+    R = t(A) %*% B
+}
+else if(type==3) {
+    R = A %*% t(B)
+}
+else if(type==4) {
+    R = t(A) %*% t(B)
+}
+# Write the result matrix R
+write(R, $4)

From 271f008aaf29f44062f109079117dfcb011c07a8 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Wed, 16 Jul 2025 23:13:03 +0200
Subject: [PATCH 18/26] add checkGPU() to TestUtils to run before class

---
 .../java/org/apache/sysds/test/TestUtils.java | 32 +++++++++++++++++--
 .../CudaCublasGeamTest.java                   | 26 ++-------------
 .../CudaCusparseCsrGemmTest.java              | 25 ++-------------
 3 files changed, 33 insertions(+), 50 deletions(-)

diff --git a/src/test/java/org/apache/sysds/test/TestUtils.java b/src/test/java/org/apache/sysds/test/TestUtils.java
index 195e36d6065..64403ff2e03 100644
--- a/src/test/java/org/apache/sysds/test/TestUtils.java
+++ b/src/test/java/org/apache/sysds/test/TestUtils.java
@@ -52,6 +52,9 @@
 import java.util.StringTokenizer;
 import java.util.concurrent.TimeUnit;
 
+import jcuda.CudaException;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaError;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.NotImplementedException;
@@ -95,6 +98,7 @@
 import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.runtime.util.UtilFunctions;
 import org.junit.Assert;
+import org.junit.Assume;
 
 /**
  * <p>
@@ -3917,7 +3921,7 @@ public static boolean containsNan(double[][] data, int col) {
 				return true;
 		return false;
 	}
-	
+
 	public static int isGPUAvailable() {
 		// returns cudaSuccess if at least one gpu is available
 		//final int[] deviceCount = new int[1];
@@ -3926,10 +3930,32 @@ public static int isGPUAvailable() {
 		return 1; //return false for now
 	}
 
-	public static MatrixBlock mockNonContiguousMatrix(MatrixBlock db){
+	public static void checkGPU() {
+		boolean gpuAvailable = false;
+		try {
+			// Ask JCuda to throw Java exceptions (much nicer than error codes)
+			JCuda.setExceptionsEnabled(true);
+
+			// How many devices does the runtime see?
+			int[] devCount = {0};
+			int status = JCuda.cudaGetDeviceCount(devCount);
+
+			gpuAvailable = (status == cudaError.cudaSuccess) && (devCount[0] > 0);
+		}
+		catch(UnsatisfiedLinkError | CudaException ex) {
+			// - native JCuda libs not on the class-path
+			// - or they were built for the wrong CUDA version
+			gpuAvailable = false;
+		}
+
+		Assume.assumeTrue("Skipping GPU test: no compatible CUDA device " + "or JCuda native libraries not available.",
+			gpuAvailable);
+	}
+
+	public static MatrixBlock mockNonContiguousMatrix(MatrixBlock db) {
 		db.sparseToDense();
 		double[] vals = db.getDenseBlockValues();
-		int[] dims = new int[]{db.getNumRows(), db.getNumColumns()};
+		int[] dims = new int[] {db.getNumRows(), db.getNumColumns()};
 		MatrixBlock m = new MatrixBlock(db.getNumRows(), db.getNumColumns(), new DenseBlockFP64Mock(dims, vals));
 		m.setNonZeros(db.getNonZeros());
 		return m;
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
index 49b6f0e8883..8bc54e27ece 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
@@ -19,15 +19,11 @@
 
 package org.apache.sysds.test.gpu.cudaSupportFunctions;
 
-import jcuda.CudaException;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaError;
 import org.apache.sysds.runtime.matrix.data.MatrixValue;
 import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
 import org.junit.Assert;
-import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -45,26 +41,8 @@ public class CudaCublasGeamTest extends AutomatedTestBase {
     private static final double eps = Math.pow(10, -10);
 
     @BeforeClass
-    public static void checkGPU() {
-        boolean gpuAvailable = false;
-        try {
-            // Ask JCuda to throw Java exceptions (much nicer than error codes)
-            JCuda.setExceptionsEnabled(true);
-
-            // How many devices does the runtime see?
-            int[] devCount = {0};
-            int status = JCuda.cudaGetDeviceCount(devCount);
-
-            gpuAvailable = (status == cudaError.cudaSuccess) && (devCount[0] > 0);
-        }
-        catch(UnsatisfiedLinkError | CudaException ex) {
-            // - native JCuda libs not on the class-path
-            // - or they were built for the wrong CUDA version
-            gpuAvailable = false;
-        }
-
-        Assume.assumeTrue("Skipping GPU test: no compatible CUDA device " + "or JCuda native libraries not available.",
-            gpuAvailable);
+    public static void ensureGPU() {
+        TestUtils.checkGPU();
     }
 
     @Override
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
index ff942b1ef08..30833995334 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
@@ -19,15 +19,11 @@
 
 package org.apache.sysds.test.gpu.cudaSupportFunctions;
 
-import jcuda.CudaException;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaError;
 import org.apache.sysds.runtime.matrix.data.MatrixValue;
 import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
 import org.junit.Assert;
-import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -45,25 +41,8 @@ public class CudaCusparseCsrGemmTest extends AutomatedTestBase {
     private static final double eps = Math.pow(10, -10);
 
     @BeforeClass
-    public static void checkGPU() {
-        boolean gpuAvailable = false;
-        try {
-            // Ask JCuda to throw Java exceptions (much nicer than error codes)
-            JCuda.setExceptionsEnabled(true);
-
-            // How many devices does the runtime see?
-            int[] devCount = {0};
-            int status = JCuda.cudaGetDeviceCount(devCount);
-
-            gpuAvailable = (status == cudaError.cudaSuccess) && (devCount[0] > 0);
-        } catch (UnsatisfiedLinkError | CudaException ex) {
-            // - native JCuda libs not on the class-path
-            // - or they were built for the wrong CUDA version
-            gpuAvailable = false;
-        }
-
-        Assume.assumeTrue("Skipping GPU test: no compatible CUDA device " + "or JCuda native libraries not available.",
-                gpuAvailable);
+    public static void ensureGPU() {
+        TestUtils.checkGPU();
     }
 
     @Override

From f793c7437627878155a36e33c936e4c8d9c52877 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Thu, 17 Jul 2025 00:09:10 +0200
Subject: [PATCH 19/26] add test for matrix-vector multiply for GPU

---
 .../CudaCusparseCsrMVTest.java                | 91 +++++++++++++++++++
 .../cudaSupportFunctions/CudaCusparseCsrMV.R  | 46 ++++++++++
 .../CudaCusparseCsrMV.dml                     | 36 ++++++++
 3 files changed, 173 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.R
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.dml

diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java
new file mode 100644
index 00000000000..718a1090e45
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsrMVTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaCusparseCsrMV";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrMVTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCusparseCsrMVNoTranspose() {
+		testCusparseCsrMV(1);
+	}
+
+	@Test
+	public void testCusparseCsrMVLeftTranspose() {
+		testCusparseCsrMV(2);
+	}
+
+	private void testCusparseCsrMV(int ID) {
+
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("X"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+		// A is sparse matrix, X dense vector
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.20d, 5);
+		double[][] X = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("X", X, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+
+	}
+}
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.R
new file mode 100644
index 00000000000..321da586172
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.R
@@ -0,0 +1,46 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+X = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
+type = as.integer(args[2])
+
+
+# Perform operations
+if(type==1){
+    R = A %*% X
+} else if(type==2) {
+    R = t(A) %*% X
+}
+
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[3], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.dml
new file mode 100644
index 00000000000..be444fb7e7b
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrix A, vector X, and operation type
+A = read($1)
+X = read($2)
+type = $3
+
+# Perform operations
+if(type==1){
+    R = A %*% X
+}
+else if(type==2) {
+    R = t(A) %*% X
+}
+
+# Write the result matrix R
+write(R, $4)

From a9d07eb995dbe40e164cff3f75367b1949249396 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Thu, 17 Jul 2025 00:47:32 +0200
Subject: [PATCH 20/26] add test for cusparsecsrmm2

---
 .../CudaCusparseCsrMMTest.java                | 101 ++++++++++++++++++
 .../cudaSupportFunctions/CudaCusparseCsrMM.R  |  50 +++++++++
 .../CudaCusparseCsrMM.dml                     |  41 +++++++
 3 files changed, 192 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.R
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.dml

diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java
new file mode 100644
index 00000000000..f6c21ee2522
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsrMMTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaCusparseCsrMM";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrMMTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCusparseCsrMMNoTranspose() {
+		testCusparseCsrMM(1);
+	}
+
+	@Test
+	public void testCusparseCsrMMLeftTranspose() {
+		testCusparseCsrMM(2);
+	}
+
+	@Test
+	public void testCusparseCsrMMRightTranspose() {
+		testCusparseCsrMM(3);
+	}
+
+	@Test
+	public void testCusparseCsrMMBothTranspose() {
+		testCusparseCsrMM(4);
+	}
+
+	private void testCusparseCsrMM(int ID) {
+
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+		// A is sparse matrix, B a dense matrix
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.20d, 5);
+		double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+
+	}
+}
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.R
new file mode 100644
index 00000000000..329ebd59f9c
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.R
@@ -0,0 +1,50 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
+type = as.integer(args[2])
+
+
+# Perform operations
+if(type==1){
+    R = A %*% B
+} else if(type==2) {
+    R = t(A) %*% B
+} else if(type==3) {
+     R = A %*% t(B)
+} else if(type==4) {
+     R = t(A) %*% t(B)
+}
+
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[3], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.dml
new file mode 100644
index 00000000000..12e144f48ca
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.dml
@@ -0,0 +1,41 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrices, and operation type
+A = read($1)
+B = read($2)
+type = $3
+
+# Perform operations
+if(type==1){
+    R = A %*% B
+}
+else if(type==2) {
+    R = t(A) %*% B
+}
+else if(type==3) {
+    R = A %*% t(B)
+}
+else if(type==4) {
+    R = t(A) %*% t(B)
+}
+# Write the result matrix R
+write(R, $4)

From 735cb7507dbd077d5c15760dc9a956f0073ee907 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Thu, 17 Jul 2025 19:41:54 +0200
Subject: [PATCH 21/26] add testing for cublasDot

---
 .../CudaCublasDotTest.java                    | 85 +++++++++++++++++++
 .../gpu/cudaSupportFunctions/CudaCublasDot.R  | 41 +++++++++
 .../cudaSupportFunctions/CudaCublasDot.dml    | 30 +++++++
 3 files changed, 156 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.R
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.dml

diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
new file mode 100644
index 00000000000..fbd4361c51f
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCublasDotTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaCublasDot";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasDotTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCublasDot() {
+		testCudaCublasDot();
+	}
+
+	private void testCudaCublasDot() {
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), expectedDir());
+
+		// A is sparse matrix, B a dense matrix
+		double[][] A = getRandomMatrix(1, cols, -1, 1, 0.70d, 5);
+		double[][] B = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+
+	}
+}
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.R
new file mode 100644
index 00000000000..1344ca30d09
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.R
@@ -0,0 +1,41 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
+
+
+
+# Perform operations
+R=A%*%B
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.dml
new file mode 100644
index 00000000000..a6d6a886969
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrices, and operation type
+A = read($1)
+B = read($2)
+
+
+# Perform operations
+R = A%*%B
+# Write the result matrix R
+write(R, $3)

From 1a48926dfdd7ee085211f145de3d05e445f62ad4 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Thu, 17 Jul 2025 19:55:55 +0200
Subject: [PATCH 22/26] add test for cublasgemv

---
 .../CudaCublasDotTest.java                    |  2 +-
 .../CudaCublasGeMVTest.java                   | 86 +++++++++++++++++++
 .../gpu/cudaSupportFunctions/CudaCublasGeMV.R | 40 +++++++++
 .../cudaSupportFunctions/CudaCublasGeMV.dml   | 30 +++++++
 4 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.R
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.dml

diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
index fbd4361c51f..fd6815ad615 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
@@ -65,7 +65,7 @@ private void testCudaCublasDot() {
 		fullRScriptName = HOME + TEST_NAME + ".R";
 		rCmd = getRCmd(inputDir(), expectedDir());
 
-		// A is sparse matrix, B a dense matrix
+		// A is dense vector, B is a dense vector
 		double[][] A = getRandomMatrix(1, cols, -1, 1, 0.70d, 5);
 		double[][] B = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
 		writeInputMatrixWithMTD("A", A, true);
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java
new file mode 100644
index 00000000000..f8d861f5543
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCublasGeMVTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaCublasGeMV";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasGeMVTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCudaCublasGeMV() {
+		testCudaCublasGeMVTest();
+	}
+
+	private void testCudaCublasGeMVTest() {
+
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), expectedDir());
+
+		// A is dense matrix, B is a dense vector
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.70d, 5);
+		double[][] B = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+
+	}
+}
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.R
new file mode 100644
index 00000000000..503dd313ebc
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.R
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices and operation type
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
+
+
+# Perform operations
+R=A%*%B
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.dml
new file mode 100644
index 00000000000..a6d6a886969
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrices, and operation type
+A = read($1)
+B = read($2)
+
+
+# Perform operations
+R = A%*%B
+# Write the result matrix R
+write(R, $3)

From fbdb865dc04ecbfd7287a4b3b76a43dfcd04d522 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Fri, 18 Jul 2025 01:24:14 +0200
Subject: [PATCH 23/26] restructure scripts

---
 .../CudaCublasDotTest.java                    |  2 +-
 .../CudaCublasGeMVTest.java                   | 22 +++--
 .../CudaCublasGeamTest.java                   |  2 +-
 .../CudaCublasGemmTest.java                   | 99 +++++++++++++++++++
 .../CudaCusparseCsrGemmTest.java              |  2 +-
 .../CudaCusparseCsrMMTest.java                |  2 +-
 .../CudaCusparseCsrMVTest.java                |  2 +-
 .../gpu/cudaSupportFunctions/CudaCublasDot.R  | 41 --------
 .../cudaSupportFunctions/CudaCublasGeMV.dml   | 30 ------
 .../cudaSupportFunctions/CudaCusparseCsrMM.R  | 50 ----------
 .../CudaCusparseCsrMM.dml                     | 41 --------
 ...CublasGeMV.R => CudaSupportFunctionsDot.R} |  3 +-
 ...lasDot.dml => CudaSupportFunctionsDot.dml} |  4 +-
 ...ublasGeam.R => CudaSupportFunctionsGeam.R} |  0
 ...sGeam.dml => CudaSupportFunctionsGeam.dml} |  2 +
 ...arseCsrGemm.R => CudaSupportFunctionsMM.R} |  0
 ...CsrGemm.dml => CudaSupportFunctionsMM.dml} |  1 +
 ...sparseCsrMV.R => CudaSupportFunctionsMV.R} |  0
 ...seCsrMV.dml => CudaSupportFunctionsMV.dml} |  0
 19 files changed, 124 insertions(+), 179 deletions(-)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGemmTest.java
 delete mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.R
 delete mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.dml
 delete mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.R
 delete mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.dml
 rename src/test/scripts/gpu/cudaSupportFunctions/{CudaCublasGeMV.R => CudaSupportFunctionsDot.R} (97%)
 rename src/test/scripts/gpu/cudaSupportFunctions/{CudaCublasDot.dml => CudaSupportFunctionsDot.dml} (96%)
 rename src/test/scripts/gpu/cudaSupportFunctions/{CudaCublasGeam.R => CudaSupportFunctionsGeam.R} (100%)
 rename src/test/scripts/gpu/cudaSupportFunctions/{CudaCublasGeam.dml => CudaSupportFunctionsGeam.dml} (99%)
 rename src/test/scripts/gpu/cudaSupportFunctions/{CudaCusparseCsrGemm.R => CudaSupportFunctionsMM.R} (100%)
 rename src/test/scripts/gpu/cudaSupportFunctions/{CudaCusparseCsrGemm.dml => CudaSupportFunctionsMM.dml} (99%)
 rename src/test/scripts/gpu/cudaSupportFunctions/{CudaCusparseCsrMV.R => CudaSupportFunctionsMV.R} (100%)
 rename src/test/scripts/gpu/cudaSupportFunctions/{CudaCusparseCsrMV.dml => CudaSupportFunctionsMV.dml} (100%)

diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
index fd6815ad615..b202382adbf 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasDotTest.java
@@ -31,7 +31,7 @@
 
 public class CudaCublasDotTest extends AutomatedTestBase {
 
-	private static final String TEST_NAME = "CudaCublasDot";
+	private static final String TEST_NAME = "CudaSupportFunctionsDot";
 	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasDotTest.class.getSimpleName() + "/";
 
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java
index f8d861f5543..5b3c6ab0ce2 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeMVTest.java
@@ -31,7 +31,7 @@
 
 public class CudaCublasGeMVTest extends AutomatedTestBase {
 
-	private static final String TEST_NAME = "CudaCublasGeMV";
+	private static final String TEST_NAME = "CudaSupportFunctionsMV";
 	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasGeMVTest.class.getSimpleName() + "/";
 
@@ -51,26 +51,32 @@ public void setUp() {
 	}
 
 	@Test
-	public void testCudaCublasGeMV() {
-		testCudaCublasGeMVTest();
+	public void testCublasGeMV() {
+		testCudaCublasGeMVTest(1);
 	}
 
-	private void testCudaCublasGeMVTest() {
+	@Test
+	public void testCublasGeMVLeftTranspose() {
+		testCudaCublasGeMVTest(2);
+	}
+
+
+	private void testCudaCublasGeMVTest(int ID) {
 
 		TestConfiguration config = getTestConfiguration(TEST_NAME);
 		loadTestConfiguration(config);
 
 		String HOME = SCRIPT_DIR + TEST_DIR;
 		fullDMLScriptName = HOME + TEST_NAME + ".dml";
-		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), output("R")};
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("X"), String.valueOf(ID), output("R")};
 		fullRScriptName = HOME + TEST_NAME + ".R";
-		rCmd = getRCmd(inputDir(), expectedDir());
+		rCmd = getRCmd(inputDir(),  String.valueOf(ID), expectedDir());
 
 		// A is dense matrix, B is a dense vector
 		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.70d, 5);
-		double[][] B = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
+		double[][] X = getRandomMatrix(rows, 1, -1, 1, 0.80d, 3);
 		writeInputMatrixWithMTD("A", A, true);
-		writeInputMatrixWithMTD("B", B, true);
+		writeInputMatrixWithMTD("X", X, true);
 
 		runTest(true, false, null, -1);
 		runRScript(true);
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
index 8bc54e27ece..1809c5d243d 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGeamTest.java
@@ -31,7 +31,7 @@
 
 public class CudaCublasGeamTest extends AutomatedTestBase {
 
-    private static final String TEST_NAME = "CudaCublasGeam";
+    private static final String TEST_NAME = "CudaSupportFunctionsGeam";
     private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
     private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasGeamTest.class.getSimpleName() + "/";
 
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGemmTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGemmTest.java
new file mode 100644
index 00000000000..e23c11f6173
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCublasGemmTest.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCublasGemmTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsMM";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCublasGemmTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCudaCublasGemmNoTranspose(){
+		testCublasGemm(1);
+	}
+
+	@Test
+	public void testCudaCublasGemmLeftTranspose(){
+		testCublasGemm(2);
+	}
+
+	@Test
+	public void testCudaCublasGemmRightTranspose(){
+		testCublasGemm(3);
+	}
+
+	@Test
+	public void testCudaCublasGemmBothTranspose(){
+		testCublasGemm(4);
+	}
+
+	private void testCublasGemm(int ID){
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+		// A is dense matrix, B is a dense vector
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.70d, 5);
+		double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.80d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_ba+*"));
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
index 30833995334..4ed06fba3af 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGemmTest.java
@@ -31,7 +31,7 @@
 
 public class CudaCusparseCsrGemmTest extends AutomatedTestBase {
 
-    private static final String TEST_NAME = "CudaCusparseCsrGemm";
+    private static final String TEST_NAME = "CudaSupportFunctionsMM";
     private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
     private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrGemmTest.class.getSimpleName() + "/";
 
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java
index f6c21ee2522..bdd816583ea 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMMTest.java
@@ -31,7 +31,7 @@
 
 public class CudaCusparseCsrMMTest extends AutomatedTestBase {
 
-	private static final String TEST_NAME = "CudaCusparseCsrMM";
+	private static final String TEST_NAME = "CudaSupportFunctionsMM";
 	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrMMTest.class.getSimpleName() + "/";
 
diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java
index 718a1090e45..05663754211 100644
--- a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrMVTest.java
@@ -31,7 +31,7 @@
 
 public class CudaCusparseCsrMVTest extends AutomatedTestBase {
 
-	private static final String TEST_NAME = "CudaCusparseCsrMV";
+	private static final String TEST_NAME = "CudaSupportFunctionsMV";
 	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrMVTest.class.getSimpleName() + "/";
 
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.R
deleted file mode 100644
index 1344ca30d09..00000000000
--- a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.R
+++ /dev/null
@@ -1,41 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-
-args <- commandArgs(TRUE)
-
-# Set options for numeric precision
-options(digits=22)
-
-# Load required libraries
-library("Matrix")
-library("matrixStats")
-
-# Read matrices and operation type
-A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
-B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
-
-
-
-# Perform operations
-R=A%*%B
-#Write result matrix R
-writeMM(as(R, "CsparseMatrix"), paste(args[2], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.dml
deleted file mode 100644
index a6d6a886969..00000000000
--- a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.dml
+++ /dev/null
@@ -1,30 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Read matrices, and operation type
-A = read($1)
-B = read($2)
-
-
-# Perform operations
-R = A%*%B
-# Write the result matrix R
-write(R, $3)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.R
deleted file mode 100644
index 329ebd59f9c..00000000000
--- a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.R
+++ /dev/null
@@ -1,50 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-
-args <- commandArgs(TRUE)
-
-# Set options for numeric precision
-options(digits=22)
-
-# Load required libraries
-library("Matrix")
-library("matrixStats")
-
-# Read matrices and operation type
-A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
-B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
-type = as.integer(args[2])
-
-
-# Perform operations
-if(type==1){
-    R = A %*% B
-} else if(type==2) {
-    R = t(A) %*% B
-} else if(type==3) {
-     R = A %*% t(B)
-} else if(type==4) {
-     R = t(A) %*% t(B)
-}
-
-#Write result matrix R
-writeMM(as(R, "CsparseMatrix"), paste(args[3], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.dml
deleted file mode 100644
index 12e144f48ca..00000000000
--- a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMM.dml
+++ /dev/null
@@ -1,41 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Read matrices, and operation type
-A = read($1)
-B = read($2)
-type = $3
-
-# Perform operations
-if(type==1){
-    R = A %*% B
-}
-else if(type==2) {
-    R = t(A) %*% B
-}
-else if(type==3) {
-    R = A %*% t(B)
-}
-else if(type==4) {
-    R = t(A) %*% t(B)
-}
-# Write the result matrix R
-write(R, $4)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.R
similarity index 97%
rename from src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.R
rename to src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.R
index 503dd313ebc..872ec031392 100644
--- a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeMV.R
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.R
@@ -29,11 +29,10 @@ options(digits=22)
 library("Matrix")
 library("matrixStats")
 
-# Read matrices and operation type
+# Read matrices
 A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
 B = as.matrix(readMM(paste(args[1], "B.mtx", sep="")))
 
-
 # Perform operations
 R=A%*%B
 #Write result matrix R
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.dml
similarity index 96%
rename from src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.dml
rename to src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.dml
index a6d6a886969..528d3e2035c 100644
--- a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasDot.dml
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsDot.dml
@@ -19,12 +19,12 @@
 #
 #-------------------------------------------------------------
 
-# Read matrices, and operation type
+# Read matrices
 A = read($1)
 B = read($2)
 
-
 # Perform operations
 R = A%*%B
+
 # Write the result matrix R
 write(R, $3)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.R
similarity index 100%
rename from src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.R
rename to src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.R
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.dml
similarity index 99%
rename from src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.dml
rename to src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.dml
index 25fe0fb3651..bfed8c61f92 100644
--- a/src/test/scripts/gpu/cudaSupportFunctions/CudaCublasGeam.dml
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsGeam.dml
@@ -19,6 +19,7 @@
 #
 #-------------------------------------------------------------
 
+
 # Read matrices A, B, and operation type
 A = read($1)
 B = read($2)
@@ -37,5 +38,6 @@ else if(type==3) {
 else if(type==4) {
     R = t(A) + t(B)
 }
+
 # Write the result matrix R
 write(R, $4)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.R
similarity index 100%
rename from src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.R
rename to src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.R
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.dml
similarity index 99%
rename from src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.dml
rename to src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.dml
index 4eb0c99b7ff..fbe89129121 100644
--- a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrGemm.dml
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMM.dml
@@ -38,5 +38,6 @@ else if(type==3) {
 else if(type==4) {
     R = t(A) %*% t(B)
 }
+
 # Write the result matrix R
 write(R, $4)
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.R
similarity index 100%
rename from src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.R
rename to src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.R
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.dml
similarity index 100%
rename from src/test/scripts/gpu/cudaSupportFunctions/CudaCusparseCsrMV.dml
rename to src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsMV.dml

From c0f7841a3540e3772216e31d853fdc8187ae55cf Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Fri, 18 Jul 2025 01:26:07 +0200
Subject: [PATCH 24/26] refactor nn

---
 .../apache/sysds/test/gpu/{ => nn}/DNNOperationsGPUTest.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename src/test/java/org/apache/sysds/test/gpu/{ => nn}/DNNOperationsGPUTest.java (99%)

diff --git a/src/test/java/org/apache/sysds/test/gpu/DNNOperationsGPUTest.java b/src/test/java/org/apache/sysds/test/gpu/nn/DNNOperationsGPUTest.java
similarity index 99%
rename from src/test/java/org/apache/sysds/test/gpu/DNNOperationsGPUTest.java
rename to src/test/java/org/apache/sysds/test/gpu/nn/DNNOperationsGPUTest.java
index ea7a564f8e3..2bd80c938d0 100644
--- a/src/test/java/org/apache/sysds/test/gpu/DNNOperationsGPUTest.java
+++ b/src/test/java/org/apache/sysds/test/gpu/nn/DNNOperationsGPUTest.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.apache.sysds.test.gpu;
+package org.apache.sysds.test.gpu.nn;
 
 import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.functions.dnn.Conv1DTest;

From 481b8ddf567fac1ef597d86ba0a166aee559ffd2 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Fri, 18 Jul 2025 20:18:13 +0200
Subject: [PATCH 25/26] add test for cupsrasecsr2csc via transpose operation

---
 .../CudaCusparseCsr2CscTest.java              | 82 +++++++++++++++++++
 .../CudaSupportFunctionsTranspose.R           | 38 +++++++++
 .../CudaSupportFunctionsTranspose.dml         | 29 +++++++
 3 files changed, 149 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsr2CscTest.java
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.R
 create mode 100644 src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.dml

diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsr2CscTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsr2CscTest.java
new file mode 100644
index 00000000000..63acbc7b8ca
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsr2CscTest.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsr2CscTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsTranspose";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsr2CscTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCusparseCsr2CscTest() {
+		testCudaCusparseCsr2Csc();
+	}
+
+	private void testCudaCusparseCsr2Csc() {
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), expectedDir());
+
+		// A is a sparse matrix
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.30d, 5);
+		writeInputMatrixWithMTD("A", A, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_r'"));
+	}
+}
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.R b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.R
new file mode 100644
index 00000000000..4dc5894e590
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.R
@@ -0,0 +1,38 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+
+# Set options for numeric precision
+options(digits=22)
+
+# Load required libraries
+library("Matrix")
+library("matrixStats")
+
+# Read matrices
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+
+# Perform operation
+R = t(A)
+#Write result matrix R
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "R", sep=""));
diff --git a/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.dml b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.dml
new file mode 100644
index 00000000000..27f7a0b39f2
--- /dev/null
+++ b/src/test/scripts/gpu/cudaSupportFunctions/CudaSupportFunctionsTranspose.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read matrices
+A = read($1)
+
+# Perform operation
+R = t(A)
+
+# Write the result matrix R
+write(R, $2)

From 6107852a118befff85234b312f63d7794322ac72 Mon Sep 17 00:00:00 2001
From: ReneEnjilian <enjilianrene@gmail.com>
Date: Fri, 18 Jul 2025 21:02:40 +0200
Subject: [PATCH 26/26] add test for cusparsecsrgeam

---
 .../CudaCusparseCsrGeamTest.java              | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGeamTest.java

diff --git a/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGeamTest.java b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGeamTest.java
new file mode 100644
index 00000000000..7e9f708084e
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/gpu/cudaSupportFunctions/CudaCusparseCsrGeamTest.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.gpu.cudaSupportFunctions;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class CudaCusparseCsrGeamTest extends AutomatedTestBase {
+
+	private static final String TEST_NAME = "CudaSupportFunctionsGeam";
+	private static final String TEST_DIR = "gpu/cudaSupportFunctions/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CudaCusparseCsrGeamTest.class.getSimpleName() + "/";
+
+	private static final int rows = 200;
+	private static final int cols = 200;
+
+	private static final double eps = Math.pow(10, -10);
+
+	@BeforeClass
+	public static void ensureGPU() {
+		TestUtils.checkGPU();
+	}
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"R"}));
+	}
+
+	@Test
+	public void testCusparseCsrGeamNoTranspose() {
+		testCusparseCsrGeam(1);
+	}
+
+	@Test
+	public void testCusparseCsrGeamLeftTranspose() {
+		testCusparseCsrGeam(2);
+	}
+
+	@Test
+	public void testCusparseCsrGeamRightTranspose() {
+		testCusparseCsrGeam(3);
+	}
+
+	@Test
+	public void testCusparseCsrGeamBothTranspose() {
+		testCusparseCsrGeam(4);
+	}
+
+	private void testCusparseCsrGeam(int ID) {
+		TestConfiguration config = getTestConfiguration(TEST_NAME);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		programArgs = new String[] {"-stats", "-gpu", "-args", input("A"), input("B"), String.valueOf(ID), output("R")};
+		fullRScriptName = HOME + TEST_NAME + ".R";
+		rCmd = getRCmd(inputDir(), String.valueOf(ID), expectedDir());
+
+		// both matrices have to be dense
+		double[][] A = getRandomMatrix(rows, cols, -1, 1, 0.20d, 5);
+		double[][] B = getRandomMatrix(rows, cols, -1, 1, 0.20d, 3);
+		writeInputMatrixWithMTD("A", A, true);
+		writeInputMatrixWithMTD("B", B, true);
+
+		runTest(true, false, null, -1);
+		runRScript(true);
+
+		//compare matrices
+		HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("R");
+		HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromExpectedDir("R");
+		TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+		Assert.assertTrue(heavyHittersContainsString("gpu_+"));
+	}
+}