apache · corepointer · Apr 19, 2022 · Jun 14, 2021
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
@@ -93,8 +93,8 @@ else if(_type == SpoofCellwise.CellType.ROW_AGG)
 		long nnz = in_obj.getNnz("spoofCUDA" + getSpoofType(), false);
 		MatrixObject out_obj = sparseOut ?
 				(ec.getSparseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, (isSparseSafe() && nnz > 0) ?
-						nnz : out_rows * out_cols).getKey()) :
-				(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols).getKey());
+						nnz : out_rows * out_cols, false).getKey()) :
+				(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, false).getKey());
 
 		packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,false, null);
 		if(NotEmpty(in_obj) || !sparseSafe) {

diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
@@ -72,7 +72,7 @@ default void prepareMatrixPointers(ByteBuffer buf, ExecutionContext ec, MatrixOb
 				int rows = (int)mo.getNumRows();
 				int cols = (int)mo.getNumColumns();
 				Pointer b1 = mo.getGPUObject(ec.getGPUContext(0)).getDensePointer();
-				Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType);
+				Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType, false);
 				LibMatrixCUDA.denseTranspose(ec, ec.getGPUContext(0), getName(), b1, ptr, rows, cols);
 				writeMatrixDescriptorToBuffer(buf, rows, cols, 0, 0, GPUObject.getPointerAddress(ptr), mo.getNnz());
 			} else {
@@ -115,8 +115,10 @@ default void packDataForTransfer(ExecutionContext ec, ArrayList<MatrixObject> in
 				int NT = 256;
 				long N = inputs.get(0).getNumRows() * inputs.get(0).getNumColumns();
 				num_blocks = ((N + NT * 2 - 1) / (NT * 2));
+				ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks, false);
 			}
-			ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks);
+			else
+				ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks, true);
 			writeMatrixDescriptorToBuffer(buf, 1, 1, 0, 0, GPUObject.getPointerAddress(ptr[0]), 1);
 		}
  		else {

diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
@@ -77,7 +77,7 @@ public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 				hasMatrixObjectSideInput(inputs) ? getMinColsMatrixObjectSideInputs(inputs) : -1;
 		OutputDimensions out_dims = new OutputDimensions(m, n, n2);
 		ec.setMetaData(outputName, out_dims.rows, out_dims.cols);
-		MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols).getKey();
+		MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols, false).getKey();
 
 		packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,_tB1, null);
 

diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
@@ -373,8 +373,14 @@ private static long validateDimensions(long d1, long d2) {
 	 * @return a pair containing the wrapping {@link MatrixObject} and a boolean indicating whether a cuda memory allocation took place (as opposed to the space already being allocated)
 	 */
 	public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols) {
+		return 	getDenseMatrixOutputForGPUInstruction(varName, numRows, numCols, true);
+	}
+
+	public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols,
+		boolean initialize)
+	{
 		MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols);
-		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense();
+		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense(initialize);
 		mo.getDataCharacteristics().setNonZeros(-1);
 		return new Pair<>(mo, allocated);
 	}
@@ -390,9 +396,15 @@ public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String
 	 * @return matrix object
 	 */
 	public Pair<MatrixObject, Boolean> getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols, long nnz) {
+		return getSparseMatrixOutputForGPUInstruction(varName, numRows, numCols, nnz, true);
+	}
+
+	public Pair<MatrixObject, Boolean> getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols,
+		long nnz, boolean initialize)
+	{
 		MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols);
 		mo.getDataCharacteristics().setNonZeros(nnz);
-		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse();
+		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse(initialize);
 		return new Pair<>(mo, allocated);
 	}
 

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -637,7 +637,8 @@ private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRunti
 		int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures 
 		Pointer sysdsWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M);
 		Pointer sysdsBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M);
-		Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
+		// TODO: find out if memset is necessary
+		Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType, true);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
 				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
 				sysdsWPointer, sysdsBiasPointer, cudnnWPointer, D, M);
@@ -650,7 +651,7 @@ private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRunti
 		int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D)
 		long numColsX = X.getNumColumns();
 		int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
-		Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
+		Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType, false);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
 				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
 				xPointer, cudnnInput, N, D, T*D, N*T*D);
@@ -702,7 +703,7 @@ private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeExcept
 		int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures 
 		Pointer sysdsWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M);
 		Pointer sysdsBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M);
-		Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
+		Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType, false);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
 				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
 				sysdsWPointer, sysdsBiasPointer, cudnnWPointer, D, M);
@@ -717,7 +718,7 @@ private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeExcept
 		int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D)
 		long numColsX = X.getNumColumns();
 		int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
-		Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
+		Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType, false);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
 				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
 				xPointer, cudnnInput, N, D, T*D, N*T*D);

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
@@ -252,7 +252,13 @@ protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, Stri
 	 * @return	the matrix object
 	 */
 	protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols) {
-		return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey();
+		return getDenseMatrixOutputForGPUInstruction(ec, name, numRows, numCols, true);
+	}
+
+	protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols,
+		boolean initialize)
+	{
+		return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols, initialize).getKey();
 	}
 
 	@Override

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java
@@ -85,7 +85,8 @@ public void processInstruction(ExecutionContext ec) {
 		}
 		// We currently support only dense rshape
 		Pointer inPtr = LibMatrixCUDA.getDensePointer(gCtx, mat, instName);
-		MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), rows, cols);
+		MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), rows,
+			cols, false);
 		Pointer outPtr = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
 		if(byRow.getBooleanValue()) {
 			// byrow = TRUE is simple memcpy and metadata update

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -265,12 +265,15 @@ public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHand
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param nnz2 number of non-zeroes
 	 * @param rows number of rows
+	 * @param initialize memset to zero?
 	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
 	 */
-	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
+	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows, boolean initialize) {
 		LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
-		if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU");
-		if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or equal to 0 when trying to allocate sparse data on GPU");
+		if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less " +
+			"than 0 when trying to allocate sparse data on GPU");
+		if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or " +
+			"equal to 0 when trying to allocate sparse data on GPU");
 		CSRPointer r = new CSRPointer(gCtx);
 		r.nnz = nnz2;
 		if (nnz2 == 0) {
@@ -279,12 +282,16 @@ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
 			return r;
 		}
 		// increment the cudaCount by 1 for the allocation of all 3 arrays
-		r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
-		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
-		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+		r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2), initialize);
+		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1), initialize);
+		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2), initialize);
 		return r;
 	}
 
+	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
+		return allocateEmpty(gCtx, nnz2, rows, true);
+	}
+
 	/**
 	 * Allocate row pointers of m+1 elements
 	 *
@@ -296,9 +303,9 @@ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
 	private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) {
 		LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
 		cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
-		//cudaDeviceSynchronize;
+
 		// Do not increment the cudaCount of allocations on GPU
-		C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1));
+		C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1), true);
 	}
 
 	/**
@@ -375,8 +382,9 @@ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, C
 	private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) {
 		LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
 		// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
-		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz));
-		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
+
+		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz), false);
+		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz), false);
 	}
 
 	// ==============================================================================================
@@ -402,17 +410,17 @@ public CSRPointer clone(int rows) {
 		CSRPointer that = new CSRPointer(me.getGPUContext());
 		that.allocateMatDescrPointer();
 		that.nnz = me.nnz;
-		that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType);
-		that.rowPtr = allocate(rows * Sizeof.INT);
-		that.colInd = allocate(that.nnz * Sizeof.INT);
+		that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType, false);
+		that.rowPtr = allocate(rows * Sizeof.INT, false);
+		that.colInd = allocate(that.nnz * Sizeof.INT, false);
 		cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice);
 		return that;
 	}
 
-	private Pointer allocate(long size) {
-		return getGPUContext().allocate(null, size);
+	private Pointer allocate(long size, boolean initialize) {
+		return getGPUContext().allocate(null, size, initialize);
 	}
 
 	private GPUContext getGPUContext() {
@@ -460,7 +468,7 @@ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHan
 		LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
 				+ getGPUContext());
 		long size = rows * getDataTypeSizeOf(cols);
-		Pointer A = allocate(size);
+		Pointer A = allocate(size, false);
 		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
 		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
 			// Note: cusparseDcsr2dense method cannot handle empty blocks

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -82,20 +82,21 @@ public class GPUContext {
 	/**
 	 * cusolverDnHandle for invoking solve() function on dense matrices on the GPU
 	 */
-	private cusolverDnHandle cusolverDnHandle;
+	private volatile cusolverDnHandle cusolverDnHandle;
 	/**
 	 * to launch custom CUDA kernel, specific to the active GPU for this GPUContext
 	 */
 	private JCudaKernels kernels;
 
-	private GPUMemoryManager memoryManager;
+	private final GPUMemoryManager memoryManager;
 
 	public GPUMemoryManager getMemoryManager() {
 		return memoryManager;
 	}
 
 	protected GPUContext(int deviceNum) {
 		this.deviceNum = deviceNum;
+
 		cudaSetDevice(deviceNum);
 
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@@ -175,7 +176,7 @@ public int getDeviceNum() {
 	 * Sets the device for the calling thread.
 	 * This method must be called after
 	 * {@link org.apache.sysds.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)}
-	 * If in a multi-threaded environment like parfor, this method must be called when in the
+	 * If in a multithreaded environment like parfor, this method must be called when in the
 	 * appropriate thread.
 	 *
 	 */
@@ -187,18 +188,32 @@ public void initializeThread() {
 	/**
 	 * Invokes memory manager's malloc method
 	 *
-	 * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
+	 * @param instructionName name of instruction for which to record per instruction performance statistics, null if
+	 *                        you don't want to record
 	 * @param size            size of data (in bytes) to allocate
+	 * @param initialize 	  if cudaMemset() should be called
+	 * @return jcuda pointer
+	 */
+	public Pointer allocate(String instructionName, long size, boolean initialize) {
+		return memoryManager.malloc(instructionName, size, initialize);
+	}
+
+	/**
+	 * Default behavior for gpu memory allocation (init to zero)
+	 *
+	 * @param instructionName Name of the instruction calling allocate
+	 * @param size size in bytes
 	 * @return jcuda pointer
 	 */
 	public Pointer allocate(String instructionName, long size) {
-		return memoryManager.malloc(instructionName, size);
+		return memoryManager.malloc(instructionName, size, true);
 	}
 
 	/**
 	 * Does cudaFree calls, lazily.
 	 *
-	 * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
+	 * @param instructionName name of the instruction for which to record per instruction free time, null if you do not
+	 *                        want to record
 	 * @param toFree          {@link Pointer} instance to be freed
 	 * @param eager           true if to be done eagerly
 	 */
@@ -389,7 +404,7 @@ public JCudaKernels getKernels() {
 	 */
 	public void destroy() {
 		if (LOG.isTraceEnabled()) {
-			LOG.trace("GPU : this context was destroyed, this = " + this.toString());
+			LOG.trace("GPU : this context was destroyed, this = " + this);
 		}
 		clearMemory();
 

diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -245,9 +245,10 @@ private static String byteCountToDisplaySize(long numBytes) {
 	 * 
 	 * @param opcode instruction name
 	 * @param size size in bytes
+	 * @param initialize if cudaMemset() should be called
 	 * @return allocated pointer
 	 */
-	public Pointer malloc(String opcode, long size) {
+	public Pointer malloc(String opcode, long size, boolean initialize) {
 		if(size < 0) {
 			throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size));
 		}
@@ -432,7 +433,6 @@ public Pointer malloc(String opcode, long size) {
 			}
 		}
 
-
 		// Step 8: Handle defragmentation
 		if(A == null) {
 			LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
@@ -448,11 +448,12 @@ public Pointer malloc(String opcode, long size) {
 		}
 
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-		cudaMemset(A, 0, size);
+		if(initialize)
+			cudaMemset(A, 0, size);
 		addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
 		return A;
 	}
-	
+
 	private int worstCaseContiguousMemorySizeCompare(GPUObject o1, GPUObject o2) {
 		long ret = matrixMemoryManager.getWorstCaseContiguousMemorySize(o1) - matrixMemoryManager.getWorstCaseContiguousMemorySize(o2);
 		return ret < 0 ? -1 : (ret == 0 ? 0 : 1);