From 06b2090105355cb2b2c2050f5a05a10493373ff1 Mon Sep 17 00:00:00 2001
From: Mark Dokter <mark@dokter.cc>
Date: Mon, 14 Jun 2021 14:08:55 +0200
Subject: [PATCH] [SYSTEMDS-3022] Avoid cudaMemset() where possible

This improvement tries to avoid calling cudaMemset() after allocating memory if there is a subsequent write to the whole buffer happening.

Closes #1572
---
 .../runtime/codegen/SpoofCUDACellwise.java    |  4 +-
 .../runtime/codegen/SpoofCUDAOperator.java    |  6 +-
 .../runtime/codegen/SpoofCUDARowwise.java     |  2 +-
 .../context/ExecutionContext.java             | 16 ++++-
 .../instructions/gpu/DnnGPUInstruction.java   |  9 +--
 .../instructions/gpu/GPUInstruction.java      |  8 ++-
 .../gpu/MatrixReshapeGPUInstruction.java      |  3 +-
 .../instructions/gpu/context/CSRPointer.java  | 40 +++++++-----
 .../instructions/gpu/context/GPUContext.java  | 29 +++++++--
 .../gpu/context/GPUMemoryManager.java         |  9 +--
 .../instructions/gpu/context/GPUObject.java   | 64 ++++++++++---------
 .../gpu/context/ShadowBuffer.java             |  2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java    | 54 ++++++++++------
 .../runtime/matrix/data/LibMatrixCuDNN.java   | 27 ++++----
 .../LibMatrixCuDNNConvolutionAlgorithm.java   | 10 +--
 .../data/LibMatrixCuDNNInputRowFetcher.java   |  4 +-
 .../data/LibMatrixCuDNNRnnAlgorithm.java      |  6 +-
 .../matrix/data/LibMatrixCuMatMult.java       |  2 +-
 .../SinglePrecisionCudaSupportFunctions.java  |  4 +-
 19 files changed, 183 insertions(+), 116 deletions(-)
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
index 5a3211435a7..03c35da5405 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
@@ -93,8 +93,8 @@ else if(_type == SpoofCellwise.CellType.ROW_AGG)
 		long nnz = in_obj.getNnz("spoofCUDA" + getSpoofType(), false);
 		MatrixObject out_obj = sparseOut ?
 				(ec.getSparseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, (isSparseSafe() && nnz > 0) ?
-						nnz : out_rows * out_cols).getKey()) :
-				(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols).getKey());
+						nnz : out_rows * out_cols, false).getKey()) :
+				(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, false).getKey());
 
 		packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,false, null);
 		if(NotEmpty(in_obj) || !sparseSafe) {
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
index 02fbf961aad..8b0aa6432c1 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
@@ -72,7 +72,7 @@ default void prepareMatrixPointers(ByteBuffer buf, ExecutionContext ec, MatrixOb
 				int rows = (int)mo.getNumRows();
 				int cols = (int)mo.getNumColumns();
 				Pointer b1 = mo.getGPUObject(ec.getGPUContext(0)).getDensePointer();
-				Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType);
+				Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType, false);
 				LibMatrixCUDA.denseTranspose(ec, ec.getGPUContext(0), getName(), b1, ptr, rows, cols);
 				writeMatrixDescriptorToBuffer(buf, rows, cols, 0, 0, GPUObject.getPointerAddress(ptr), mo.getNnz());
 			} else {
@@ -115,8 +115,10 @@ default void packDataForTransfer(ExecutionContext ec, ArrayList<MatrixObject> in
 				int NT = 256;
 				long N = inputs.get(0).getNumRows() * inputs.get(0).getNumColumns();
 				num_blocks = ((N + NT * 2 - 1) / (NT * 2));
+				ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks, false);
 			}
-			ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks);
+			else
+				ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks, true);
 			writeMatrixDescriptorToBuffer(buf, 1, 1, 0, 0, GPUObject.getPointerAddress(ptr[0]), 1);
 		}
  		else {
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
index 2632faeb84f..47826a94618 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
@@ -77,7 +77,7 @@ public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 				hasMatrixObjectSideInput(inputs) ? getMinColsMatrixObjectSideInputs(inputs) : -1;
 		OutputDimensions out_dims = new OutputDimensions(m, n, n2);
 		ec.setMetaData(outputName, out_dims.rows, out_dims.cols);
-		MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols).getKey();
+		MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols, false).getKey();
 
 		packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,_tB1, null);
 
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
index 925b34b46d7..e398abd32b0 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
@@ -373,8 +373,14 @@ private static long validateDimensions(long d1, long d2) {
 	 * @return a pair containing the wrapping {@link MatrixObject} and a boolean indicating whether a cuda memory allocation took place (as opposed to the space already being allocated)
 	 */
 	public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols) {
+		return 	getDenseMatrixOutputForGPUInstruction(varName, numRows, numCols, true);
+	}
+
+	public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols,
+		boolean initialize)
+	{
 		MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols);
-		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense();
+		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense(initialize);
 		mo.getDataCharacteristics().setNonZeros(-1);
 		return new Pair<>(mo, allocated);
 	}
@@ -390,9 +396,15 @@ public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String
 	 * @return matrix object
 	 */
 	public Pair<MatrixObject, Boolean> getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols, long nnz) {
+		return getSparseMatrixOutputForGPUInstruction(varName, numRows, numCols, nnz, true);
+	}
+
+	public Pair<MatrixObject, Boolean> getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols,
+		long nnz, boolean initialize)
+	{
 		MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols);
 		mo.getDataCharacteristics().setNonZeros(nnz);
-		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse();
+		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse(initialize);
 		return new Pair<>(mo, allocated);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
index 3ac1b5ad07a..285906a5241 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -637,7 +637,8 @@ private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRunti
 		int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures 
 		Pointer sysdsWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M);
 		Pointer sysdsBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M);
-		Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
+		// TODO: find out if memset is necessary
+		Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType, true);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
 				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
 				sysdsWPointer, sysdsBiasPointer, cudnnWPointer, D, M);
@@ -650,7 +651,7 @@ private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRunti
 		int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D)
 		long numColsX = X.getNumColumns();
 		int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
-		Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
+		Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType, false);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
 				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
 				xPointer, cudnnInput, N, D, T*D, N*T*D);
@@ -702,7 +703,7 @@ private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeExcept
 		int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures 
 		Pointer sysdsWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M);
 		Pointer sysdsBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M);
-		Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
+		Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType, false);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
 				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
 				sysdsWPointer, sysdsBiasPointer, cudnnWPointer, D, M);
@@ -717,7 +718,7 @@ private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeExcept
 		int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D)
 		long numColsX = X.getNumColumns();
 		int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
-		Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
+		Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType, false);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
 				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
 				xPointer, cudnnInput, N, D, T*D, N*T*D);
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
index 700145b55c1..50afdcfca8f 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java
@@ -252,7 +252,13 @@ protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, Stri
 	 * @return	the matrix object
 	 */
 	protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols) {
-		return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey();
+		return getDenseMatrixOutputForGPUInstruction(ec, name, numRows, numCols, true);
+	}
+
+	protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols,
+		boolean initialize)
+	{
+		return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols, initialize).getKey();
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java
index 97d39cc7ba7..f82d712b45f 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java
@@ -85,7 +85,8 @@ public void processInstruction(ExecutionContext ec) {
 		}
 		// We currently support only dense rshape
 		Pointer inPtr = LibMatrixCUDA.getDensePointer(gCtx, mat, instName);
-		MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), rows, cols);
+		MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), rows,
+			cols, false);
 		Pointer outPtr = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
 		if(byRow.getBooleanValue()) {
 			// byrow = TRUE is simple memcpy and metadata update
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
index 55d31302242..23ae4d1648e 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java
@@ -265,12 +265,15 @@ public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHand
 	 * @param gCtx a valid {@link GPUContext}
 	 * @param nnz2 number of non-zeroes
 	 * @param rows number of rows
+	 * @param initialize memset to zero?
 	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
 	 */
-	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
+	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows, boolean initialize) {
 		LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
-		if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU");
-		if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or equal to 0 when trying to allocate sparse data on GPU");
+		if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less " +
+			"than 0 when trying to allocate sparse data on GPU");
+		if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or " +
+			"equal to 0 when trying to allocate sparse data on GPU");
 		CSRPointer r = new CSRPointer(gCtx);
 		r.nnz = nnz2;
 		if (nnz2 == 0) {
@@ -279,12 +282,16 @@ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
 			return r;
 		}
 		// increment the cudaCount by 1 for the allocation of all 3 arrays
-		r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
-		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
-		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+		r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2), initialize);
+		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1), initialize);
+		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2), initialize);
 		return r;
 	}
 
+	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
+		return allocateEmpty(gCtx, nnz2, rows, true);
+	}
+
 	/**
 	 * Allocate row pointers of m+1 elements
 	 *
@@ -296,9 +303,9 @@ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
 	private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) {
 		LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
 		cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
-		//cudaDeviceSynchronize;
+
 		// Do not increment the cudaCount of allocations on GPU
-		C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1));
+		C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1), true);
 	}
 
 	/**
@@ -375,8 +382,9 @@ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, C
 	private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) {
 		LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
 		// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
-		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz));
-		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
+
+		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz), false);
+		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz), false);
 	}
 
 	// ==============================================================================================
@@ -402,17 +410,17 @@ public CSRPointer clone(int rows) {
 		CSRPointer that = new CSRPointer(me.getGPUContext());
 		that.allocateMatDescrPointer();
 		that.nnz = me.nnz;
-		that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType);
-		that.rowPtr = allocate(rows * Sizeof.INT);
-		that.colInd = allocate(that.nnz * Sizeof.INT);
+		that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType, false);
+		that.rowPtr = allocate(rows * Sizeof.INT, false);
+		that.colInd = allocate(that.nnz * Sizeof.INT, false);
 		cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice);
 		return that;
 	}
 
-	private Pointer allocate(long size) {
-		return getGPUContext().allocate(null, size);
+	private Pointer allocate(long size, boolean initialize) {
+		return getGPUContext().allocate(null, size, initialize);
 	}
 
 	private GPUContext getGPUContext() {
@@ -460,7 +468,7 @@ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHan
 		LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
 				+ getGPUContext());
 		long size = rows * getDataTypeSizeOf(cols);
-		Pointer A = allocate(size);
+		Pointer A = allocate(size, false);
 		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
 		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
 			// Note: cusparseDcsr2dense method cannot handle empty blocks
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
index 7ad030bb9a7..5ad9bc52a33 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -82,13 +82,13 @@ public class GPUContext {
 	/**
 	 * cusolverDnHandle for invoking solve() function on dense matrices on the GPU
 	 */
-	private cusolverDnHandle cusolverDnHandle;
+	private volatile cusolverDnHandle cusolverDnHandle;
 	/**
 	 * to launch custom CUDA kernel, specific to the active GPU for this GPUContext
 	 */
 	private JCudaKernels kernels;
 	
-	private GPUMemoryManager memoryManager;
+	private final GPUMemoryManager memoryManager;
 	
 	public GPUMemoryManager getMemoryManager() {
 		return memoryManager;
@@ -96,6 +96,7 @@ public GPUMemoryManager getMemoryManager() {
 
 	protected GPUContext(int deviceNum) {
 		this.deviceNum = deviceNum;
+
 		cudaSetDevice(deviceNum);
 
 		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@@ -175,7 +176,7 @@ public int getDeviceNum() {
 	 * Sets the device for the calling thread.
 	 * This method must be called after
 	 * {@link org.apache.sysds.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)}
-	 * If in a multi-threaded environment like parfor, this method must be called when in the
+	 * If in a multithreaded environment like parfor, this method must be called when in the
 	 * appropriate thread.
 	 *
 	 */
@@ -187,18 +188,32 @@ public void initializeThread() {
 	/**
 	 * Invokes memory manager's malloc method
 	 *
-	 * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
+	 * @param instructionName name of instruction for which to record per instruction performance statistics, null if
+	 *                        you don't want to record
 	 * @param size            size of data (in bytes) to allocate
+	 * @param initialize 	  if cudaMemset() should be called
+	 * @return jcuda pointer
+	 */
+	public Pointer allocate(String instructionName, long size, boolean initialize) {
+		return memoryManager.malloc(instructionName, size, initialize);
+	}
+
+	/**
+	 * Default behavior for gpu memory allocation (init to zero)
+	 *
+	 * @param instructionName Name of the instruction calling allocate
+	 * @param size size in bytes
 	 * @return jcuda pointer
 	 */
 	public Pointer allocate(String instructionName, long size) {
-		return memoryManager.malloc(instructionName, size);
+		return memoryManager.malloc(instructionName, size, true);
 	}
 
 	/**
 	 * Does cudaFree calls, lazily.
 	 *
-	 * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
+	 * @param instructionName name of the instruction for which to record per instruction free time, null if you do not
+	 *                        want to record
 	 * @param toFree          {@link Pointer} instance to be freed
 	 * @param eager           true if to be done eagerly
 	 */
@@ -389,7 +404,7 @@ public JCudaKernels getKernels() {
 	 */
 	public void destroy() {
 		if (LOG.isTraceEnabled()) {
-			LOG.trace("GPU : this context was destroyed, this = " + this.toString());
+			LOG.trace("GPU : this context was destroyed, this = " + this);
 		}
 		clearMemory();
 
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 7df6214e5b7..d6b62ad8907 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -245,9 +245,10 @@ private static String byteCountToDisplaySize(long numBytes) {
 	 * 
 	 * @param opcode instruction name
 	 * @param size size in bytes
+	 * @param initialize if cudaMemset() should be called
 	 * @return allocated pointer
 	 */
-	public Pointer malloc(String opcode, long size) {
+	public Pointer malloc(String opcode, long size, boolean initialize) {
 		if(size < 0) {
 			throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size));
 		}
@@ -432,7 +433,6 @@ public Pointer malloc(String opcode, long size) {
 			}
 		}
 		
-		
 		// Step 8: Handle defragmentation
 		if(A == null) {
 			LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
@@ -448,11 +448,12 @@ public Pointer malloc(String opcode, long size) {
 		}
 		
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-		cudaMemset(A, 0, size);
+		if(initialize)
+			cudaMemset(A, 0, size);
 		addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
 		return A;
 	}
-	
+
 	private int worstCaseContiguousMemorySizeCompare(GPUObject o1, GPUObject o2) {
 		long ret = matrixMemoryManager.getWorstCaseContiguousMemorySize(o1) - matrixMemoryManager.getWorstCaseContiguousMemorySize(o2);
 		return ret < 0 ? -1 : (ret == 0 ? 0 : 1);
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 633e3fc7d26..5ea922b99f4 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -202,7 +202,7 @@ public Object clone() {
 				long rows = me.mat.getNumRows();
 				long cols = me.mat.getNumColumns();
 				long size = rows * cols * LibMatrixCUDA.sizeOfDataType;
-				that.setDensePointer(allocate(size));
+				that.setDensePointer(allocate(size, false));
 				cudaMemcpy(that.getDensePointer(), me.getDensePointer(), size, cudaMemcpyDeviceToDevice);
 			}
 
@@ -218,17 +218,15 @@ public Object clone() {
 		return that;
 	}
 
-	private Pointer allocate(long size) {
-		return getGPUContext().allocate(null, size);
+	private Pointer allocate(long size, boolean initialize) {
+		return getGPUContext().allocate(null, size, initialize);
 	}
 
 	private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
 		getGPUContext().cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE);
 	}
 
-	GPUContext getGPUContext() {
-		return gpuContext;
-	}
+	GPUContext getGPUContext() { return gpuContext; }
 
 	/**
 	 * Transposes a dense matrix on the GPU by calling the cublasDgeam operation
@@ -248,7 +246,7 @@ public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n,
 		Pointer alpha = LibMatrixCUDA.one();
 		Pointer beta = LibMatrixCUDA.zero();
 		Pointer A = densePtr;
-		Pointer C = gCtx.allocate(null, m * getDatatypeSizeOf(n));
+		Pointer C = gCtx.allocate(null, m * getDatatypeSizeOf(n), false);
 
 		// Transpose the matrix to get a dense matrix
 		LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(),
@@ -276,8 +274,8 @@ public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cuspa
 		Pointer nnzPerRowPtr = null;
 		Pointer nnzTotalDevHostPtr = null;
 
-		nnzPerRowPtr = gCtx.allocate(null, getIntSizeOf(rows));
-		nnzTotalDevHostPtr = gCtx.allocate(null, getIntSizeOf(1));
+		nnzPerRowPtr = gCtx.allocate(null, getIntSizeOf(rows), false);
+		nnzTotalDevHostPtr = gCtx.allocate(null, getIntSizeOf(1), false);
 
 		// Output is in dense vector format, convert it to CSR
 		LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows,
@@ -526,13 +524,9 @@ public void allocateAndFillDense(double v) {
 		long cols = mat.getNumColumns();
 		int numElems = toIntExact(rows * cols);
 		long size = getDatatypeSizeOf(numElems);
-		setDensePointer(allocate(size));
-		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
-		// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
-		if (v != 0)
-			getGPUContext().getKernels()
-			.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
-					getDensePointer(), v, numElems);
+		setDensePointer(allocate(size,false));
+		getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations
+				(numElems), getDensePointer(), v, numElems);
 	}
 
 	/**
@@ -574,8 +568,8 @@ public long getNnz(String instName, boolean recomputeDenseNNZ) {
 				int cols = toIntExact(mat.getNumColumns());
 				Pointer nnzPerRowPtr = null;
 				Pointer nnzTotalDevHostPtr = null;
-				nnzPerRowPtr = gCtx.allocate(instName, getIntSizeOf(rows));
-				nnzTotalDevHostPtr = gCtx.allocate(instName, getIntSizeOf(1));
+				nnzPerRowPtr = gCtx.allocate(instName, getIntSizeOf(rows), false);
+				nnzTotalDevHostPtr = gCtx.allocate(instName, getIntSizeOf(1), false);
 				LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getDensePointer(), rows,
 						nnzPerRowPtr, nnzTotalDevHostPtr);
 				int[] nnzC = { -1 };
@@ -613,6 +607,10 @@ public boolean acquireDeviceRead(String opcode) {
 	}
 
 	public boolean acquireDeviceModifyDense() {
+		return acquireDeviceModifyDense(true);
+	}
+
+	public boolean acquireDeviceModifyDense(boolean initialize) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext());
 		}
@@ -623,7 +621,7 @@ public boolean acquireDeviceModifyDense() {
 				LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this);
 			}
 			// Dense block, size = numRows * numCols
-			allocateDenseMatrixOnDevice();
+			allocateDenseMatrixOnDevice(initialize);
 			allocated = true;
 		}
 		dirty = true;
@@ -633,6 +631,10 @@ public boolean acquireDeviceModifyDense() {
 	}
 
 	public boolean acquireDeviceModifySparse() {
+ 		return acquireDeviceModifySparse(true);
+	}
+
+	public boolean acquireDeviceModifySparse(boolean initialize) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
 		}
@@ -643,7 +645,7 @@ public boolean acquireDeviceModifySparse() {
 				LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this);
 			}
 			mat.setDirty(true);
-			allocateSparseMatrixOnDevice();
+			allocateSparseMatrixOnDevice(initialize);
 			allocated = true;
 		}
 		dirty = true;
@@ -753,7 +755,7 @@ public void releaseOutput() {
 			throw new DMLRuntimeException("Attempting to release an output before allocating it");
 	}
 
-	void allocateDenseMatrixOnDevice() {
+	void allocateDenseMatrixOnDevice(boolean initialize) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
 		}
@@ -766,11 +768,11 @@ void allocateDenseMatrixOnDevice() {
 		if(cols <= 0)
 			throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix:" + cols);
 		long size = getDatatypeSizeOf(rows * cols);
-		Pointer tmp = allocate(size);
+		Pointer tmp = allocate(size, initialize);
 		setDensePointer(tmp);
 	}
 
-	void allocateSparseMatrixOnDevice() {
+	void allocateSparseMatrixOnDevice(boolean initialize) {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
 		}
@@ -782,16 +784,16 @@ void allocateSparseMatrixOnDevice() {
 			throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix");
 		if(nnz < 0)
 			throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix");
-		CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows);
+		CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows, initialize);
 		setSparseMatrixCudaPointer(tmp);
 	}
 
-	void allocateSparseMatrixOnDevice(long numVals) {
+	void allocateSparseMatrixOnDevice(long numVals, boolean initialize) {
 		// This method is called when #values > nnz
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext());
 		}
-		if(isAllocated()) 
+		if(isAllocated())
 			throw new DMLRuntimeException("Internal error - trying to allocated sparse matrix to a GPUObject that is already allocated");
 		long rows = mat.getNumRows();
 		long nnz = mat.getNnz();
@@ -799,7 +801,7 @@ void allocateSparseMatrixOnDevice(long numVals) {
 			throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix");
 		if(nnz < 0)
 			throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix");
-		CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), numVals, rows);
+		CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), numVals, rows, initialize);
 		setSparseMatrixCudaPointer(tmp);
 	}
 
@@ -883,11 +885,11 @@ void copyFromHostToDevice(String opcode) {
 
 			if (values != null)
 				if(values.length > tmp.getNonZeros())
-					allocateSparseMatrixOnDevice(values.length);
+					allocateSparseMatrixOnDevice(values.length, false);
 				else
-					allocateSparseMatrixOnDevice();
+					allocateSparseMatrixOnDevice(false);
 			else
-				allocateSparseMatrixOnDevice();
+				allocateSparseMatrixOnDevice(false);
 
 			if (copyToDevice) {
 				CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(),
@@ -901,7 +903,7 @@ void copyFromHostToDevice(String opcode) {
 			else if (data == null && tmp.getNonZeros() != 0)
 				throw new DMLRuntimeException("MatrixBlock is not allocated");
 			
-			allocateDenseMatrixOnDevice();
+			allocateDenseMatrixOnDevice(false);
 			
 			if (tmp.getNonZeros() == 0) {
 				// Minor optimization: No need to allocate empty error for CPU 
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/ShadowBuffer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/ShadowBuffer.java
index 13279a55df6..0eba39134e2 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/ShadowBuffer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/ShadowBuffer.java
@@ -103,7 +103,7 @@ public void moveToHost() {
 	public void moveToDevice() {
 		long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		long numBytes = shadowPointer.length*LibMatrixCUDA.sizeOfDataType;
-		gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes);
+		gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes, false);
 		cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
 		clearShadowPointer();
 		if (DMLScript.STATISTICS) {
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
index f6e950ab810..7a5db2141c7 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
@@ -377,7 +377,7 @@ public static void channelSums(GPUContext gCtx, String instName, MatrixObject in
 		Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
 		
 		// We can replace this with CuDNN tensor reduce
-		Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType);
+		Pointer tmp = gCtx.allocate(instName, (long) cols * sizeOfDataType, false);
 		reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols);
 		reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW));
 		gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
@@ -718,7 +718,7 @@ else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...}
 		}
 		case OP_PLUS_SQ : {
 			// Calculate the squares in a temporary object tmp
-			Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
+			Pointer tmp = gCtx.allocate(instName, (long) size * sizeOfDataType, false);
 
 			squareMatrix(gCtx, instName, in, tmp, rlen, clen);
 			// Then do the sum on the temporary object and free it
@@ -817,8 +817,8 @@ else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...}
 		}
 		case OP_VARIANCE : {
 			// Temporary GPU array for
-			Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
-			Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType);
+			Pointer tmp = gCtx.allocate(instName, (long) size * sizeOfDataType, false);
+			Pointer tmp2 = gCtx.allocate(instName, (long) size * sizeOfDataType, false);
 
 			switch(reductionDirection) {
 
@@ -846,7 +846,7 @@ else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...}
 
 				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-				Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType);
+				Pointer tmpRow = gCtx.allocate(instName, (long) rlen * sizeOfDataType, false);
 				reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
 
 				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
@@ -864,7 +864,7 @@ else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...}
 
 				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-				Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType);
+				Pointer tmpCol = gCtx.allocate(instName, (long) clen * sizeOfDataType, false);
 				reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
 
 				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
@@ -933,7 +933,7 @@ private static double reduceAll(GPUContext gCtx, String instName, String kernelF
 		int[] tmp = getKernelParamsForReduceAll(gCtx, n);
 		int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
-		Pointer tempOut = gCtx.allocate(instName, (long) blocks * sizeOfDataType); 
+		Pointer tempOut = gCtx.allocate(instName, (long) blocks * sizeOfDataType, false);
 
 		getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
 		
@@ -2298,7 +2298,7 @@ public static void cumulativeScan(ExecutionContext ec, GPUContext gCtx, String i
 			Pointer input = getDensePointer(gCtx, in, instName);
 			Pointer output = getDensePointer(gCtx, out, instName);
 			// storage for last value of each block
-			Pointer blk_res = gCtx.allocate(instName, cols * blocks_y * sizeOfDataType);
+			Pointer blk_res = gCtx.allocate(instName, (long) cols * blocks_y * sizeOfDataType, false);
 
 			alloc_duration = printKernelTiming(time, "allocation of temporary buffer (" +
 				cols * blocks_y * sizeOfDataType + " bytes)", alloc_duration, 0);
@@ -2388,9 +2388,9 @@ public static void cumulativeSumProduct(ExecutionContext ec, GPUContext gCtx,
 
 			long total_mem_size = 0;
 			while( cascade_blocks > 0) {
-				long buf_size = 2 * block_height * cascade_blocks * sizeOfDataType;
+				long buf_size = 2L * block_height * cascade_blocks * sizeOfDataType;
 				total_mem_size += buf_size;
-				intermediate_buffers.add(gCtx.allocate(instName, buf_size));
+				intermediate_buffers.add(gCtx.allocate(instName, buf_size, false));
 				cascade_blocks = (cascade_blocks + block_height - 2) / block_height;
 				if(cascade_blocks > 0)
 					cb_list.add(cascade_blocks);
@@ -2652,9 +2652,9 @@ public static void solve(ExecutionContext ec, GPUContext gCtx, String instName,
 		cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
 		
 		// step 4: compute QR factorization
-		Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType);
-		Pointer tau = gCtx.allocate(instName, m * sizeOfDataType);
-		Pointer devInfo = gCtx.allocate(instName, Sizeof.INT);
+		Pointer work = gCtx.allocate(instName, (long) lwork[0] * sizeOfDataType, false);
+		Pointer tau = gCtx.allocate(instName, (long) m * sizeOfDataType, false);
+		Pointer devInfo = gCtx.allocate(instName, Sizeof.INT, false);
 		cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
 		
 		int[] qrError = {-1};
@@ -2704,8 +2704,16 @@ public static void solve(ExecutionContext ec, GPUContext gCtx, String instName,
 	 * @param numCols number of columns of output matrix object
 	 * @return	the matrix object
 	 */
-	public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) {
-		return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey();
+	public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name,
+		long numRows, long numCols)
+	{
+		return getDenseMatrixOutputForGPUInstruction(ec, instName, name, numRows, numCols, true);
+	}
+
+	public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name,
+		long numRows, long numCols, boolean initialize)
+	{
+		return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols, initialize).getKey();
 	}
 
 	/**
@@ -2717,12 +2725,22 @@ public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContex
 	 * @param nnz number of non zeroes in output matrix
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param name	name of input matrix (that the {@link ExecutionContext} is aware of)
+	 * @param initialize memset to zero?
+	 *
 	 * @return	the matrix object
 	 */
-	private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, long nnz, String instName, String name) {
-		return ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz).getKey();
+	private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols,
+		long nnz, String instName, String name, boolean initialize)
+	{
+		return ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz, initialize).getKey();
 	}
-	
+
+	private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols,
+		long nnz, String instName, String name)
+	{
+		return getSparseMatrixOutputForGPUInstruction(ec, numRows, numCols, nnz, instName, name, true);
+	}
+
 	/**
 	 * Utility to compute number of non-zeroes on the GPU
 	 * 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
index f85b9d0f298..d95e1bf8235 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java
@@ -141,7 +141,7 @@ private static Pointer denseIm2col(GPUContext gCtx, String instName, MatrixObjec
 				throw new DMLRuntimeException("Unknown number of nonzeroes in denseIm2col");
 			}
 			else if(inPointer.nnz > 0) {
-				im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType);
+				im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType, false);
 				getCudaKernels(gCtx).launchKernel("sparse_dense_im2col", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(inPointer.nnz)), 
 						inPointer.val, inPointer.rowPtr, inPointer.colInd, im2colPointer, inPointer.nnz, N, 
 						C*H*W, H*W, W, R, S, P, Q, P*Q, R*S, N*P*Q, stride_h, stride_w, pad_h, pad_w);
@@ -150,7 +150,7 @@ else if(inPointer.nnz > 0) {
 				return null;
 		}
 		else {
-			im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType);
+			im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType, false);
 			Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName);
 			getCudaKernels(gCtx).launchKernel("dense_dense_im2col", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*C*H*W)), 
 					imagePointer, im2colPointer, N*C*H*W, 
@@ -212,7 +212,7 @@ public static void conv2d(GPUContext gCtx, String instName, MatrixObject image,
 				
 				// Perform matrix multiplication
 				CSRPointer filterPointer = filter.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-				Pointer matmultOutputPointer = gCtx.allocate(instName, NKPQ*sizeOfDataType);
+				Pointer matmultOutputPointer = gCtx.allocate(instName, NKPQ*sizeOfDataType, false);
 				LibMatrixCuMatMult.sparseDenseMatMult(gCtx, instName, matmultOutputPointer, filterPointer, im2colPointer, K, CRS, CRS, NPQ, K, NPQ, false, false);
 				gCtx.cudaFreeHelper(instName, im2colPointer, DMLScript.EAGER_CUDA_FREE);
 				
@@ -423,7 +423,7 @@ public static void conv2dBackwardFilter(GPUContext gCtx, String instName, Matrix
 					try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 						LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
 						// Perform one-input conv2dBackwardFilter
-						Pointer tempdwPointer = gCtx.allocate(instName, KCRS*sizeOfDataType);
+						Pointer tempdwPointer = gCtx.allocate(instName, KCRS*sizeOfDataType, false);
 						for(int n = 0; n < N; n++) {
 							cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType);
 							// Perform one-input conv2dBackwardFilter
@@ -726,8 +726,8 @@ private static void cudnnPoolingBackwardHelper(GPUContext gCtx, String instName,
 						pad_h, pad_w, stride_h, stride_w, P, Q, poolingType)) {
 			int status;
 			if(!isMaxPoolOutputProvided) {
-				long numBytes = N*C*P*Q*sizeOfDataType;
-				y = gCtx.allocate(instName, numBytes);
+				long numBytes = (long) N *C*P*Q*sizeOfDataType;
+				y = gCtx.allocate(instName, numBytes, false);
 				status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
 				if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
 					throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
@@ -845,8 +845,9 @@ private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPU
 			String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
 		boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
 		// Get output pointers
-		Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
-		Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType);
+		Pointer cudnnYPointer = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
+		Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName,
+			(long) N*M*sizeOfDataType, false);
 		Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
 		// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
 		
@@ -878,16 +879,16 @@ public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String ins
 			String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
 			boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
 		// Transform the input dout and prepare them for cudnnRNNBackwardData
-		Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
+		Pointer dy = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
 		int size = return_sequences ? N*T*M : N*M;
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
 				ExecutionConfig.getConfigForSimpleVectorOperations(size),
-				getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M),
+				getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? (long) T*M : M),
 				dy, N, T, M, size, return_sequences ? 1 : 0);
 		ec.releaseMatrixInputForGPUInstruction(doutName);
 				
 		// Allocate intermediate pointers computed by forward
-		Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
+		Pointer yPointer = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false);
 		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
 			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
 					algo.xDesc, x, 
@@ -900,7 +901,7 @@ algo.cyDesc, new Pointer(),
 					algo.workSpace, algo.sizeInBytes, 
 					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
 			
-			Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType);
+			Pointer cudnnDx = gCtx.allocate(instName, (long) N *T*D*LibMatrixCUDA.sizeOfDataType, false);
 			JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
 					algo.yDesc, yPointer,
 					// ----------------------
@@ -933,7 +934,7 @@ algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M),
 			gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE);
 			
 			// -------------------------------------------------------------------------------------------
-			Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
+			Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4L *M)*LibMatrixCUDA.sizeOfDataType, false);
 			JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
 					algo.xDesc, x, 
 					algo.hxDesc, hx, 
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index 67373e45a42..2284c9da6ca 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -134,7 +134,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionForwardAlgor
 		jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
 				ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
 		if (sizeInBytesArray[0] != 0)
-			ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
+			ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false);
 		ret.sizeInBytes = sizeInBytesArray[0];
 		ret.algo = algos[0];
 		return ret;
@@ -168,7 +168,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilt
 				pad_h, pad_w, stride_h, stride_w, P, Q);
 		
 		int[] algos = {-1};
-		long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
+		long[] sizeInBytesArray = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
 		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
 				LibMatrixCuDNN.getCudnnHandle(gCtx), 
 				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, 
@@ -176,7 +176,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilt
 		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
 				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
 		if (sizeInBytesArray[0] != 0)
-			ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
+			ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false);
 		ret.sizeInBytes = sizeInBytesArray[0];
 		ret.algo = algos[0];
 		
@@ -218,7 +218,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardData
 		}
 		else {
 			int[] algos = {-1};
-			long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
+			long[] sizeInBytesArray = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
 			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
 					LibMatrixCuDNN.getCudnnHandle(gCtx), 
 					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
@@ -226,7 +226,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardData
 			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
 					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
 			if (sizeInBytesArray[0] != 0)
-				ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]);
+				ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false);
 			ret.sizeInBytes = sizeInBytesArray[0];
 			ret.algo = algos[0];
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index 0cc27b4cd11..1a29dee2cf6 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -47,7 +47,7 @@ public LibMatrixCuDNNInputRowFetcher(GPUContext gCtx, String instName, MatrixObj
 		numColumns = LibMatrixCUDA.toInt(image.getNumColumns());
 		isInputInSparseFormat = LibMatrixCUDA.isInSparseFormat(gCtx, image);
 		inPointer = isInputInSparseFormat ? LibMatrixCUDA.getSparsePointer(gCtx, image, instName) : LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName);
-		outPointer = gCtx.allocate(instName, numColumns*sizeOfDataType);
+		outPointer = gCtx.allocate(instName, (long) numColumns *sizeOfDataType, false);
 	}
 	/**
 	 * Copy the nth row and return the dense pointer
@@ -57,7 +57,7 @@ public LibMatrixCuDNNInputRowFetcher(GPUContext gCtx, String instName, MatrixObj
 	public Pointer getNthRow(int n) {
 		if(isInputInSparseFormat) {
 			jcuda.runtime.JCuda.cudaDeviceSynchronize();
-			cudaMemset(outPointer, 0, numColumns*sizeOfDataType);
+			cudaMemset(outPointer, 0, (long) numColumns *sizeOfDataType);
 			jcuda.runtime.JCuda.cudaDeviceSynchronize();
 			LibMatrixCUDA.sliceSparseDense(gCtx, instName, (CSRPointer)inPointer, outPointer, n, n, 0, LibMatrixCUDA.toInt(numColumns-1), numColumns);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
index 663267b431c..75f9fec0de6 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
@@ -90,7 +90,7 @@ public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String i
 		dropOutSizeInBytes = _dropOutSizeInBytes[0];
 		dropOutStateSpace = new Pointer();
 		if (dropOutSizeInBytes != 0)
-			dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes);
+			dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes, false);
 		JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
 		
 		// Initialize RNN descriptor
@@ -112,12 +112,12 @@ public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String i
 		workSpace = new Pointer(); reserveSpace = new Pointer();
 		sizeInBytes = getWorkspaceSize(T);
 		if(sizeInBytes != 0)
-			workSpace = gCtx.allocate(instName, sizeInBytes);
+			workSpace = gCtx.allocate(instName, sizeInBytes, false);
 		reserveSpaceSizeInBytes = 0;
 		if(isTraining) {
 			reserveSpaceSizeInBytes = getReservespaceSize(T);
 			if (reserveSpaceSizeInBytes != 0) {
-				reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes);
+				reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes, false);
 			}
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
index a62e152b44b..5753041a622 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -259,7 +259,7 @@ static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRP
 		// t(C) = t(B) %*% t(A)
 		Pointer output = null;
 		if (outRLen != 1 && outCLen != 1) {
-			output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType);
+			output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType, false);
 		} else {
 			// no transpose required for vector output
 			output = C;
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index fd29f21d655..e25a68aef35 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -178,7 +178,7 @@ public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String ins
 		// during eviction: `evict -> devictToHost -> float2double -> allocate -> ensureFreeSpace -> evict`. 
 		// To avoid this recursion, it is necessary to perform this conversion in host.
 		if(PERFORM_CONVERSION_ON_DEVICE && !isEviction) {
-			Pointer deviceDoubleData = gCtx.allocate(instName, ((long)dest.length)*Sizeof.DOUBLE);
+			Pointer deviceDoubleData = gCtx.allocate(instName, ((long)dest.length)*Sizeof.DOUBLE, false);
 			LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length);
 			cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
 			gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE);
@@ -202,7 +202,7 @@ public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String ins
 		// TODO: Perform conversion on GPU using double2float and float2double kernels
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		if(PERFORM_CONVERSION_ON_DEVICE) {
-			Pointer deviceDoubleData = gCtx.allocate(instName, ((long)src.length)*Sizeof.DOUBLE);
+			Pointer deviceDoubleData = gCtx.allocate(instName, ((long)src.length)*Sizeof.DOUBLE, false);
 			cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
 			LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length);
 			gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE);