From 06b2090105355cb2b2c2050f5a05a10493373ff1 Mon Sep 17 00:00:00 2001 From: Mark Dokter Date: Mon, 14 Jun 2021 14:08:55 +0200 Subject: [PATCH] [SYSTEMDS-3022] Avoid cudaMemset() where possible This improvement tries to avoid calling cudaMemset() after allocating memory if there is a subsequent write to the whole buffer happening. Closes #1572 --- .../runtime/codegen/SpoofCUDACellwise.java | 4 +- .../runtime/codegen/SpoofCUDAOperator.java | 6 +- .../runtime/codegen/SpoofCUDARowwise.java | 2 +- .../context/ExecutionContext.java | 16 ++++- .../instructions/gpu/DnnGPUInstruction.java | 9 +-- .../instructions/gpu/GPUInstruction.java | 8 ++- .../gpu/MatrixReshapeGPUInstruction.java | 3 +- .../instructions/gpu/context/CSRPointer.java | 40 +++++++----- .../instructions/gpu/context/GPUContext.java | 29 +++++++-- .../gpu/context/GPUMemoryManager.java | 9 +-- .../instructions/gpu/context/GPUObject.java | 64 ++++++++++--------- .../gpu/context/ShadowBuffer.java | 2 +- .../runtime/matrix/data/LibMatrixCUDA.java | 54 ++++++++++------ .../runtime/matrix/data/LibMatrixCuDNN.java | 27 ++++---- .../LibMatrixCuDNNConvolutionAlgorithm.java | 10 +-- .../data/LibMatrixCuDNNInputRowFetcher.java | 4 +- .../data/LibMatrixCuDNNRnnAlgorithm.java | 6 +- .../matrix/data/LibMatrixCuMatMult.java | 2 +- .../SinglePrecisionCudaSupportFunctions.java | 4 +- 19 files changed, 183 insertions(+), 116 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java index 5a3211435a7..03c35da5405 100644 --- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java +++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java @@ -93,8 +93,8 @@ else if(_type == SpoofCellwise.CellType.ROW_AGG) long nnz = in_obj.getNnz("spoofCUDA" + getSpoofType(), false); MatrixObject out_obj = sparseOut ? (ec.getSparseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, (isSparseSafe() && nnz > 0) ? - nnz : out_rows * out_cols).getKey()) : - (ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols).getKey()); + nnz : out_rows * out_cols, false).getKey()) : + (ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, false).getKey()); packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,false, null); if(NotEmpty(in_obj) || !sparseSafe) { diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java index 02fbf961aad..8b0aa6432c1 100644 --- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java +++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java @@ -72,7 +72,7 @@ default void prepareMatrixPointers(ByteBuffer buf, ExecutionContext ec, MatrixOb int rows = (int)mo.getNumRows(); int cols = (int)mo.getNumColumns(); Pointer b1 = mo.getGPUObject(ec.getGPUContext(0)).getDensePointer(); - Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType); + Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType, false); LibMatrixCUDA.denseTranspose(ec, ec.getGPUContext(0), getName(), b1, ptr, rows, cols); writeMatrixDescriptorToBuffer(buf, rows, cols, 0, 0, GPUObject.getPointerAddress(ptr), mo.getNnz()); } else { @@ -115,8 +115,10 @@ default void packDataForTransfer(ExecutionContext ec, ArrayList in int NT = 256; long N = inputs.get(0).getNumRows() * inputs.get(0).getNumColumns(); num_blocks = ((N + NT * 2 - 1) / (NT * 2)); + ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks, false); } - ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks); + else + ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks, true); writeMatrixDescriptorToBuffer(buf, 1, 1, 0, 0, GPUObject.getPointerAddress(ptr[0]), 1); } else { diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java index 2632faeb84f..47826a94618 100644 --- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java +++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java @@ -77,7 +77,7 @@ public MatrixObject execute(ExecutionContext ec, ArrayList inputs, hasMatrixObjectSideInput(inputs) ? getMinColsMatrixObjectSideInputs(inputs) : -1; OutputDimensions out_dims = new OutputDimensions(m, n, n2); ec.setMetaData(outputName, out_dims.rows, out_dims.cols); - MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols).getKey(); + MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols, false).getKey(); packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,_tB1, null); diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java index 925b34b46d7..e398abd32b0 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java @@ -373,8 +373,14 @@ private static long validateDimensions(long d1, long d2) { * @return a pair containing the wrapping {@link MatrixObject} and a boolean indicating whether a cuda memory allocation took place (as opposed to the space already being allocated) */ public Pair getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols) { + return getDenseMatrixOutputForGPUInstruction(varName, numRows, numCols, true); + } + + public Pair getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols, + boolean initialize) + { MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols); - boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense(); + boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense(initialize); mo.getDataCharacteristics().setNonZeros(-1); return new Pair<>(mo, allocated); } @@ -390,9 +396,15 @@ public Pair getDenseMatrixOutputForGPUInstruction(String * @return matrix object */ public Pair getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols, long nnz) { + return getSparseMatrixOutputForGPUInstruction(varName, numRows, numCols, nnz, true); + } + + public Pair getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols, + long nnz, boolean initialize) + { MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols); mo.getDataCharacteristics().setNonZeros(nnz); - boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse(); + boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse(initialize); return new Pair<>(mo, allocated); } diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java index 3ac1b5ad07a..285906a5241 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java @@ -637,7 +637,8 @@ private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRunti int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures Pointer sysdsWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M); Pointer sysdsBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M); - Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); + // TODO: find out if memset is necessary + Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType, true); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight", ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), sysdsWPointer, sysdsBiasPointer, cudnnWPointer, D, M); @@ -650,7 +651,7 @@ private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRunti int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D) long numColsX = X.getNumColumns(); int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength - Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType); + Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType, false); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input", ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), xPointer, cudnnInput, N, D, T*D, N*T*D); @@ -702,7 +703,7 @@ private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeExcept int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures Pointer sysdsWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M); Pointer sysdsBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M); - Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); + Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType, false); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight", ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), sysdsWPointer, sysdsBiasPointer, cudnnWPointer, D, M); @@ -717,7 +718,7 @@ private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeExcept int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D) long numColsX = X.getNumColumns(); int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength - Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType); + Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType, false); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input", ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), xPointer, cudnnInput, N, D, T*D, N*T*D); diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java index 700145b55c1..50afdcfca8f 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/GPUInstruction.java @@ -252,7 +252,13 @@ protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, Stri * @return the matrix object */ protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols) { - return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey(); + return getDenseMatrixOutputForGPUInstruction(ec, name, numRows, numCols, true); + } + + protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols, + boolean initialize) + { + return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols, initialize).getKey(); } @Override diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java index 97d39cc7ba7..f82d712b45f 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixReshapeGPUInstruction.java @@ -85,7 +85,8 @@ public void processInstruction(ExecutionContext ec) { } // We currently support only dense rshape Pointer inPtr = LibMatrixCUDA.getDensePointer(gCtx, mat, instName); - MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), rows, cols); + MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), rows, + cols, false); Pointer outPtr = LibMatrixCUDA.getDensePointer(gCtx, out, instName); if(byRow.getBooleanValue()) { // byrow = TRUE is simple memcpy and metadata update diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java index 55d31302242..23ae4d1648e 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/CSRPointer.java @@ -265,12 +265,15 @@ public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHand * @param gCtx a valid {@link GPUContext} * @param nnz2 number of non-zeroes * @param rows number of rows + * @param initialize memset to zero? * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU */ - public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) { + public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows, boolean initialize) { LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx); - if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"); - if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or equal to 0 when trying to allocate sparse data on GPU"); + if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less " + + "than 0 when trying to allocate sparse data on GPU"); + if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or " + + "equal to 0 when trying to allocate sparse data on GPU"); CSRPointer r = new CSRPointer(gCtx); r.nnz = nnz2; if (nnz2 == 0) { @@ -279,12 +282,16 @@ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) { return r; } // increment the cudaCount by 1 for the allocation of all 3 arrays - r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2)); - r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1)); - r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2)); + r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2), initialize); + r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1), initialize); + r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2), initialize); return r; } + public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) { + return allocateEmpty(gCtx, nnz2, rows, true); + } + /** * Allocate row pointers of m+1 elements * @@ -296,9 +303,9 @@ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) { private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) { LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx); cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST); - //cudaDeviceSynchronize; + // Do not increment the cudaCount of allocations on GPU - C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1)); + C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1), true); } /** @@ -375,8 +382,9 @@ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, C private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) { LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx); // Increment cudaCount by one when all three arrays of CSR sparse array are allocated - C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz)); - C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz)); + + C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz), false); + C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz), false); } // ============================================================================================== @@ -402,17 +410,17 @@ public CSRPointer clone(int rows) { CSRPointer that = new CSRPointer(me.getGPUContext()); that.allocateMatDescrPointer(); that.nnz = me.nnz; - that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType); - that.rowPtr = allocate(rows * Sizeof.INT); - that.colInd = allocate(that.nnz * Sizeof.INT); + that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType, false); + that.rowPtr = allocate(rows * Sizeof.INT, false); + that.colInd = allocate(that.nnz * Sizeof.INT, false); cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice); cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice); cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice); return that; } - private Pointer allocate(long size) { - return getGPUContext().allocate(null, size); + private Pointer allocate(long size, boolean initialize) { + return getGPUContext().allocate(null, size, initialize); } private GPUContext getGPUContext() { @@ -460,7 +468,7 @@ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHan LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + getGPUContext()); long size = rows * getDataTypeSizeOf(cols); - Pointer A = allocate(size); + Pointer A = allocate(size, false); // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned. if (val != null && rowPtr != null && colInd != null && nnz > 0) { // Note: cusparseDcsr2dense method cannot handle empty blocks diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java index 7ad030bb9a7..5ad9bc52a33 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java @@ -82,13 +82,13 @@ public class GPUContext { /** * cusolverDnHandle for invoking solve() function on dense matrices on the GPU */ - private cusolverDnHandle cusolverDnHandle; + private volatile cusolverDnHandle cusolverDnHandle; /** * to launch custom CUDA kernel, specific to the active GPU for this GPUContext */ private JCudaKernels kernels; - private GPUMemoryManager memoryManager; + private final GPUMemoryManager memoryManager; public GPUMemoryManager getMemoryManager() { return memoryManager; @@ -96,6 +96,7 @@ public GPUMemoryManager getMemoryManager() { protected GPUContext(int deviceNum) { this.deviceNum = deviceNum; + cudaSetDevice(deviceNum); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); @@ -175,7 +176,7 @@ public int getDeviceNum() { * Sets the device for the calling thread. * This method must be called after * {@link org.apache.sysds.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)} - * If in a multi-threaded environment like parfor, this method must be called when in the + * If in a multithreaded environment like parfor, this method must be called when in the * appropriate thread. * */ @@ -187,18 +188,32 @@ public void initializeThread() { /** * Invokes memory manager's malloc method * - * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record + * @param instructionName name of instruction for which to record per instruction performance statistics, null if + * you don't want to record * @param size size of data (in bytes) to allocate + * @param initialize if cudaMemset() should be called + * @return jcuda pointer + */ + public Pointer allocate(String instructionName, long size, boolean initialize) { + return memoryManager.malloc(instructionName, size, initialize); + } + + /** + * Default behavior for gpu memory allocation (init to zero) + * + * @param instructionName Name of the instruction calling allocate + * @param size size in bytes * @return jcuda pointer */ public Pointer allocate(String instructionName, long size) { - return memoryManager.malloc(instructionName, size); + return memoryManager.malloc(instructionName, size, true); } /** * Does cudaFree calls, lazily. * - * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record + * @param instructionName name of the instruction for which to record per instruction free time, null if you do not + * want to record * @param toFree {@link Pointer} instance to be freed * @param eager true if to be done eagerly */ @@ -389,7 +404,7 @@ public JCudaKernels getKernels() { */ public void destroy() { if (LOG.isTraceEnabled()) { - LOG.trace("GPU : this context was destroyed, this = " + this.toString()); + LOG.trace("GPU : this context was destroyed, this = " + this); } clearMemory(); diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java index 7df6214e5b7..d6b62ad8907 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -245,9 +245,10 @@ private static String byteCountToDisplaySize(long numBytes) { * * @param opcode instruction name * @param size size in bytes + * @param initialize if cudaMemset() should be called * @return allocated pointer */ - public Pointer malloc(String opcode, long size) { + public Pointer malloc(String opcode, long size, boolean initialize) { if(size < 0) { throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size)); } @@ -432,7 +433,6 @@ public Pointer malloc(String opcode, long size) { } } - // Step 8: Handle defragmentation if(A == null) { LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ..."); @@ -448,11 +448,12 @@ public Pointer malloc(String opcode, long size) { } long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; - cudaMemset(A, 0, size); + if(initialize) + cudaMemset(A, 0, size); addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0); return A; } - + private int worstCaseContiguousMemorySizeCompare(GPUObject o1, GPUObject o2) { long ret = matrixMemoryManager.getWorstCaseContiguousMemorySize(o1) - matrixMemoryManager.getWorstCaseContiguousMemorySize(o2); return ret < 0 ? -1 : (ret == 0 ? 0 : 1); diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java index 633e3fc7d26..5ea922b99f4 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java @@ -202,7 +202,7 @@ public Object clone() { long rows = me.mat.getNumRows(); long cols = me.mat.getNumColumns(); long size = rows * cols * LibMatrixCUDA.sizeOfDataType; - that.setDensePointer(allocate(size)); + that.setDensePointer(allocate(size, false)); cudaMemcpy(that.getDensePointer(), me.getDensePointer(), size, cudaMemcpyDeviceToDevice); } @@ -218,17 +218,15 @@ public Object clone() { return that; } - private Pointer allocate(long size) { - return getGPUContext().allocate(null, size); + private Pointer allocate(long size, boolean initialize) { + return getGPUContext().allocate(null, size, initialize); } private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { getGPUContext().cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE); } - GPUContext getGPUContext() { - return gpuContext; - } + GPUContext getGPUContext() { return gpuContext; } /** * Transposes a dense matrix on the GPU by calling the cublasDgeam operation @@ -248,7 +246,7 @@ public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, Pointer alpha = LibMatrixCUDA.one(); Pointer beta = LibMatrixCUDA.zero(); Pointer A = densePtr; - Pointer C = gCtx.allocate(null, m * getDatatypeSizeOf(n)); + Pointer C = gCtx.allocate(null, m * getDatatypeSizeOf(n), false); // Transpose the matrix to get a dense matrix LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), @@ -276,8 +274,8 @@ public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cuspa Pointer nnzPerRowPtr = null; Pointer nnzTotalDevHostPtr = null; - nnzPerRowPtr = gCtx.allocate(null, getIntSizeOf(rows)); - nnzTotalDevHostPtr = gCtx.allocate(null, getIntSizeOf(1)); + nnzPerRowPtr = gCtx.allocate(null, getIntSizeOf(rows), false); + nnzTotalDevHostPtr = gCtx.allocate(null, getIntSizeOf(1), false); // Output is in dense vector format, convert it to CSR LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, @@ -526,13 +524,9 @@ public void allocateAndFillDense(double v) { long cols = mat.getNumColumns(); int numElems = toIntExact(rows * cols); long size = getDatatypeSizeOf(numElems); - setDensePointer(allocate(size)); - // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v" - // If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0 - if (v != 0) - getGPUContext().getKernels() - .launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), - getDensePointer(), v, numElems); + setDensePointer(allocate(size,false)); + getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations + (numElems), getDensePointer(), v, numElems); } /** @@ -574,8 +568,8 @@ public long getNnz(String instName, boolean recomputeDenseNNZ) { int cols = toIntExact(mat.getNumColumns()); Pointer nnzPerRowPtr = null; Pointer nnzTotalDevHostPtr = null; - nnzPerRowPtr = gCtx.allocate(instName, getIntSizeOf(rows)); - nnzTotalDevHostPtr = gCtx.allocate(instName, getIntSizeOf(1)); + nnzPerRowPtr = gCtx.allocate(instName, getIntSizeOf(rows), false); + nnzTotalDevHostPtr = gCtx.allocate(instName, getIntSizeOf(1), false); LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getDensePointer(), rows, nnzPerRowPtr, nnzTotalDevHostPtr); int[] nnzC = { -1 }; @@ -613,6 +607,10 @@ public boolean acquireDeviceRead(String opcode) { } public boolean acquireDeviceModifyDense() { + return acquireDeviceModifyDense(true); + } + + public boolean acquireDeviceModifyDense(boolean initialize) { if(LOG.isTraceEnabled()) { LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext()); } @@ -623,7 +621,7 @@ public boolean acquireDeviceModifyDense() { LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this); } // Dense block, size = numRows * numCols - allocateDenseMatrixOnDevice(); + allocateDenseMatrixOnDevice(initialize); allocated = true; } dirty = true; @@ -633,6 +631,10 @@ public boolean acquireDeviceModifyDense() { } public boolean acquireDeviceModifySparse() { + return acquireDeviceModifySparse(true); + } + + public boolean acquireDeviceModifySparse(boolean initialize) { if(LOG.isTraceEnabled()) { LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); } @@ -643,7 +645,7 @@ public boolean acquireDeviceModifySparse() { LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this); } mat.setDirty(true); - allocateSparseMatrixOnDevice(); + allocateSparseMatrixOnDevice(initialize); allocated = true; } dirty = true; @@ -753,7 +755,7 @@ public void releaseOutput() { throw new DMLRuntimeException("Attempting to release an output before allocating it"); } - void allocateDenseMatrixOnDevice() { + void allocateDenseMatrixOnDevice(boolean initialize) { if(LOG.isTraceEnabled()) { LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); } @@ -766,11 +768,11 @@ void allocateDenseMatrixOnDevice() { if(cols <= 0) throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix:" + cols); long size = getDatatypeSizeOf(rows * cols); - Pointer tmp = allocate(size); + Pointer tmp = allocate(size, initialize); setDensePointer(tmp); } - void allocateSparseMatrixOnDevice() { + void allocateSparseMatrixOnDevice(boolean initialize) { if(LOG.isTraceEnabled()) { LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); } @@ -782,16 +784,16 @@ void allocateSparseMatrixOnDevice() { throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix"); if(nnz < 0) throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix"); - CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows); + CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows, initialize); setSparseMatrixCudaPointer(tmp); } - void allocateSparseMatrixOnDevice(long numVals) { + void allocateSparseMatrixOnDevice(long numVals, boolean initialize) { // This method is called when #values > nnz if(LOG.isTraceEnabled()) { LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); } - if(isAllocated()) + if(isAllocated()) throw new DMLRuntimeException("Internal error - trying to allocated sparse matrix to a GPUObject that is already allocated"); long rows = mat.getNumRows(); long nnz = mat.getNnz(); @@ -799,7 +801,7 @@ void allocateSparseMatrixOnDevice(long numVals) { throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix"); if(nnz < 0) throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix"); - CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), numVals, rows); + CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), numVals, rows, initialize); setSparseMatrixCudaPointer(tmp); } @@ -883,11 +885,11 @@ void copyFromHostToDevice(String opcode) { if (values != null) if(values.length > tmp.getNonZeros()) - allocateSparseMatrixOnDevice(values.length); + allocateSparseMatrixOnDevice(values.length, false); else - allocateSparseMatrixOnDevice(); + allocateSparseMatrixOnDevice(false); else - allocateSparseMatrixOnDevice(); + allocateSparseMatrixOnDevice(false); if (copyToDevice) { CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(), @@ -901,7 +903,7 @@ void copyFromHostToDevice(String opcode) { else if (data == null && tmp.getNonZeros() != 0) throw new DMLRuntimeException("MatrixBlock is not allocated"); - allocateDenseMatrixOnDevice(); + allocateDenseMatrixOnDevice(false); if (tmp.getNonZeros() == 0) { // Minor optimization: No need to allocate empty error for CPU diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/ShadowBuffer.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/ShadowBuffer.java index 13279a55df6..0eba39134e2 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/ShadowBuffer.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/ShadowBuffer.java @@ -103,7 +103,7 @@ public void moveToHost() { public void moveToDevice() { long start = DMLScript.STATISTICS ? System.nanoTime() : 0; long numBytes = shadowPointer.length*LibMatrixCUDA.sizeOfDataType; - gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes); + gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes, false); cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice); clearShadowPointer(); if (DMLScript.STATISTICS) { diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java index f6e950ab810..7a5db2141c7 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java @@ -377,7 +377,7 @@ public static void channelSums(GPUContext gCtx, String instName, MatrixObject in Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName); // We can replace this with CuDNN tensor reduce - Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType); + Pointer tmp = gCtx.allocate(instName, (long) cols * sizeOfDataType, false); reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols); reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW)); gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); @@ -718,7 +718,7 @@ else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...} } case OP_PLUS_SQ : { // Calculate the squares in a temporary object tmp - Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType); + Pointer tmp = gCtx.allocate(instName, (long) size * sizeOfDataType, false); squareMatrix(gCtx, instName, in, tmp, rlen, clen); // Then do the sum on the temporary object and free it @@ -817,8 +817,8 @@ else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...} } case OP_VARIANCE : { // Temporary GPU array for - Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType); - Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType); + Pointer tmp = gCtx.allocate(instName, (long) size * sizeOfDataType, false); + Pointer tmp2 = gCtx.allocate(instName, (long) size * sizeOfDataType, false); switch(reductionDirection) { @@ -846,7 +846,7 @@ else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...} squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); - Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType); + Pointer tmpRow = gCtx.allocate(instName, (long) rlen * sizeOfDataType, false); reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen); ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1); @@ -864,7 +864,7 @@ else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...} squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); - Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType); + Pointer tmpCol = gCtx.allocate(instName, (long) clen * sizeOfDataType, false); reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen); ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1); @@ -933,7 +933,7 @@ private static double reduceAll(GPUContext gCtx, String instName, String kernelF int[] tmp = getKernelParamsForReduceAll(gCtx, n); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; - Pointer tempOut = gCtx.allocate(instName, (long) blocks * sizeOfDataType); + Pointer tempOut = gCtx.allocate(instName, (long) blocks * sizeOfDataType, false); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n); @@ -2298,7 +2298,7 @@ public static void cumulativeScan(ExecutionContext ec, GPUContext gCtx, String i Pointer input = getDensePointer(gCtx, in, instName); Pointer output = getDensePointer(gCtx, out, instName); // storage for last value of each block - Pointer blk_res = gCtx.allocate(instName, cols * blocks_y * sizeOfDataType); + Pointer blk_res = gCtx.allocate(instName, (long) cols * blocks_y * sizeOfDataType, false); alloc_duration = printKernelTiming(time, "allocation of temporary buffer (" + cols * blocks_y * sizeOfDataType + " bytes)", alloc_duration, 0); @@ -2388,9 +2388,9 @@ public static void cumulativeSumProduct(ExecutionContext ec, GPUContext gCtx, long total_mem_size = 0; while( cascade_blocks > 0) { - long buf_size = 2 * block_height * cascade_blocks * sizeOfDataType; + long buf_size = 2L * block_height * cascade_blocks * sizeOfDataType; total_mem_size += buf_size; - intermediate_buffers.add(gCtx.allocate(instName, buf_size)); + intermediate_buffers.add(gCtx.allocate(instName, buf_size, false)); cascade_blocks = (cascade_blocks + block_height - 2) / block_height; if(cascade_blocks > 0) cb_list.add(cascade_blocks); @@ -2652,9 +2652,9 @@ public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork); // step 4: compute QR factorization - Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType); - Pointer tau = gCtx.allocate(instName, m * sizeOfDataType); - Pointer devInfo = gCtx.allocate(instName, Sizeof.INT); + Pointer work = gCtx.allocate(instName, (long) lwork[0] * sizeOfDataType, false); + Pointer tau = gCtx.allocate(instName, (long) m * sizeOfDataType, false); + Pointer devInfo = gCtx.allocate(instName, Sizeof.INT, false); cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo); int[] qrError = {-1}; @@ -2704,8 +2704,16 @@ public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, * @param numCols number of columns of output matrix object * @return the matrix object */ - public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) { - return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey(); + public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, + long numRows, long numCols) + { + return getDenseMatrixOutputForGPUInstruction(ec, instName, name, numRows, numCols, true); + } + + public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, + long numRows, long numCols, boolean initialize) + { + return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols, initialize).getKey(); } /** @@ -2717,12 +2725,22 @@ public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContex * @param nnz number of non zeroes in output matrix * @param instName the invoking instruction's name for record {@link Statistics}. * @param name name of input matrix (that the {@link ExecutionContext} is aware of) + * @param initialize memset to zero? + * * @return the matrix object */ - private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, long nnz, String instName, String name) { - return ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz).getKey(); + private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, + long nnz, String instName, String name, boolean initialize) + { + return ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz, initialize).getKey(); } - + + private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, + long nnz, String instName, String name) + { + return getSparseMatrixOutputForGPUInstruction(ec, numRows, numCols, nnz, instName, name, true); + } + /** * Utility to compute number of non-zeroes on the GPU * diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java index f85b9d0f298..d95e1bf8235 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNN.java @@ -141,7 +141,7 @@ private static Pointer denseIm2col(GPUContext gCtx, String instName, MatrixObjec throw new DMLRuntimeException("Unknown number of nonzeroes in denseIm2col"); } else if(inPointer.nnz > 0) { - im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType); + im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType, false); getCudaKernels(gCtx).launchKernel("sparse_dense_im2col", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(inPointer.nnz)), inPointer.val, inPointer.rowPtr, inPointer.colInd, im2colPointer, inPointer.nnz, N, C*H*W, H*W, W, R, S, P, Q, P*Q, R*S, N*P*Q, stride_h, stride_w, pad_h, pad_w); @@ -150,7 +150,7 @@ else if(inPointer.nnz > 0) { return null; } else { - im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType); + im2colPointer = gCtx.allocate(instName, C*R*S*N*P*Q*sizeOfDataType, false); Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName); getCudaKernels(gCtx).launchKernel("dense_dense_im2col", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*C*H*W)), imagePointer, im2colPointer, N*C*H*W, @@ -212,7 +212,7 @@ public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, // Perform matrix multiplication CSRPointer filterPointer = filter.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); - Pointer matmultOutputPointer = gCtx.allocate(instName, NKPQ*sizeOfDataType); + Pointer matmultOutputPointer = gCtx.allocate(instName, NKPQ*sizeOfDataType, false); LibMatrixCuMatMult.sparseDenseMatMult(gCtx, instName, matmultOutputPointer, filterPointer, im2colPointer, K, CRS, CRS, NPQ, K, NPQ, false, false); gCtx.cudaFreeHelper(instName, im2colPointer, DMLScript.EAGER_CUDA_FREE); @@ -423,7 +423,7 @@ public static void conv2dBackwardFilter(GPUContext gCtx, String instName, Matrix try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image); LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) { // Perform one-input conv2dBackwardFilter - Pointer tempdwPointer = gCtx.allocate(instName, KCRS*sizeOfDataType); + Pointer tempdwPointer = gCtx.allocate(instName, KCRS*sizeOfDataType, false); for(int n = 0; n < N; n++) { cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType); // Perform one-input conv2dBackwardFilter @@ -726,8 +726,8 @@ private static void cudnnPoolingBackwardHelper(GPUContext gCtx, String instName, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType)) { int status; if(!isMaxPoolOutputProvided) { - long numBytes = N*C*P*Q*sizeOfDataType; - y = gCtx.allocate(instName, numBytes); + long numBytes = (long) N *C*P*Q*sizeOfDataType; + y = gCtx.allocate(instName, numBytes, false); status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); @@ -845,8 +845,9 @@ private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPU String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { boolean hasCarry = rnnMode.equalsIgnoreCase("lstm"); // Get output pointers - Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); - Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType); + Pointer cudnnYPointer = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false); + Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, + (long) N*M*sizeOfDataType, false); Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer(); // Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M); @@ -878,16 +879,16 @@ public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String ins String dxName, String dwName, String dbName, String dhxName, String dcxName, // output boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { // Transform the input dout and prepare them for cudnnRNNBackwardData - Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType); + Pointer dy = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false); int size = return_sequences ? N*T*M : N*M; LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients", ExecutionConfig.getConfigForSimpleVectorOperations(size), - getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M), + getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? (long) T*M : M), dy, N, T, M, size, return_sequences ? 1 : 0); ec.releaseMatrixInputForGPUInstruction(doutName); // Allocate intermediate pointers computed by forward - Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); + Pointer yPointer = gCtx.allocate(instName, (long) N *T*M*sizeOfDataType, false); try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) { JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.xDesc, x, @@ -900,7 +901,7 @@ algo.cyDesc, new Pointer(), algo.workSpace, algo.sizeInBytes, algo.reserveSpace, algo.reserveSpaceSizeInBytes); - Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType); + Pointer cudnnDx = gCtx.allocate(instName, (long) N *T*D*LibMatrixCUDA.sizeOfDataType, false); JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.yDesc, yPointer, // ---------------------- @@ -933,7 +934,7 @@ algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M), gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE); // ------------------------------------------------------------------------------------------- - Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); + Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4L *M)*LibMatrixCUDA.sizeOfDataType, false); JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.xDesc, x, algo.hxDesc, hx, diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java index 67373e45a42..2284c9da6ca 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java @@ -134,7 +134,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionForwardAlgor jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray); if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]); + ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; return ret; @@ -168,7 +168,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilt pad_h, pad_w, stride_h, stride_w, P, Q); int[] algos = {-1}; - long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; + long[] sizeInBytesArray = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm( LibMatrixCuDNN.getCudnnHandle(gCtx), ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, @@ -176,7 +176,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilt jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray); if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]); + ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; @@ -218,7 +218,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardData } else { int[] algos = {-1}; - long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; + long[] sizeInBytesArray = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm( LibMatrixCuDNN.getCudnnHandle(gCtx), ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, @@ -226,7 +226,7 @@ public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardData jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray); if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]); + ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0], false); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; } diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java index 0cc27b4cd11..1a29dee2cf6 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java @@ -47,7 +47,7 @@ public LibMatrixCuDNNInputRowFetcher(GPUContext gCtx, String instName, MatrixObj numColumns = LibMatrixCUDA.toInt(image.getNumColumns()); isInputInSparseFormat = LibMatrixCUDA.isInSparseFormat(gCtx, image); inPointer = isInputInSparseFormat ? LibMatrixCUDA.getSparsePointer(gCtx, image, instName) : LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName); - outPointer = gCtx.allocate(instName, numColumns*sizeOfDataType); + outPointer = gCtx.allocate(instName, (long) numColumns *sizeOfDataType, false); } /** * Copy the nth row and return the dense pointer @@ -57,7 +57,7 @@ public LibMatrixCuDNNInputRowFetcher(GPUContext gCtx, String instName, MatrixObj public Pointer getNthRow(int n) { if(isInputInSparseFormat) { jcuda.runtime.JCuda.cudaDeviceSynchronize(); - cudaMemset(outPointer, 0, numColumns*sizeOfDataType); + cudaMemset(outPointer, 0, (long) numColumns *sizeOfDataType); jcuda.runtime.JCuda.cudaDeviceSynchronize(); LibMatrixCUDA.sliceSparseDense(gCtx, instName, (CSRPointer)inPointer, outPointer, n, n, 0, LibMatrixCUDA.toInt(numColumns-1), numColumns); } diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java index 663267b431c..75f9fec0de6 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java @@ -90,7 +90,7 @@ public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String i dropOutSizeInBytes = _dropOutSizeInBytes[0]; dropOutStateSpace = new Pointer(); if (dropOutSizeInBytes != 0) - dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes); + dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes, false); JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345); // Initialize RNN descriptor @@ -112,12 +112,12 @@ public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String i workSpace = new Pointer(); reserveSpace = new Pointer(); sizeInBytes = getWorkspaceSize(T); if(sizeInBytes != 0) - workSpace = gCtx.allocate(instName, sizeInBytes); + workSpace = gCtx.allocate(instName, sizeInBytes, false); reserveSpaceSizeInBytes = 0; if(isTraining) { reserveSpaceSizeInBytes = getReservespaceSize(T); if (reserveSpaceSizeInBytes != 0) { - reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes); + reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes, false); } } } diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java index a62e152b44b..5753041a622 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCuMatMult.java @@ -259,7 +259,7 @@ static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRP // t(C) = t(B) %*% t(A) Pointer output = null; if (outRLen != 1 && outCLen != 1) { - output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType); + output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType, false); } else { // no transpose required for vector output output = C; diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java index fd29f21d655..e25a68aef35 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java @@ -178,7 +178,7 @@ public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String ins // during eviction: `evict -> devictToHost -> float2double -> allocate -> ensureFreeSpace -> evict`. // To avoid this recursion, it is necessary to perform this conversion in host. if(PERFORM_CONVERSION_ON_DEVICE && !isEviction) { - Pointer deviceDoubleData = gCtx.allocate(instName, ((long)dest.length)*Sizeof.DOUBLE); + Pointer deviceDoubleData = gCtx.allocate(instName, ((long)dest.length)*Sizeof.DOUBLE, false); LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length); cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost); gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE); @@ -202,7 +202,7 @@ public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String ins // TODO: Perform conversion on GPU using double2float and float2double kernels long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if(PERFORM_CONVERSION_ON_DEVICE) { - Pointer deviceDoubleData = gCtx.allocate(instName, ((long)src.length)*Sizeof.DOUBLE); + Pointer deviceDoubleData = gCtx.allocate(instName, ((long)src.length)*Sizeof.DOUBLE, false); cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice); LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length); gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE);