Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ else if(_type == SpoofCellwise.CellType.ROW_AGG)
long nnz = in_obj.getNnz("spoofCUDA" + getSpoofType(), false);
MatrixObject out_obj = sparseOut ?
(ec.getSparseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, (isSparseSafe() && nnz > 0) ?
nnz : out_rows * out_cols).getKey()) :
(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols).getKey());
nnz : out_rows * out_cols, false).getKey()) :
(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, false).getKey());

packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,false, null);
if(NotEmpty(in_obj) || !sparseSafe) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ default void prepareMatrixPointers(ByteBuffer buf, ExecutionContext ec, MatrixOb
int rows = (int)mo.getNumRows();
int cols = (int)mo.getNumColumns();
Pointer b1 = mo.getGPUObject(ec.getGPUContext(0)).getDensePointer();
Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType);
Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType, false);
LibMatrixCUDA.denseTranspose(ec, ec.getGPUContext(0), getName(), b1, ptr, rows, cols);
writeMatrixDescriptorToBuffer(buf, rows, cols, 0, 0, GPUObject.getPointerAddress(ptr), mo.getNnz());
} else {
Expand Down Expand Up @@ -115,8 +115,10 @@ default void packDataForTransfer(ExecutionContext ec, ArrayList<MatrixObject> in
int NT = 256;
long N = inputs.get(0).getNumRows() * inputs.get(0).getNumColumns();
num_blocks = ((N + NT * 2 - 1) / (NT * 2));
ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks, false);
}
ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks);
else
ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks, true);
writeMatrixDescriptorToBuffer(buf, 1, 1, 0, 0, GPUObject.getPointerAddress(ptr[0]), 1);
}
else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
hasMatrixObjectSideInput(inputs) ? getMinColsMatrixObjectSideInputs(inputs) : -1;
OutputDimensions out_dims = new OutputDimensions(m, n, n2);
ec.setMetaData(outputName, out_dims.rows, out_dims.cols);
MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols).getKey();
MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols, false).getKey();

packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,_tB1, null);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -373,8 +373,14 @@ private static long validateDimensions(long d1, long d2) {
* @return a pair containing the wrapping {@link MatrixObject} and a boolean indicating whether a cuda memory allocation took place (as opposed to the space already being allocated)
*/
public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols) {
return getDenseMatrixOutputForGPUInstruction(varName, numRows, numCols, true);
}

public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols,
boolean initialize)
{
MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols);
boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense();
boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense(initialize);
mo.getDataCharacteristics().setNonZeros(-1);
return new Pair<>(mo, allocated);
}
Expand All @@ -390,9 +396,15 @@ public Pair<MatrixObject, Boolean> getDenseMatrixOutputForGPUInstruction(String
* @return matrix object
*/
public Pair<MatrixObject, Boolean> getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols, long nnz) {
return getSparseMatrixOutputForGPUInstruction(varName, numRows, numCols, nnz, true);
}

public Pair<MatrixObject, Boolean> getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols,
long nnz, boolean initialize)
{
MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols);
mo.getDataCharacteristics().setNonZeros(nnz);
boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse();
boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse(initialize);
return new Pair<>(mo, allocated);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,8 @@ private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRunti
int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures
Pointer sysdsWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M);
Pointer sysdsBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M);
Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
// TODO: find out if memset is necessary
Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType, true);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
sysdsWPointer, sysdsBiasPointer, cudnnWPointer, D, M);
Expand All @@ -650,7 +651,7 @@ private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRunti
int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D)
long numColsX = X.getNumColumns();
int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType, false);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
xPointer, cudnnInput, N, D, T*D, N*T*D);
Expand Down Expand Up @@ -702,7 +703,7 @@ private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeExcept
int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures
Pointer sysdsWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instructionName, D+M, 4*M);
Pointer sysdsBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instructionName, 1, 4*M);
Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
Pointer cudnnWPointer = gCtx.allocate(instructionName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType, false);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
sysdsWPointer, sysdsBiasPointer, cudnnWPointer, D, M);
Expand All @@ -717,7 +718,7 @@ private void processLstmInstruction(ExecutionContext ec) throws DMLRuntimeExcept
int N = toInt(X.getNumRows()); // batchSize .. since X:(N, T*D)
long numColsX = X.getNumColumns();
int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
Pointer cudnnInput = gCtx.allocate(instructionName, (N*T*D)*LibMatrixCUDA.sizeOfDataType, false);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
xPointer, cudnnInput, N, D, T*D, N*T*D);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,13 @@ protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, Stri
* @return the matrix object
*/
protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols) {
return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols).getKey();
return getDenseMatrixOutputForGPUInstruction(ec, name, numRows, numCols, true);
}

protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols,
boolean initialize)
{
return ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols, initialize).getKey();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ public void processInstruction(ExecutionContext ec) {
}
// We currently support only dense rshape
Pointer inPtr = LibMatrixCUDA.getDensePointer(gCtx, mat, instName);
MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), rows, cols);
MatrixObject out = LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), rows,
cols, false);
Pointer outPtr = LibMatrixCUDA.getDensePointer(gCtx, out, instName);
if(byRow.getBooleanValue()) {
// byrow = TRUE is simple memcpy and metadata update
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,15 @@ public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHand
* @param gCtx a valid {@link GPUContext}
* @param nnz2 number of non-zeroes
* @param rows number of rows
* @param initialize memset to zero?
* @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
*/
public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows, boolean initialize) {
LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU");
if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or equal to 0 when trying to allocate sparse data on GPU");
if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less " +
"than 0 when trying to allocate sparse data on GPU");
if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or " +
"equal to 0 when trying to allocate sparse data on GPU");
CSRPointer r = new CSRPointer(gCtx);
r.nnz = nnz2;
if (nnz2 == 0) {
Expand All @@ -279,12 +282,16 @@ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
return r;
}
// increment the cudaCount by 1 for the allocation of all 3 arrays
r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2), initialize);
r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1), initialize);
r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2), initialize);
return r;
}

public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
return allocateEmpty(gCtx, nnz2, rows, true);
}

/**
* Allocate row pointers of m+1 elements
*
Expand All @@ -296,9 +303,9 @@ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) {
private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) {
LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
//cudaDeviceSynchronize;

// Do not increment the cudaCount of allocations on GPU
C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1));
C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1), true);
}

/**
Expand Down Expand Up @@ -375,8 +382,9 @@ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, C
private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) {
LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz));
C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));

C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz), false);
C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz), false);
}

// ==============================================================================================
Expand All @@ -402,17 +410,17 @@ public CSRPointer clone(int rows) {
CSRPointer that = new CSRPointer(me.getGPUContext());
that.allocateMatDescrPointer();
that.nnz = me.nnz;
that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType);
that.rowPtr = allocate(rows * Sizeof.INT);
that.colInd = allocate(that.nnz * Sizeof.INT);
that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType, false);
that.rowPtr = allocate(rows * Sizeof.INT, false);
that.colInd = allocate(that.nnz * Sizeof.INT, false);
cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice);
cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice);
cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice);
return that;
}

private Pointer allocate(long size) {
return getGPUContext().allocate(null, size);
private Pointer allocate(long size, boolean initialize) {
return getGPUContext().allocate(null, size, initialize);
}

private GPUContext getGPUContext() {
Expand Down Expand Up @@ -460,7 +468,7 @@ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHan
LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
+ getGPUContext());
long size = rows * getDataTypeSizeOf(cols);
Pointer A = allocate(size);
Pointer A = allocate(size, false);
// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
if (val != null && rowPtr != null && colInd != null && nnz > 0) {
// Note: cusparseDcsr2dense method cannot handle empty blocks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,20 +82,21 @@ public class GPUContext {
/**
* cusolverDnHandle for invoking solve() function on dense matrices on the GPU
*/
private cusolverDnHandle cusolverDnHandle;
private volatile cusolverDnHandle cusolverDnHandle;
/**
* to launch custom CUDA kernel, specific to the active GPU for this GPUContext
*/
private JCudaKernels kernels;

private GPUMemoryManager memoryManager;
private final GPUMemoryManager memoryManager;

public GPUMemoryManager getMemoryManager() {
return memoryManager;
}

protected GPUContext(int deviceNum) {
this.deviceNum = deviceNum;

cudaSetDevice(deviceNum);

cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
Expand Down Expand Up @@ -175,7 +176,7 @@ public int getDeviceNum() {
* Sets the device for the calling thread.
* This method must be called after
* {@link org.apache.sysds.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)}
* If in a multi-threaded environment like parfor, this method must be called when in the
* If in a multithreaded environment like parfor, this method must be called when in the
* appropriate thread.
*
*/
Expand All @@ -187,18 +188,32 @@ public void initializeThread() {
/**
* Invokes memory manager's malloc method
*
* @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record
* @param instructionName name of instruction for which to record per instruction performance statistics, null if
* you don't want to record
* @param size size of data (in bytes) to allocate
* @param initialize if cudaMemset() should be called
* @return jcuda pointer
*/
public Pointer allocate(String instructionName, long size, boolean initialize) {
return memoryManager.malloc(instructionName, size, initialize);
}

/**
* Default behavior for gpu memory allocation (init to zero)
*
* @param instructionName Name of the instruction calling allocate
* @param size size in bytes
* @return jcuda pointer
*/
public Pointer allocate(String instructionName, long size) {
return memoryManager.malloc(instructionName, size);
return memoryManager.malloc(instructionName, size, true);
}

/**
* Does cudaFree calls, lazily.
*
* @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record
* @param instructionName name of the instruction for which to record per instruction free time, null if you do not
* want to record
* @param toFree {@link Pointer} instance to be freed
* @param eager true if to be done eagerly
*/
Expand Down Expand Up @@ -389,7 +404,7 @@ public JCudaKernels getKernels() {
*/
public void destroy() {
if (LOG.isTraceEnabled()) {
LOG.trace("GPU : this context was destroyed, this = " + this.toString());
LOG.trace("GPU : this context was destroyed, this = " + this);
}
clearMemory();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,10 @@ private static String byteCountToDisplaySize(long numBytes) {
*
* @param opcode instruction name
* @param size size in bytes
* @param initialize if cudaMemset() should be called
* @return allocated pointer
*/
public Pointer malloc(String opcode, long size) {
public Pointer malloc(String opcode, long size, boolean initialize) {
if(size < 0) {
throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size));
}
Expand Down Expand Up @@ -432,7 +433,6 @@ public Pointer malloc(String opcode, long size) {
}
}


// Step 8: Handle defragmentation
if(A == null) {
LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ...");
Expand All @@ -448,11 +448,12 @@ public Pointer malloc(String opcode, long size) {
}

long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
cudaMemset(A, 0, size);
if(initialize)
cudaMemset(A, 0, size);
addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
return A;
}

private int worstCaseContiguousMemorySizeCompare(GPUObject o1, GPUObject o2) {
long ret = matrixMemoryManager.getWorstCaseContiguousMemorySize(o1) - matrixMemoryManager.getWorstCaseContiguousMemorySize(o2);
return ret < 0 ? -1 : (ret == 0 ? 0 : 1);
Expand Down
Loading