From 59b858c8460e0cfaca73d2388ba5188563cb5d9e Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Tue, 4 Feb 2025 15:33:15 +0100 Subject: [PATCH 01/11] [SYSTEMDS-3874] Java17 Vectorized LibMM This commit adds vectorized kernels for matrix multiplication. fix mm error Perf mm bigger scale remove compile log --- bin/systemds | 4 + pom.xml | 5 + .../compress/colgroup/ColGroupDDC.java | 40 +-- .../dictionary/MatrixBlockDictionary.java | 70 ++++- .../runtime/matrix/data/LibMatrixMult.java | 264 ++++++++---------- src/main/python/systemds/utils/converters.py | 1 + .../org/apache/sysds/performance/Main.java | 12 +- .../org/apache/sysds/performance/README.md | 23 +- .../performance/generators/IGeneratePair.java | 57 ++++ ...rmance.java => MMSparsityPerformance.java} | 12 +- .../matrix/MatrixMultiplicationPerf.java | 88 ++++++ .../apache/sysds/test/AutomatedTestBase.java | 3 +- .../multitenant/MultiTenantTestBase.java | 1 + src/test/scripts/performance/append.sh | 36 +-- .../performance/matrixMultiplication.sh | 47 ++++ 15 files changed, 448 insertions(+), 215 deletions(-) create mode 100644 src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java rename src/test/java/org/apache/sysds/performance/matrix/{MatrixMulPerformance.java => MMSparsityPerformance.java} (95%) create mode 100644 src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java create mode 100755 src/test/scripts/performance/matrixMultiplication.sh diff --git a/bin/systemds b/bin/systemds index 2e8e629495b..f0cb0b729b0 100755 --- a/bin/systemds +++ b/bin/systemds @@ -413,6 +413,7 @@ if [ $WORKER == 1 ]; then print_out "# starting Federated worker on port $PORT" CMD=" \ java $SYSTEMDS_STANDALONE_OPTS \ + --add-modules=jdk.incubator.vector \ $LOG4JPROPFULL \ -jar $SYSTEMDS_JAR_FILE \ -w $PORT \ @@ -422,6 +423,7 @@ elif [ "$FEDMONITORING" == 1 ]; then print_out "# starting Federated backend monitoring on port $PORT" CMD=" \ java $SYSTEMDS_STANDALONE_OPTS \ + --add-modules=jdk.incubator.vector \ $LOG4JPROPFULL \ -jar $SYSTEMDS_JAR_FILE \ -fedMonitoring $PORT \ @@ -433,6 +435,7 @@ elif [ $SYSDS_DISTRIBUTED == 0 ]; then CMD=" \ java $SYSTEMDS_STANDALONE_OPTS \ $LOG4JPROPFULL \ + --add-modules=jdk.incubator.vector \ -jar $SYSTEMDS_JAR_FILE \ -f $SCRIPT_FILE \ -exec $SYSDS_EXEC_MODE \ @@ -442,6 +445,7 @@ else print_out "# Running script $SCRIPT_FILE distributed with opts: $*" CMD=" \ spark-submit $SYSTEMDS_DISTRIBUTED_OPTS \ + --add-modules=jdk.incubator.vector \ $SYSTEMDS_JAR_FILE \ -f $SCRIPT_FILE \ -exec $SYSDS_EXEC_MODE \ diff --git a/pom.xml b/pom.xml index b25d94cc7db..5d2485897fb 100644 --- a/pom.xml +++ b/pom.xml @@ -92,6 +92,7 @@ --add-opens=java.base/java.lang.ref=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED + --add-modules=jdk.incubator.vector @@ -357,6 +358,9 @@ ${java.level} ${java.level} ${java.level} + + --add-modules=jdk.incubator.vector + @@ -904,6 +908,7 @@ true false true + --add-modules=jdk.incubator.vector ${doc.skip} public ${java.level} diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java index e55a24e56f5..fc82c58e16b 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java @@ -26,8 +26,8 @@ import java.util.List; import java.util.concurrent.ExecutorService; -// import jdk.incubator.vector.DoubleVector; -// import jdk.incubator.vector.VectorSpecies; +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; import org.apache.commons.lang3.NotImplementedException; import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; @@ -75,7 +75,7 @@ public class ColGroupDDC extends APreAgg implements IMapToDataGroup { protected final AMapToData _data; - // static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; private ColGroupDDC(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) { super(colIndexes, dict, cachedCounts); @@ -625,7 +625,8 @@ private void identityRightDecompressingMult(MatrixBlock right, MatrixBlock ret, final double[] b = right.getDenseBlockValues(); final double[] c = ret.getDenseBlockValues(); final int jd = right.getNumColumns(); - final int vLen = 8; + final DoubleVector vVec = DoubleVector.zero(SPECIES); + final int vLen = SPECIES.length(); final int lenJ = cru - crl; final int end = cru - (lenJ % vLen); for(int i = rl; i < ru; i++) { @@ -633,8 +634,7 @@ private void identityRightDecompressingMult(MatrixBlock right, MatrixBlock ret, final int offOut = i * jd + crl; final double aa = 1; final int k_right = _colIndexes.get(k); - vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen); - + vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen, vVec); } } @@ -644,8 +644,8 @@ private void defaultRightDecompressingMult(MatrixBlock right, MatrixBlock ret, i final double[] c = ret.getDenseBlockValues(); final int kd = _colIndexes.size(); final int jd = right.getNumColumns(); - // final DoubleVector vVec = DoubleVector.zero(SPECIES); - final int vLen = 8; + final DoubleVector vVec = DoubleVector.zero(SPECIES); + final int vLen = SPECIES.length(); final int blkzI = 32; final int blkzK = 24; @@ -661,32 +661,22 @@ private void defaultRightDecompressingMult(MatrixBlock right, MatrixBlock ret, i for(int k = bk; k < bke; k++) { final double aa = a[offi + k]; final int k_right = _colIndexes.get(k); - vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen); + vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen, vVec); } } } } } - final void vectMM(double aa, double[] b, double[] c, int endT, int jd, int crl, int cru, int offOut, int k, - int vLen) { - // vVec = vVec.broadcast(aa); + final void vectMM(double aa, double[] b, double[] c, int endT, int jd, int crl, int cru, int offOut, int k, int vLen, DoubleVector vVec) { + vVec = vVec.broadcast(aa); final int offj = k * jd; final int end = endT + offj; for(int j = offj + crl; j < end; j += vLen, offOut += vLen) { - // DoubleVector res = DoubleVector.fromArray(SPECIES, c, offOut); - // DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j); - // res = vVec.fma(bVec, res); - // res.intoArray(c, offOut); - - c[offOut] += aa * b[j]; - c[offOut + 1] += aa * b[j + 1]; - c[offOut + 2] += aa * b[j + 2]; - c[offOut + 3] += aa * b[j + 3]; - c[offOut + 4] += aa * b[j + 4]; - c[offOut + 5] += aa * b[j + 5]; - c[offOut + 6] += aa * b[j + 6]; - c[offOut + 7] += aa * b[j + 7]; + DoubleVector res = DoubleVector.fromArray(SPECIES, c, offOut); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j); + res = vVec.fma(bVec, res); + res.intoArray(c, offOut); } for(int j = end; j < cru + offj; j++, offOut++) { double bb = b[j]; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java index 1d6949cbcd7..54cdf6920ac 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java @@ -27,6 +27,8 @@ import java.util.Arrays; import java.util.Set; +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; import org.apache.commons.lang3.NotImplementedException; import org.apache.sysds.runtime.compress.DMLCompressionException; import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex; @@ -65,6 +67,8 @@ public class MatrixBlockDictionary extends ADictionary { final private MatrixBlock _data; + static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + /** * Unsafe private constructor that does not check the data validity. USE WITH CAUTION. * @@ -2088,7 +2092,71 @@ private void preaggValuesFromDenseDictDenseAggArray(final int numVals, final ICo private void preaggValuesFromDenseDictDenseAggRange(final int numVals, final IColIndex colIndexes, final int s, final int e, final double[] b, final int cut, final double[] ret) { - preaggValuesFromDenseDictDenseAggRangeGeneric(numVals, colIndexes, s, e, b, cut, ret); + if(colIndexes instanceof RangeIndex) { + RangeIndex ri = (RangeIndex) colIndexes; + preaggValuesFromDenseDictDenseAggRangeRange(numVals, ri.get(0), ri.get(0) + ri.size(), s, e, b, cut, ret); + } + else + preaggValuesFromDenseDictDenseAggRangeGeneric(numVals, colIndexes, s, e, b, cut, ret); + } + + private void preaggValuesFromDenseDictDenseAggRangeRange(final int numVals, final int ls, final int le, final int rs, + final int re, final double[] b, final int cut, final double[] ret) { + final int cz = le - ls; + final int az = re - rs; + // final int nCells = numVals * cz; + final double[] values = _data.getDenseBlockValues(); + // Correctly named ikj matrix multiplication . + + final int blkzI = 32; + final int blkzK = 24; + final int blkzJ = 1024; + for(int bi = 0; bi < numVals; bi += blkzI) { + final int bie = Math.min(numVals, bi + blkzI); + for(int bk = 0; bk < cz; bk += blkzK) { + final int bke = Math.min(cz, bk + blkzK); + for(int bj = 0; bj < az; bj += blkzJ) { + final int bje = Math.min(az, bj + blkzJ); + final int sOffT = rs + bj; + final int eOffT = rs + bje; + preaggValuesFromDenseDictBlockedIKJ(values, b, ret, bi, bk, bj, bie, bke, cz, az, ls, cut, sOffT, eOffT); + } + } + } + } + + private static void preaggValuesFromDenseDictBlockedIKJ(double[] a, double[] b, double[] ret, int bi, int bk, int bj, + int bie, int bke, int cz, int az, int ls, int cut, int sOffT, int eOffT) { + final int vLen = SPECIES.length(); + final DoubleVector vVec = DoubleVector.zero(SPECIES); + final int leftover = sOffT - eOffT % vLen; // leftover not vectorized + for(int i = bi; i < bie; i++) { + final int offI = i * cz; + final int offOutT = i * az + bj; + for(int k = bk; k < bke; k++) { + final int idb = (k + ls) * cut; + final int sOff = sOffT + idb; + final int eOff = eOffT + idb; + final double v = a[offI + k]; + vecInnerLoop(v, b, ret, offOutT, eOff, sOff, leftover, vLen, vVec); + } + } + } + + private static void vecInnerLoop(final double v, final double[] b, final double[] ret, final int offOutT, + final int eOff, final int sOff, final int leftover, final int vLen, DoubleVector vVec) { + int offOut = offOutT; + vVec = vVec.broadcast(v); + final int end = eOff - leftover; + for(int j = sOff; j < end; j += vLen, offOut += vLen) { + DoubleVector res = DoubleVector.fromArray(SPECIES, ret, offOut); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j); + vVec.fma(bVec, res).intoArray(ret, offOut); + } + for(int j = end; j < eOff; j++, offOut++) { + ret[offOut] += v * b[j]; + } + } private void preaggValuesFromDenseDictDenseAggRangeGeneric(final int numVals, final IColIndex colIndexes, diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index 66f7c3c9445..adb26dce107 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -29,6 +29,9 @@ import java.util.concurrent.Future; import java.util.stream.IntStream; +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -78,6 +81,8 @@ public class LibMatrixMult public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size) public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size) private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName()); + private static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + private static final int vLen = SPECIES.length(); private LibMatrixMult() { //prevent instantiation via private constructor @@ -3668,25 +3673,18 @@ private static void matrixMultWuMMGeneric (MatrixBlock mW, MatrixBlock mU, Matri private static double dotProduct( double[] a, double[] b, final int len ) { double val = 0; - final int bn = len%8; + + final int bn = len%vLen; //compute rest for( int i = 0; i < bn; i++ ) val += a[ i ] * b[ i ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int i = bn; i < len; i+=8 ) - { - //read 64B cachelines of a and b - //compute cval' = sum(a * b) + cval - val += a[ i+0 ] * b[ i+0 ] - + a[ i+1 ] * b[ i+1 ] - + a[ i+2 ] * b[ i+2 ] - + a[ i+3 ] * b[ i+3 ] - + a[ i+4 ] * b[ i+4 ] - + a[ i+5 ] * b[ i+5 ] - + a[ i+6 ] * b[ i+6 ] - + a[ i+7 ] * b[ i+7 ]; + //unrolled vLen-block (for better instruction-level parallelism) + for( int i = bn; i < len; i+=vLen ){ + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, i); + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, i); + val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD); } //scalar result @@ -3697,25 +3695,18 @@ private static double dotProduct( double[] a, double[] b, final int len ) public static double dotProduct( double[] a, double[] b, int ai, int bi, final int len ) { double val = 0; - final int bn = len%8; + final int bn = len%vLen; //compute rest for( int i = 0; i < bn; i++, ai++, bi++ ) val += a[ ai ] * b[ bi ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int i = bn; i < len; i+=8, ai+=8, bi+=8 ) + //unrolled vLen-block (for better instruction-level parallelism) + for( int i = bn; i < len; i+=vLen, ai+=vLen, bi+=vLen ) { - //read 64B cachelines of a and b - //compute cval' = sum(a * b) + cval - val += a[ ai+0 ] * b[ bi+0 ] - + a[ ai+1 ] * b[ bi+1 ] - + a[ ai+2 ] * b[ bi+2 ] - + a[ ai+3 ] * b[ bi+3 ] - + a[ ai+4 ] * b[ bi+4 ] - + a[ ai+5 ] * b[ bi+5 ] - + a[ ai+6 ] * b[ bi+6 ] - + a[ ai+7 ] * b[ bi+7 ]; + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi); + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai); + val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD); } //scalar result @@ -3784,102 +3775,93 @@ else if( aixk > bixk ) //note: public for use by codegen for consistency public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len ) - { - final int bn = len%8; + { + final int bn = len%vLen; - //rest, not aligned to 8-blocks + //rest, not aligned to vLen-blocks for( int j = 0; j < bn; j++, bi++, ci++) c[ ci ] += aval * b[ bi ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int j = bn; j < len; j+=8, bi+=8, ci+=8) + DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval); + //unrolled vLen-block (for better instruction-level parallelism) + for( int j = bn; j < len; j+=vLen, bi+=vLen, ci+=vLen) { - //read 64B cachelines of b and c - //compute c' = aval * b + c - //write back 64B cacheline of c = c' - c[ ci+0 ] += aval * b[ bi+0 ]; - c[ ci+1 ] += aval * b[ bi+1 ]; - c[ ci+2 ] += aval * b[ bi+2 ]; - c[ ci+3 ] += aval * b[ bi+3 ]; - c[ ci+4 ] += aval * b[ bi+4 ]; - c[ ci+5 ] += aval * b[ bi+5 ]; - c[ ci+6 ] += aval * b[ bi+6 ]; - c[ ci+7 ] += aval * b[ bi+7 ]; + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci); + cVec = aVec.fma(bVec, cVec); + cVec.intoArray(c, ci); } } private static void vectMultiplyAdd2( final double aval1, final double aval2, double[] b, double[] c, int bi1, int bi2, int ci, final int len ) { - final int bn = len%8; + final int bn = len%vLen; - //rest, not aligned to 8-blocks + //rest, not aligned to vLen-blocks for( int j = 0; j < bn; j++, bi1++, bi2++, ci++ ) c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, ci+=8 ) - { - //read 64B cachelines of b (2x) and c - //compute c' = aval_1 * b_1 + aval_2 * b_2 + c - //write back 64B cacheline of c = c' - c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ]; - c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ]; - c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ]; - c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ]; - c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ]; - c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ]; - c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ]; - c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ]; + DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1); + DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2); + //unrolled vLen-block (for better instruction-level parallelism) + for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, ci+=vLen ) { + DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1); + DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci); + cVec = aVec1.fma(bVec1, cVec); + cVec = aVec2.fma(bVec2, cVec); + cVec.intoArray(c, ci); } } private static void vectMultiplyAdd3( final double aval1, final double aval2, final double aval3, double[] b, double[] c, int bi1, int bi2, int bi3, int ci, final int len ) { - final int bn = len%8; - - //rest, not aligned to 8-blocks + final int bn = len%vLen; + //rest, not aligned to vLen-blocks for( int j = 0; j < bn; j++, bi1++, bi2++, bi3++, ci++ ) c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ] + aval3 * b[ bi3 ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, bi3+=8, ci+=8 ) - { - //read 64B cachelines of b (3x) and c - //compute c' = aval_1 * b_1 + aval_2 * b_2 + c - //write back 64B cacheline of c = c' - c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ] + aval3 * b[ bi3+0 ]; - c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ] + aval3 * b[ bi3+1 ]; - c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ] + aval3 * b[ bi3+2 ]; - c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ] + aval3 * b[ bi3+3 ]; - c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ] + aval3 * b[ bi3+4 ]; - c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ] + aval3 * b[ bi3+5 ]; - c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ] + aval3 * b[ bi3+6 ]; - c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ]; + DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1); + DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2); + DoubleVector aVec3 = DoubleVector.broadcast(SPECIES, aval3); + //unrolled vLen-block (for better instruction-level parallelism) + for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, bi3+=vLen, ci+=vLen ) + { + DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1); + DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2); + DoubleVector bVec3 = DoubleVector.fromArray(SPECIES, b, bi3); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci); + cVec = aVec1.fma(bVec1, cVec); + cVec = aVec2.fma(bVec2, cVec); + cVec = aVec3.fma(bVec3, cVec); + cVec.intoArray(c, ci); } } private static void vectMultiplyAdd4( final double aval1, final double aval2, final double aval3, final double aval4, double[] b, double[] c, int bi1, int bi2, int bi3, int bi4, int ci, final int len ) { - final int bn = len%8; - - //rest, not aligned to 8-blocks + final int bn = len%vLen; + //rest, not aligned to vLen-blocks for( int j = 0; j < bn; j++, bi1++, bi2++, bi3++, bi4++, ci++ ) c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ] + aval3 * b[ bi3 ] + aval4 * b[ bi4 ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, bi3+=8, bi4+=8, ci+=8) + DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1); + DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2); + DoubleVector aVec3 = DoubleVector.broadcast(SPECIES, aval3); + DoubleVector aVec4 = DoubleVector.broadcast(SPECIES, aval4); + //unrolled vLen-block (for better instruction-level parallelism) + for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, bi3+=vLen, bi4+=vLen, ci+=vLen) { - //read 64B cachelines of b (4x) and c - //compute c' = aval_1 * b_1 + aval_2 * b_2 + c - //write back 64B cacheline of c = c' - c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ] + aval3 * b[ bi3+0 ] + aval4 * b[ bi4+0 ]; - c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ] + aval3 * b[ bi3+1 ] + aval4 * b[ bi4+1 ]; - c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ] + aval3 * b[ bi3+2 ] + aval4 * b[ bi4+2 ]; - c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ] + aval3 * b[ bi3+3 ] + aval4 * b[ bi4+3 ]; - c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ] + aval3 * b[ bi3+4 ] + aval4 * b[ bi4+4 ]; - c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ] + aval3 * b[ bi3+5 ] + aval4 * b[ bi4+5 ]; - c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ] + aval3 * b[ bi3+6 ] + aval4 * b[ bi4+6 ]; - c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ] + aval4 * b[ bi4+7 ]; + DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1); + DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2); + DoubleVector bVec3 = DoubleVector.fromArray(SPECIES, b, bi3); + DoubleVector bVec4 = DoubleVector.fromArray(SPECIES, b, bi4); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci); + cVec = aVec1.fma(bVec1, cVec); + cVec = aVec2.fma(bVec2, cVec); + cVec = aVec3.fma(bVec3, cVec); + cVec = aVec4.fma(bVec4, cVec); + cVec.intoArray(c, ci); } } @@ -3940,26 +3922,19 @@ public static void vectMultiplyAdd( final double aval, double[] b, double[] c, i //note: public for use by codegen for consistency public static void vectMultiplyWrite( final double aval, double[] b, double[] c, int bi, int ci, final int len ) { - final int bn = len%8; + final int bn = len%vLen; - //rest, not aligned to 8-blocks + //rest, not aligned to vLen-blocks for( int j = 0; j < bn; j++, bi++, ci++) c[ ci ] = aval * b[ bi ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int j = bn; j < len; j+=8, bi+=8, ci+=8) + //unrolled vLen-block (for better instruction-level parallelism) + DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval); + for( int j = bn; j < len; j+=vLen, bi+=vLen, ci+=vLen) { - //read 64B cachelines of b and c - //compute c' = aval * b + c - //write back 64B cacheline of c = c' - c[ ci+0 ] = aval * b[ bi+0 ]; - c[ ci+1 ] = aval * b[ bi+1 ]; - c[ ci+2 ] = aval * b[ bi+2 ]; - c[ ci+3 ] = aval * b[ bi+3 ]; - c[ ci+4 ] = aval * b[ bi+4 ]; - c[ ci+5 ] = aval * b[ bi+5 ]; - c[ ci+6 ] = aval * b[ bi+6 ]; - c[ ci+7 ] = aval * b[ bi+7 ]; + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi); + aVec = aVec.mul(bVec); + aVec.intoArray(c, ci); } } @@ -3996,28 +3971,21 @@ public static void vectMultiplyInPlace(final double[] a, double[] c, int[] cix, } //note: public for use by codegen for consistency - public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len ) - { - final int bn = len%8; + public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len ){ + + final int bn = len%vLen; - //rest, not aligned to 8-blocks + //rest, not aligned to vLen-blocks for( int j = 0; j < bn; j++, ai++, bi++, ci++) c[ ci ] = a[ ai ] * b[ bi ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int j = bn; j < len; j+=8, ai+=8, bi+=8, ci+=8) + //unrolled vLen-block (for better instruction-level parallelism) + for( int j = bn; j < len; j+=vLen, ai+=vLen, bi+=vLen, ci+=vLen) { - //read 64B cachelines of a and b - //compute c' = a * b - //write back 64B cacheline of c = c' - c[ ci+0 ] = a[ ai+0 ] * b[ bi+0 ]; - c[ ci+1 ] = a[ ai+1 ] * b[ bi+1 ]; - c[ ci+2 ] = a[ ai+2 ] * b[ bi+2 ]; - c[ ci+3 ] = a[ ai+3 ] * b[ bi+3 ]; - c[ ci+4 ] = a[ ai+4 ] * b[ bi+4 ]; - c[ ci+5 ] = a[ ai+5 ] * b[ bi+5 ]; - c[ ci+6 ] = a[ ai+6 ] * b[ bi+6 ]; - c[ ci+7 ] = a[ ai+7 ] * b[ bi+7 ]; + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi); + aVec = aVec.mul(bVec); + aVec.intoArray(c, ci); } } @@ -4039,47 +4007,37 @@ public static void vectMultiplyWrite( final double[] a, double[] b, double[] c, } } - public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len) - { - final int bn = len%8; + public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len){ + + final int bn = len%vLen; - //rest, not aligned to 8-blocks + //rest, not aligned to vLen-blocks for( int j = 0; j < bn; j++, ai++, ci++) c[ ci ] *= a[ ai ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int j = bn; j < len; j+=8, ai+=8, ci+=8) + //unrolled vLen-block (for better instruction-level parallelism) + for( int j = bn; j < len; j+=vLen, ai+=vLen, ci+=vLen) { - //read 64B cachelines of a and c - //compute c' = c * a - //write back 64B cacheline of c = c' - c[ ci+0 ] *= a[ ai+0 ]; - c[ ci+1 ] *= a[ ai+1 ]; - c[ ci+2 ] *= a[ ai+2 ]; - c[ ci+3 ] *= a[ ai+3 ]; - c[ ci+4 ] *= a[ ai+4 ]; - c[ ci+5 ] *= a[ ai+5 ]; - c[ ci+6 ] *= a[ ai+6 ]; - c[ ci+7 ] *= a[ ai+7 ]; + DoubleVector res = DoubleVector.fromArray(SPECIES, c, ci); + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai); + + res = aVec.mul(res); + res.intoArray(c, ci); } } //note: public for use by codegen for consistency public static void vectAdd( double[] a, double bval, double[] c, int ai, int ci, final int len ) { - final int bn = len%8; - //rest, not aligned to 8-blocks + final int bn = len%vLen; + //rest, not aligned to vLen-blocks for( int j = 0; j < bn; j++, ai++, ci++) c[ ci ] += a[ ai ]; - //unrolled 8-block (for better ILP) - for( int j = bn; j < len; j+=8, ai+=8, ci+=8) { - c[ ci+0 ] += a[ ai+0 ] + bval; - c[ ci+1 ] += a[ ai+1 ] + bval; - c[ ci+2 ] += a[ ai+2 ] + bval; - c[ ci+3 ] += a[ ai+3 ] + bval; - c[ ci+4 ] += a[ ai+4 ] + bval; - c[ ci+5 ] += a[ ai+5 ] + bval; - c[ ci+6 ] += a[ ai+6 ] + bval; - c[ ci+7 ] += a[ ai+7 ] + bval; + //unrolled vLen-block (for better ILP) + for( int j = bn; j < len; j+=vLen, ai+=vLen, ci+=vLen) { + DoubleVector res = DoubleVector.fromArray(SPECIES, c, ci); + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai); + res = aVec.add(res); + res.intoArray(c, ci); } } diff --git a/src/main/python/systemds/utils/converters.py b/src/main/python/systemds/utils/converters.py index 61a4769e806..650d6380605 100644 --- a/src/main/python/systemds/utils/converters.py +++ b/src/main/python/systemds/utils/converters.py @@ -72,6 +72,7 @@ def matrix_block_to_numpy(jvm: JVMView, mb: JavaObject): :param jvm: The current JVM instance running systemds. :param mb: A pointer to the JVM's MatrixBlock object. """ + num_ros = mb.getNumRows() num_cols = mb.getNumColumns() buf = jvm.org.apache.sysds.runtime.util.Py4jConverterUtils.convertMBtoPy4JDenseArr( diff --git a/src/test/java/org/apache/sysds/performance/Main.java b/src/test/java/org/apache/sysds/performance/Main.java index fc749b56df5..f8d0bbea852 100644 --- a/src/test/java/org/apache/sysds/performance/Main.java +++ b/src/test/java/org/apache/sysds/performance/Main.java @@ -33,7 +33,8 @@ import org.apache.sysds.performance.generators.MatrixFile; import org.apache.sysds.performance.matrix.MatrixAppend; import org.apache.sysds.performance.matrix.MatrixBinaryCellPerf; -import org.apache.sysds.performance.matrix.MatrixMulPerformance; +import org.apache.sysds.performance.matrix.MatrixMultiplicationPerf; +import org.apache.sysds.performance.matrix.MMSparsityPerformance; import org.apache.sysds.performance.matrix.MatrixReplacePerf; import org.apache.sysds.performance.matrix.MatrixStorage; import org.apache.sysds.performance.matrix.ReshapePerf; @@ -139,6 +140,9 @@ private static void exec(int prog, String[] args) throws Exception { case 1008: MatrixAppend.main(args); break; + case 1009: + MatrixMultiplicationPerf.main(args); + break; default: break; } @@ -235,9 +239,9 @@ private static void run17(String[] args) throws Exception { } private static void run1000(String[] args) { - MatrixMulPerformance perf; + MMSparsityPerformance perf; if (args.length < 3) { - perf = new MatrixMulPerformance(); + perf = new MMSparsityPerformance(); } else { // ... [resolution] [maxSparsity] [resolution] [warmupRuns] [repetitions] int rl = Integer.parseInt(args[1]); @@ -256,7 +260,7 @@ private static void run1000(String[] args) { if (args.length > 6) repetitions = Integer.parseInt(args[6]); - perf = new MatrixMulPerformance(rl, cl, warmupRuns, repetitions, resolution, maxSparsity, 2f); + perf = new MMSparsityPerformance(rl, cl, warmupRuns, repetitions, resolution, maxSparsity, 2f); } perf.testSparseFormat(null, null); diff --git a/src/test/java/org/apache/sysds/performance/README.md b/src/test/java/org/apache/sysds/performance/README.md index 4945afd9ab5..79bc8fa8f36 100644 --- a/src/test/java/org/apache/sysds/performance/README.md +++ b/src/test/java/org/apache/sysds/performance/README.md @@ -28,7 +28,7 @@ mvn package Example of running it: ```bash -java -jar target/systemds-3.3.0-SNAPSHOT-perf.jar 1 +java -jar target/systemds-3.4.0-SNAPSHOT-perf.jar 1 ``` example result of the above job: @@ -49,45 +49,45 @@ Running Steam Compression Test With profiler: ```bash -java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 12 10000 100 4 1.0 16 1000 -1 +java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 12 10000 100 4 1.0 16 1000 -1 ``` Take a Matrix and perform serialization ```bash -java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 13 16 100 "temp/test.csv" -1 +java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 13 16 100 "temp/test.csv" -1 ``` Take a Frame and transform into a Matrix and perform serialization. ```bash -java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 14 16 1000 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json" -1 +java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 14 16 1000 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json" -1 ``` Frame Operation timings ```bash -java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 15 16 10 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json" +java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 15 16 10 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json" ``` Reshape Sparse ```bash -java -cp "target/systemds-3.3.0-SNAPSHOT-perf.jar:target/lib/*" -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html org.apache.sysds.performance.Main 1005 +java -cp "target/systemds-3.4.0-SNAPSHOT-perf.jar:target/lib/*" -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html org.apache.sysds.performance.Main 1005 ``` Binary Operations ```bash -java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1006 500 +java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1006 500 ``` transform encode ```bash -java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1007 +java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1007 ``` @@ -96,3 +96,10 @@ append matrix sequence ```bash ./src/test/scripts/performance/append.sh ``` + + +matrix multiplication + +```bash +./src/test/scripts/performance/matrixMultiplication.sh +``` \ No newline at end of file diff --git a/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java b/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java new file mode 100644 index 00000000000..06a4d9065a8 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.performance.generators; + +import org.apache.sysds.runtime.matrix.data.Pair; + +public class IGeneratePair implements IGenerate> { + + private final IGenerate a; + private final IGenerate b; + + public IGeneratePair(IGenerate a, IGenerate b) { + this.a = a; + this.b = b; + } + + @Override + public boolean isEmpty() { + return a.isEmpty() && b.isEmpty(); + } + + @Override + public int defaultWaitTime() { + return Math.max(a.defaultWaitTime(), b.defaultWaitTime()); + } + + @Override + public Pair take() { + A av = a.take(); + B bv = b.take(); + return new Pair<>(av, bv); + } + + @Override + public void generate(int N) throws InterruptedException { + a.generate(N); + b.generate(N); + } + +} diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java b/src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java similarity index 95% rename from src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java rename to src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java index f201c8fd7a5..0c77b07dfcd 100644 --- a/src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java +++ b/src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java @@ -31,7 +31,7 @@ import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.TestUtils; -public class MatrixMulPerformance { +public class MMSparsityPerformance { private final int _rl; private final int _cl; @@ -42,11 +42,11 @@ public class MatrixMulPerformance { private final float resolutionDivisor; private final float maxSparsity; - public MatrixMulPerformance() { + public MMSparsityPerformance() { this(1024, 1024, 15, 50, 18, .4f, 2f); } - public MatrixMulPerformance(int rl, int cl, int warmupRuns, int repetitions, + public MMSparsityPerformance(int rl, int cl, int warmupRuns, int repetitions, int resolution, float maxSparsity, float stepDivisor) { _rl = rl; @@ -89,10 +89,12 @@ private static String printAsPythonList(double[] list) { sb.append("["); for (double el : list) - sb.append(el + ","); + sb.append(el + ", "); - if (list.length > 0) + if (list.length > 0){ sb.deleteCharAt(sb.length() - 1); + sb.deleteCharAt(sb.length() - 1); + } sb.append("]"); return sb.toString(); diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java new file mode 100644 index 00000000000..757bbcd0854 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java @@ -0,0 +1,88 @@ +package org.apache.sysds.performance.matrix; + +import java.util.Arrays; + +import org.apache.sysds.performance.compression.APerfTest; +import org.apache.sysds.performance.generators.ConstMatrix; +import org.apache.sysds.performance.generators.IGenerate; +import org.apache.sysds.performance.generators.IGeneratePair; +import org.apache.sysds.runtime.instructions.InstructionUtils; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.data.Pair; +import org.apache.sysds.utils.stats.InfrastructureAnalyzer; + +public class MatrixMultiplicationPerf extends APerfTest> { + + // parallelization degree + private final int k; + + public MatrixMultiplicationPerf(int N, IGenerate> gen, int k) { + super(N, gen); + this.k = k; + } + + public void run() throws Exception { + warmup(() -> mm(k), 10); + execute(() -> mm(1), "mm SingleThread", N/10); + if(k != 1) { + execute(() -> mm(k), "mm MultiThread: " + k); + } + } + + private void mm(int k) { + Pair in = gen.take(); + MatrixBlock left = in.getKey(); + MatrixBlock right = in.getValue(); + left.aggregateBinaryOperations(left, right, InstructionUtils.getMatMultOperator(k)); + ret.add(null); + } + + @Override + protected String makeResString() { + return ""; + } + + public static void main(String[] args) throws Exception { + + IGenerate left; + IGenerate right; + final int i; + final int j; + final int k; + final double sp1; + final double sp2; + if(args.length == 0) { + i = Integer.parseInt(args[1]); + j = Integer.parseInt(args[2]); + k = Integer.parseInt(args[3]); + + sp1 = Double.parseDouble(args[4]); + sp2 = Double.parseDouble(args[5]); + + } + else { + + i = Integer.parseInt(args[1]); + j = Integer.parseInt(args[2]); + k = Integer.parseInt(args[3]); + + sp1 = Double.parseDouble(args[4]); + sp2 = Double.parseDouble(args[5]); + + } + + left = new ConstMatrix(i, j, 10, sp1); + right = new ConstMatrix(j, k, 10, sp2); + IGenerate> gen = new IGeneratePair<>(left, right); + + // set number of repeats based on expected number of instructions. + + long inst = (long) i * k * j; + + int N = Math.min(100000, (int) Math.max(100L, 50000000000L / inst)); + + System.out.println("MM Perf : rep " +N+ " -- " + Arrays.toString(args)); + + new MatrixMultiplicationPerf(N, gen, InfrastructureAnalyzer.getLocalParallelism()).run(); + } +} diff --git a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java index 2c3dd11c6d0..a7f5714bf9a 100644 --- a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java +++ b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java @@ -1665,7 +1665,8 @@ protected static Process startLocalFedWorker(int port, String[] addArgs, int sle "--add-opens=java.base/java.lang=ALL-UNNAMED" , "--add-opens=java.base/java.lang.ref=ALL-UNNAMED" , "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED" , - "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",}; + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", + "--add-modules=jdk.incubator.vector",}; RuntimeMXBean runtimeMxBean = ManagementFactory.getRuntimeMXBean(); List jvmArgs = runtimeMxBean.getInputArguments(); diff --git a/src/test/java/org/apache/sysds/test/functions/federated/multitenant/MultiTenantTestBase.java b/src/test/java/org/apache/sysds/test/functions/federated/multitenant/MultiTenantTestBase.java index 1a716febf97..c3a4756a2d5 100644 --- a/src/test/java/org/apache/sysds/test/functions/federated/multitenant/MultiTenantTestBase.java +++ b/src/test/java/org/apache/sysds/test/functions/federated/multitenant/MultiTenantTestBase.java @@ -124,6 +124,7 @@ protected void startCoordinator(ExecMode execMode, String scriptPath, String[] a "--add-opens=java.base/java.lang.ref=ALL-UNNAMED" , "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED" , "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", + "--add-modules=jdk.incubator.vector", "-cp", classpath, DMLScript.class.getName()}, argsList.toArray(new String[0]))); Process process = null; diff --git a/src/test/scripts/performance/append.sh b/src/test/scripts/performance/append.sh index d2184dd472f..822de94ba1f 100755 --- a/src/test/scripts/performance/append.sh +++ b/src/test/scripts/performance/append.sh @@ -21,23 +21,23 @@ #------------------------------------------------------------- mvn package > /dev/null -java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 100 100 1.0 1 30000 -java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 100 1.0 1 3000 -java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 1 3000 -java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 100 100 0.3 1 30000 -java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 100 0.3 1 3000 -java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 1 3000 +java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 100 100 1.0 1 30000 +java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 100 1.0 1 3000 +java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 1 3000 +java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 100 100 0.3 1 30000 +java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 100 0.3 1 3000 +java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 1 3000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 100 100 1.0 10 30000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 100 1.0 10 3000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 10 1000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 100 100 0.3 10 30000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 100 0.3 10 3000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 10 1000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 100 100 1.0 10 30000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 100 1.0 10 3000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 10 1000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 100 100 0.3 10 30000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 100 0.3 10 3000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 10 1000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 100 100 1.0 100 3000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 100 1.0 100 300 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 100 200 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 100 100 0.3 100 3000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 100 0.3 100 2000 -# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 100 1000 \ No newline at end of file +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 100 100 1.0 100 3000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 100 1.0 100 300 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 100 200 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 100 100 0.3 100 3000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 100 0.3 100 2000 +# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 100 1000 \ No newline at end of file diff --git a/src/test/scripts/performance/matrixMultiplication.sh b/src/test/scripts/performance/matrixMultiplication.sh new file mode 100755 index 00000000000..66a2694c5b8 --- /dev/null +++ b/src/test/scripts/performance/matrixMultiplication.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +mvn package > /dev/null + +cm="java --add-modules=jdk.incubator.vector -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1009" + +$cm 5 5 5 1 1 +$cm 500 5 5 1 1 +$cm 5 500 5 1 1 +$cm 5 5 500 1 1 + + +$cm 100 100 100 1 1 +$cm 1000 100 100 1 1 +$cm 100 1000 100 1 1 +$cm 100 100 1000 1 1 + + +$cm 1000 1000 1000 1 1 + +$cm 10000 1000 1000 1 1 +$cm 1000 10000 1000 1 1 +$cm 1000 1000 10000 1 1 + + +$cm 10000 10000 10000 1 1 \ No newline at end of file From 9cc9f0259371563c15ead64afe3bb3d840be5fd9 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 15 May 2025 16:49:37 +0200 Subject: [PATCH 02/11] add boolean to specify single or multithreaded --- .../matrix/MatrixMultiplicationPerf.java | 28 +++++++++-------- .../performance/matrixMultiplication.sh | 31 +++++++++---------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java index 757bbcd0854..8f4ead21f31 100644 --- a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java +++ b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java @@ -15,18 +15,20 @@ public class MatrixMultiplicationPerf extends APerfTest> gen, int k) { + public MatrixMultiplicationPerf(int N, IGenerate> gen, int k, boolean single) { super(N, gen); this.k = k; + this.single = single; } public void run() throws Exception { warmup(() -> mm(k), 10); - execute(() -> mm(1), "mm SingleThread", N/10); - if(k != 1) { + if(single) + execute(() -> mm(1), "mm SingleThread", N/10); + if(k != 1) execute(() -> mm(k), "mm MultiThread: " + k); - } } private void mm(int k) { @@ -51,23 +53,23 @@ public static void main(String[] args) throws Exception { final int k; final double sp1; final double sp2; + final boolean single; if(args.length == 0) { - i = Integer.parseInt(args[1]); - j = Integer.parseInt(args[2]); - k = Integer.parseInt(args[3]); - - sp1 = Double.parseDouble(args[4]); - sp2 = Double.parseDouble(args[5]); - + i = 10; + j = 10; + k = 10; + sp1 = 1.0; + sp2 = 1.0; + single= true; } else { i = Integer.parseInt(args[1]); j = Integer.parseInt(args[2]); k = Integer.parseInt(args[3]); - sp1 = Double.parseDouble(args[4]); sp2 = Double.parseDouble(args[5]); + single = Boolean.parseBoolean(args[6]); } @@ -83,6 +85,6 @@ public static void main(String[] args) throws Exception { System.out.println("MM Perf : rep " +N+ " -- " + Arrays.toString(args)); - new MatrixMultiplicationPerf(N, gen, InfrastructureAnalyzer.getLocalParallelism()).run(); + new MatrixMultiplicationPerf(N, gen, InfrastructureAnalyzer.getLocalParallelism(), single).run(); } } diff --git a/src/test/scripts/performance/matrixMultiplication.sh b/src/test/scripts/performance/matrixMultiplication.sh index 66a2694c5b8..fd43a90b331 100755 --- a/src/test/scripts/performance/matrixMultiplication.sh +++ b/src/test/scripts/performance/matrixMultiplication.sh @@ -21,27 +21,24 @@ #------------------------------------------------------------- -mvn package > /dev/null +mvn package 2>&1 > /dev/null cm="java --add-modules=jdk.incubator.vector -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1009" -$cm 5 5 5 1 1 -$cm 500 5 5 1 1 -$cm 5 500 5 1 1 -$cm 5 5 500 1 1 +$cm 5 5 5 1 1 true +$cm 500 5 5 1 1 true +$cm 5 500 5 1 1 true +$cm 5 5 500 1 1 true +$cm 100 100 100 1 1 true +$cm 1000 100 100 1 1 true +$cm 100 1000 100 1 1 true +$cm 100 100 1000 1 1 true -$cm 100 100 100 1 1 -$cm 1000 100 100 1 1 -$cm 100 1000 100 1 1 -$cm 100 100 1000 1 1 +$cm 1000 1000 1000 1 1 true +$cm 10000 1000 1000 1 1 true +$cm 1000 10000 1000 1 1 true +$cm 1000 1000 10000 1 1 true -$cm 1000 1000 1000 1 1 - -$cm 10000 1000 1000 1 1 -$cm 1000 10000 1000 1 1 -$cm 1000 1000 10000 1 1 - - -$cm 10000 10000 10000 1 1 \ No newline at end of file +$cm 10000 10000 10000 1 1 false \ No newline at end of file From a9c0c908b42187cecaa038b916ebc24d0dbb11d0 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 15 May 2025 16:50:43 +0200 Subject: [PATCH 03/11] tmp only do the big one --- .../performance/matrixMultiplication.sh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/test/scripts/performance/matrixMultiplication.sh b/src/test/scripts/performance/matrixMultiplication.sh index fd43a90b331..a63763c9019 100755 --- a/src/test/scripts/performance/matrixMultiplication.sh +++ b/src/test/scripts/performance/matrixMultiplication.sh @@ -25,20 +25,20 @@ mvn package 2>&1 > /dev/null cm="java --add-modules=jdk.incubator.vector -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1009" -$cm 5 5 5 1 1 true -$cm 500 5 5 1 1 true -$cm 5 500 5 1 1 true -$cm 5 5 500 1 1 true +# $cm 5 5 5 1 1 true +# $cm 500 5 5 1 1 true +# $cm 5 500 5 1 1 true +# $cm 5 5 500 1 1 true -$cm 100 100 100 1 1 true -$cm 1000 100 100 1 1 true -$cm 100 1000 100 1 1 true -$cm 100 100 1000 1 1 true +# $cm 100 100 100 1 1 true +# $cm 1000 100 100 1 1 true +# $cm 100 1000 100 1 1 true +# $cm 100 100 1000 1 1 true -$cm 1000 1000 1000 1 1 true +# $cm 1000 1000 1000 1 1 true -$cm 10000 1000 1000 1 1 true -$cm 1000 10000 1000 1 1 true -$cm 1000 1000 10000 1 1 true +# $cm 10000 1000 1000 1 1 true +# $cm 1000 10000 1000 1 1 true +# $cm 1000 1000 10000 1 1 true $cm 10000 10000 10000 1 1 false \ No newline at end of file From 1cc76f6269a752bed7217fe1b532ab828273f98c Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 15 May 2025 19:29:53 +0200 Subject: [PATCH 04/11] fix error --- .../org/apache/sysds/runtime/matrix/data/LibMatrixMult.java | 6 ++---- .../sysds/test/component/matrix/MatrixMultiplyTest.java | 3 ++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index adb26dce107..2910b37938e 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -3933,8 +3933,7 @@ public static void vectMultiplyWrite( final double aval, double[] b, double[] c, for( int j = bn; j < len; j+=vLen, bi+=vLen, ci+=vLen) { DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi); - aVec = aVec.mul(bVec); - aVec.intoArray(c, ci); + aVec.mul(bVec).intoArray(c, ci); } } @@ -3984,8 +3983,7 @@ public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai { DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai); DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi); - aVec = aVec.mul(bVec); - aVec.intoArray(c, ci); + aVec.mul(bVec).intoArray(c, ci); } } diff --git a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java index 0934898bcc2..b862a8a6314 100644 --- a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java +++ b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java @@ -263,7 +263,8 @@ private void test(MatrixBlock a, MatrixBlock b) { totalMessage += "\n\nExp" + exp; totalMessage += "\n\nAct" + ret; } - + LOG.error(exp.slice(0, 10,0, 10)); + LOG.error(ret.slice(0, 10,0, 10)); assertEquals(totalMessage, exp.getNonZeros(), ret.getNonZeros()); TestUtils.compareMatricesPercentageDistance(exp, ret, 0.999, 0.99999, totalMessage, false); } From c33374f18c50c00fe16b6fb1b07e3e037709e5a0 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 15 May 2025 19:40:39 +0200 Subject: [PATCH 05/11] remove print --- .../org/apache/sysds/runtime/matrix/data/LibMatrixMult.java | 3 +-- .../apache/sysds/test/component/matrix/MatrixMultiplyTest.java | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index 2910b37938e..3982759312a 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -3774,8 +3774,7 @@ else if( aixk > bixk ) } //note: public for use by codegen for consistency - public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len ) - { + public static void vectMultiplyAdd(final double aval, double[] b, double[] c, int bi, int ci, final int len) { final int bn = len%vLen; //rest, not aligned to vLen-blocks diff --git a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java index b862a8a6314..0934898bcc2 100644 --- a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java +++ b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java @@ -263,8 +263,7 @@ private void test(MatrixBlock a, MatrixBlock b) { totalMessage += "\n\nExp" + exp; totalMessage += "\n\nAct" + ret; } - LOG.error(exp.slice(0, 10,0, 10)); - LOG.error(ret.slice(0, 10,0, 10)); + assertEquals(totalMessage, exp.getNonZeros(), ret.getNonZeros()); TestUtils.compareMatricesPercentageDistance(exp, ret, 0.999, 0.99999, totalMessage, false); } From 9b933d00d93c5bb47cbc9edfc030d4b2c7cb84fc Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 15 May 2025 19:43:24 +0200 Subject: [PATCH 06/11] fix --- .../matrix/MatrixMultiplicationPerf.java | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java index 8f4ead21f31..df799e79c3b 100644 --- a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java +++ b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.sysds.performance.matrix; import java.util.Arrays; @@ -26,7 +45,7 @@ public MatrixMultiplicationPerf(int N, IGenerate> public void run() throws Exception { warmup(() -> mm(k), 10); if(single) - execute(() -> mm(1), "mm SingleThread", N/10); + execute(() -> mm(1), "mm SingleThread", N / 10); if(k != 1) execute(() -> mm(k), "mm MultiThread: " + k); } @@ -60,7 +79,7 @@ public static void main(String[] args) throws Exception { k = 10; sp1 = 1.0; sp2 = 1.0; - single= true; + single = true; } else { @@ -83,7 +102,7 @@ public static void main(String[] args) throws Exception { int N = Math.min(100000, (int) Math.max(100L, 50000000000L / inst)); - System.out.println("MM Perf : rep " +N+ " -- " + Arrays.toString(args)); + System.out.println("MM Perf : rep " + N + " -- " + Arrays.toString(args)); new MatrixMultiplicationPerf(N, gen, InfrastructureAnalyzer.getLocalParallelism(), single).run(); } From 5bd8d1a057043e7a76afc3a40c1bd5a705ec30de Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 15 May 2025 20:00:43 +0200 Subject: [PATCH 07/11] fix MMDictionary vectorized --- .../compress/colgroup/dictionary/MatrixBlockDictionary.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java index 54cdf6920ac..24776f3adc4 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java @@ -2129,7 +2129,7 @@ private static void preaggValuesFromDenseDictBlockedIKJ(double[] a, double[] b, int bie, int bke, int cz, int az, int ls, int cut, int sOffT, int eOffT) { final int vLen = SPECIES.length(); final DoubleVector vVec = DoubleVector.zero(SPECIES); - final int leftover = sOffT - eOffT % vLen; // leftover not vectorized + final int leftover = (eOffT - sOffT) % vLen; // leftover not vectorized for(int i = bi; i < bie; i++) { final int offI = i * cz; final int offOutT = i * az + bj; From 9cfd129a962daad26a3245777353d2d5bad28bc4 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 15 May 2025 20:24:03 +0200 Subject: [PATCH 08/11] Increase eps tolerance in RewriteDetTest --- .../org/apache/sysds/test/functions/rewrite/RewriteDetTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/apache/sysds/test/functions/rewrite/RewriteDetTest.java b/src/test/java/org/apache/sysds/test/functions/rewrite/RewriteDetTest.java index 288ff0e44e8..245d6b235bd 100644 --- a/src/test/java/org/apache/sysds/test/functions/rewrite/RewriteDetTest.java +++ b/src/test/java/org/apache/sysds/test/functions/rewrite/RewriteDetTest.java @@ -46,7 +46,7 @@ public class RewriteDetTest extends AutomatedTestBase private final static int rows = 23; private final static double _sparsityDense = 0.7; private final static double _sparsitySparse = 0.2; - private final static double eps = 1e-8; + private final static double eps = 1e-7; @Override public void setUp() { From 840eb992aa67661e09e1be97db7ab124b7687937 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Fri, 16 May 2025 15:40:02 +0200 Subject: [PATCH 09/11] add vector api to SystemDS Context in PythonAPI --- src/main/python/systemds/context/systemds_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/python/systemds/context/systemds_context.py b/src/main/python/systemds/context/systemds_context.py index 9385cb991b9..25824bc6631 100644 --- a/src/main/python/systemds/context/systemds_context.py +++ b/src/main/python/systemds/context/systemds_context.py @@ -173,7 +173,7 @@ def __build_startup_command(self, port: int): :param port: The port address to use if -1 chose random port.""" # Base command - command = ["java", "-cp"] + command = ["java", "--add-modules=jdk.incubator.vector", "-cp"] # Find the operating system specifc separator, nt means its Windows cp_separator = ";" if os.name == "nt" else ":" From 8a53047838eb1c18b230644fdb5744c9c05b2132 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 22 May 2025 15:10:00 +0200 Subject: [PATCH 10/11] black formatting --- src/main/python/systemds/utils/converters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/python/systemds/utils/converters.py b/src/main/python/systemds/utils/converters.py index 650d6380605..551a2332579 100644 --- a/src/main/python/systemds/utils/converters.py +++ b/src/main/python/systemds/utils/converters.py @@ -72,7 +72,7 @@ def matrix_block_to_numpy(jvm: JVMView, mb: JavaObject): :param jvm: The current JVM instance running systemds. :param mb: A pointer to the JVM's MatrixBlock object. """ - + num_ros = mb.getNumRows() num_cols = mb.getNumColumns() buf = jvm.org.apache.sysds.runtime.util.Py4jConverterUtils.convertMBtoPy4JDenseArr( From 1a819a309cf98b5fa2068d2f8c16b6241db4d333 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 22 May 2025 22:01:54 +0200 Subject: [PATCH 11/11] more error allowed. --- docker/build.sh | 2 +- docker/entrypoint.sh | 4 +++- .../primitives/part3/FederatedWeightedDivMatrixMultTest.java | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docker/build.sh b/docker/build.sh index 02e45675f10..2898effdc22 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -23,7 +23,7 @@ # Build the docker containers # The first build is for running systemds through docker. -docker image build -f docker/sysds.Dockerfile -t apache/systemds:latest . +# docker image build -f docker/sysds.Dockerfile -t apache/systemds:latest . # The second build is for testing systemds. This image installs the R dependencies needed to run the tests. docker image build -f docker/testsysds.Dockerfile -t apache/systemds:testing-latest . diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 84bd5e53783..c276a707c35 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -30,7 +30,9 @@ export MAVEN_OPTS="-Xmx512m" log="/tmp/sysdstest.log" mvn -ntp -B test-compile 2>&1 | grep -E "BUILD|Total time:|---|Building SystemDS" -mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 | grep -v "already exists in destination." | tee $log +mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \ + | grep -v "already exists in destination." \ + | grep -v 'WARNING: Using incubator modules' | tee $log # Merge Federated test runs. [ -f target/jacoco.exec ] && mv target/jacoco.exec target/jacoco_main.exec diff --git a/src/test/java/org/apache/sysds/test/functions/federated/primitives/part3/FederatedWeightedDivMatrixMultTest.java b/src/test/java/org/apache/sysds/test/functions/federated/primitives/part3/FederatedWeightedDivMatrixMultTest.java index 17768f237fc..6753774f653 100644 --- a/src/test/java/org/apache/sysds/test/functions/federated/primitives/part3/FederatedWeightedDivMatrixMultTest.java +++ b/src/test/java/org/apache/sysds/test/functions/federated/primitives/part3/FederatedWeightedDivMatrixMultTest.java @@ -61,7 +61,7 @@ public class FederatedWeightedDivMatrixMultTest extends AutomatedTestBase { private final static String OUTPUT_NAME = "Z"; - private final static double TOLERANCE = 1e-9; + private final static double TOLERANCE = 1e-8; private final static int BLOCKSIZE = 1024;