diff --git a/bin/systemds b/bin/systemds
index 2e8e629495b..f0cb0b729b0 100755
--- a/bin/systemds
+++ b/bin/systemds
@@ -413,6 +413,7 @@ if [ $WORKER == 1 ]; then
print_out "# starting Federated worker on port $PORT"
CMD=" \
java $SYSTEMDS_STANDALONE_OPTS \
+ --add-modules=jdk.incubator.vector \
$LOG4JPROPFULL \
-jar $SYSTEMDS_JAR_FILE \
-w $PORT \
@@ -422,6 +423,7 @@ elif [ "$FEDMONITORING" == 1 ]; then
print_out "# starting Federated backend monitoring on port $PORT"
CMD=" \
java $SYSTEMDS_STANDALONE_OPTS \
+ --add-modules=jdk.incubator.vector \
$LOG4JPROPFULL \
-jar $SYSTEMDS_JAR_FILE \
-fedMonitoring $PORT \
@@ -433,6 +435,7 @@ elif [ $SYSDS_DISTRIBUTED == 0 ]; then
CMD=" \
java $SYSTEMDS_STANDALONE_OPTS \
$LOG4JPROPFULL \
+ --add-modules=jdk.incubator.vector \
-jar $SYSTEMDS_JAR_FILE \
-f $SCRIPT_FILE \
-exec $SYSDS_EXEC_MODE \
@@ -442,6 +445,7 @@ else
print_out "# Running script $SCRIPT_FILE distributed with opts: $*"
CMD=" \
spark-submit $SYSTEMDS_DISTRIBUTED_OPTS \
+ --add-modules=jdk.incubator.vector \
$SYSTEMDS_JAR_FILE \
-f $SCRIPT_FILE \
-exec $SYSDS_EXEC_MODE \
diff --git a/docker/build.sh b/docker/build.sh
index 02e45675f10..2898effdc22 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -23,7 +23,7 @@
# Build the docker containers
# The first build is for running systemds through docker.
-docker image build -f docker/sysds.Dockerfile -t apache/systemds:latest .
+# docker image build -f docker/sysds.Dockerfile -t apache/systemds:latest .
# The second build is for testing systemds. This image installs the R dependencies needed to run the tests.
docker image build -f docker/testsysds.Dockerfile -t apache/systemds:testing-latest .
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index 84bd5e53783..c276a707c35 100755
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -30,7 +30,9 @@ export MAVEN_OPTS="-Xmx512m"
log="/tmp/sysdstest.log"
mvn -ntp -B test-compile 2>&1 | grep -E "BUILD|Total time:|---|Building SystemDS"
-mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 | grep -v "already exists in destination." | tee $log
+mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \
+ | grep -v "already exists in destination." \
+ | grep -v 'WARNING: Using incubator modules' | tee $log
# Merge Federated test runs.
[ -f target/jacoco.exec ] && mv target/jacoco.exec target/jacoco_main.exec
diff --git a/pom.xml b/pom.xml
index b25d94cc7db..5d2485897fb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -92,6 +92,7 @@
--add-opens=java.base/java.lang.ref=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
+ --add-modules=jdk.incubator.vector
@@ -357,6 +358,9 @@
${java.level}
${java.level}
${java.level}
+
+ --add-modules=jdk.incubator.vector
+
@@ -904,6 +908,7 @@
true
false
true
+ --add-modules=jdk.incubator.vector
${doc.skip}
public
${java.level}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index e55a24e56f5..fc82c58e16b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -26,8 +26,8 @@
import java.util.List;
import java.util.concurrent.ExecutorService;
-// import jdk.incubator.vector.DoubleVector;
-// import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
@@ -75,7 +75,7 @@ public class ColGroupDDC extends APreAgg implements IMapToDataGroup {
protected final AMapToData _data;
- // static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED;
+ static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED;
private ColGroupDDC(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) {
super(colIndexes, dict, cachedCounts);
@@ -625,7 +625,8 @@ private void identityRightDecompressingMult(MatrixBlock right, MatrixBlock ret,
final double[] b = right.getDenseBlockValues();
final double[] c = ret.getDenseBlockValues();
final int jd = right.getNumColumns();
- final int vLen = 8;
+ final DoubleVector vVec = DoubleVector.zero(SPECIES);
+ final int vLen = SPECIES.length();
final int lenJ = cru - crl;
final int end = cru - (lenJ % vLen);
for(int i = rl; i < ru; i++) {
@@ -633,8 +634,7 @@ private void identityRightDecompressingMult(MatrixBlock right, MatrixBlock ret,
final int offOut = i * jd + crl;
final double aa = 1;
final int k_right = _colIndexes.get(k);
- vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen);
-
+ vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen, vVec);
}
}
@@ -644,8 +644,8 @@ private void defaultRightDecompressingMult(MatrixBlock right, MatrixBlock ret, i
final double[] c = ret.getDenseBlockValues();
final int kd = _colIndexes.size();
final int jd = right.getNumColumns();
- // final DoubleVector vVec = DoubleVector.zero(SPECIES);
- final int vLen = 8;
+ final DoubleVector vVec = DoubleVector.zero(SPECIES);
+ final int vLen = SPECIES.length();
final int blkzI = 32;
final int blkzK = 24;
@@ -661,32 +661,22 @@ private void defaultRightDecompressingMult(MatrixBlock right, MatrixBlock ret, i
for(int k = bk; k < bke; k++) {
final double aa = a[offi + k];
final int k_right = _colIndexes.get(k);
- vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen);
+ vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen, vVec);
}
}
}
}
}
- final void vectMM(double aa, double[] b, double[] c, int endT, int jd, int crl, int cru, int offOut, int k,
- int vLen) {
- // vVec = vVec.broadcast(aa);
+ final void vectMM(double aa, double[] b, double[] c, int endT, int jd, int crl, int cru, int offOut, int k, int vLen, DoubleVector vVec) {
+ vVec = vVec.broadcast(aa);
final int offj = k * jd;
final int end = endT + offj;
for(int j = offj + crl; j < end; j += vLen, offOut += vLen) {
- // DoubleVector res = DoubleVector.fromArray(SPECIES, c, offOut);
- // DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j);
- // res = vVec.fma(bVec, res);
- // res.intoArray(c, offOut);
-
- c[offOut] += aa * b[j];
- c[offOut + 1] += aa * b[j + 1];
- c[offOut + 2] += aa * b[j + 2];
- c[offOut + 3] += aa * b[j + 3];
- c[offOut + 4] += aa * b[j + 4];
- c[offOut + 5] += aa * b[j + 5];
- c[offOut + 6] += aa * b[j + 6];
- c[offOut + 7] += aa * b[j + 7];
+ DoubleVector res = DoubleVector.fromArray(SPECIES, c, offOut);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j);
+ res = vVec.fma(bVec, res);
+ res.intoArray(c, offOut);
}
for(int j = end; j < cru + offj; j++, offOut++) {
double bb = b[j];
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index 1d6949cbcd7..24776f3adc4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -27,6 +27,8 @@
import java.util.Arrays;
import java.util.Set;
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex;
@@ -65,6 +67,8 @@ public class MatrixBlockDictionary extends ADictionary {
final private MatrixBlock _data;
+ static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED;
+
/**
* Unsafe private constructor that does not check the data validity. USE WITH CAUTION.
*
@@ -2088,7 +2092,71 @@ private void preaggValuesFromDenseDictDenseAggArray(final int numVals, final ICo
private void preaggValuesFromDenseDictDenseAggRange(final int numVals, final IColIndex colIndexes, final int s,
final int e, final double[] b, final int cut, final double[] ret) {
- preaggValuesFromDenseDictDenseAggRangeGeneric(numVals, colIndexes, s, e, b, cut, ret);
+ if(colIndexes instanceof RangeIndex) {
+ RangeIndex ri = (RangeIndex) colIndexes;
+ preaggValuesFromDenseDictDenseAggRangeRange(numVals, ri.get(0), ri.get(0) + ri.size(), s, e, b, cut, ret);
+ }
+ else
+ preaggValuesFromDenseDictDenseAggRangeGeneric(numVals, colIndexes, s, e, b, cut, ret);
+ }
+
+ private void preaggValuesFromDenseDictDenseAggRangeRange(final int numVals, final int ls, final int le, final int rs,
+ final int re, final double[] b, final int cut, final double[] ret) {
+ final int cz = le - ls;
+ final int az = re - rs;
+ // final int nCells = numVals * cz;
+ final double[] values = _data.getDenseBlockValues();
+ // Correctly named ikj matrix multiplication .
+
+ final int blkzI = 32;
+ final int blkzK = 24;
+ final int blkzJ = 1024;
+ for(int bi = 0; bi < numVals; bi += blkzI) {
+ final int bie = Math.min(numVals, bi + blkzI);
+ for(int bk = 0; bk < cz; bk += blkzK) {
+ final int bke = Math.min(cz, bk + blkzK);
+ for(int bj = 0; bj < az; bj += blkzJ) {
+ final int bje = Math.min(az, bj + blkzJ);
+ final int sOffT = rs + bj;
+ final int eOffT = rs + bje;
+ preaggValuesFromDenseDictBlockedIKJ(values, b, ret, bi, bk, bj, bie, bke, cz, az, ls, cut, sOffT, eOffT);
+ }
+ }
+ }
+ }
+
+ private static void preaggValuesFromDenseDictBlockedIKJ(double[] a, double[] b, double[] ret, int bi, int bk, int bj,
+ int bie, int bke, int cz, int az, int ls, int cut, int sOffT, int eOffT) {
+ final int vLen = SPECIES.length();
+ final DoubleVector vVec = DoubleVector.zero(SPECIES);
+ final int leftover = (eOffT - sOffT) % vLen; // leftover not vectorized
+ for(int i = bi; i < bie; i++) {
+ final int offI = i * cz;
+ final int offOutT = i * az + bj;
+ for(int k = bk; k < bke; k++) {
+ final int idb = (k + ls) * cut;
+ final int sOff = sOffT + idb;
+ final int eOff = eOffT + idb;
+ final double v = a[offI + k];
+ vecInnerLoop(v, b, ret, offOutT, eOff, sOff, leftover, vLen, vVec);
+ }
+ }
+ }
+
+ private static void vecInnerLoop(final double v, final double[] b, final double[] ret, final int offOutT,
+ final int eOff, final int sOff, final int leftover, final int vLen, DoubleVector vVec) {
+ int offOut = offOutT;
+ vVec = vVec.broadcast(v);
+ final int end = eOff - leftover;
+ for(int j = sOff; j < end; j += vLen, offOut += vLen) {
+ DoubleVector res = DoubleVector.fromArray(SPECIES, ret, offOut);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j);
+ vVec.fma(bVec, res).intoArray(ret, offOut);
+ }
+ for(int j = end; j < eOff; j++, offOut++) {
+ ret[offOut] += v * b[j];
+ }
+
}
private void preaggValuesFromDenseDictDenseAggRangeGeneric(final int numVals, final IColIndex colIndexes,
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 66f7c3c9445..3982759312a 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -29,6 +29,9 @@
import java.util.concurrent.Future;
import java.util.stream.IntStream;
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -78,6 +81,8 @@ public class LibMatrixMult
public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size)
public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size)
private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName());
+ private static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED;
+ private static final int vLen = SPECIES.length();
private LibMatrixMult() {
//prevent instantiation via private constructor
@@ -3668,25 +3673,18 @@ private static void matrixMultWuMMGeneric (MatrixBlock mW, MatrixBlock mU, Matri
private static double dotProduct( double[] a, double[] b, final int len )
{
double val = 0;
- final int bn = len%8;
+
+ final int bn = len%vLen;
//compute rest
for( int i = 0; i < bn; i++ )
val += a[ i ] * b[ i ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int i = bn; i < len; i+=8 )
- {
- //read 64B cachelines of a and b
- //compute cval' = sum(a * b) + cval
- val += a[ i+0 ] * b[ i+0 ]
- + a[ i+1 ] * b[ i+1 ]
- + a[ i+2 ] * b[ i+2 ]
- + a[ i+3 ] * b[ i+3 ]
- + a[ i+4 ] * b[ i+4 ]
- + a[ i+5 ] * b[ i+5 ]
- + a[ i+6 ] * b[ i+6 ]
- + a[ i+7 ] * b[ i+7 ];
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int i = bn; i < len; i+=vLen ){
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, i);
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, i);
+ val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
}
//scalar result
@@ -3697,25 +3695,18 @@ private static double dotProduct( double[] a, double[] b, final int len )
public static double dotProduct( double[] a, double[] b, int ai, int bi, final int len )
{
double val = 0;
- final int bn = len%8;
+ final int bn = len%vLen;
//compute rest
for( int i = 0; i < bn; i++, ai++, bi++ )
val += a[ ai ] * b[ bi ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int i = bn; i < len; i+=8, ai+=8, bi+=8 )
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int i = bn; i < len; i+=vLen, ai+=vLen, bi+=vLen )
{
- //read 64B cachelines of a and b
- //compute cval' = sum(a * b) + cval
- val += a[ ai+0 ] * b[ bi+0 ]
- + a[ ai+1 ] * b[ bi+1 ]
- + a[ ai+2 ] * b[ bi+2 ]
- + a[ ai+3 ] * b[ bi+3 ]
- + a[ ai+4 ] * b[ bi+4 ]
- + a[ ai+5 ] * b[ bi+5 ]
- + a[ ai+6 ] * b[ bi+6 ]
- + a[ ai+7 ] * b[ bi+7 ];
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+ val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
}
//scalar result
@@ -3783,103 +3774,93 @@ else if( aixk > bixk )
}
//note: public for use by codegen for consistency
- public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len )
- {
- final int bn = len%8;
+ public static void vectMultiplyAdd(final double aval, double[] b, double[] c, int bi, int ci, final int len) {
+ final int bn = len%vLen;
- //rest, not aligned to 8-blocks
+ //rest, not aligned to vLen-blocks
for( int j = 0; j < bn; j++, bi++, ci++)
c[ ci ] += aval * b[ bi ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int j = bn; j < len; j+=8, bi+=8, ci+=8)
+ DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval);
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int j = bn; j < len; j+=vLen, bi+=vLen, ci+=vLen)
{
- //read 64B cachelines of b and c
- //compute c' = aval * b + c
- //write back 64B cacheline of c = c'
- c[ ci+0 ] += aval * b[ bi+0 ];
- c[ ci+1 ] += aval * b[ bi+1 ];
- c[ ci+2 ] += aval * b[ bi+2 ];
- c[ ci+3 ] += aval * b[ bi+3 ];
- c[ ci+4 ] += aval * b[ bi+4 ];
- c[ ci+5 ] += aval * b[ bi+5 ];
- c[ ci+6 ] += aval * b[ bi+6 ];
- c[ ci+7 ] += aval * b[ bi+7 ];
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+ cVec = aVec.fma(bVec, cVec);
+ cVec.intoArray(c, ci);
}
}
private static void vectMultiplyAdd2( final double aval1, final double aval2, double[] b, double[] c, int bi1, int bi2, int ci, final int len )
{
- final int bn = len%8;
+ final int bn = len%vLen;
- //rest, not aligned to 8-blocks
+ //rest, not aligned to vLen-blocks
for( int j = 0; j < bn; j++, bi1++, bi2++, ci++ )
c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, ci+=8 )
- {
- //read 64B cachelines of b (2x) and c
- //compute c' = aval_1 * b_1 + aval_2 * b_2 + c
- //write back 64B cacheline of c = c'
- c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ];
- c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ];
- c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ];
- c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ];
- c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ];
- c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ];
- c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ];
- c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ];
+ DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1);
+ DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2);
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, ci+=vLen ) {
+ DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1);
+ DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+ cVec = aVec1.fma(bVec1, cVec);
+ cVec = aVec2.fma(bVec2, cVec);
+ cVec.intoArray(c, ci);
}
}
private static void vectMultiplyAdd3( final double aval1, final double aval2, final double aval3, double[] b, double[] c, int bi1, int bi2, int bi3, int ci, final int len )
{
- final int bn = len%8;
-
- //rest, not aligned to 8-blocks
+ final int bn = len%vLen;
+ //rest, not aligned to vLen-blocks
for( int j = 0; j < bn; j++, bi1++, bi2++, bi3++, ci++ )
c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ] + aval3 * b[ bi3 ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, bi3+=8, ci+=8 )
- {
- //read 64B cachelines of b (3x) and c
- //compute c' = aval_1 * b_1 + aval_2 * b_2 + c
- //write back 64B cacheline of c = c'
- c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ] + aval3 * b[ bi3+0 ];
- c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ] + aval3 * b[ bi3+1 ];
- c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ] + aval3 * b[ bi3+2 ];
- c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ] + aval3 * b[ bi3+3 ];
- c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ] + aval3 * b[ bi3+4 ];
- c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ] + aval3 * b[ bi3+5 ];
- c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ] + aval3 * b[ bi3+6 ];
- c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ];
+ DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1);
+ DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2);
+ DoubleVector aVec3 = DoubleVector.broadcast(SPECIES, aval3);
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, bi3+=vLen, ci+=vLen )
+ {
+ DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1);
+ DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2);
+ DoubleVector bVec3 = DoubleVector.fromArray(SPECIES, b, bi3);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+ cVec = aVec1.fma(bVec1, cVec);
+ cVec = aVec2.fma(bVec2, cVec);
+ cVec = aVec3.fma(bVec3, cVec);
+ cVec.intoArray(c, ci);
}
}
private static void vectMultiplyAdd4( final double aval1, final double aval2, final double aval3, final double aval4, double[] b, double[] c, int bi1, int bi2, int bi3, int bi4, int ci, final int len )
{
- final int bn = len%8;
-
- //rest, not aligned to 8-blocks
+ final int bn = len%vLen;
+ //rest, not aligned to vLen-blocks
for( int j = 0; j < bn; j++, bi1++, bi2++, bi3++, bi4++, ci++ )
c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ] + aval3 * b[ bi3 ] + aval4 * b[ bi4 ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, bi3+=8, bi4+=8, ci+=8)
+ DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1);
+ DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2);
+ DoubleVector aVec3 = DoubleVector.broadcast(SPECIES, aval3);
+ DoubleVector aVec4 = DoubleVector.broadcast(SPECIES, aval4);
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, bi3+=vLen, bi4+=vLen, ci+=vLen)
{
- //read 64B cachelines of b (4x) and c
- //compute c' = aval_1 * b_1 + aval_2 * b_2 + c
- //write back 64B cacheline of c = c'
- c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ] + aval3 * b[ bi3+0 ] + aval4 * b[ bi4+0 ];
- c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ] + aval3 * b[ bi3+1 ] + aval4 * b[ bi4+1 ];
- c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ] + aval3 * b[ bi3+2 ] + aval4 * b[ bi4+2 ];
- c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ] + aval3 * b[ bi3+3 ] + aval4 * b[ bi4+3 ];
- c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ] + aval3 * b[ bi3+4 ] + aval4 * b[ bi4+4 ];
- c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ] + aval3 * b[ bi3+5 ] + aval4 * b[ bi4+5 ];
- c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ] + aval3 * b[ bi3+6 ] + aval4 * b[ bi4+6 ];
- c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ] + aval4 * b[ bi4+7 ];
+ DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1);
+ DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2);
+ DoubleVector bVec3 = DoubleVector.fromArray(SPECIES, b, bi3);
+ DoubleVector bVec4 = DoubleVector.fromArray(SPECIES, b, bi4);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+ cVec = aVec1.fma(bVec1, cVec);
+ cVec = aVec2.fma(bVec2, cVec);
+ cVec = aVec3.fma(bVec3, cVec);
+ cVec = aVec4.fma(bVec4, cVec);
+ cVec.intoArray(c, ci);
}
}
@@ -3940,26 +3921,18 @@ public static void vectMultiplyAdd( final double aval, double[] b, double[] c, i
//note: public for use by codegen for consistency
public static void vectMultiplyWrite( final double aval, double[] b, double[] c, int bi, int ci, final int len )
{
- final int bn = len%8;
+ final int bn = len%vLen;
- //rest, not aligned to 8-blocks
+ //rest, not aligned to vLen-blocks
for( int j = 0; j < bn; j++, bi++, ci++)
c[ ci ] = aval * b[ bi ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int j = bn; j < len; j+=8, bi+=8, ci+=8)
+ //unrolled vLen-block (for better instruction-level parallelism)
+ DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval);
+ for( int j = bn; j < len; j+=vLen, bi+=vLen, ci+=vLen)
{
- //read 64B cachelines of b and c
- //compute c' = aval * b + c
- //write back 64B cacheline of c = c'
- c[ ci+0 ] = aval * b[ bi+0 ];
- c[ ci+1 ] = aval * b[ bi+1 ];
- c[ ci+2 ] = aval * b[ bi+2 ];
- c[ ci+3 ] = aval * b[ bi+3 ];
- c[ ci+4 ] = aval * b[ bi+4 ];
- c[ ci+5 ] = aval * b[ bi+5 ];
- c[ ci+6 ] = aval * b[ bi+6 ];
- c[ ci+7 ] = aval * b[ bi+7 ];
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+ aVec.mul(bVec).intoArray(c, ci);
}
}
@@ -3996,28 +3969,20 @@ public static void vectMultiplyInPlace(final double[] a, double[] c, int[] cix,
}
//note: public for use by codegen for consistency
- public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len )
- {
- final int bn = len%8;
+ public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len ){
+
+ final int bn = len%vLen;
- //rest, not aligned to 8-blocks
+ //rest, not aligned to vLen-blocks
for( int j = 0; j < bn; j++, ai++, bi++, ci++)
c[ ci ] = a[ ai ] * b[ bi ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int j = bn; j < len; j+=8, ai+=8, bi+=8, ci+=8)
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int j = bn; j < len; j+=vLen, ai+=vLen, bi+=vLen, ci+=vLen)
{
- //read 64B cachelines of a and b
- //compute c' = a * b
- //write back 64B cacheline of c = c'
- c[ ci+0 ] = a[ ai+0 ] * b[ bi+0 ];
- c[ ci+1 ] = a[ ai+1 ] * b[ bi+1 ];
- c[ ci+2 ] = a[ ai+2 ] * b[ bi+2 ];
- c[ ci+3 ] = a[ ai+3 ] * b[ bi+3 ];
- c[ ci+4 ] = a[ ai+4 ] * b[ bi+4 ];
- c[ ci+5 ] = a[ ai+5 ] * b[ bi+5 ];
- c[ ci+6 ] = a[ ai+6 ] * b[ bi+6 ];
- c[ ci+7 ] = a[ ai+7 ] * b[ bi+7 ];
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+ aVec.mul(bVec).intoArray(c, ci);
}
}
@@ -4039,47 +4004,37 @@ public static void vectMultiplyWrite( final double[] a, double[] b, double[] c,
}
}
- public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len)
- {
- final int bn = len%8;
+ public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len){
+
+ final int bn = len%vLen;
- //rest, not aligned to 8-blocks
+ //rest, not aligned to vLen-blocks
for( int j = 0; j < bn; j++, ai++, ci++)
c[ ci ] *= a[ ai ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int j = bn; j < len; j+=8, ai+=8, ci+=8)
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int j = bn; j < len; j+=vLen, ai+=vLen, ci+=vLen)
{
- //read 64B cachelines of a and c
- //compute c' = c * a
- //write back 64B cacheline of c = c'
- c[ ci+0 ] *= a[ ai+0 ];
- c[ ci+1 ] *= a[ ai+1 ];
- c[ ci+2 ] *= a[ ai+2 ];
- c[ ci+3 ] *= a[ ai+3 ];
- c[ ci+4 ] *= a[ ai+4 ];
- c[ ci+5 ] *= a[ ai+5 ];
- c[ ci+6 ] *= a[ ai+6 ];
- c[ ci+7 ] *= a[ ai+7 ];
+ DoubleVector res = DoubleVector.fromArray(SPECIES, c, ci);
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+
+ res = aVec.mul(res);
+ res.intoArray(c, ci);
}
}
//note: public for use by codegen for consistency
public static void vectAdd( double[] a, double bval, double[] c, int ai, int ci, final int len ) {
- final int bn = len%8;
- //rest, not aligned to 8-blocks
+ final int bn = len%vLen;
+ //rest, not aligned to vLen-blocks
for( int j = 0; j < bn; j++, ai++, ci++)
c[ ci ] += a[ ai ];
- //unrolled 8-block (for better ILP)
- for( int j = bn; j < len; j+=8, ai+=8, ci+=8) {
- c[ ci+0 ] += a[ ai+0 ] + bval;
- c[ ci+1 ] += a[ ai+1 ] + bval;
- c[ ci+2 ] += a[ ai+2 ] + bval;
- c[ ci+3 ] += a[ ai+3 ] + bval;
- c[ ci+4 ] += a[ ai+4 ] + bval;
- c[ ci+5 ] += a[ ai+5 ] + bval;
- c[ ci+6 ] += a[ ai+6 ] + bval;
- c[ ci+7 ] += a[ ai+7 ] + bval;
+ //unrolled vLen-block (for better ILP)
+ for( int j = bn; j < len; j+=vLen, ai+=vLen, ci+=vLen) {
+ DoubleVector res = DoubleVector.fromArray(SPECIES, c, ci);
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+ res = aVec.add(res);
+ res.intoArray(c, ci);
}
}
diff --git a/src/main/python/systemds/context/systemds_context.py b/src/main/python/systemds/context/systemds_context.py
index 9385cb991b9..25824bc6631 100644
--- a/src/main/python/systemds/context/systemds_context.py
+++ b/src/main/python/systemds/context/systemds_context.py
@@ -173,7 +173,7 @@ def __build_startup_command(self, port: int):
:param port: The port address to use if -1 chose random port."""
# Base command
- command = ["java", "-cp"]
+ command = ["java", "--add-modules=jdk.incubator.vector", "-cp"]
# Find the operating system specifc separator, nt means its Windows
cp_separator = ";" if os.name == "nt" else ":"
diff --git a/src/main/python/systemds/utils/converters.py b/src/main/python/systemds/utils/converters.py
index 61a4769e806..551a2332579 100644
--- a/src/main/python/systemds/utils/converters.py
+++ b/src/main/python/systemds/utils/converters.py
@@ -72,6 +72,7 @@ def matrix_block_to_numpy(jvm: JVMView, mb: JavaObject):
:param jvm: The current JVM instance running systemds.
:param mb: A pointer to the JVM's MatrixBlock object.
"""
+
num_ros = mb.getNumRows()
num_cols = mb.getNumColumns()
buf = jvm.org.apache.sysds.runtime.util.Py4jConverterUtils.convertMBtoPy4JDenseArr(
diff --git a/src/test/java/org/apache/sysds/performance/Main.java b/src/test/java/org/apache/sysds/performance/Main.java
index fc749b56df5..f8d0bbea852 100644
--- a/src/test/java/org/apache/sysds/performance/Main.java
+++ b/src/test/java/org/apache/sysds/performance/Main.java
@@ -33,7 +33,8 @@
import org.apache.sysds.performance.generators.MatrixFile;
import org.apache.sysds.performance.matrix.MatrixAppend;
import org.apache.sysds.performance.matrix.MatrixBinaryCellPerf;
-import org.apache.sysds.performance.matrix.MatrixMulPerformance;
+import org.apache.sysds.performance.matrix.MatrixMultiplicationPerf;
+import org.apache.sysds.performance.matrix.MMSparsityPerformance;
import org.apache.sysds.performance.matrix.MatrixReplacePerf;
import org.apache.sysds.performance.matrix.MatrixStorage;
import org.apache.sysds.performance.matrix.ReshapePerf;
@@ -139,6 +140,9 @@ private static void exec(int prog, String[] args) throws Exception {
case 1008:
MatrixAppend.main(args);
break;
+ case 1009:
+ MatrixMultiplicationPerf.main(args);
+ break;
default:
break;
}
@@ -235,9 +239,9 @@ private static void run17(String[] args) throws Exception {
}
private static void run1000(String[] args) {
- MatrixMulPerformance perf;
+ MMSparsityPerformance perf;
if (args.length < 3) {
- perf = new MatrixMulPerformance();
+ perf = new MMSparsityPerformance();
} else {
// ... [resolution] [maxSparsity] [resolution] [warmupRuns] [repetitions]
int rl = Integer.parseInt(args[1]);
@@ -256,7 +260,7 @@ private static void run1000(String[] args) {
if (args.length > 6)
repetitions = Integer.parseInt(args[6]);
- perf = new MatrixMulPerformance(rl, cl, warmupRuns, repetitions, resolution, maxSparsity, 2f);
+ perf = new MMSparsityPerformance(rl, cl, warmupRuns, repetitions, resolution, maxSparsity, 2f);
}
perf.testSparseFormat(null, null);
diff --git a/src/test/java/org/apache/sysds/performance/README.md b/src/test/java/org/apache/sysds/performance/README.md
index 4945afd9ab5..79bc8fa8f36 100644
--- a/src/test/java/org/apache/sysds/performance/README.md
+++ b/src/test/java/org/apache/sysds/performance/README.md
@@ -28,7 +28,7 @@ mvn package
Example of running it:
```bash
-java -jar target/systemds-3.3.0-SNAPSHOT-perf.jar 1
+java -jar target/systemds-3.4.0-SNAPSHOT-perf.jar 1
```
example result of the above job:
@@ -49,45 +49,45 @@ Running Steam Compression Test
With profiler:
```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 12 10000 100 4 1.0 16 1000 -1
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 12 10000 100 4 1.0 16 1000 -1
```
Take a Matrix and perform serialization
```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 13 16 100 "temp/test.csv" -1
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 13 16 100 "temp/test.csv" -1
```
Take a Frame and transform into a Matrix and perform serialization.
```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 14 16 1000 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json" -1
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 14 16 1000 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json" -1
```
Frame Operation timings
```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 15 16 10 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json"
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 15 16 10 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json"
```
Reshape Sparse
```bash
-java -cp "target/systemds-3.3.0-SNAPSHOT-perf.jar:target/lib/*" -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html org.apache.sysds.performance.Main 1005
+java -cp "target/systemds-3.4.0-SNAPSHOT-perf.jar:target/lib/*" -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html org.apache.sysds.performance.Main 1005
```
Binary Operations
```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1006 500
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1006 500
```
transform encode
```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1007
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1007
```
@@ -96,3 +96,10 @@ append matrix sequence
```bash
./src/test/scripts/performance/append.sh
```
+
+
+matrix multiplication
+
+```bash
+./src/test/scripts/performance/matrixMultiplication.sh
+```
\ No newline at end of file
diff --git a/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java b/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java
new file mode 100644
index 00000000000..06a4d9065a8
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.performance.generators;
+
+import org.apache.sysds.runtime.matrix.data.Pair;
+
+public class IGeneratePair implements IGenerate> {
+
+ private final IGenerate a;
+ private final IGenerate b;
+
+ public IGeneratePair(IGenerate a, IGenerate b) {
+ this.a = a;
+ this.b = b;
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return a.isEmpty() && b.isEmpty();
+ }
+
+ @Override
+ public int defaultWaitTime() {
+ return Math.max(a.defaultWaitTime(), b.defaultWaitTime());
+ }
+
+ @Override
+ public Pair take() {
+ A av = a.take();
+ B bv = b.take();
+ return new Pair<>(av, bv);
+ }
+
+ @Override
+ public void generate(int N) throws InterruptedException {
+ a.generate(N);
+ b.generate(N);
+ }
+
+}
diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java b/src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java
similarity index 95%
rename from src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java
rename to src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java
index f201c8fd7a5..0c77b07dfcd 100644
--- a/src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java
+++ b/src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java
@@ -31,7 +31,7 @@
import org.apache.sysds.runtime.util.DataConverter;
import org.apache.sysds.test.TestUtils;
-public class MatrixMulPerformance {
+public class MMSparsityPerformance {
private final int _rl;
private final int _cl;
@@ -42,11 +42,11 @@ public class MatrixMulPerformance {
private final float resolutionDivisor;
private final float maxSparsity;
- public MatrixMulPerformance() {
+ public MMSparsityPerformance() {
this(1024, 1024, 15, 50, 18, .4f, 2f);
}
- public MatrixMulPerformance(int rl, int cl, int warmupRuns, int repetitions,
+ public MMSparsityPerformance(int rl, int cl, int warmupRuns, int repetitions,
int resolution, float maxSparsity, float stepDivisor)
{
_rl = rl;
@@ -89,10 +89,12 @@ private static String printAsPythonList(double[] list) {
sb.append("[");
for (double el : list)
- sb.append(el + ",");
+ sb.append(el + ", ");
- if (list.length > 0)
+ if (list.length > 0){
sb.deleteCharAt(sb.length() - 1);
+ sb.deleteCharAt(sb.length() - 1);
+ }
sb.append("]");
return sb.toString();
diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
new file mode 100644
index 00000000000..df799e79c3b
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.performance.matrix;
+
+import java.util.Arrays;
+
+import org.apache.sysds.performance.compression.APerfTest;
+import org.apache.sysds.performance.generators.ConstMatrix;
+import org.apache.sysds.performance.generators.IGenerate;
+import org.apache.sysds.performance.generators.IGeneratePair;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.data.Pair;
+import org.apache.sysds.utils.stats.InfrastructureAnalyzer;
+
+public class MatrixMultiplicationPerf extends APerfTest