From 59b858c8460e0cfaca73d2388ba5188563cb5d9e Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Tue, 4 Feb 2025 15:33:15 +0100
Subject: [PATCH 01/11] [SYSTEMDS-3874] Java17 Vectorized LibMM

This commit adds vectorized kernels for matrix multiplication.

fix mm error

Perf mm

bigger scale

remove compile log
---
 bin/systemds                                  |   4 +
 pom.xml                                       |   5 +
 .../compress/colgroup/ColGroupDDC.java        |  40 +--
 .../dictionary/MatrixBlockDictionary.java     |  70 ++++-
 .../runtime/matrix/data/LibMatrixMult.java    | 264 ++++++++----------
 src/main/python/systemds/utils/converters.py  |   1 +
 .../org/apache/sysds/performance/Main.java    |  12 +-
 .../org/apache/sysds/performance/README.md    |  23 +-
 .../performance/generators/IGeneratePair.java |  57 ++++
 ...rmance.java => MMSparsityPerformance.java} |  12 +-
 .../matrix/MatrixMultiplicationPerf.java      |  88 ++++++
 .../apache/sysds/test/AutomatedTestBase.java  |   3 +-
 .../multitenant/MultiTenantTestBase.java      |   1 +
 src/test/scripts/performance/append.sh        |  36 +--
 .../performance/matrixMultiplication.sh       |  47 ++++
 15 files changed, 448 insertions(+), 215 deletions(-)
 create mode 100644 src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java
 rename src/test/java/org/apache/sysds/performance/matrix/{MatrixMulPerformance.java => MMSparsityPerformance.java} (95%)
 create mode 100644 src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
 create mode 100755 src/test/scripts/performance/matrixMultiplication.sh
diff --git a/bin/systemds b/bin/systemds
index 2e8e629495b..f0cb0b729b0 100755
--- a/bin/systemds
+++ b/bin/systemds
@@ -413,6 +413,7 @@ if [ $WORKER == 1 ]; then
   print_out "#  starting Federated worker on port $PORT"
   CMD=" \
   java $SYSTEMDS_STANDALONE_OPTS \
+  --add-modules=jdk.incubator.vector \
   $LOG4JPROPFULL \
   -jar $SYSTEMDS_JAR_FILE \
   -w $PORT \
@@ -422,6 +423,7 @@ elif [ "$FEDMONITORING" == 1 ]; then
   print_out "#  starting Federated backend monitoring on port $PORT"
   CMD=" \
   java $SYSTEMDS_STANDALONE_OPTS \
+  --add-modules=jdk.incubator.vector \
   $LOG4JPROPFULL \
   -jar $SYSTEMDS_JAR_FILE \
   -fedMonitoring $PORT \
@@ -433,6 +435,7 @@ elif [ $SYSDS_DISTRIBUTED == 0 ]; then
   CMD=" \
   java $SYSTEMDS_STANDALONE_OPTS \
   $LOG4JPROPFULL \
+  --add-modules=jdk.incubator.vector \
   -jar $SYSTEMDS_JAR_FILE \
   -f $SCRIPT_FILE \
   -exec $SYSDS_EXEC_MODE \
@@ -442,6 +445,7 @@ else
   print_out "#  Running script $SCRIPT_FILE distributed with opts: $*"
   CMD=" \
   spark-submit $SYSTEMDS_DISTRIBUTED_OPTS \
+  --add-modules=jdk.incubator.vector \
   $SYSTEMDS_JAR_FILE \
   -f $SCRIPT_FILE \
   -exec $SYSDS_EXEC_MODE \
diff --git a/pom.xml b/pom.xml
index b25d94cc7db..5d2485897fb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -92,6 +92,7 @@
 			--add-opens=java.base/java.lang.ref=ALL-UNNAMED
 			--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
 			--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
+			--add-modules=jdk.incubator.vector
 		</jvm.addopens>
 	</properties>
 
@@ -357,6 +358,9 @@
 					<source>${java.level}</source>
 					<target>${java.level}</target>
 					<release>${java.level}</release>
+					<compilerArgs>
+						<arg>--add-modules=jdk.incubator.vector</arg>
+					</compilerArgs>
 				</configuration>
 			</plugin>
 
@@ -904,6 +908,7 @@
 							<notimestamp>true</notimestamp>
 							<failOnWarnings>false</failOnWarnings>
 							<quiet>true</quiet>
+							<additionalJOption>--add-modules=jdk.incubator.vector</additionalJOption>
 							<skip>${doc.skip}</skip>
 							<show>public</show>
 							<source>${java.level}</source>
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index e55a24e56f5..fc82c58e16b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -26,8 +26,8 @@
 import java.util.List;
 import java.util.concurrent.ExecutorService;
 
-// import jdk.incubator.vector.DoubleVector;
-// import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
 import org.apache.commons.lang3.NotImplementedException;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
@@ -75,7 +75,7 @@ public class ColGroupDDC extends APreAgg implements IMapToDataGroup {
 
 	protected final AMapToData _data;
 
-	// static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+	static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
 
 	private ColGroupDDC(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) {
 		super(colIndexes, dict, cachedCounts);
@@ -625,7 +625,8 @@ private void identityRightDecompressingMult(MatrixBlock right, MatrixBlock ret,
 		final double[] b = right.getDenseBlockValues();
 		final double[] c = ret.getDenseBlockValues();
 		final int jd = right.getNumColumns();
-		final int vLen = 8;
+		final DoubleVector vVec = DoubleVector.zero(SPECIES);
+		final int vLen = SPECIES.length();
 		final int lenJ = cru - crl;
 		final int end = cru - (lenJ % vLen);
 		for(int i = rl; i < ru; i++) {
@@ -633,8 +634,7 @@ private void identityRightDecompressingMult(MatrixBlock right, MatrixBlock ret,
 			final int offOut = i * jd + crl;
 			final double aa = 1;
 			final int k_right = _colIndexes.get(k);
-			vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen);
-
+			vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen, vVec);
 		}
 	}
 
@@ -644,8 +644,8 @@ private void defaultRightDecompressingMult(MatrixBlock right, MatrixBlock ret, i
 		final double[] c = ret.getDenseBlockValues();
 		final int kd = _colIndexes.size();
 		final int jd = right.getNumColumns();
-		// final DoubleVector vVec = DoubleVector.zero(SPECIES);
-		final int vLen = 8;
+		final DoubleVector vVec = DoubleVector.zero(SPECIES);
+		final int vLen = SPECIES.length();
 
 		final int blkzI = 32;
 		final int blkzK = 24;
@@ -661,32 +661,22 @@ private void defaultRightDecompressingMult(MatrixBlock right, MatrixBlock ret, i
 					for(int k = bk; k < bke; k++) {
 						final double aa = a[offi + k];
 						final int k_right = _colIndexes.get(k);
-						vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen);
+						vectMM(aa, b, c, end, jd, crl, cru, offOut, k_right, vLen, vVec);
 					}
 				}
 			}
 		}
 	}
 
-	final void vectMM(double aa, double[] b, double[] c, int endT, int jd, int crl, int cru, int offOut, int k,
-		int vLen) {
-		// vVec = vVec.broadcast(aa);
+	final void vectMM(double aa, double[] b, double[] c, int endT, int jd, int crl, int cru, int offOut, int k, int vLen, DoubleVector vVec) {
+		vVec = vVec.broadcast(aa);
 		final int offj = k * jd;
 		final int end = endT + offj;
 		for(int j = offj + crl; j < end; j += vLen, offOut += vLen) {
-			// DoubleVector res = DoubleVector.fromArray(SPECIES, c, offOut);
-			// DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j);
-			// res = vVec.fma(bVec, res);
-			// res.intoArray(c, offOut);
-
-			c[offOut] += aa * b[j];
-			c[offOut + 1] += aa * b[j + 1];
-			c[offOut + 2] += aa * b[j + 2];
-			c[offOut + 3] += aa * b[j + 3];
-			c[offOut + 4] += aa * b[j + 4];
-			c[offOut + 5] += aa * b[j + 5];
-			c[offOut + 6] += aa * b[j + 6];
-			c[offOut + 7] += aa * b[j + 7];
+			DoubleVector res = DoubleVector.fromArray(SPECIES, c, offOut);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j);
+			res = vVec.fma(bVec, res);
+			res.intoArray(c, offOut);
 		}
 		for(int j = end; j < cru + offj; j++, offOut++) {
 			double bb = b[j];
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index 1d6949cbcd7..54cdf6920ac 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -27,6 +27,8 @@
 import java.util.Arrays;
 import java.util.Set;
 
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
 import org.apache.commons.lang3.NotImplementedException;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex;
@@ -65,6 +67,8 @@ public class MatrixBlockDictionary extends ADictionary {
 
 	final private MatrixBlock _data;
 
+	static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+
 	/**
 	 * Unsafe private constructor that does not check the data validity. USE WITH CAUTION.
 	 * 
@@ -2088,7 +2092,71 @@ private void preaggValuesFromDenseDictDenseAggArray(final int numVals, final ICo
 
 	private void preaggValuesFromDenseDictDenseAggRange(final int numVals, final IColIndex colIndexes, final int s,
 		final int e, final double[] b, final int cut, final double[] ret) {
-		preaggValuesFromDenseDictDenseAggRangeGeneric(numVals, colIndexes, s, e, b, cut, ret);
+		if(colIndexes instanceof RangeIndex) {
+			RangeIndex ri = (RangeIndex) colIndexes;
+			preaggValuesFromDenseDictDenseAggRangeRange(numVals, ri.get(0), ri.get(0) + ri.size(), s, e, b, cut, ret);
+		}
+		else
+			preaggValuesFromDenseDictDenseAggRangeGeneric(numVals, colIndexes, s, e, b, cut, ret);
+	}
+
+	private void preaggValuesFromDenseDictDenseAggRangeRange(final int numVals, final int ls, final int le, final int rs,
+		final int re, final double[] b, final int cut, final double[] ret) {
+		final int cz = le - ls;
+		final int az = re - rs;
+		// final int nCells = numVals * cz;
+		final double[] values = _data.getDenseBlockValues();
+		// Correctly named ikj matrix multiplication .
+
+		final int blkzI = 32;
+		final int blkzK = 24;
+		final int blkzJ = 1024;
+		for(int bi = 0; bi < numVals; bi += blkzI) {
+			final int bie = Math.min(numVals, bi + blkzI);
+			for(int bk = 0; bk < cz; bk += blkzK) {
+				final int bke = Math.min(cz, bk + blkzK);
+				for(int bj = 0; bj < az; bj += blkzJ) {
+					final int bje = Math.min(az, bj + blkzJ);
+					final int sOffT = rs + bj;
+					final int eOffT = rs + bje;
+					preaggValuesFromDenseDictBlockedIKJ(values, b, ret, bi, bk, bj, bie, bke, cz, az, ls, cut, sOffT, eOffT);
+				}
+			}
+		}
+	}
+
+	private static void preaggValuesFromDenseDictBlockedIKJ(double[] a, double[] b, double[] ret, int bi, int bk, int bj,
+		int bie, int bke, int cz, int az, int ls, int cut, int sOffT, int eOffT) {
+		final int vLen = SPECIES.length();
+		final DoubleVector vVec = DoubleVector.zero(SPECIES);
+		final int leftover = sOffT - eOffT % vLen; // leftover not vectorized
+		for(int i = bi; i < bie; i++) {
+			final int offI = i * cz;
+			final int offOutT = i * az + bj;
+			for(int k = bk; k < bke; k++) {
+				final int idb = (k + ls) * cut;
+				final int sOff = sOffT + idb;
+				final int eOff = eOffT + idb;
+				final double v = a[offI + k];
+				vecInnerLoop(v, b, ret, offOutT, eOff, sOff, leftover, vLen, vVec);
+			}
+		}
+	}
+
+	private static void vecInnerLoop(final double v, final double[] b, final double[] ret, final int offOutT,
+		final int eOff, final int sOff, final int leftover, final int vLen, DoubleVector vVec) {
+		int offOut = offOutT;
+		vVec = vVec.broadcast(v);
+		final int end = eOff - leftover;
+		for(int j = sOff; j < end; j += vLen, offOut += vLen) {
+			DoubleVector res = DoubleVector.fromArray(SPECIES, ret, offOut);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, j);
+			vVec.fma(bVec, res).intoArray(ret, offOut);
+		}
+		for(int j = end; j < eOff; j++, offOut++) {
+			ret[offOut] += v * b[j];
+		}
+
 	}
 
 	private void preaggValuesFromDenseDictDenseAggRangeGeneric(final int numVals, final IColIndex colIndexes,
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 66f7c3c9445..adb26dce107 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -29,6 +29,9 @@
 import java.util.concurrent.Future;
 import java.util.stream.IntStream;
 
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
 import org.apache.commons.lang3.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -78,6 +81,8 @@ public class LibMatrixMult
 	public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size)
 	public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size)
 	private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName());
+	private static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+	private static final int vLen = SPECIES.length();
 
 	private LibMatrixMult() {
 		//prevent instantiation via private constructor
@@ -3668,25 +3673,18 @@ private static void matrixMultWuMMGeneric (MatrixBlock mW, MatrixBlock mU, Matri
 	private static double dotProduct( double[] a, double[] b, final int len )
 	{
 		double val = 0;
-		final int bn = len%8;
+
+		final int bn = len%vLen;
 				
 		//compute rest
 		for( int i = 0; i < bn; i++ )
 			val += a[ i ] * b[ i ];
 		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int i = bn; i < len; i+=8 )
-		{
-			//read 64B cachelines of a and b
-			//compute cval' = sum(a * b) + cval
-			val += a[ i+0 ] * b[ i+0 ]
-			     + a[ i+1 ] * b[ i+1 ]
-			     + a[ i+2 ] * b[ i+2 ]
-			     + a[ i+3 ] * b[ i+3 ]
-			     + a[ i+4 ] * b[ i+4 ]
-			     + a[ i+5 ] * b[ i+5 ]
-			     + a[ i+6 ] * b[ i+6 ]
-			     + a[ i+7 ] * b[ i+7 ];
+		//unrolled vLen-block (for better instruction-level parallelism)
+		for( int i = bn; i < len; i+=vLen ){
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, i);
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, i);
+			val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
 		}
 		
 		//scalar result
@@ -3697,25 +3695,18 @@ private static double dotProduct( double[] a, double[] b, final int len )
 	public static double dotProduct( double[] a, double[] b, int ai, int bi, final int len )
 	{
 		double val = 0;
-		final int bn = len%8;
+		final int bn = len%vLen;
 		
 		//compute rest
 		for( int i = 0; i < bn; i++, ai++, bi++ )
 			val += a[ ai ] * b[ bi ];
 		
-		//unrolled 8-block (for better instruction-level parallelism)
-		for( int i = bn; i < len; i+=8, ai+=8, bi+=8 )
+		//unrolled vLen-block (for better instruction-level parallelism)
+		for( int i = bn; i < len; i+=vLen, ai+=vLen, bi+=vLen )
 		{
-			//read 64B cachelines of a and b
-			//compute cval' = sum(a * b) + cval
-			val += a[ ai+0 ] * b[ bi+0 ]
-			     + a[ ai+1 ] * b[ bi+1 ]
-			     + a[ ai+2 ] * b[ bi+2 ]
-			     + a[ ai+3 ] * b[ bi+3 ]
-			     + a[ ai+4 ] * b[ bi+4 ]
-			     + a[ ai+5 ] * b[ bi+5 ]
-			     + a[ ai+6 ] * b[ bi+6 ]
-			     + a[ ai+7 ] * b[ bi+7 ];
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+			val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
 		}
 		
 		//scalar result
@@ -3784,102 +3775,93 @@ else if( aixk > bixk )
 
 	//note: public for use by codegen for consistency
 	public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len )
-	{
-		final int bn = len%8;
+	{		
+		final int bn = len%vLen;
 		
-		//rest, not aligned to 8-blocks
+		//rest, not aligned to vLen-blocks
 		for( int j = 0; j < bn; j++, bi++, ci++)
 			c[ ci ] += aval * b[ bi ];
 		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, bi+=8, ci+=8) 
+		DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval);
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=vLen, bi+=vLen, ci+=vLen) 
 		{
-			//read 64B cachelines of b and c
-			//compute c' = aval * b + c
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] += aval * b[ bi+0 ];
-			c[ ci+1 ] += aval * b[ bi+1 ];
-			c[ ci+2 ] += aval * b[ bi+2 ];
-			c[ ci+3 ] += aval * b[ bi+3 ];
-			c[ ci+4 ] += aval * b[ bi+4 ];
-			c[ ci+5 ] += aval * b[ bi+5 ];
-			c[ ci+6 ] += aval * b[ bi+6 ];
-			c[ ci+7 ] += aval * b[ bi+7 ];
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+			cVec = aVec.fma(bVec, cVec);
+			cVec.intoArray(c, ci);
 		}
 	}
 
 	private static void vectMultiplyAdd2( final double aval1, final double aval2, double[] b, double[] c, int bi1, int bi2, int ci, final int len )
 	{
-		final int bn = len%8;	
+		final int bn = len%vLen;
 		
-		//rest, not aligned to 8-blocks
+		//rest, not aligned to vLen-blocks
 		for( int j = 0; j < bn; j++, bi1++, bi2++, ci++ )
 			c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ];
 		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, ci+=8 ) 
-		{
-			//read 64B cachelines of b (2x) and c
-			//compute c' = aval_1 * b_1 + aval_2 * b_2 + c
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ];
-			c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ];
-			c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ];
-			c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ];
-			c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ];
-			c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ];
-			c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ];
-			c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ];	
+		DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1);
+		DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2);
+		//unrolled vLen-block (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, ci+=vLen ) 		{
+			DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1);
+			DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+			cVec = aVec1.fma(bVec1, cVec);
+			cVec = aVec2.fma(bVec2, cVec);
+			cVec.intoArray(c, ci);
 		}
 	}
 
 	private static void vectMultiplyAdd3( final double aval1, final double aval2, final double aval3, double[] b, double[] c, int bi1, int bi2, int bi3, int ci, final int len )
 	{
-		final int bn = len%8;	
-		
-		//rest, not aligned to 8-blocks
+		final int bn = len%vLen;
+		//rest, not aligned to vLen-blocks
 		for( int j = 0; j < bn; j++, bi1++, bi2++, bi3++, ci++ )
 			c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ] + aval3 * b[ bi3 ];
 		
-		//unrolled 8-block (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, bi3+=8, ci+=8 ) 
-		{
-			//read 64B cachelines of b (3x) and c
-			//compute c' = aval_1 * b_1 + aval_2 * b_2 + c
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ] + aval3 * b[ bi3+0 ];
-			c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ] + aval3 * b[ bi3+1 ];
-			c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ] + aval3 * b[ bi3+2 ];
-			c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ] + aval3 * b[ bi3+3 ];
-			c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ] + aval3 * b[ bi3+4 ];
-			c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ] + aval3 * b[ bi3+5 ];
-			c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ] + aval3 * b[ bi3+6 ];
-			c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ];	
+		DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1);
+		DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2);
+		DoubleVector aVec3 = DoubleVector.broadcast(SPECIES, aval3);
+		//unrolled vLen-block (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, bi3+=vLen, ci+=vLen ) 
+		{	
+			DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1);
+			DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2);
+			DoubleVector bVec3 = DoubleVector.fromArray(SPECIES, b, bi3);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+			cVec = aVec1.fma(bVec1, cVec);
+			cVec = aVec2.fma(bVec2, cVec);
+			cVec = aVec3.fma(bVec3, cVec);
+			cVec.intoArray(c, ci);
 		}
 	}
 
 	private static void vectMultiplyAdd4( final double aval1, final double aval2, final double aval3, final double aval4, double[] b, double[] c, int bi1, int bi2, int bi3, int bi4, int ci, final int len )
 	{
-		final int bn = len%8;	
-		
-		//rest, not aligned to 8-blocks
+		final int bn = len%vLen;
+		//rest, not aligned to vLen-blocks
 		for( int j = 0; j < bn; j++, bi1++, bi2++, bi3++, bi4++, ci++ )
 			c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ] + aval3 * b[ bi3 ] + aval4 * b[ bi4 ];
 		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, bi3+=8, bi4+=8, ci+=8) 
+		DoubleVector aVec1 = DoubleVector.broadcast(SPECIES, aval1);
+		DoubleVector aVec2 = DoubleVector.broadcast(SPECIES, aval2);
+		DoubleVector aVec3 = DoubleVector.broadcast(SPECIES, aval3);
+		DoubleVector aVec4 = DoubleVector.broadcast(SPECIES, aval4);
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=vLen, bi1+=vLen, bi2+=vLen, bi3+=vLen, bi4+=vLen, ci+=vLen) 
 		{
-			//read 64B cachelines of b (4x) and c 
-			//compute c' = aval_1 * b_1 + aval_2 * b_2 + c
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ] + aval3 * b[ bi3+0 ] + aval4 * b[ bi4+0 ];
-			c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ] + aval3 * b[ bi3+1 ] + aval4 * b[ bi4+1 ];
-			c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ] + aval3 * b[ bi3+2 ] + aval4 * b[ bi4+2 ];
-			c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ] + aval3 * b[ bi3+3 ] + aval4 * b[ bi4+3 ];
-			c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ] + aval3 * b[ bi3+4 ] + aval4 * b[ bi4+4 ];
-			c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ] + aval3 * b[ bi3+5 ] + aval4 * b[ bi4+5 ];
-			c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ] + aval3 * b[ bi3+6 ] + aval4 * b[ bi4+6 ];
-			c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ] + aval4 * b[ bi4+7 ];	
+			DoubleVector bVec1 = DoubleVector.fromArray(SPECIES, b, bi1);
+			DoubleVector bVec2 = DoubleVector.fromArray(SPECIES, b, bi2);
+			DoubleVector bVec3 = DoubleVector.fromArray(SPECIES, b, bi3);
+			DoubleVector bVec4 = DoubleVector.fromArray(SPECIES, b, bi4);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+			cVec = aVec1.fma(bVec1, cVec);
+			cVec = aVec2.fma(bVec2, cVec);
+			cVec = aVec3.fma(bVec3, cVec);
+			cVec = aVec4.fma(bVec4, cVec);
+			cVec.intoArray(c, ci);
 		}
 	}
 	
@@ -3940,26 +3922,19 @@ public static void vectMultiplyAdd( final double aval, double[] b, double[] c, i
 	//note: public for use by codegen for consistency
 	public static void vectMultiplyWrite( final double aval, double[] b, double[] c, int bi, int ci, final int len )
 	{
-		final int bn = len%8;
+		final int bn = len%vLen;
 		
-		//rest, not aligned to 8-blocks
+		//rest, not aligned to vLen-blocks
 		for( int j = 0; j < bn; j++, bi++, ci++)
 			c[ ci ] = aval * b[ bi ];
 		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, bi+=8, ci+=8) 
+		//unrolled vLen-block (for better instruction-level parallelism)
+		DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval);
+		for( int j = bn; j < len; j+=vLen, bi+=vLen, ci+=vLen) 
 		{
-			//read 64B cachelines of b and c
-			//compute c' = aval * b + c
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] = aval * b[ bi+0 ];
-			c[ ci+1 ] = aval * b[ bi+1 ];
-			c[ ci+2 ] = aval * b[ bi+2 ];
-			c[ ci+3 ] = aval * b[ bi+3 ];
-			c[ ci+4 ] = aval * b[ bi+4 ];
-			c[ ci+5 ] = aval * b[ bi+5 ];
-			c[ ci+6 ] = aval * b[ bi+6 ];
-			c[ ci+7 ] = aval * b[ bi+7 ];
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+			aVec = aVec.mul(bVec);
+			aVec.intoArray(c, ci);
 		}
 	}
 	
@@ -3996,28 +3971,21 @@ public static void vectMultiplyInPlace(final double[] a, double[] c, int[] cix,
 	}
 
 	//note: public for use by codegen for consistency
-	public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len )
-	{
-		final int bn = len%8;
+	public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len ){
+
+		final int bn = len%vLen;
 		
-		//rest, not aligned to 8-blocks
+		//rest, not aligned to vLen-blocks
 		for( int j = 0; j < bn; j++, ai++, bi++, ci++)
 			c[ ci ] = a[ ai ] * b[ bi ];
 		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, ai+=8, bi+=8, ci+=8) 
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=vLen, ai+=vLen, bi+=vLen, ci+=vLen) 
 		{
-			//read 64B cachelines of a and b
-			//compute c' = a * b
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] = a[ ai+0 ] * b[ bi+0 ];
-			c[ ci+1 ] = a[ ai+1 ] * b[ bi+1 ];
-			c[ ci+2 ] = a[ ai+2 ] * b[ bi+2 ];
-			c[ ci+3 ] = a[ ai+3 ] * b[ bi+3 ];
-			c[ ci+4 ] = a[ ai+4 ] * b[ bi+4 ];
-			c[ ci+5 ] = a[ ai+5 ] * b[ bi+5 ];
-			c[ ci+6 ] = a[ ai+6 ] * b[ bi+6 ];
-			c[ ci+7 ] = a[ ai+7 ] * b[ bi+7 ];
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+			aVec = aVec.mul(bVec);
+			aVec.intoArray(c, ci);
 		}
 	}
 	
@@ -4039,47 +4007,37 @@ public static void vectMultiplyWrite( final double[] a, double[] b, double[] c,
 		}
 	}
 
-	public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len)
-	{
-		final int bn = len%8;
+	public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len){
+
+		final int bn = len%vLen;
 		
-		//rest, not aligned to 8-blocks
+		//rest, not aligned to vLen-blocks
 		for( int j = 0; j < bn; j++, ai++, ci++)
 			c[ ci ] *= a[ ai ];
 		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) 
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=vLen, ai+=vLen, ci+=vLen) 
 		{
-			//read 64B cachelines of a and c
-			//compute c' = c * a
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] *= a[ ai+0 ];
-			c[ ci+1 ] *= a[ ai+1 ];
-			c[ ci+2 ] *= a[ ai+2 ];
-			c[ ci+3 ] *= a[ ai+3 ];
-			c[ ci+4 ] *= a[ ai+4 ];
-			c[ ci+5 ] *= a[ ai+5 ];
-			c[ ci+6 ] *= a[ ai+6 ];
-			c[ ci+7 ] *= a[ ai+7 ];
+			DoubleVector res = DoubleVector.fromArray(SPECIES, c, ci);
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+			
+			res = aVec.mul(res);
+			res.intoArray(c, ci);
 		}
 	}
 
 	//note: public for use by codegen for consistency
 	public static void vectAdd( double[] a, double bval, double[] c, int ai, int ci, final int len ) {
-		final int bn = len%8;
-		//rest, not aligned to 8-blocks
+		final int bn = len%vLen;
+		//rest, not aligned to vLen-blocks
 		for( int j = 0; j < bn; j++, ai++, ci++)
 			c[ ci ] += a[ ai ];
-		//unrolled 8-block  (for better ILP)
-		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) {
-			c[ ci+0 ] += a[ ai+0 ] + bval;
-			c[ ci+1 ] += a[ ai+1 ] + bval;
-			c[ ci+2 ] += a[ ai+2 ] + bval;
-			c[ ci+3 ] += a[ ai+3 ] + bval;
-			c[ ci+4 ] += a[ ai+4 ] + bval;
-			c[ ci+5 ] += a[ ai+5 ] + bval;
-			c[ ci+6 ] += a[ ai+6 ] + bval;
-			c[ ci+7 ] += a[ ai+7 ] + bval;
+		//unrolled vLen-block  (for better ILP)
+		for( int j = bn; j < len; j+=vLen, ai+=vLen, ci+=vLen) {
+			DoubleVector res = DoubleVector.fromArray(SPECIES, c, ci);
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+			res = aVec.add(res);
+			res.intoArray(c, ci);
 		}
 	}
 	
diff --git a/src/main/python/systemds/utils/converters.py b/src/main/python/systemds/utils/converters.py
index 61a4769e806..650d6380605 100644
--- a/src/main/python/systemds/utils/converters.py
+++ b/src/main/python/systemds/utils/converters.py
@@ -72,6 +72,7 @@ def matrix_block_to_numpy(jvm: JVMView, mb: JavaObject):
     :param jvm: The current JVM instance running systemds.
     :param mb: A pointer to the JVM's MatrixBlock object.
     """
+    
     num_ros = mb.getNumRows()
     num_cols = mb.getNumColumns()
     buf = jvm.org.apache.sysds.runtime.util.Py4jConverterUtils.convertMBtoPy4JDenseArr(
diff --git a/src/test/java/org/apache/sysds/performance/Main.java b/src/test/java/org/apache/sysds/performance/Main.java
index fc749b56df5..f8d0bbea852 100644
--- a/src/test/java/org/apache/sysds/performance/Main.java
+++ b/src/test/java/org/apache/sysds/performance/Main.java
@@ -33,7 +33,8 @@
 import org.apache.sysds.performance.generators.MatrixFile;
 import org.apache.sysds.performance.matrix.MatrixAppend;
 import org.apache.sysds.performance.matrix.MatrixBinaryCellPerf;
-import org.apache.sysds.performance.matrix.MatrixMulPerformance;
+import org.apache.sysds.performance.matrix.MatrixMultiplicationPerf;
+import org.apache.sysds.performance.matrix.MMSparsityPerformance;
 import org.apache.sysds.performance.matrix.MatrixReplacePerf;
 import org.apache.sysds.performance.matrix.MatrixStorage;
 import org.apache.sysds.performance.matrix.ReshapePerf;
@@ -139,6 +140,9 @@ private static void exec(int prog, String[] args) throws Exception {
 			case 1008:
 				MatrixAppend.main(args);
 				break;
+			case 1009:
+				MatrixMultiplicationPerf.main(args);
+				break;
 			default:
 				break;
 		}
@@ -235,9 +239,9 @@ private static void run17(String[] args) throws Exception {
 	}
 
 	private static void run1000(String[] args) {
-		MatrixMulPerformance perf;
+		MMSparsityPerformance perf;
 		if (args.length < 3) {
-			perf = new MatrixMulPerformance();
+			perf = new MMSparsityPerformance();
 		} else {
 			// ... <rl> <cl> [resolution] [maxSparsity] [resolution] [warmupRuns] [repetitions]
 			int rl = Integer.parseInt(args[1]);
@@ -256,7 +260,7 @@ private static void run1000(String[] args) {
 			if (args.length > 6)
 				repetitions = Integer.parseInt(args[6]);
 
-			perf = new MatrixMulPerformance(rl, cl, warmupRuns, repetitions, resolution, maxSparsity, 2f);
+			perf = new MMSparsityPerformance(rl, cl, warmupRuns, repetitions, resolution, maxSparsity, 2f);
 		}
 
 		perf.testSparseFormat(null, null);
diff --git a/src/test/java/org/apache/sysds/performance/README.md b/src/test/java/org/apache/sysds/performance/README.md
index 4945afd9ab5..79bc8fa8f36 100644
--- a/src/test/java/org/apache/sysds/performance/README.md
+++ b/src/test/java/org/apache/sysds/performance/README.md
@@ -28,7 +28,7 @@ mvn package
 Example of running it:
 
 ```bash
-java -jar target/systemds-3.3.0-SNAPSHOT-perf.jar 1
+java -jar target/systemds-3.4.0-SNAPSHOT-perf.jar 1
 ```
 
 example result of the above job:
@@ -49,45 +49,45 @@ Running Steam Compression Test
 With profiler:
 
 ```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 12 10000 100 4 1.0 16 1000 -1
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 12 10000 100 4 1.0 16 1000 -1
 ```
 
 Take a Matrix and perform serialization
 
 ```bash 
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 13 16 100 "temp/test.csv" -1
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 13 16 100 "temp/test.csv" -1
 ```
 
 Take a Frame and transform into a Matrix and perform serialization.
 
 ```bash 
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 14 16 1000 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json" -1
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 14 16 1000 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json" -1
 ```
 
 Frame Operation timings
 
 ```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.3.0-SNAPSHOT-perf.jar 15 16 10 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json"
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html target/systemds-3.4.0-SNAPSHOT-perf.jar 15 16 10 "src/test/resources/datasets/titanic/titanic.csv" "src/test/resources/datasets/titanic/tfspec.json"
 ```
 
 Reshape Sparse
 
 ```bash
-java -cp "target/systemds-3.3.0-SNAPSHOT-perf.jar:target/lib/*" -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html  org.apache.sysds.performance.Main 1005
+java -cp "target/systemds-3.4.0-SNAPSHOT-perf.jar:target/lib/*" -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html  org.apache.sysds.performance.Main 1005
 ```
 
 
 Binary Operations
 
 ```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1006 500
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1006 500
 ```
 
 
 transform encode 
 
 ```bash
-java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1007
+java -jar -agentpath:$HOME/Programs/profiler/lib/libasyncProfiler.so=start,event=cpu,file=temp/log.html -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1007
 ```
 
 
@@ -96,3 +96,10 @@ append matrix sequence
 ```bash
 ./src/test/scripts/performance/append.sh
 ```
+
+
+matrix multiplication 
+
+```bash
+./src/test/scripts/performance/matrixMultiplication.sh
+```
\ No newline at end of file
diff --git a/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java b/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java
new file mode 100644
index 00000000000..06a4d9065a8
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/generators/IGeneratePair.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.performance.generators;
+
+import org.apache.sysds.runtime.matrix.data.Pair;
+
+public class IGeneratePair<A, B> implements IGenerate<Pair<A, B>> {
+
+	private final IGenerate<A> a;
+	private final IGenerate<B> b;
+
+	public IGeneratePair(IGenerate<A> a, IGenerate<B> b) {
+		this.a = a;
+		this.b = b;
+	}
+
+	@Override
+	public boolean isEmpty() {
+		return a.isEmpty() && b.isEmpty();
+	}
+
+	@Override
+	public int defaultWaitTime() {
+		return Math.max(a.defaultWaitTime(), b.defaultWaitTime());
+	}
+
+	@Override
+	public Pair<A, B> take() {
+		A av = a.take();
+		B bv = b.take();
+		return new Pair<>(av, bv);
+	}
+
+	@Override
+	public void generate(int N) throws InterruptedException {
+		a.generate(N);
+		b.generate(N);
+	}
+
+}
diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java b/src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java
similarity index 95%
rename from src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java
rename to src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java
index f201c8fd7a5..0c77b07dfcd 100644
--- a/src/test/java/org/apache/sysds/performance/matrix/MatrixMulPerformance.java
+++ b/src/test/java/org/apache/sysds/performance/matrix/MMSparsityPerformance.java
@@ -31,7 +31,7 @@
 import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.test.TestUtils;
 
-public class MatrixMulPerformance {
+public class MMSparsityPerformance {
 
 	private final int _rl;
 	private final int _cl;
@@ -42,11 +42,11 @@ public class MatrixMulPerformance {
 	private final float resolutionDivisor;
 	private final float maxSparsity;
 
-	public MatrixMulPerformance() {
+	public MMSparsityPerformance() {
 		this(1024, 1024, 15, 50, 18, .4f, 2f);
 	}
 
-	public MatrixMulPerformance(int rl, int cl, int warmupRuns, int repetitions,
+	public MMSparsityPerformance(int rl, int cl, int warmupRuns, int repetitions,
 		int resolution, float maxSparsity, float stepDivisor)
 	{
 		_rl = rl;
@@ -89,10 +89,12 @@ private static String printAsPythonList(double[] list) {
 		sb.append("[");
 
 		for (double el : list)
-			sb.append(el + ",");
+			sb.append(el + ", ");
 
-		if (list.length > 0)
+		if (list.length > 0){
 			sb.deleteCharAt(sb.length() - 1);
+			sb.deleteCharAt(sb.length() - 1);
+		}
 
 		sb.append("]");
 		return sb.toString();
diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
new file mode 100644
index 00000000000..757bbcd0854
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
@@ -0,0 +1,88 @@
+package org.apache.sysds.performance.matrix;
+
+import java.util.Arrays;
+
+import org.apache.sysds.performance.compression.APerfTest;
+import org.apache.sysds.performance.generators.ConstMatrix;
+import org.apache.sysds.performance.generators.IGenerate;
+import org.apache.sysds.performance.generators.IGeneratePair;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.data.Pair;
+import org.apache.sysds.utils.stats.InfrastructureAnalyzer;
+
+public class MatrixMultiplicationPerf extends APerfTest<Object, Pair<MatrixBlock, MatrixBlock>> {
+
+	// parallelization degree
+	private final int k;
+
+	public MatrixMultiplicationPerf(int N, IGenerate<Pair<MatrixBlock, MatrixBlock>> gen, int k) {
+		super(N, gen);
+		this.k = k;
+	}
+
+	public void run() throws Exception {
+		warmup(() -> mm(k), 10);
+		execute(() -> mm(1), "mm SingleThread", N/10);
+		if(k != 1) {
+			execute(() -> mm(k), "mm MultiThread: " + k);
+		}
+	}
+
+	private void mm(int k) {
+		Pair<MatrixBlock, MatrixBlock> in = gen.take();
+		MatrixBlock left = in.getKey();
+		MatrixBlock right = in.getValue();
+		left.aggregateBinaryOperations(left, right, InstructionUtils.getMatMultOperator(k));
+		ret.add(null);
+	}
+
+	@Override
+	protected String makeResString() {
+		return "";
+	}
+
+	public static void main(String[] args) throws Exception {
+
+		IGenerate<MatrixBlock> left;
+		IGenerate<MatrixBlock> right;
+		final int i;
+		final int j;
+		final int k;
+		final double sp1;
+		final double sp2;
+		if(args.length == 0) {
+			i = Integer.parseInt(args[1]);
+			j = Integer.parseInt(args[2]);
+			k = Integer.parseInt(args[3]);
+
+			sp1 = Double.parseDouble(args[4]);
+			sp2 = Double.parseDouble(args[5]);
+
+		}
+		else {
+
+			i = Integer.parseInt(args[1]);
+			j = Integer.parseInt(args[2]);
+			k = Integer.parseInt(args[3]);
+
+			sp1 = Double.parseDouble(args[4]);
+			sp2 = Double.parseDouble(args[5]);
+
+		}
+
+		left = new ConstMatrix(i, j, 10, sp1);
+		right = new ConstMatrix(j, k, 10, sp2);
+		IGenerate<Pair<MatrixBlock, MatrixBlock>> gen = new IGeneratePair<>(left, right);
+
+		// set number of repeats based on expected number of instructions.
+
+		long inst = (long) i * k * j;
+
+		int N = Math.min(100000, (int) Math.max(100L, 50000000000L / inst));
+
+		System.out.println("MM Perf : rep " +N+ " -- " + Arrays.toString(args));
+
+		new MatrixMultiplicationPerf(N, gen, InfrastructureAnalyzer.getLocalParallelism()).run();
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
index 2c3dd11c6d0..a7f5714bf9a 100644
--- a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
+++ b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
@@ -1665,7 +1665,8 @@ protected static Process startLocalFedWorker(int port, String[] addArgs, int sle
 			"--add-opens=java.base/java.lang=ALL-UNNAMED" ,
 			"--add-opens=java.base/java.lang.ref=ALL-UNNAMED" ,
 			"--add-opens=java.base/java.util.concurrent=ALL-UNNAMED" ,
-			"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",};
+			"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
+			"--add-modules=jdk.incubator.vector",};
 
 		RuntimeMXBean runtimeMxBean = ManagementFactory.getRuntimeMXBean();
 		List<String> jvmArgs = runtimeMxBean.getInputArguments();
diff --git a/src/test/java/org/apache/sysds/test/functions/federated/multitenant/MultiTenantTestBase.java b/src/test/java/org/apache/sysds/test/functions/federated/multitenant/MultiTenantTestBase.java
index 1a716febf97..c3a4756a2d5 100644
--- a/src/test/java/org/apache/sysds/test/functions/federated/multitenant/MultiTenantTestBase.java
+++ b/src/test/java/org/apache/sysds/test/functions/federated/multitenant/MultiTenantTestBase.java
@@ -124,6 +124,7 @@ protected void startCoordinator(ExecMode execMode, String scriptPath, String[] a
 				"--add-opens=java.base/java.lang.ref=ALL-UNNAMED" ,
 				"--add-opens=java.base/java.util.concurrent=ALL-UNNAMED" ,
 				"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
+				"--add-modules=jdk.incubator.vector",
 				"-cp", classpath, DMLScript.class.getName()}, argsList.toArray(new String[0])));
 
 		Process process = null;
diff --git a/src/test/scripts/performance/append.sh b/src/test/scripts/performance/append.sh
index d2184dd472f..822de94ba1f 100755
--- a/src/test/scripts/performance/append.sh
+++ b/src/test/scripts/performance/append.sh
@@ -21,23 +21,23 @@
 #-------------------------------------------------------------
 
 mvn package > /dev/null
-java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008  100  100 1.0 1 30000
-java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000  100 1.0 1 3000
-java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 1 3000
-java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008  100  100 0.3 1 30000
-java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000  100 0.3 1 3000
-java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 1 3000
+java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008  100  100 1.0 1 30000
+java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000  100 1.0 1 3000
+java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 1 3000
+java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008  100  100 0.3 1 30000
+java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000  100 0.3 1 3000
+java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 1 3000
 
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008  100  100 1.0 10 30000
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000  100 1.0 10 3000
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 10 1000
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008  100  100 0.3 10 30000
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000  100 0.3 10 3000
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 10 1000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008  100  100 1.0 10 30000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000  100 1.0 10 3000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 10 1000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008  100  100 0.3 10 30000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000  100 0.3 10 3000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 10 1000
 
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008  100  100 1.0 100 3000
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000  100 1.0 100 300
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 100 200
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008  100  100 0.3 100 3000
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000  100 0.3 100 2000
-# java -jar -XX:+UseNUMA target/systemds-3.3.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 100 1000
\ No newline at end of file
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008  100  100 1.0 100 3000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000  100 1.0 100 300
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 1.0 100 200
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008  100  100 0.3 100 3000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000  100 0.3 100 2000
+# java -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1008 1000 1000 0.3 100 1000
\ No newline at end of file
diff --git a/src/test/scripts/performance/matrixMultiplication.sh b/src/test/scripts/performance/matrixMultiplication.sh
new file mode 100755
index 00000000000..66a2694c5b8
--- /dev/null
+++ b/src/test/scripts/performance/matrixMultiplication.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+mvn package > /dev/null
+
+cm="java --add-modules=jdk.incubator.vector -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1009"
+
+$cm 5 5 5 1 1
+$cm 500 5 5 1 1
+$cm 5 500 5 1 1
+$cm 5 5 500 1 1
+
+
+$cm 100 100 100 1 1
+$cm 1000 100 100 1 1
+$cm 100 1000 100 1 1
+$cm 100 100 1000 1 1
+
+
+$cm 1000 1000 1000 1 1
+
+$cm 10000 1000 1000 1 1
+$cm 1000 10000 1000 1 1
+$cm 1000 1000 10000 1 1
+
+
+$cm 10000 10000 10000 1 1
\ No newline at end of file

From 9cc9f0259371563c15ead64afe3bb3d840be5fd9 Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 15 May 2025 16:49:37 +0200
Subject: [PATCH 02/11] add boolean to specify single or multithreaded

---
 .../matrix/MatrixMultiplicationPerf.java      | 28 +++++++++--------
 .../performance/matrixMultiplication.sh       | 31 +++++++++----------
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
index 757bbcd0854..8f4ead21f31 100644
--- a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
+++ b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
@@ -15,18 +15,20 @@ public class MatrixMultiplicationPerf extends APerfTest<Object, Pair<MatrixBlock
 
 	// parallelization degree
 	private final int k;
+	private final boolean single;
 
-	public MatrixMultiplicationPerf(int N, IGenerate<Pair<MatrixBlock, MatrixBlock>> gen, int k) {
+	public MatrixMultiplicationPerf(int N, IGenerate<Pair<MatrixBlock, MatrixBlock>> gen, int k, boolean single) {
 		super(N, gen);
 		this.k = k;
+		this.single = single;
 	}
 
 	public void run() throws Exception {
 		warmup(() -> mm(k), 10);
-		execute(() -> mm(1), "mm SingleThread", N/10);
-		if(k != 1) {
+		if(single)
+			execute(() -> mm(1), "mm SingleThread", N/10);
+		if(k != 1)
 			execute(() -> mm(k), "mm MultiThread: " + k);
-		}
 	}
 
 	private void mm(int k) {
@@ -51,23 +53,23 @@ public static void main(String[] args) throws Exception {
 		final int k;
 		final double sp1;
 		final double sp2;
+		final boolean single;
 		if(args.length == 0) {
-			i = Integer.parseInt(args[1]);
-			j = Integer.parseInt(args[2]);
-			k = Integer.parseInt(args[3]);
-
-			sp1 = Double.parseDouble(args[4]);
-			sp2 = Double.parseDouble(args[5]);
-
+			i = 10;
+			j = 10;
+			k = 10;
+			sp1 = 1.0;
+			sp2 = 1.0;
+			single= true;
 		}
 		else {
 
 			i = Integer.parseInt(args[1]);
 			j = Integer.parseInt(args[2]);
 			k = Integer.parseInt(args[3]);
-
 			sp1 = Double.parseDouble(args[4]);
 			sp2 = Double.parseDouble(args[5]);
+			single = Boolean.parseBoolean(args[6]);
 
 		}
 
@@ -83,6 +85,6 @@ public static void main(String[] args) throws Exception {
 
 		System.out.println("MM Perf : rep " +N+ " -- " + Arrays.toString(args));
 
-		new MatrixMultiplicationPerf(N, gen, InfrastructureAnalyzer.getLocalParallelism()).run();
+		new MatrixMultiplicationPerf(N, gen, InfrastructureAnalyzer.getLocalParallelism(), single).run();
 	}
 }
diff --git a/src/test/scripts/performance/matrixMultiplication.sh b/src/test/scripts/performance/matrixMultiplication.sh
index 66a2694c5b8..fd43a90b331 100755
--- a/src/test/scripts/performance/matrixMultiplication.sh
+++ b/src/test/scripts/performance/matrixMultiplication.sh
@@ -21,27 +21,24 @@
 #-------------------------------------------------------------
 
 
-mvn package > /dev/null
+mvn package 2>&1 > /dev/null
 
 cm="java --add-modules=jdk.incubator.vector -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1009"
 
-$cm 5 5 5 1 1
-$cm 500 5 5 1 1
-$cm 5 500 5 1 1
-$cm 5 5 500 1 1
+$cm 5 5 5 1 1 true
+$cm 500 5 5 1 1 true
+$cm 5 500 5 1 1 true
+$cm 5 5 500 1 1 true
 
+$cm 100 100 100 1 1 true
+$cm 1000 100 100 1 1 true
+$cm 100 1000 100 1 1 true
+$cm 100 100 1000 1 1 true
 
-$cm 100 100 100 1 1
-$cm 1000 100 100 1 1
-$cm 100 1000 100 1 1
-$cm 100 100 1000 1 1
+$cm 1000 1000 1000 1 1 true
 
+$cm 10000 1000 1000 1 1 true
+$cm 1000 10000 1000 1 1 true
+$cm 1000 1000 10000 1 1 true
 
-$cm 1000 1000 1000 1 1
-
-$cm 10000 1000 1000 1 1
-$cm 1000 10000 1000 1 1
-$cm 1000 1000 10000 1 1
-
-
-$cm 10000 10000 10000 1 1
\ No newline at end of file
+$cm 10000 10000 10000 1 1 false
\ No newline at end of file

From a9c0c908b42187cecaa038b916ebc24d0dbb11d0 Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 15 May 2025 16:50:43 +0200
Subject: [PATCH 03/11] tmp only do the big one

---
 .../performance/matrixMultiplication.sh       | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/test/scripts/performance/matrixMultiplication.sh b/src/test/scripts/performance/matrixMultiplication.sh
index fd43a90b331..a63763c9019 100755
--- a/src/test/scripts/performance/matrixMultiplication.sh
+++ b/src/test/scripts/performance/matrixMultiplication.sh
@@ -25,20 +25,20 @@ mvn package 2>&1 > /dev/null
 
 cm="java --add-modules=jdk.incubator.vector -jar -XX:+UseNUMA target/systemds-3.4.0-SNAPSHOT-perf.jar 1009"
 
-$cm 5 5 5 1 1 true
-$cm 500 5 5 1 1 true
-$cm 5 500 5 1 1 true
-$cm 5 5 500 1 1 true
+# $cm 5 5 5 1 1 true
+# $cm 500 5 5 1 1 true
+# $cm 5 500 5 1 1 true
+# $cm 5 5 500 1 1 true
 
-$cm 100 100 100 1 1 true
-$cm 1000 100 100 1 1 true
-$cm 100 1000 100 1 1 true
-$cm 100 100 1000 1 1 true
+# $cm 100 100 100 1 1 true
+# $cm 1000 100 100 1 1 true
+# $cm 100 1000 100 1 1 true
+# $cm 100 100 1000 1 1 true
 
-$cm 1000 1000 1000 1 1 true
+# $cm 1000 1000 1000 1 1 true
 
-$cm 10000 1000 1000 1 1 true
-$cm 1000 10000 1000 1 1 true
-$cm 1000 1000 10000 1 1 true
+# $cm 10000 1000 1000 1 1 true
+# $cm 1000 10000 1000 1 1 true
+# $cm 1000 1000 10000 1 1 true
 
 $cm 10000 10000 10000 1 1 false
\ No newline at end of file

From 1cc76f6269a752bed7217fe1b532ab828273f98c Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 15 May 2025 19:29:53 +0200
Subject: [PATCH 04/11] fix error

---
 .../org/apache/sysds/runtime/matrix/data/LibMatrixMult.java | 6 ++----
 .../sysds/test/component/matrix/MatrixMultiplyTest.java     | 3 ++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index adb26dce107..2910b37938e 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -3933,8 +3933,7 @@ public static void vectMultiplyWrite( final double aval, double[] b, double[] c,
 		for( int j = bn; j < len; j+=vLen, bi+=vLen, ci+=vLen) 
 		{
 			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
-			aVec = aVec.mul(bVec);
-			aVec.intoArray(c, ci);
+			aVec.mul(bVec).intoArray(c, ci);
 		}
 	}
 	
@@ -3984,8 +3983,7 @@ public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai
 		{
 			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
 			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
-			aVec = aVec.mul(bVec);
-			aVec.intoArray(c, ci);
+			aVec.mul(bVec).intoArray(c, ci);
 		}
 	}
 	
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
index 0934898bcc2..b862a8a6314 100644
--- a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
+++ b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
@@ -263,7 +263,8 @@ private void test(MatrixBlock a, MatrixBlock b) {
 				totalMessage += "\n\nExp" + exp;
 				totalMessage += "\n\nAct" + ret;
 			}
-
+			LOG.error(exp.slice(0, 10,0, 10));
+			LOG.error(ret.slice(0, 10,0, 10));
 			assertEquals(totalMessage, exp.getNonZeros(), ret.getNonZeros());
 			TestUtils.compareMatricesPercentageDistance(exp, ret, 0.999, 0.99999, totalMessage, false);
 		}

From c33374f18c50c00fe16b6fb1b07e3e037709e5a0 Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 15 May 2025 19:40:39 +0200
Subject: [PATCH 05/11] remove print

---
 .../org/apache/sysds/runtime/matrix/data/LibMatrixMult.java    | 3 +--
 .../apache/sysds/test/component/matrix/MatrixMultiplyTest.java | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 2910b37938e..3982759312a 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -3774,8 +3774,7 @@ else if( aixk > bixk )
 	}
 
 	//note: public for use by codegen for consistency
-	public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len )
-	{		
+	public static void vectMultiplyAdd(final double aval, double[] b, double[] c, int bi, int ci, final int len) {
 		final int bn = len%vLen;
 		
 		//rest, not aligned to vLen-blocks
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
index b862a8a6314..0934898bcc2 100644
--- a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
+++ b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyTest.java
@@ -263,8 +263,7 @@ private void test(MatrixBlock a, MatrixBlock b) {
 				totalMessage += "\n\nExp" + exp;
 				totalMessage += "\n\nAct" + ret;
 			}
-			LOG.error(exp.slice(0, 10,0, 10));
-			LOG.error(ret.slice(0, 10,0, 10));
+
 			assertEquals(totalMessage, exp.getNonZeros(), ret.getNonZeros());
 			TestUtils.compareMatricesPercentageDistance(exp, ret, 0.999, 0.99999, totalMessage, false);
 		}

From 9b933d00d93c5bb47cbc9edfc030d4b2c7cb84fc Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 15 May 2025 19:43:24 +0200
Subject: [PATCH 06/11] fix

---
 .../matrix/MatrixMultiplicationPerf.java      | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
index 8f4ead21f31..df799e79c3b 100644
--- a/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
+++ b/src/test/java/org/apache/sysds/performance/matrix/MatrixMultiplicationPerf.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.apache.sysds.performance.matrix;
 
 import java.util.Arrays;
@@ -26,7 +45,7 @@ public MatrixMultiplicationPerf(int N, IGenerate<Pair<MatrixBlock, MatrixBlock>>
 	public void run() throws Exception {
 		warmup(() -> mm(k), 10);
 		if(single)
-			execute(() -> mm(1), "mm SingleThread", N/10);
+			execute(() -> mm(1), "mm SingleThread", N / 10);
 		if(k != 1)
 			execute(() -> mm(k), "mm MultiThread: " + k);
 	}
@@ -60,7 +79,7 @@ public static void main(String[] args) throws Exception {
 			k = 10;
 			sp1 = 1.0;
 			sp2 = 1.0;
-			single= true;
+			single = true;
 		}
 		else {
 
@@ -83,7 +102,7 @@ public static void main(String[] args) throws Exception {
 
 		int N = Math.min(100000, (int) Math.max(100L, 50000000000L / inst));
 
-		System.out.println("MM Perf : rep " +N+ " -- " + Arrays.toString(args));
+		System.out.println("MM Perf : rep " + N + " -- " + Arrays.toString(args));
 
 		new MatrixMultiplicationPerf(N, gen, InfrastructureAnalyzer.getLocalParallelism(), single).run();
 	}

From 5bd8d1a057043e7a76afc3a40c1bd5a705ec30de Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 15 May 2025 20:00:43 +0200
Subject: [PATCH 07/11] fix MMDictionary vectorized

---
 .../compress/colgroup/dictionary/MatrixBlockDictionary.java     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index 54cdf6920ac..24776f3adc4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -2129,7 +2129,7 @@ private static void preaggValuesFromDenseDictBlockedIKJ(double[] a, double[] b,
 		int bie, int bke, int cz, int az, int ls, int cut, int sOffT, int eOffT) {
 		final int vLen = SPECIES.length();
 		final DoubleVector vVec = DoubleVector.zero(SPECIES);
-		final int leftover = sOffT - eOffT % vLen; // leftover not vectorized
+		final int leftover = (eOffT - sOffT) % vLen; // leftover not vectorized
 		for(int i = bi; i < bie; i++) {
 			final int offI = i * cz;
 			final int offOutT = i * az + bj;

From 9cfd129a962daad26a3245777353d2d5bad28bc4 Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 15 May 2025 20:24:03 +0200
Subject: [PATCH 08/11] Increase eps tolerance in RewriteDetTest

---
 .../org/apache/sysds/test/functions/rewrite/RewriteDetTest.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/java/org/apache/sysds/test/functions/rewrite/RewriteDetTest.java b/src/test/java/org/apache/sysds/test/functions/rewrite/RewriteDetTest.java
index 288ff0e44e8..245d6b235bd 100644
--- a/src/test/java/org/apache/sysds/test/functions/rewrite/RewriteDetTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/rewrite/RewriteDetTest.java
@@ -46,7 +46,7 @@ public class RewriteDetTest extends AutomatedTestBase
 	private final static int rows = 23;
 	private final static double _sparsityDense = 0.7;
 	private final static double _sparsitySparse = 0.2;
-	private final static double eps = 1e-8;
+	private final static double eps = 1e-7;
 	
 	@Override
 	public void setUp() {

From 840eb992aa67661e09e1be97db7ab124b7687937 Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Fri, 16 May 2025 15:40:02 +0200
Subject: [PATCH 09/11] add vector api to SystemDS Context in PythonAPI

---
 src/main/python/systemds/context/systemds_context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/python/systemds/context/systemds_context.py b/src/main/python/systemds/context/systemds_context.py
index 9385cb991b9..25824bc6631 100644
--- a/src/main/python/systemds/context/systemds_context.py
+++ b/src/main/python/systemds/context/systemds_context.py
@@ -173,7 +173,7 @@ def __build_startup_command(self, port: int):
         :param port: The port address to use if -1 chose random port."""
 
         # Base command
-        command = ["java", "-cp"]
+        command = ["java", "--add-modules=jdk.incubator.vector", "-cp"]
 
         # Find the operating system specifc separator, nt means its Windows
         cp_separator = ";" if os.name == "nt" else ":"

From 8a53047838eb1c18b230644fdb5744c9c05b2132 Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 22 May 2025 15:10:00 +0200
Subject: [PATCH 10/11] black formatting

---
 src/main/python/systemds/utils/converters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/python/systemds/utils/converters.py b/src/main/python/systemds/utils/converters.py
index 650d6380605..551a2332579 100644
--- a/src/main/python/systemds/utils/converters.py
+++ b/src/main/python/systemds/utils/converters.py
@@ -72,7 +72,7 @@ def matrix_block_to_numpy(jvm: JVMView, mb: JavaObject):
     :param jvm: The current JVM instance running systemds.
     :param mb: A pointer to the JVM's MatrixBlock object.
     """
-    
+
     num_ros = mb.getNumRows()
     num_cols = mb.getNumColumns()
     buf = jvm.org.apache.sysds.runtime.util.Py4jConverterUtils.convertMBtoPy4JDenseArr(

From 1a819a309cf98b5fa2068d2f8c16b6241db4d333 Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 22 May 2025 22:01:54 +0200
Subject: [PATCH 11/11] more error allowed.

---
 docker/build.sh                                               | 2 +-
 docker/entrypoint.sh                                          | 4 +++-
 .../primitives/part3/FederatedWeightedDivMatrixMultTest.java  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docker/build.sh b/docker/build.sh
index 02e45675f10..2898effdc22 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -23,7 +23,7 @@
 # Build the docker containers
 
 # The first build is for running systemds through docker.
-docker image build -f docker/sysds.Dockerfile -t apache/systemds:latest .
+# docker image build -f docker/sysds.Dockerfile -t apache/systemds:latest .
 
 # The second build is for testing systemds. This image installs the R dependencies needed to run the tests.
 docker image build -f docker/testsysds.Dockerfile -t apache/systemds:testing-latest .
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index 84bd5e53783..c276a707c35 100755
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -30,7 +30,9 @@ export MAVEN_OPTS="-Xmx512m"
 
 log="/tmp/sysdstest.log"
 mvn -ntp -B test-compile 2>&1 | grep -E "BUILD|Total time:|---|Building SystemDS"
-mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 | grep -v "already exists in destination." | tee $log
+mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \
+	| grep -v "already exists in destination." \
+	| grep -v 'WARNING: Using incubator modules' | tee $log
 
 # Merge Federated test runs.
 [ -f target/jacoco.exec ] && mv target/jacoco.exec target/jacoco_main.exec
diff --git a/src/test/java/org/apache/sysds/test/functions/federated/primitives/part3/FederatedWeightedDivMatrixMultTest.java b/src/test/java/org/apache/sysds/test/functions/federated/primitives/part3/FederatedWeightedDivMatrixMultTest.java
index 17768f237fc..6753774f653 100644
--- a/src/test/java/org/apache/sysds/test/functions/federated/primitives/part3/FederatedWeightedDivMatrixMultTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/federated/primitives/part3/FederatedWeightedDivMatrixMultTest.java
@@ -61,7 +61,7 @@ public class FederatedWeightedDivMatrixMultTest extends AutomatedTestBase {
 
 	private final static String OUTPUT_NAME = "Z";
 
-	private final static double TOLERANCE = 1e-9;
+	private final static double TOLERANCE = 1e-8;
 
 	private final static int BLOCKSIZE = 1024;