From 1a850f961345b642d5b12f2c7cc66c5e0c7b73f6 Mon Sep 17 00:00:00 2001
From: Sebastian Baunsgaard <baunsgaard@apache.org>
Date: Thu, 22 May 2025 23:14:36 +0200
Subject: [PATCH] [SYSTEMDS-3874] Initial vector gather and remove vector
 staging

Co-authored-by: Kevin Innerebner <kevin.innerebner@gmail.com>
---
 .../SIMD-double-vectors/LibMatrixMult.java    | 4445 -----------------
 scripts/staging/SIMD-double-vectors/README.md |   45 -
 .../staging/SIMD-double-vectors/pom.xml.tmp   | 1336 -----
 scripts/staging/SIMD-double-vectors/systemds  |  487 --
 .../runtime/matrix/data/LibMatrixMult.java    |   18 +-
 5 files changed, 7 insertions(+), 6324 deletions(-)
 delete mode 100644 scripts/staging/SIMD-double-vectors/LibMatrixMult.java
 delete mode 100644 scripts/staging/SIMD-double-vectors/README.md
 delete mode 100644 scripts/staging/SIMD-double-vectors/pom.xml.tmp
 delete mode 100755 scripts/staging/SIMD-double-vectors/systemds

diff --git a/scripts/staging/SIMD-double-vectors/LibMatrixMult.java b/scripts/staging/SIMD-double-vectors/LibMatrixMult.java
deleted file mode 100644
index e1e3a640e43..00000000000
--- a/scripts/staging/SIMD-double-vectors/LibMatrixMult.java
+++ /dev/null
@@ -1,4445 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.runtime.matrix.data;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.stream.IntStream;
-
-import jdk.incubator.vector.DoubleVector;
-import jdk.incubator.vector.VectorOperators;
-import jdk.incubator.vector.VectorSpecies;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.commons.math3.util.FastMath;
-import org.apache.sysds.hops.OptimizerUtils;
-import org.apache.sysds.lops.MapMultChain.ChainType;
-import org.apache.sysds.lops.WeightedCrossEntropy.WCeMMType;
-import org.apache.sysds.lops.WeightedDivMM.WDivMMType;
-import org.apache.sysds.lops.WeightedSigmoid.WSigmoidType;
-import org.apache.sysds.lops.WeightedSquaredLoss.WeightsType;
-import org.apache.sysds.lops.WeightedUnaryMM.WUMMType;
-import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.DenseBlockFactory;
-import org.apache.sysds.runtime.data.SparseBlock;
-import org.apache.sysds.runtime.data.SparseBlock.Type;
-import org.apache.sysds.runtime.data.SparseBlockCSR;
-import org.apache.sysds.runtime.data.SparseBlockFactory;
-import org.apache.sysds.runtime.data.SparseBlockMCSR;
-import org.apache.sysds.runtime.data.SparseRowScalar;
-import org.apache.sysds.runtime.functionobjects.SwapIndex;
-import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
-import org.apache.sysds.runtime.util.CommonThreadPool;
-import org.apache.sysds.runtime.util.UtilFunctions;
-import org.apache.sysds.utils.NativeHelper;
-
-/**
- * MB: Library for matrix multiplications including MM, MV, VV for all
- * combinations of dense, sparse, ultrasparse representations and special
- * operations such as transpose-self matrix multiplication.
- * <p>
- * In general all implementations use internally dense outputs
- * for direct access, but change the final result to sparse if necessary.
- * The only exceptions are ultra-sparse matrix mult, wsloss and wsigmoid.
- */
-public class LibMatrixMult 
-{
-	//internal configuration
-	private static final boolean LOW_LEVEL_OPTIMIZATION = true;
-	private static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB
-	private static final long PAR_MINFLOP_THRESHOLD1 = 2L*1024*1024; //MIN 2 MFLOP
-	private static final long PAR_MINFLOP_THRESHOLD2 = 128L*1024; //MIN 2 MFLOP
-	public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size)
-	public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size)
-	private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName());
-
-	static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
-
-	private LibMatrixMult() {
-		//prevent instantiation via private constructor
-	}
-	
-	////////////////////////////////
-	// public matrix mult interface
-	////////////////////////////////
-	
-	/**
-	 * Performs a matrix multiplication
-	 * 
-	 * All variants use a IKJ access pattern, and internally use dense output. After the
-	 * actual computation, we recompute nnz and check for sparse/dense representation.
-	 * 
-	 * @param m1 first matrix
-	 * @param m2 second matrix
-	 * @return ret Matrix Block
-	 */
-	public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2) {
-		return matrixMult(m1, m2, null, false, 1);
-	}
-
-	/**
-	 * Performs a matrix multiplication
-	 * 
-	 * All variants use a IKJ access pattern, and internally use dense output. After the
-	 * actual computation, we recompute nnz and check for sparse/dense representation.
-	 * 
-	 * @param m1 first matrix
-	 * @param m2 second matrix
-	 * @param k maximum parallelism
-	 * @return ret Matrix Block
-	 */
-	public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, int k) {
-		return matrixMult(m1, m2, null, false, k);
-	}
-
-	/**
-	 * Performs a matrix multiplication and stores the result in the output matrix.
-	 * 
-	 * All variants use a IKJ access pattern, and internally use dense output. After the
-	 * actual computation, we recompute nnz and check for sparse/dense representation.
-	 * 
-	 * @param m1 first matrix
-	 * @param m2 second matrix
-	 * @param ret result matrix
-	 * @return ret Matrix Block
-	 */
-	public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret) {
-		return matrixMult(m1, m2, ret, false, 1);
-	}
-	
-	/**
-	 * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate
-	 * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally
-	 * consumed by another dense instruction, which makes repeated conversion to sparse wasteful.
-	 * This should be used in rare cases and if you are unsure,
-	 * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead.
-	 * 
-	 * @param m1 first matrix
-	 * @param m2 second matrix
-	 * @param ret result matrix
-	 * @param fixedRet if true, output representation is fixed and nnzs not recomputed
-	 * @return ret Matrix Block
-	 */
-	public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean fixedRet) {
-		return matrixMult(m1, m2, ret, fixedRet, 1);
-	}
-	
-	/**
-	 * Performs a multi-threaded matrix multiplication and stores the result in the output matrix.
-	 * The parameter k (k&gt;=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
-	 * 
-	 * @param m1 first matrix
-	 * @param m2 second matrix
-	 * @param ret result matrix
-	 * @param k maximum parallelism
-	 * @return ret Matrix Block
-	 */
-	public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) {
-		return matrixMult(m1, m2, ret, false, k);
-	}
-	
-	/**
-	 * Performs a matrix multiplication and stores the result in the output matrix.
-	 * 
-	 * All variants use a IKJ access pattern, and internally use dense output. After the
-	 * actual computation, we recompute nnz and check for sparse/dense representation.
-	 * 
-	 * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate
-	 * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally
-	 * consumed by another dense instruction, which makes repeated conversion to sparse wasteful.
-	 * This should be used in rare cases and if you are unsure,
-	 * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead.
-	 * 
-	 * The parameter k (k&gt;=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
-	 * 
-	 * @param m1 first matrix
-	 * @param m2 second matrix
-	 * @param ret result matrix
-	 * @param fixedRet if true, output representation is fixed and nnzs not recomputed
-	 * @param k maximum parallelism
-	 * @return ret Matrix Block
-	 */
-	public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean fixedRet, int k) {
-		if(m1.isEmptyBlock(false) || m2.isEmptyBlock(false)) 
-			return emptyMatrixMult(m1, m2, ret);
-		
-		// Timing time = new Timing(true);
-		
-		// pre analysis
-		boolean m1Perm = m1.isSparsePermutationMatrix();
-		boolean ultraSparse = (fixedRet && ret.sparse) ||
-			(!fixedRet && isUltraSparseMatrixMult(m1, m2, m1Perm));
-		boolean sparse = !fixedRet && !ultraSparse && !m1Perm
-			&& isSparseOutputMatrixMult(m1, m2);
-		
-		// allocate output
-		if(ret == null)
-			ret = new MatrixBlock(m1.rlen, m2.clen, ultraSparse | sparse);
-		else 
-			ret.reset(m1.rlen, m2.clen, ultraSparse | sparse);
-		ret.allocateBlock();
-		
-		// Detect if we should transpose skinny right side.
-		boolean tm2 = !fixedRet && checkPrepMatrixMultRightInput(m1,m2);
-		m2 = prepMatrixMultRightInput(m1, m2, tm2);
-
-		// check for multi-threading
-		if (!ret.isThreadSafe() 
-				|| !satisfiesMultiThreadingConstraints(m1, m2, m1.rlen==1, true, 2, k)
-				|| fixedRet) // Fixed ret not supported in multithreaded execution yet
-			k = 1;
-
-		if(k <= 1)
-			singleThreadedMatrixMult(m1, m2, ret, ultraSparse, sparse, tm2, m1Perm, fixedRet);
-		else
-			parallelMatrixMult(m1, m2, ret, k, ultraSparse, sparse, tm2, m1Perm);
-
-		//System.out.println("MM "+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
-		//		"("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
-	
-		return ret;
-	}
-
-	private static void singleThreadedMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,  
-		boolean ultraSparse, boolean sparse, boolean tm2, boolean m1Perm, boolean fixedRet){
-		// prepare row-upper for special cases of vector-matrix
-		final boolean pm2 = !ultraSparse && checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE);
-		final int ru2 = (pm2) ? m2.rlen : m1.rlen;
-
-		// core matrix mult computation
-		if(ultraSparse && !fixedRet)
-			matrixMultUltraSparse(m1, m2, ret, m1Perm, 0, ru2);
-		else if(!m1.sparse && !m2.sparse)
-			matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, m2.clen);
-		else if(m1.sparse && m2.sparse)
-			matrixMultSparseSparse(m1, m2, ret, pm2, sparse, 0, ru2);
-		else if(m1.sparse)
-			matrixMultSparseDense(m1, m2, ret, pm2, 0, ru2);
-		else
-			matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2);
-
-		// post-processing: nnz/representation
-		if(!fixedRet) {
-			if(!ret.sparse)
-				ret.recomputeNonZeros();
-			ret.examSparsity();
-		}
-	}
-
-	private static void parallelMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k, 
-		boolean ultraSparse, boolean sparse, boolean tm2, boolean m1Perm){
-		// prepare row-upper for special cases of vector-matrix / matrix-matrix
-		boolean pm2r = !ultraSparse && !sparse && checkParMatrixMultRightInputRows(m1, m2, k);
-		boolean pm2c = !ultraSparse && checkParMatrixMultRightInputCols(m1, m2, k, pm2r);
-		int num = pm2r ? m2.rlen : pm2c ? m2.clen : m1.rlen;
-
-		// core multi-threaded matrix mult computation
-		// (currently: always parallelization over number of rows)
-		try {
-			ExecutorService pool = CommonThreadPool.get(k);
-			ArrayList<MatrixMultTask> tasks = new ArrayList<>();
-			ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, (pm2r || pm2c));
-			for(int i = 0, lb = 0; i < blklens.size(); lb += blklens.get(i), i++)
-				tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i)));
-			// execute tasks
-			List<Future<Object>> taskret = pool.invokeAll(tasks);
-			pool.shutdown();
-			// aggregate partial results (nnz, ret for vector/matrix)
-			ret.nonZeros = 0; // reset after execute
-			for(Future<Object> task : taskret) {
-				if(pm2r) // guaranteed single block
-					vectAdd((double[]) task.get(), ret.getDenseBlockValues(), 0, 0, ret.rlen * ret.clen);
-				else
-					ret.nonZeros += (Long) task.get();
-			}
-			if(pm2r)
-				ret.recomputeNonZeros();
-		}
-		catch(Exception ex) {
-			throw new DMLRuntimeException(ex);
-		}
-
-		// post-processing (nnz maintained in parallel)
-		ret.examSparsity();
-	}
-
-	public static MatrixBlock emptyMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret){
-		final int rl = m1.rlen;
-		final int cl = m2.clen;
-
-		if(ret == null)
-			return new MatrixBlock(rl, cl, true);
-		else {
-			ret.reset(rl, cl, true);
-			ret.setNonZeros(0);
-			ret.cleanupBlock(true, true);
-			return ret;
-		}
-	}
-
-	/**
-	 * Performs a matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)).
-	 * 
-	 * All variants use a IKJ access pattern, and internally use dense output. After the
-	 * actual computation, we recompute nnz and check for sparse/dense representation.
-	 * 
-	 * @param mX X matrix
-	 * @param mV v matrix
-	 * @param mW w matrix
-	 * @param ret result matrix
-	 * @param ct chain type
-	 */
-	public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct) {
-		//check inputs / outputs (after that mV and mW guaranteed to be dense)
-		if( mX.isEmptyBlock(false) || (mV.isEmptyBlock(false) && ct!=ChainType.XtXvy)
-			|| (mW !=null && mW.isEmptyBlock(false)) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return;
-		}
-
-		//Timing time = new Timing(true);
-		
-		//pre-processing: output allocation
-		ret.sparse = false;
-		ret.allocateDenseBlock();
-		
-		//core matrix mult chain computation
-		if( mX.sparse )
-			matrixMultChainSparse(mX, mV, mW, ret, ct, 0, mX.rlen);
-		else
-			matrixMultChainDense(mX, mV, mW, ret, ct, 0, mX.rlen);
-		
-		//post-processing
-		ret.recomputeNonZeros();
-		ret.examSparsity();
-		
-		//System.out.println("MMChain "+ct.toString()+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
-		// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-
-	/**
-	 * Performs a parallel matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)).
-	 * The parameter k (k&gt;=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
-	 * 
-	 * NOTE: This multi-threaded mmchain operation has additional memory requirements of k*ncol(X)*8bytes 
-	 * for partial aggregation. Current max memory: 256KB; otherwise redirectly to sequential execution.
-	 * 
-	 * @param mX X matrix
-	 * @param mV v matrix
-	 * @param mW w matrix
-	 * @param ret result matrix
-	 * @param ct chain type
-	 * @param k maximum parallelism
-	 */
-	public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int k) {
-		//check inputs / outputs (after that mV and mW guaranteed to be dense)
-		if( mX.isEmptyBlock(false) || (mV.isEmptyBlock(false) && ct!=ChainType.XtXvy)
-			|| (mW !=null && mW.isEmptyBlock(false)) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return;
-		}
-		
-		//check temporary memory and too small workload for multi-threading
-		if( !satisfiesMultiThreadingConstraints(mX, true, true, mX.sparse?2:4, k) ) { 
-			matrixMultChain(mX, mV, mW, ret, ct);
-			return;
-		}
-		
-		//Timing time = new Timing(true);
-		
-		//pre-processing (no need to check isThreadSafe)
-		ret.sparse = false;
-		ret.allocateDenseBlock();
-		
-		//core matrix mult chain computation
-		//(currently: always parallelization over number of rows)
-		try {
-			ExecutorService pool = CommonThreadPool.get(k);
-			ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(mX.rlen, k, true);
-			ArrayList<MatrixMultChainTask> tasks = new ArrayList<>();
-			for( int i=0, lb=0; i<blklens.size(); lb+=blklens.get(i), i++ )
-				tasks.add(new MatrixMultChainTask(mX, mV, mW, ct, lb, lb+blklens.get(i)));
-			List<Future<double[]>> taskret = pool.invokeAll(tasks);
-			pool.shutdown();
-			//aggregate partial results and error handling
-			double[][] a = new double[taskret.size()][];
-			for(int i=0; i<taskret.size(); i++)
-				a[i] = taskret.get(i).get();
-			vectAddAll(a, ret.getDenseBlockValues(), 0, 0, mX.clen);
-		}
-		catch(Exception ex) {
-			throw new DMLRuntimeException(ex);
-		}
-		
-		//post-processing
-		ret.recomputeNonZeros();
-		ret.examSparsity();
-		
-		//System.out.println("MMChain "+ct.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
-		// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-
-	public static void matrixMultTransposeSelf( MatrixBlock m1, MatrixBlock ret, boolean leftTranspose ) {
-		matrixMultTransposeSelf(m1, ret, leftTranspose, true);
-	}
-
-	public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, boolean copyToLowerTriangle){
-		//check inputs / outputs
-		if( m1.isEmptyBlock(false) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return;
-		}
-		
-		//Timing time = new Timing(true);
-		
-		//pre-processing
-		ret.sparse = false;
-		ret.allocateDenseBlock();
-
-		if( m1.sparse )
-			matrixMultTransposeSelfSparse(m1, ret, leftTranspose, 0, ret.rlen);
-		else 
-			matrixMultTransposeSelfDense(m1, ret, leftTranspose, 0, ret.rlen );
-
-		//post-processing
-		if(copyToLowerTriangle){
-			long nnz = copyUpperToLowerTriangle(ret);
-			ret.setNonZeros(nnz);
-			ret.examSparsity();
-		}
-		
-		//System.out.println("TSMM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop());
-	}
-
-	/**
-	 * TSMM with optional transposed left side or not (Transposed self matrix multiplication)
-	 * 
-	 * @param m1            The matrix to do tsmm
-	 * @param ret           The output matrix to allocate the result to
-	 * @param leftTranspose If the left side should be considered transposed
-	 * @param k             the number of threads to use
-	 */
-	public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int k) {
-		//check inputs / outputs
-		if( m1.isEmptyBlock(false) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return;
-		}
-		
-		//check too small workload and fallback to sequential if necessary
-		if( !satisfiesMultiThreadingConstraintsTSMM(m1, leftTranspose, 1, k) ) {
-			matrixMultTransposeSelf(m1, ret, leftTranspose);
-			return;
-		}
-		
-		//Timing time = new Timing(true);
-		
-		//pre-processing (no need to check isThreadSafe)
-		ret.sparse = false;
-		ret.allocateDenseBlock();
-	
-		//core multi-threaded matrix mult computation
-		ExecutorService pool = CommonThreadPool.get(k);
-		try {
-			ArrayList<MatrixMultTransposeTask> tasks = new ArrayList<>();
-			//load balance via #tasks=2k due to triangular shape 
-			int blklen = (int)(Math.ceil((double)ret.rlen / (2 * k)));
-			for(int i = 0; i < ret.rlen; i += blklen)
-				tasks.add(new MatrixMultTransposeTask(m1, ret, leftTranspose, i, Math.min(i+blklen, ret.rlen)));
-			for( Future<Object> rtask :  pool.invokeAll(tasks) )
-				rtask.get();
-		}
-		catch(Exception ex) {
-			throw new DMLRuntimeException(ex);
-		}
-		finally{
-			pool.shutdown();
-		}
-		
-		//post-processing
-		long nnz = copyUpperToLowerTriangle(ret);
-		ret.setNonZeros(nnz);
-		ret.examSparsity();	
-		
-		//System.out.println("TSMM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop());
-	}
-
-	public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2 ) {
-		//check inputs / outputs
-		if( pm1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
-			return;
-
-		//Timing time = new Timing(true);
-
-		//pre-processing
-		ret1.sparse = (m2.sparse || ret1.sparse);
-		if( ret1.sparse )
-			ret1.allocateSparseRowsBlock();
-		else
-			ret1.allocateDenseBlock();
-		
-		//core permutation mm computation
-		if( m2.sparse )
-			matrixMultPermuteSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
-		else if( ret1.sparse )
-			matrixMultPermuteDenseSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
-		else
-			matrixMultPermuteDense(pm1, m2, ret1, ret2, 0, pm1.rlen);
-
-		//post-processing
-		ret1.recomputeNonZeros();
-		ret1.examSparsity();
-		if( ret2 != null ) { //optional second output
-			ret2.recomputeNonZeros();
-			ret2.examSparsity();
-		}
-
-		//System.out.println("PMM Seq ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" +
-		// "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
-	}	
-
-	public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int k) {
-		//check inputs / outputs
-		if( pm1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
-			return;
-
-		//check no parallelization benefit (fallback to sequential)
-		if (pm1.rlen == 1) {
-			matrixMultPermute(pm1, m2, ret1, ret2);
-			return;
-		}
-	
-		//Timing time = new Timing(true);
-		
-		//allocate first output block (second allocated if needed)
-		ret1.sparse = false;	  // no need to check isThreadSafe
-		ret1.allocateDenseBlock();
-		
-		try
-		{
-			ExecutorService pool = CommonThreadPool.get(k);
-			ArrayList<MatrixMultPermuteTask> tasks = new ArrayList<>();
-			int blklen = (int)(Math.ceil((double)pm1.rlen/k));
-			for( int i=0; i<k & i*blklen<pm1.rlen; i++ )
-				tasks.add(new MatrixMultPermuteTask(pm1, m2, ret1, ret2, i*blklen, Math.min((i+1)*blklen, pm1.rlen)));
-			pool.invokeAll(tasks);
-			pool.shutdown();
-		} 
-		catch (InterruptedException e) {
-			throw new DMLRuntimeException(e);
-		}
-		
-		//post-processing
-		ret1.recomputeNonZeros();
-		ret1.examSparsity();
-		if( ret2 != null ) { //optional second output
-			ret2.recomputeNonZeros();
-			ret2.examSparsity();
-		}
-		
-		// System.out.println("PMM Par ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" +
-		// "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
-	}	
-
-	public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt) {
-		//check for empty result
-		if( wt==WeightsType.POST && mW.isEmptyBlock(false) 
-			|| wt==WeightsType.POST_NZ && mX.isEmptyBlock(false) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-
-		//Timing time = new Timing(true);
-
-		//core weighted square sum mm computation
-		if( !mX.sparse && !mU.sparse && !mV.sparse && (mW==null || !mW.sparse) 
-			&& !mX.isEmptyBlock() && !mU.isEmptyBlock() && !mV.isEmptyBlock() && (mW==null || !mW.isEmptyBlock()))
-			matrixMultWSLossDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
-		else if( mX.sparse && !mU.sparse && !mV.sparse && (mW==null || mW.sparse)
-				&& !mX.isEmptyBlock() && !mU.isEmptyBlock() && !mV.isEmptyBlock() && (mW==null || !mW.isEmptyBlock()))
-			matrixMultWSLossSparseDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
-		else
-			matrixMultWSLossGeneric(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
-		
-		//add correction for sparse wsloss w/o weight
-		if( mX.sparse && wt==WeightsType.NONE )
-			addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, 1);
-		
-		//System.out.println("MMWSLoss " +wt.toString()+ " ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
-		// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-
-	public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int k) {
-		//check for empty result
-		if( wt==WeightsType.POST && mW.isEmptyBlock(false)
-			|| wt==WeightsType.POST_NZ && mX.isEmptyBlock(false) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return;
-		}
-		
-		//check no parallelization benefit (fallback to sequential)
-		//no need to check isThreadSafe (scalar output)
-		if( mX.rlen == 1 ) {
-			matrixMultWSLoss(mX, mU, mV, mW, ret, wt);
-			return;
-		}
-		
-		//Timing time = new Timing(true);
-		
-		try {
-			ExecutorService pool = CommonThreadPool.get(k);
-			ArrayList<MatrixMultWSLossTask> tasks = new ArrayList<>();
-			int blklen = (int)(Math.ceil((double)mX.rlen/k));
-			for( int i=0; i<k & i*blklen<mX.rlen; i++ )
-				tasks.add(new MatrixMultWSLossTask(mX, mU, mV, mW, wt, i*blklen, Math.min((i+1)*blklen, mX.rlen)));
-			List<Future<Double>> taskret = pool.invokeAll(tasks);
-			pool.shutdown();
-			//aggregate partial results
-			sumScalarResults(taskret, ret);
-		} 
-		catch( Exception e ) {
-			throw new DMLRuntimeException(e);
-		}
-
-		//add correction for sparse wsloss w/o weight
-		if( mX.sparse && wt==WeightsType.NONE )
-			addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, k);
-		
-		//System.out.println("MMWSLoss "+wt.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
-		//                   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-
-	public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt) {
-		//check for empty result
-		if( mW.isEmptyBlock(false) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-
-		//Timing time = new Timing(true);
-
-		//pre-processing
-		ret.sparse = mW.sparse;
-		ret.allocateBlock();
-		
-		//core weighted square sum mm computation
-		boolean allDense = !mW.sparse && !mU.sparse && !mV.sparse
-			&& !mU.isEmptyBlock() && !mV.isEmptyBlock();
-		if( NativeHelper.isNativeLibraryLoaded() && allDense && (mW.rlen == 1 || mW.clen == 1) 
-			&& !LibMatrixNative.isMatMultMemoryBound(mU.rlen, mU.clen, mV.rlen)
-			&& mW.getDenseBlock().isContiguous() && mU.getDenseBlock().isContiguous() && mV.getDenseBlock().isContiguous() )
-			matrixMultWSigmoidDenseNative(mW, mU, mV, ret, wt);
-		else if( allDense )
-			matrixMultWSigmoidDense(mW, mU, mV, ret, wt, 0, mW.rlen);
-		else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
-			matrixMultWSigmoidSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen);
-		else
-			matrixMultWSigmoidGeneric(mW, mU, mV, ret, wt, 0, mW.rlen);
-		
-		//post-processing
-		ret.recomputeNonZeros();
-		ret.examSparsity();
-		
-		//System.out.println("MMWSig "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-
-	public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int k) {
-		//check for empty result
-		if( mW.isEmptyBlock(false) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-
-		//check no parallelization benefit (fallback to sequential)
-		if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) {
-			matrixMultWSigmoid(mW, mU, mV, ret, wt);
-			return;
-		}
-		
-		//Timing time = new Timing(true);
-
-		//pre-processing
-		ret.sparse = mW.sparse;
-		ret.allocateBlock();
-		
-		try 
-		{
-			ExecutorService pool = CommonThreadPool.get(k);
-			ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<>();
-			int blklen = (int)(Math.ceil((double)mW.rlen/k));
-			for( int i=0; i<k & i*blklen<mW.rlen; i++ )
-				tasks.add(new MatrixMultWSigmoidTask(mW, mU, mV, ret, wt, i*blklen, Math.min((i+1)*blklen, mW.rlen)));
-			//execute tasks
-			List<Future<Long>> taskret = pool.invokeAll(tasks);
-			pool.shutdown();
-			//aggregate partial nnz and check for errors
-			ret.nonZeros = 0; //reset after execute
-			for( Future<Long> task : taskret )
-				ret.nonZeros += task.get();
-		} 
-		catch (Exception e) {
-			throw new DMLRuntimeException(e);
-		}
-
-		//post-processing (nnz maintained in parallel)
-		ret.examSparsity();
-
-		//System.out.println("MMWSig "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
-	}
-	
-	/**
-	 * NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations
-	 * have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail, 
-	 * this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero 
-	 * which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X) 
-	 * is zero.
-	 * 
-	 * @param mW matrix W
-	 * @param mU matrix U
-	 * @param mV matrix V
-	 * @param mX matrix X
-	 * @param ret result type
-	 * @param wt weighted divide matrix multiplication type
-	 */
-	public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt) {
-		//check for empty result 
-		if(   mW.isEmptyBlock(false) 
-		   || (wt.isLeft() && mU.isEmptyBlock(false))
-		   || (wt.isRight() && mV.isEmptyBlock(false))
-		   || (wt.isBasic() && mW.isEmptyBlock(false)))  {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-
-		//Timing time = new Timing(true);
-
-		//pre-processing
-		ret.sparse = wt.isBasic()?mW.sparse:false;
-		ret.allocateBlock();
-		
-		//core weighted div mm computation
-		boolean scalarX = wt.hasScalar();
-		if( !mW.sparse && !mU.sparse && !mV.sparse && (mX==null || !mX.sparse || scalarX) && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
-			matrixMultWDivMMDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
-		else if( mW.sparse && !mU.sparse && !mV.sparse && (mX==null || mX.sparse || scalarX) && !mU.isEmptyBlock() && !mV.isEmptyBlock())
-			matrixMultWDivMMSparseDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
-		else
-			matrixMultWDivMMGeneric(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
-		
-		//post-processing
-		ret.recomputeNonZeros();
-		ret.examSparsity();
-		
-		//System.out.println("MMWDiv "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-	
-	/**
-	 * NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations
-	 * have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail, 
-	 * this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero 
-	 * which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X) 
-	 * is zero.
-	 * 
-	 * @param mW matrix W
-	 * @param mU matrix U
-	 * @param mV matrix V
-	 * @param mX matrix X
-	 * @param ret result matrix
-	 * @param wt weighted divide matrix multiplication type
-	 * @param k maximum parallelism
-	 */
-	public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int k) {
-		//check for empty result 
-		if(   mW.isEmptyBlock(false) 
-		   || (wt.isLeft() && mU.isEmptyBlock(false))
-		   || (wt.isRight() && mV.isEmptyBlock(false)) 
-		   || (wt.isBasic() && mW.isEmptyBlock(false)))  {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-		
-		//Timing time = new Timing(true);
-
-		//pre-processing
-		ret.sparse = wt.isBasic()?mW.sparse:false;
-		ret.allocateBlock();
-
-		if (!ret.isThreadSafe()){
-			matrixMultWDivMM(mW, mU, mV, mX, ret, wt);
-			return;
-		}
-		
-		try 
-		{
-			ExecutorService pool = CommonThreadPool.get(k);
-			ArrayList<MatrixMultWDivTask> tasks = new ArrayList<>();
-			//create tasks (for wdivmm-left, parallelization over columns;
-			//for wdivmm-right, parallelization over rows; both ensure disjoint results)
-			if( wt.isLeft() ) {
-				int blklen = (int)(Math.ceil((double)mW.clen/k));
-				for( int j=0; j<k & j*blklen<mW.clen; j++ )
-					tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, 0, mW.rlen, j*blklen, Math.min((j+1)*blklen, mW.clen)));
-			}
-			else { //basic/right
-				int blklen = (int)(Math.ceil((double)mW.rlen/k));
-				for( int i=0; i<k & i*blklen<mW.rlen; i++ )
-					tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, i*blklen, Math.min((i+1)*blklen, mW.rlen), 0, mW.clen));
-			}
-			//execute tasks
-			List<Future<Long>> taskret = pool.invokeAll(tasks);
-			pool.shutdown();
-			//aggregate partial nnz and check for errors
-			ret.nonZeros = 0;  //reset after execute
-			for( Future<Long> task : taskret )
-				ret.nonZeros += task.get();
-		} 
-		catch (Exception e) {
-			throw new DMLRuntimeException(e);
-		} 
-
-		//post-processing
-		ret.examSparsity();
-		
-		//System.out.println("MMWDiv "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}	
-
-	public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt) {
-		//check for empty result 
-		if( mW.isEmptyBlock(false) )  {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-
-		//Timing time = new Timing(true);
-
-		//pre-processing
-		ret.sparse = false;
-		ret.allocateDenseBlock();
-		
-		//core weighted cross entropy mm computation
-		if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
-			matrixMultWCeMMDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
-		else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
-			matrixMultWCeMMSparseDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
-		else
-			matrixMultWCeMMGeneric(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
-		
-		//System.out.println("MMWCe "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-
-	public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int k) {
-		//check for empty result 
-		if( mW.isEmptyBlock(false) )  {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-
-		//Timing time = new Timing(true);
-
-		//pre-processing (no need to check isThreadSafe)
-		ret.sparse = false;
-		ret.allocateDenseBlock();
-		
-		try 
-		{
-			ExecutorService pool = CommonThreadPool.get(k);
-			ArrayList<MatrixMultWCeTask> tasks = new ArrayList<>();
-			int blklen = (int)(Math.ceil((double)mW.rlen/k));
-			for( int i=0; i<k & i*blklen<mW.rlen; i++ )
-				tasks.add(new MatrixMultWCeTask(mW, mU, mV, eps, wt, i*blklen, Math.min((i+1)*blklen, mW.rlen)));
-			List<Future<Double>> taskret = pool.invokeAll(tasks);
-			pool.shutdown();
-			//aggregate partial results
-			sumScalarResults(taskret, ret);
-		} 
-		catch( Exception e ) {
-			throw new DMLRuntimeException(e);
-		}
-		
-		//System.out.println("MMWCe "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-
-	public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn) {
-		//check for empty result
-		if( mW.isEmptyBlock(false) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-
-		//Timing time = new Timing(true);
-
-		//pre-processing
-		ret.sparse = mW.sparse;
-		ret.allocateBlock();
-		
-		//core weighted square sum mm computation
-		if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
-			matrixMultWuMMDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
-		else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
-			matrixMultWuMMSparseDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
-		else
-			matrixMultWuMMGeneric(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
-		
-		//post-processing
-		ret.recomputeNonZeros();
-		ret.examSparsity();
-		
-		//System.out.println("MMWu "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
-	}
-
-	public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int k) {
-		//check for empty result
-		if( mW.isEmptyBlock(false) ) {
-			ret.examSparsity(); //turn empty dense into sparse
-			return; 
-		}
-		
-		//check no parallelization benefit (fallback to sequential)
-		if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) {
-			matrixMultWuMM(mW, mU, mV, ret, wt, fn);
-			return;
-		}
-		
-		//Timing time = new Timing(true);
-
-		//pre-processing
-		ret.sparse = mW.sparse;
-		ret.allocateBlock();
-		
-		try 
-		{
-			ExecutorService pool = CommonThreadPool.get(k);
-			ArrayList<MatrixMultWuTask> tasks = new ArrayList<>();
-			int blklen = (int)(Math.ceil((double)mW.rlen/k));
-			for( int i=0; i<k & i*blklen<mW.rlen; i++ )
-				tasks.add(new MatrixMultWuTask(mW, mU, mV, ret, wt, fn, i*blklen, Math.min((i+1)*blklen, mW.rlen)));
-			//execute tasks
-			List<Future<Long>> taskret = pool.invokeAll(tasks);
-			pool.shutdown();
-			//aggregate partial nnz and check for errors
-			ret.nonZeros = 0; //reset after execute
-			for( Future<Long> task : taskret )
-				ret.nonZeros += task.get();
-		} 
-		catch (Exception e) {
-			throw new DMLRuntimeException(e);
-		}
-
-		//post-processing (nnz maintained in parallel)
-		ret.examSparsity();
-
-		//System.out.println("MMWu "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
-	}
-	
-	//////////////////////////////////////////
-	// optimized matrix mult implementation //
-	//////////////////////////////////////////
-
-	private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) {
-		DenseBlock a = m1.getDenseBlock();
-		DenseBlock b = m2.getDenseBlock();
-		DenseBlock c = ret.getDenseBlock();
-		final int m = m1.rlen;
-		final int n = m2.clen;
-		final int cd = m1.clen;
-		
-		if( LOW_LEVEL_OPTIMIZATION ) {
-			if( m==1 && n==1 ) {            //DOT PRODUCT
-				double[] avals = a.valuesAt(0);
-				double[] bvals = b.valuesAt(0);
-				c.set(0, 0, dotProduct(avals, bvals, cd));
-			}
-			else if( n>1 && cd == 1 ) {     //OUTER PRODUCT
-				double[] avals = a.valuesAt(0);
-				double[] bvals = b.valuesAt(0);
-				for( int i=rl; i < ru; i++) {
-					double[] cvals = c.values(i);
-					int cix = c.pos(i);
-					if( avals[i] == 1 )
-						System.arraycopy(bvals, 0, cvals, cix, n);
-					else if( avals[i] != 0 )
-						vectMultiplyWrite(avals[i], bvals, cvals, 0, cix, n);
-					else
-						Arrays.fill(cvals, cix, cix+n, 0);
-				}
-			}
-			else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
-				double[] avals = a.valuesAt(0);
-				double[] cvals = c.valuesAt(0);
-				vectMultiplyWrite(b.get(0,0), avals, cvals, rl, rl, ru-rl);
-			}
-			else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
-				matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
-			}
-			else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
-				matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
-			}
-			else if( pm2 && m==1 ) {        //VECTOR-MATRIX
-				matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
-			}
-			else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
-				matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
-			}
-			else if( tm2 ) {                //MATRIX-MATRIX (skinny rhs)
-				matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
-			}
-			else {                          //MATRIX-MATRIX
-				matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
-			}
-		}
-		else {
-			for( int i = rl; i < ru; i++) {
-				double[] avals = a.values(i);
-				double[] cvals = c.values(i);
-				int aix = a.pos(i), cix = c.pos(i);
-				for( int k = 0; k < cd; k++) {
-					double val = avals[aix + k];
-					if( val != 0 ) {
-						double[] bvals = b.values(k);
-						int bix = b.pos(k);
-						for( int j = 0; j < n; j++) 
-							cvals[cix+j] += val * bvals[bix+j];
-					}
-				}
-			}
-		}
-	}
-	
-	private static void matrixMultDenseDenseMVShortRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
-		double[] bvals = b.valuesAt(0);
-		double[] cvals = c.valuesAt(0);
-		for( int i=rl; i < ru; i++ )
-			cvals[i] = dotProduct(a.values(i), bvals, a.pos(i), 0, cd);
-	}
-	
-	private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
-		final int blocksizeI = 32;
-		final int blocksizeK = 2*1024; //16KB vector blocks (L1)
-		double[] bvals = b.valuesAt(0);
-		double[] cvals = c.valuesAt(0);
-		for( int bi=rl; bi<ru; bi+=blocksizeI ) {
-			int bimin = Math.min(bi+blocksizeI, ru);
-			for( int bk=0; bk<cd; bk+=blocksizeK ) {
-				int bkmin = Math.min(bk+blocksizeK, cd);
-				for( int i=bi; i<bimin; i++) 
-					cvals[i] += dotProduct(a.values(i), bvals, a.pos(i,bk), bk, bkmin-bk);
-			}
-		}
-	}
-	
-	private static void matrixMultDenseDenseVM(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru) {
-		double[] avals = a.valuesAt(0); //vector
-		double[] cvals = c.valuesAt(0); //vector
-		
-		//parallelization over rows in rhs matrix
-		//rest not aligned to blocks of 2 rows
-		final int kn = b.isContiguous() ? rl+(ru-rl)%2 : ru;
-		for( int k = rl; k < kn; k++ )
-			if( avals[k] != 0 )
-				vectMultiplyAdd(avals[k], b.values(k), cvals, b.pos(k), 0, n);
-		
-		//compute blocks of 2 rows (2 instead of 4 for small n<64)
-		double[] bvals = b.valuesAt(0); //only for special case
-		for( int k=kn, bix=kn*n; k<ru; k+=2, bix+=2*n ){
-			if( avals[k] != 0 && avals[k+1] != 0  )
-				vectMultiplyAdd2(avals[k], avals[k+1], bvals, cvals, bix, bix+n, 0, n);
-			else if( avals[k] != 0 )
-				vectMultiplyAdd(avals[k], bvals, cvals, bix, 0, n);
-			else if( avals[k+1] != 0 )
-				vectMultiplyAdd(avals[k+1], bvals, cvals, bix+n, 0, n);
-		}
-	}
-	
-	private static void matrixMultDenseDenseMMShortLHS(DenseBlock a, DenseBlock b, DenseBlock c, int m, int n, int cd, int rl, int ru) {
-		//cache-conscious parallelization over rows in rhs matrix
-		final int kn = (ru-rl)%4;
-		
-		//rest not aligned to blocks of 2 rows
-		for( int i=0; i<m; i++ ) {
-			double[] avals = a.values(i), cvals = c.values(i);
-			int aix = a.pos(i), cix = c.pos(i);
-			for( int k=rl; k<rl+kn; k++ )
-				if( avals[aix+k] != 0 )
-					vectMultiplyAdd(avals[aix+k], b.values(k), cvals, b.pos(k), cix, n);
-		}
-		
-		final int blocksizeK = 48;
-		final int blocksizeJ = 1024;
-		
-		//blocked execution
-		for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) {
-			int bkmin = Math.min(ru, bk+blocksizeK);
-			for( int bj = 0; bj < n; bj+=blocksizeJ ) {
-				//compute blocks of 4 rows in rhs w/ IKJ
-				int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-				for( int i=0; i<m; i++ ) {
-					double[] avals = a.values(i), cvals = c.values(i);
-					int aix = a.pos(i), cix = c.pos(i, bj);
-					if( b.isContiguous(bk, bkmin-1) ) {
-						double[] bvals = b.values(bk);
-						for( int k=bk, bix=b.pos(bk, bj); k<bkmin; k+=4, bix+=4*n )
-							vectMultiplyAdd4(avals[aix+k], avals[aix+k+1], avals[aix+k+2], avals[aix+k+3],
-								bvals, cvals, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
-					}
-					else {
-						for( int k=rl; k<rl+kn; k++ )
-							if( avals[aix+k] != 0 )
-								vectMultiplyAdd(avals[aix+k], b.values(k), cvals, b.pos(k), cix, n);
-					}
-				}
-			}
-		}
-	}
-	
-	private static void matrixMultDenseDenseMMSkinnyRHS(DenseBlock a, DenseBlock b, DenseBlock c, int n2, int cd, int rl, int ru) {
-		//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
-		//however, explicit flag required since dimension change m2
-		for( int i=rl; i < ru; i++ ) {
-			double[] avals = a.values(i), cvals = c.values(i);
-			int aix = a.pos(i), cix = c.pos(i);
-			for( int j=0; j<n2; j++ )
-				cvals[cix+j] = dotProduct(avals, b.values(j), aix, b.pos(j), cd);
-		}
-	}
-	
-	//note: public for use by codegen for consistency
-	public static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) {
-		//1) Unrolled inner loop (for better instruction-level parallelism)
-		//2) Blocked execution (for less cache trashing in parallel exec) 
-		//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
-		
-		final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block 
-		final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
-		final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan 
-
-		//temporary arrays (nnz a, b index)
-		double[] ta = new double[ blocksizeK ];
-		int[]  tbi  = new int[ blocksizeK ];
-		
-		//blocked execution
-		for( int bi = rl; bi < ru; bi+=blocksizeI )
-			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
-				for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ ) 
-				{
-					int bklen = bkmin-bk;
-					int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
-					
-					//core sub block matrix multiplication
-					for( int i = bi; i < bimin; i++) {
-						double[] avals = a.values(i), cvals = c.values(i);
-						int aixi = a.pos(i, bk), cixj = c.pos(i, bj);
-						
-						if( b.isContiguous(bk, bkmin-1) ) {
-							double[] bvals = b.values(bk);
-							int bkpos = b.pos(bk, bj);
-							
-							//determine nnz of a (for sparsity-aware skipping of rows)
-							int knnz = copyNonZeroElements(avals, aixi, bkpos, n, ta, tbi, bklen);
-							
-							//rest not aligned to blocks of 4 rows
-							final int bn = knnz % 4;
-							switch( bn ){
-								case 1: vectMultiplyAdd(ta[0], bvals, cvals, tbi[0], cixj, bjlen); break;
-								case 2: vectMultiplyAdd2(ta[0],ta[1], bvals, cvals, tbi[0], tbi[1], cixj, bjlen); break;
-								case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], bvals, cvals, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
-							}
-							
-							//compute blocks of 4 rows (core inner loop)
-							for( int k = bn; k<knnz; k+=4 ){
-								vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], bvals, cvals, 
-									tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
-							}
-						}
-						else {
-							for( int k = bk; k<bkmin; k++ ) {
-								if( avals[k] != 0 )
-									vectMultiplyAdd( avals[k], b.values(k),
-										cvals, b.pos(k, bj), cixj, bjlen );
-							}
-						}
-					}
-				}
-	}
-
-	private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) {
-		DenseBlock a = m1.getDenseBlock();
-		DenseBlock c = ret.getDenseBlock();
-		int m = m1.rlen;
-		int cd = m1.clen;
-		
-		// MATRIX-MATRIX (VV, MV not applicable here because V always dense)
-		if( LOW_LEVEL_OPTIMIZATION )
-		{
-			SparseBlock b = m2.sparseBlock;
-			
-			if( pm2 && m==1 ) {        //VECTOR-MATRIX
-				//parallelization over rows in rhs matrix
-				double[] avals = a.valuesAt(0); //vector
-				double[] cvals = c.valuesAt(0); //vector
-				for( int k=rl; k<ru; k++ )
-					if( avals[k] != 0 && !b.isEmpty(k) ) {
-						vectMultiplyAdd(avals[k], b.values(k), cvals,
-							b.indexes(k), b.pos(k), 0, b.size(k));
-					}
-			}
-			else {                     //MATRIX-MATRIX
-				//best effort blocking, without blocking over J because it is 
-				//counter-productive, even with front of current indexes
-				final int blocksizeK = 32;
-				final int blocksizeI = 32;
-				
-				int rl1 = pm2 ? 0 : rl;
-				int ru1 = pm2 ? m : ru;
-				int rl2 = pm2 ? rl : 0;
-				int ru2 = pm2 ? ru : cd;
-				
-				//blocked execution
-				for( int bi = rl1; bi < ru1; bi+=blocksizeI )
-					for( int bk = rl2, bimin = Math.min(ru1, bi+blocksizeI); bk < ru2; bk+=blocksizeK ) {
-						int bkmin = Math.min(ru2, bk+blocksizeK);
-						//core sub block matrix multiplication
-						for(int i = bi; i < bimin; i++) {
-							double[] avals = a.values(i), cvals = c.values(i);
-							int aix = a.pos(i), cix = c.pos(i);
-							for( int k = bk; k < bkmin; k++ ) {
-								double aval = avals[aix+k];
-								if( aval == 0 || b.isEmpty(k) )
-									continue;
-								vectMultiplyAdd(aval, b.values(k), cvals, 
-									b.indexes(k), b.pos(k), cix, b.size(k));
-							}
-						}
-					}
-			}
-		}
-		else {
-			SparseBlock b = m2.sparseBlock;
-			for( int i=rl; i < ru; i++ ) {
-				double[] avals = a.values(i), cvals = c.values(i);
-				int aix = a.pos(i), cix = c.pos(i);
-				for(int k = 0; k < cd; k++ ) {
-					double val = avals [aix];
-					if( val == 0 || b.isEmpty(k) ) continue;
-					int bpos = b.pos(k);
-					int blen = b.size(k);
-					int[] bix = b.indexes(k);
-					double[] bvals = b.values(k);
-					for(int j = bpos; j < bpos+blen; j++)
-						cvals[cix+bix[j]] += val * bvals[j];
-				}
-			}
-		}
-	}
-
-	private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) {
-		SparseBlock a = m1.sparseBlock;
-		DenseBlock b = m2.getDenseBlock();
-		DenseBlock c = ret.getDenseBlock();
-		final int m = m1.rlen;
-		final int n = m2.clen;
-		final int cd = m2.rlen;
-		final long xsp = (long)m*cd/m1.nonZeros;
-
-		if( LOW_LEVEL_OPTIMIZATION ) {
-			if( m==1 && n==1 ) {            //DOT PRODUCT
-				if( !a.isEmpty(0) )
-					c.set(0, 0, dotProduct(a.values(0), b.values(0), a.indexes(0), a.pos(0), 0, a.size(0)));
-			}
-			else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
-				matrixMultSparseDenseMVShortRHS(a, b, c, cd, rl, ru);
-			}
-			else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
-				matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, ru);
-			}
-			else if( pm2 && m==1 ) {        //VECTOR-MATRIX
-				matrixMultSparseDenseVM(a, b, c, n, rl, ru);
-			}
-			else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
-				matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
-			}
-			else if( n<=64 ) {              //MATRIX-MATRIX (skinny rhs)
-				matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
-			}
-			else {                          //MATRIX-MATRIX
-				matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
-			}
-		}
-		else {
-			for( int i=rl; i<ru; i++ ) {
-				if( a.isEmpty(i) ) continue; 
-				int apos = a.pos(i);
-				int alen = a.size(i);
-				// int[] aix = a.indexes(i);
-				double[] avals = a.values(i);
-				double[] cvals = c.values(i);
-				int cix = c.pos(i);
-				for(int k = apos; k < apos+alen; k++) {
-					double val = avals[k];
-					double[] bvals = b.values(k);
-					int bix = b.pos(k);
-					for(int j = 0; j < n; j++)
-						cvals[cix+j] += val * bvals[bix+j];
-				}
-			}
-		}
-	}
-	
-	private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
-		double[] bvals = b.valuesAt(0);
-		double[] cvals = c.valuesAt(0);
-		for( int i=rl; i<ru; i++ ) {
-			if( a.isEmpty(i) ) continue;
-			int alen = a.size(i);
-			int apos = a.pos(i);
-			double[] avals = a.values(i);
-			cvals[i] = (alen==cd) ? dotProduct(avals, bvals, apos, 0, cd) :
-				dotProduct(avals, bvals, a.indexes(i), apos, 0, alen);
-		}
-	}
-	
-	private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, DenseBlock b, DenseBlock c, int cd, long xsp, int rl, int ru) {
-		double[] bvals = b.valuesAt(0);
-		double[] cvals = c.valuesAt(0);
-		
-		final int blocksizeI = 512; //8KB curk+cvals in L1
-		final int blocksizeK = (int)Math.max(2048,2048*xsp/32); //~256KB bvals in L2
-		int[] curk = new int[blocksizeI];
-		
-		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-			Arrays.fill(curk, 0); //reset positions
-			for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
-				for( int i=bi, bkmin = bk+blocksizeK; i<bimin; i++) {
-					if( a.isEmpty(i) ) continue;
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					int k = curk[i-bi] + apos;
-					for( ; k<apos+alen && aix[k]<bkmin; k++ )
-						cvals[i] += avals[k] * bvals[aix[k]];
-					curk[i-bi] = k - apos;
-				}
-			}
-		}
-	}
-	
-	private static void matrixMultSparseDenseVM(SparseBlock a, DenseBlock b, DenseBlock c, int n, int rl, int ru) {
-		if( a.isEmpty(0) )
-			return;
-		
-		//parallelization over rows in rhs matrix
-		int alen = a.size(0);
-		int[] aix = a.indexes(0);
-		double[] avals = a.values(0);
-		double[] cvals = c.valuesAt(0);
-		int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
-		rlix = (rlix>=0) ? rlix : alen;
-		
-		if( b.isContiguous() ) {
-			double[] bvals = b.valuesAt(0);
-			for( int k=rlix; k<alen && aix[k]<ru; k++ )
-				if( k+1<alen && aix[k+1]<ru )
-					vectMultiplyAdd2(avals[k], avals[k+1], bvals, cvals, aix[k]*n, aix[++k]*n, 0, n);
-				else
-					vectMultiplyAdd(avals[k], bvals, cvals, aix[k]*n, 0, n);
-		}
-		else {
-			for( int k=rlix; k<alen && aix[k]<ru; k++ )
-				vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k]), 0, n);
-		}
-	}
-	
-	private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru) {
-		int arlen = a.numRows();
-		for( int i=0; i<arlen; i++ ) {
-			if( a.isEmpty(i) ) continue;
-			int apos = a.pos(i);
-			int alen = a.size(i);
-			int[] aix = a.indexes(i);
-			double[] avals = a.values(i);
-			double[] cvals = c.values(i);
-			int cix = c.pos(i);
-			
-			int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
-			k1 = (k1>=0) ? apos+k1 : apos+alen;
-			int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
-			k2 = (k2>=0) ? apos+k2 : apos+alen;
-			
-			//note: guard k1 (and thus also k2) against overrun nnz, and guard
-			//contiguous check for k2-1 against underrun of start pos for k1==k2.
-			if( k1<apos+alen && (k1==k2 || b.isContiguous(aix[k1], aix[k2-1])) ) {
-				double[] bvals = b.values(aix[k1]);
-				int base = aix[k1]*n - b.pos(aix[k1]);
-				//rest not aligned to blocks of 4 rows
-				final int bn = (k2-k1) % 4;
-				switch( bn ){
-					case 1: vectMultiplyAdd(avals[k1], bvals, cvals, aix[k1]*n-base, cix, n); break;
-					case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], bvals, cvals, aix[k1]*n-base, aix[k1+1]*n-base, cix, n); break;
-					case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], bvals, cvals, aix[k1]*n-base, aix[k1+1]*n-base, aix[k1+2]*n-base, cix, n); break;
-				}
-				
-				//compute blocks of 4 rows (core inner loop)
-				for( int k = k1+bn; k<k2; k+=4 ) {
-					vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals, 
-						aix[k]*n-base, aix[k+1]*n-base, aix[k+2]*n-base, aix[k+3]*n-base, cix, n );
-				}
-			}
-			else {
-				for( int k = k1; k<k2; k++ )
-					vectMultiplyAdd( avals[k], b.values(aix[k]), cvals, b.pos(aix[k]), cix, n );
-			}
-		}
-	}
-	
-	private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, DenseBlock b, DenseBlock c, int n, int rl, int ru) {
-		//no blocking since b and c fit into cache anyway
-		for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
-			if( a.isEmpty(i) ) continue;
-			int apos = a.pos(i);
-			int alen = a.size(i);
-			int[] aix = a.indexes(i);
-			double[] avals = a.values(i);
-			double[] cvals = c.values(i);
-			//rest not aligned to blocks of 4 rows
-			int bn = b.isContiguous() ? alen%4 : alen;
-			for( int k=apos; k<apos+bn; k++ )
-				vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k]), cix, n);
-			//compute blocks of 4 rows (core inner loop)
-			double[] bvals = b.valuesAt(0); //only for contiguous
-			for( int k=apos+bn; k<apos+alen; k+=4 )
-				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals,
-					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
-		}
-	}
-	
-	private static void matrixMultSparseDenseMM(SparseBlock a, DenseBlock b, DenseBlock c, int n, int cd, long xsp, int rl, int ru) {
-		//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
-		//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
-		//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
-		//output - however, in this case it's unlikely that we consume every cache line in the rhs
-		final int blocksizeI = (int) (8L*xsp);
-		final int blocksizeK = (int) (8L*xsp);
-		final int blocksizeJ = 1024; 
-		
-		//temporary array of current sparse positions
-		int[] curk = new int[Math.min(blocksizeI, ru-rl)];
-		
-		//blocked execution over IKJ 
-		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-			Arrays.fill(curk, 0); //reset positions
-			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
-				for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
-					int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-					
-					//core sub block matrix multiplication
-					for( int i=bi; i<bimin; i++ ) {
-						if( a.isEmpty(i) ) continue;
-						int apos = a.pos(i);
-						int alen = a.size(i);
-						int[] aix = a.indexes(i);
-						double[] avals = a.values(i);
-						double[] cvals = c.values(i);
-						int cix = c.pos(i, bj);
-						
-						int k = curk[i-bi] + apos;
-						//rest not aligned to blocks of 4 rows
-						int bn = b.isContiguous() ? alen%4 : alen;
-						for( ; k<apos+bn && aix[k]<bkmin; k++ )
-							vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k],bj), cix, bjlen); 
-						//compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
-						double[] bvals = b.valuesAt(0); //only for contiguous
-						for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
-							vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals, 
-								aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
-						//update positions on last bj block
-						if( bj+bjlen==n )
-							curk[i-bi] = k - apos;
-					}
-				}
-			}
-		}
-	}
-
-	private static void matrixMultSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, boolean sparse, int rl, int ru) {
-		SparseBlock a = m1.sparseBlock;
-		SparseBlock b = m2.sparseBlock;
-		int m = m1.rlen;
-		int cd = m1.clen;
-		int n = m2.clen;
-		
-		// MATRIX-MATRIX (VV, MV not applicable here because V always dense)
-		if(LOW_LEVEL_OPTIMIZATION) {
-			if( pm2 && m==1 )               //VECTOR-MATRIX
-				matrixMultSparseSparseVM(a, b, ret.getDenseBlock(), rl, ru);
-			else if( sparse )               //SPARSE OUPUT
-				ret.setNonZeros(matrixMultSparseSparseSparseMM(a, b, ret.getSparseBlock(), n, rl, ru));
-			else if( m2.nonZeros < 2048 )   //MATRIX-SMALL MATRIX
-				matrixMultSparseSparseMMSmallRHS(a, b, ret.getDenseBlock(), rl, ru);
-			else                            //MATRIX-MATRIX
-				matrixMultSparseSparseMM(a, b, ret.getDenseBlock(), m, cd, m1.nonZeros, rl, ru);
-		}
-		else {
-			matrixMultSparseSparseMMGeneric(a, b, ret.getDenseBlock(), rl, ru);
-		}
-	}
-	
-	private static void matrixMultSparseSparseVM(SparseBlock a, SparseBlock b, DenseBlock c, int rl, int ru) {
-		//parallelization over rows in rhs matrix
-		if( a.isEmpty(0) )
-			return;
-		
-		int alen = a.size(0);
-		int[] aix = a.indexes(0);
-		double[] avals = a.values(0);
-		double[] cvals = c.valuesAt(0);
-		int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
-		rlix = (rlix>=0) ? rlix : alen;
-		
-		for( int k=rlix; k<alen && aix[k]<ru; k++ )
-			if( !b.isEmpty(aix[k]) ) {
-				int bpos = b.pos(aix[k]);
-				int blen = b.size(aix[k]);
-				int[] bix = b.indexes(aix[k]);
-				double[] bvals = b.values(aix[k]);
-				vectMultiplyAdd(avals[k], bvals, cvals, bix, bpos, 0, blen);
-			}
-	}
-	
-	private static long matrixMultSparseSparseSparseMM(SparseBlock a, SparseBlock b, SparseBlock c, int n, int rl, int ru) {
-		double[] tmp = new double[n];
-		long nnz = 0;
-		for( int i=rl; i<Math.min(ru, a.numRows()); i++ ) {
-			if( a.isEmpty(i) ) continue;
-			final int apos = a.pos(i);
-			final int alen = a.size(i);
-			int[] aix = a.indexes(i);
-			double[] avals = a.values(i);
-			//compute row output in dense buffer
-			boolean hitNonEmpty = false;
-			for(int k = apos; k < apos+alen; k++) {
-				int aixk = aix[k];
-				if( b.isEmpty(aixk) ) continue;
-				vectMultiplyAdd(avals[k], b.values(aixk), tmp,
-					b.indexes(aixk), b.pos(aixk), 0, b.size(aixk));
-				hitNonEmpty = true;
-			}
-			//copy dense buffer into sparse output (CSR or MCSR)
-			if( hitNonEmpty ) {
-				int rnnz = UtilFunctions.computeNnz(tmp, 0, n);
-				nnz += rnnz;
-				c.allocate(i, rnnz);
-				for(int j=0; j<n; j++)
-					if( tmp[j] != 0 ) {
-						c.append(i, j, tmp[j]);
-						tmp[j] = 0;
-					}
-			}
-		}
-		return nnz;
-	}
-	
-	private static void matrixMultSparseSparseMMSmallRHS(SparseBlock a, SparseBlock b, DenseBlock c, int rl, int ru) {
-		for( int i=rl; i<Math.min(ru, a.numRows()); i++ ) {
-			if( a.isEmpty(i) ) continue;
-			final int apos = a.pos(i);
-			final int alen = a.size(i);
-			int[] aix = a.indexes(i);
-			double[] avals = a.values(i);
-			double[] cvals = c.values(i);
-			int cix = c.pos(i);
-			for(int k = apos; k < apos+alen; k++) {
-				int aixk = aix[k];
-				if( b.isEmpty(aixk) ) continue;
-				vectMultiplyAdd(avals[k], b.values(aixk), cvals,
-					b.indexes(aixk), b.pos(aixk), cix, b.size(aixk));
-			}
-		}
-	}
-	
-	private static void matrixMultSparseSparseMM(SparseBlock a, SparseBlock b, DenseBlock c, int m, int cd, long nnz1, int rl, int ru) {
-		//block sizes for best-effort blocking w/ sufficient row reuse in B yet small overhead
-		final int blocksizeI = 32;
-		final int blocksizeK = Math.max(32,
-			UtilFunctions.nextIntPow2((int)Math.pow((double)m*cd/nnz1, 2)));
-		
-		//temporary array of current sparse positions
-		int[] curk = new int[Math.min(blocksizeI, ru-rl)];
-		
-		//blocked execution over IK 
-		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-			Arrays.fill(curk, 0); //reset positions
-			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
-				final int bkmin = Math.min(cd, bk+blocksizeK); 
-				//core sub block matrix multiplication
-				for( int i=bi; i<bimin; i++ ) {
-					if( a.isEmpty(i) ) continue;
-					final int apos = a.pos(i);
-					final int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					double[] cvals = c.values(i);
-					int cix = c.pos(i);
-					int k = curk[i-bi] + apos;
-					for(; k < apos+alen && aix[k]<bkmin; k++) {
-						if( b.isEmpty(aix[k]) ) continue;
-						vectMultiplyAdd(avals[k], b.values(aix[k]), cvals,
-							b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
-					}
-					curk[i-bi] = k - apos;
-				}
-			}
-		}
-	}
-	
-	private static void matrixMultSparseSparseMMGeneric(SparseBlock a, SparseBlock b, DenseBlock c, int rl, int ru) {
-		for( int i=rl; i<Math.min(ru, a.numRows()); i++ ) {
-			if( a.isEmpty(i) ) continue;
-			int apos = a.pos(i);
-			int alen = a.size(i);
-			int[] aix = a.indexes(i);
-			double[] avals = a.values(i);
-			double[] cvals = c.values(i);
-			int cix = c.pos(i);
-			for(int k = apos; k < apos+alen; k++) {
-				if( b.isEmpty(aix[k]) ) continue;
-				double val = avals[k];
-				int bpos = b.pos(aix[k]);
-				int blen = b.size(aix[k]);
-				int[] bix = b.indexes(aix[k]);
-				double[] bvals = b.values(aix[k]);
-				for(int j = bpos; j < bpos+blen; j++)
-					cvals[cix+bix[j]] += val * bvals[j];
-			}
-		}
-	}
-	
-	/**
-	 * This implementation applies to any combination of dense/sparse if at least one
-	 * input is ultrasparse (sparse and very few nnz). In that case, most importantly,
-	 * we want to create a sparse output and only iterate over the few nnz as the major
-	 * dimension. Low-level optimization have less importance in that case and having
-	 * this generic implementation helps to reduce the implementations from (2+1)^2
-	 * to 2^2+1.
-	 * 
-	 * @param m1 first matrix
-	 * @param m2 second matrix
-	 * @param ret result matrix
-	 * @param rl row lower bound
-	 * @param ru row upper bound
-	 */
-	private static void matrixMultUltraSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean m1Perm, int rl, int ru) {
-		final boolean leftUS = m1.isUltraSparse()
-			|| (m1.isUltraSparse(false) && !m2.isUltraSparse())
-			|| (m1.sparse && !m2.sparse);
-		
-		if( m1 == m2 ) //self-product
-			matrixMultUltraSparseSelf(m1, ret, rl, ru);
-		else if( leftUS || m1Perm )
-			matrixMultUltraSparseLeft(m1, m2, ret, rl, ru);
-		else
-			matrixMultUltraSparseRight(m1, m2, ret, rl, ru);
-		//no need to recompute nonzeros because maintained internally
-	}
-	
-	private static void matrixMultUltraSparseSelf(MatrixBlock m1, MatrixBlock ret, int rl, int ru) {
-		//common use case: self product G %*% G of graph resulting in dense but still sparse output
-		int n = m1.clen; //m2.clen
-		SparseBlock a = m1.sparseBlock;
-		SparseBlock c = ret.sparseBlock;
-		double[] tmp = null;
-		
-		//IKJ with dense working row for lhs nnz/row > threshold
-		for( int i=rl; i<ru; i++ ) {
-			if( a.isEmpty(i) ) continue;
-			int alen = a.size(i);
-			int apos = a.pos(i);
-			int[] aix = a.indexes(i);
-			double[] avals = a.values(i);
-			
-			//compute number of aggregated non-zeros for input row
-			int nnz1 = (int) Math.min(UtilFunctions.computeNnz(a, aix, apos, alen), n);
-			boolean ldense = nnz1 > n / 128;
-			
-			//perform vector-matrix multiply w/ dense or sparse output
-			if( ldense ) { //init dense tmp row
-				tmp = (tmp == null) ? new double[n] : tmp;
-				Arrays.fill(tmp, 0);
-			}
-			for( int k=apos; k<apos+alen; k++ ) {
-				if( a.isEmpty(aix[k]) ) continue;
-				int blen = a.size(aix[k]);
-				int bpos = a.pos(aix[k]);
-				int[] bix = a.indexes(aix[k]);
-				double aval = avals[k];
-				double[] bvals = a.values(aix[k]);
-				if( ldense ) { //dense aggregation
-					for( int j=bpos; j<bpos+blen; j++ )
-						tmp[bix[j]] += aval * bvals[j];
-				}
-				else { //sparse aggregation
-					c.allocate(i, nnz1);
-					for( int j=bpos; j<bpos+blen; j++ )
-						c.add(i, bix[j], aval * bvals[j]);
-					c.compact(i); //conditional compaction
-				}
-			}
-			if( ldense ) { //copy dense tmp row
-				int nnz2 = UtilFunctions.computeNnz(tmp, 0, n);
-				c.allocate(i, nnz2); //avoid reallocation
-				for( int j=0; j<n; j++ )
-					c.append(i, j, tmp[j]);
-			}
-		}
-		//recompute non-zero for single-threaded
-		if( rl == 0 && ru == m1.rlen )
-			ret.recomputeNonZeros();
-	}
-	
-	private static void matrixMultUltraSparseLeft(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
-		final int m  = m1.rlen;
-		final int n  = m2.clen;
-		
-		//left is ultra-sparse (IKJ)
-		SparseBlock a = m1.sparseBlock;
-		SparseBlock c = ret.sparseBlock;
-		boolean rightSparse = m2.sparse;
-		
-		for( int i=rl; i<ru; i++ ) {
-			if( a.isEmpty(i) ) continue; 
-			int apos = a.pos(i);
-			int alen = a.size(i);
-			int[] aixs = a.indexes(i);
-			double[] avals = a.values(i);
-			if( alen==1 ) { 
-				//row selection (now aggregation) with potential scaling
-				int aix = aixs[apos];
-				int lnnz = 0;
-				if( rightSparse ) { //sparse right matrix (full row copy)
-					if( !m2.sparseBlock.isEmpty(aix) ) {
-						ret.rlen=m;
-						ret.allocateSparseRowsBlock(false); //allocation on demand
-						boolean ldeep = (m2.sparseBlock instanceof SparseBlockMCSR);
-						ret.sparseBlock.set(i, m2.sparseBlock.get(aix), ldeep);
-						ret.nonZeros += (lnnz = ret.sparseBlock.size(i));
-					}
-				}
-				else { //dense right matrix (append all values)
-					lnnz = (int)m2.recomputeNonZeros(aix, aix, 0, n-1);
-					if( lnnz > 0 ) {
-						c.allocate(i, lnnz); //allocate once
-						double[] bvals = m2.getDenseBlock().values(aix);
-						for( int j=0, bix=m2.getDenseBlock().pos(aix); j<n; j++ )
-							c.append(i, j, bvals[bix+j]);
-						ret.nonZeros += lnnz;
-					}
-				}
-
-				//optional scaling if not pure selection
-				if( avals[apos] != 1 && lnnz > 0 )
-					if(c.get(i) instanceof SparseRowScalar){
-						SparseRowScalar sv = (SparseRowScalar) c.get(i);
-						c.set(i, new SparseRowScalar(sv.getIndex(), sv.getValue() * avals[apos]), false);
-					}
-					else
-						vectMultiplyInPlace(avals[apos], c.values(i), c.pos(i), c.size(i));
-					
-			}
-			else { //GENERAL CASE
-				for( int k=apos; k<apos+alen; k++ ) {
-					double aval = avals[k];
-					int aix = aixs[k];
-					for( int j=0; j<n; j++ ) {
-						double cval = ret.quickGetValue(i, j);
-						double cvald = aval*m2.quickGetValue(aix, j);
-						if( cvald != 0 )
-							ret.quickSetValue(i, j, cval+cvald);
-					}
-				}
-			}
-		}
-	}
-	
-	private static void matrixMultUltraSparseRight(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) {
-		final int cd = m1.clen;
-		
-		//right is ultra-sparse (KJI)
-		SparseBlock b = m2.sparseBlock;
-		for(int k = 0; k < cd; k++ ) {
-			if( b.isEmpty(k) ) continue; 
-			int bpos = b.pos(k);
-			int blen = b.size(k);
-			int[] bixs = b.indexes(k);
-			double[] bvals = b.values(k);
-			for( int j=bpos; j<bpos+blen; j++ ) {
-				double bval = bvals[j];
-				int bix = bixs[j];
-				for( int i=rl; i<ru; i++ ) {
-					double cvald = bval*m1.quickGetValue(i, k);
-					if( cvald != 0 ){
-						double cval = ret.quickGetValue(i, bix);
-						ret.quickSetValue(i, bix, cval+cvald);
-					}
-				}
-			}
-		}
-	}
-
-	private static void matrixMultChainDense(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru) 
-	{
-		DenseBlock a = mX.getDenseBlock();
-		double[] b = mV.getDenseBlockValues();
-		double[] w = (mW!=null) ? mW.getDenseBlockValues() : null;
-		double[] c = ret.getDenseBlockValues();
-		final int cd = mX.clen; //features in X
-		boolean weights = (ct == ChainType.XtwXv);
-		boolean weights2 = (ct == ChainType.XtXvy);
-		
-		//temporary array for cache blocking
-		//(blocksize chosen to fit b+v in L2 (256KB) for default 1k blocks)
-		final int blocksizeI = 24; // constraint: factor of 4
-		final int blocksizeJ = 1024;
-		double[] tmp = new double[blocksizeI];
-		final int bn = (ru-rl) % blocksizeI;
-		
-		//compute rest (not aligned to blocksize)
-		for( int i=rl; i < rl+bn; i++ ) {
-			double[] avals = a.values(i);
-			int aix = a.pos(i);
-			double val = (b == null) ? 0 :
-				dotProduct(avals, b, aix, 0, cd);
-			val *= (weights) ? w[i] : 1;
-			val -= (weights2) ? w[i] : 0;
-			vectMultiplyAdd(val, avals, c, aix, 0, cd);
-		}
-		
-		//blockwise mmchain computation
-		for( int bi=rl+bn; bi < ru; bi+=blocksizeI ) 
-		{
-			//compute 1st matrix-vector for row block
-			Arrays.fill(tmp, 0);
-			if( b != null ) {
-				for( int bj=0; bj<cd; bj+=blocksizeJ ) {
-					int bjmin = Math.min(cd-bj, blocksizeJ);
-					for( int i=0; i < blocksizeI; i++ )
-						tmp[i] += dotProduct(a.values(bi+i), b, a.pos(bi+i,bj), bj, bjmin);
-				}
-			}
-			
-			//multiply/subtract weights (in-place), if required
-			if( weights ) 
-				vectMultiply(w, tmp, bi, 0, blocksizeI);
-			else if( weights2 )
-				vectSubtract(w, tmp, bi, 0, blocksizeI);
-			
-			//compute 2nd matrix vector for row block and aggregate
-			for( int bj = 0; bj<cd; bj+=blocksizeJ ) {
-				int bjmin = Math.min(cd-bj, blocksizeJ);
-				if( a.isContiguous() ) {
-					double[] avals = a.values(0);
-					for( int i=0, aix=bi*cd+bj; i<blocksizeI; i+=4, aix+=4*cd )
-						vectMultiplyAdd4(tmp[i], tmp[i+1], tmp[i+2], tmp[i+3],
-							avals, c, aix, aix+cd, aix+2*cd, aix+3*cd, bj, bjmin);
-				}
-				else {
-					for( int i=0; i<blocksizeI; i++ )
-						vectMultiplyAdd(tmp[i], a.values(bi+i), c, a.pos(bi+i,bj), bj, bjmin);
-				}
-			}
-		}
-	}
-
-	private static void matrixMultChainSparse(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru) 
-	{
-		SparseBlock a = mX.sparseBlock;
-		double[] b = mV.getDenseBlockValues();
-		double[] w = (mW!=null) ? mW.getDenseBlockValues() : null;
-		double[] c = ret.getDenseBlockValues();
-		boolean weights = (ct == ChainType.XtwXv);
-		boolean weights2 = (ct == ChainType.XtXvy);
-		
-		//row-wise mmchain computation
-		for( int i=rl; i < ru; i++ ) {
-			if( a.isEmpty(i) || (weights && w[i]==0) )
-				continue;
-			int apos = a.pos(i);
-			int alen = a.size(i);
-			int[] aix = a.indexes(i);
-			double[] avals = a.values(i);
-			
-			//compute 1st matrix-vector dot product
-			double val = (b == null) ? 0 :
-				dotProduct(avals, b, aix, apos, 0, alen);
-			
-			//multiply/subtract weights, if required
-			val *= (weights) ? w[i] : 1;
-			val -= (weights2) ? w[i] : 0;
-			
-			//compute 2nd matrix vector and aggregate
-			if( val != 0 )
-				vectMultiplyAdd(val, avals, c, aix, apos, 0, alen);
-		}
-	}
-
-	private static void matrixMultTransposeSelfDense( MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru ) {
-		//2) transpose self matrix multiply dense
-		// (compute only upper-triangular matrix due to symmetry)
-		DenseBlock a = m1.getDenseBlock();
-		DenseBlock c = ret.getDenseBlock();
-		int m = m1.rlen;
-		int n = m1.clen;
-		
-		if( leftTranspose ) // t(X)%*%X
-		{
-			if( LOW_LEVEL_OPTIMIZATION )
-			{
-				if( n==1 ) //VECTOR (col)
-				{
-					double[] avals = a.valuesAt(0);
-					c.set(0, 0, dotProduct(avals, avals, m));
-				}
-				else //MATRIX
-				{
-					//1) Unrolled inner loop (for better instruction-level parallelism)
-					//2) Blocked execution (for less cache trashing in parallel exec)
-					//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
-					
-					final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block
-					final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
-					final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan
-
-					//temporary arrays (nnz a, b index)
-					double[] ta = new double[ blocksizeK ];
-					int[]  tbi  = new int[ blocksizeK ];
-					
-					final int mx = ru;
-					final int cdx = m;
-					final int nx = n;
-					
-					//blocked execution
-					for( int bi = rl; bi < mx; bi+=blocksizeI ) //from bi due to symmetry
-						for( int bk = 0, bimin = Math.min(mx, bi+blocksizeI); bk < cdx; bk+=blocksizeK )
-							for( int bj = bi, bkmin = Math.min(cdx, bk+blocksizeK); bj < nx; bj+=blocksizeJ )
-							{
-								int bklen = bkmin-bk;
-								int bjlen = Math.min(nx, bj+blocksizeJ)-bj;
-								
-								//core sub block matrix multiplication
-								for( int i = bi; i < bimin; i++) 
-								{
-									double[] cvals = c.values(i);
-									int cixj = c.pos(i, bj);
-									
-									if( a.isContiguous(bk, bkmin-1) ) {
-										double[] avals = a.values(bk);
-										int aixi = a.pos(bk, i);
-										int bkpos = a.pos(bk, bj);
-										
-										//determine nnz of a (for sparsity-aware skipping of rows)
-										int knnz = copyNonZeroElements(avals, aixi, bkpos, n, nx, ta, tbi, bklen);
-										
-										//rest not aligned to blocks of 4 rows
-										final int bn = knnz % 4;
-										switch( bn ){
-											case 1: vectMultiplyAdd(ta[0], avals, cvals, tbi[0], cixj, bjlen); break;
-											case 2: vectMultiplyAdd2(ta[0],ta[1], avals, cvals, tbi[0], tbi[1], cixj, bjlen); break;
-											case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], avals, cvals, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
-										}
-										
-										//compute blocks of 4 rows (core inner loop)
-										for( int k = bn; k<knnz; k+=4 ){
-											vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], avals, cvals,
-												tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
-										}
-									}
-									else {
-										for( int k = bk; k<bkmin; k++ ) {
-											double[] avals = a.values(bk);
-											int aix = a.pos(bk, i);
-											if( avals[aix] != 0 )
-												vectMultiplyAdd( avals[aix], a.values(k),
-													cvals, a.pos(k, bj), cixj, bjlen );
-										}
-									}
-								}
-							}
-				}
-			}
-			else {
-				for( int k = 0; k < m; k++ ) {
-					double[] avals = a.values(k);
-					int aix = a.pos(k);
-					for( int i = rl; i < ru; i++ ) {
-						double[] cvals = c.values(i);
-						int cix = c.pos(i);
-						double val = avals[ aix+i ];
-						if( val != 0 ) {
-							for(int j = i; j < n; j++) //from i due to symmetry
-								cvals[ cix+j ] += val * avals[ aix+j ];
-						}
-					}
-				}
-			}
-		}
-		else // X%*%t(X)
-		{
-			if(LOW_LEVEL_OPTIMIZATION)
-			{
-				if( m==1 ) //VECTOR
-				{
-					double[] avals = a.valuesAt(0);
-					c.set(0, 0, dotProduct(avals, avals, n));
-				}
-				else //MATRIX
-				{
-					//algorithm: scan c, foreach ci,j: scan row of a and t(a) (IJK)
-				
-					//1) Unrolled inner loop, for better ILP
-					//2) Blocked execution, for less cache trashing in parallel exec 
-					//   (we block such that lhs, rhs, and output roughly fit into L2, output in L1)
-					//3) Asymmetric block sizes and exploitation of result symmetry
-					int blocksizeK = 1024; //two memory pages for sufficiently long scans
-					int blocksizeIJ = L2_CACHESIZE / 8 / blocksizeK / 2 - 1; //15
-				
-					//blocked execution over IKJ (lhs/rhs in L2, output in L1)
-					for( int bi = rl; bi<ru; bi+=blocksizeIJ ) 
-						for( int bk = 0, bimin = Math.min(ru, bi+blocksizeIJ); bk<n; bk+=blocksizeK )
-							for( int bj = bi, bklen = Math.min(blocksizeK, n-bk); bj<m; bj+=blocksizeIJ ) {
-								//core tsmm block operation (15x15 vectors of length 1K elements)
-								int bjmin = Math.min(m, bj+blocksizeIJ);
-								for( int i=bi; i<bimin; i++ ) {
-									final int bjmax = Math.max(i,bj); //from i due to symmetry
-									double[] avals = a.values(i), cvals = c.values(i);
-									int aix = a.pos(i, bk), cix = c.pos(i);
-									for(int j=bjmax; j <bjmin; j++) 
-										cvals[ cix+j ] += dotProduct(avals, a.values(j), aix, a.pos(j, bk), bklen);
-								}
-							}
-				}
-			}
-			else
-			{
-				for( int i = rl; i < ru; i++ ) {
-					double[] avals1 = a.values(i), cvals = c.values(i);
-					int aix1 = a.pos(i), cix = c.pos(i);
-					for(int j = i; j < m; j++) { //from i due to symmetry
-						double[] avals2 = a.values(j);
-						int aix2 = a.pos(j);
-						double val = 0;
-						for(int k = 0; k < n; k++)
-							val += avals1[aix1+k] * avals2[aix2+k];
-						cvals[cix+j] = val;
-					}
-				}
-			}
-		}
-	}
-
-	private static void matrixMultTransposeSelfSparse( MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru ) {
-		//2) transpose self matrix multiply sparse
-		// (compute only upper-triangular matrix due to symmetry)
-		SparseBlock a = m1.sparseBlock;
-		DenseBlock c = ret.getDenseBlock();
-		int m = m1.rlen;
-		
-		if( leftTranspose ) // t(X)%*%X 
-		{
-			//only general case (because vectors always dense)
-			//algorithm: scan rows, foreach row self join (KIJ)
-			if( LOW_LEVEL_OPTIMIZATION ) {
-				final int n = m1.clen;
-				final int arlen = a.numRows();
-				for( int r=0; r<arlen; r++ ) {
-					if( a.isEmpty(r) ) continue;
-					final int alen = a.size(r);
-					final double[] avals = a.values(r);
-					final int apos = a.pos(r);
-					if( alen == n ) { //dense row
-						for (int i = rl; i < ru; i++){
-							double[] cvals = c.values(i);
-							int cix = c.pos(i);
-							double val = avals[i + apos];
-							for(int j = i; j < m1.clen; j++)
-								cvals[cix + j] +=val * avals[j + apos];
-						}
-					}
-					else { //non-full sparse row
-						int[] aix = a.indexes(r);
-						int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl);
-						rlix = (rlix>=0) ? apos+rlix : apos+alen;
-						int len = apos + alen;
-						for(int i = rlix; i < len && aix[i] < ru; i++)
-							vectMultiplyAdd(avals[i], avals, c.values(aix[i]), aix, i, c.pos(aix[i]), len - i);
-					}
-				}
-			}
-			else
-			{
-				for( int r=0; r<a.numRows(); r++ ) {
-					if( a.isEmpty(r) ) continue;
-					int apos = a.pos(r);
-					int alen = a.size(r);
-					int[] aix = a.indexes(r);
-					double[] avals = a.values(r);
-					int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl);
-					rlix = (rlix>=0) ? apos+rlix : apos+alen;
-					for(int i = rlix; i < apos+alen && aix[i]<ru; i++) {
-						double val = avals[i];
-						if( val != 0 ) {
-							double[] cvals = c.values(aix[i]);
-							int cix = c.pos(aix[i]);
-							for(int j = i; j < apos+alen; j++)
-								cvals[cix+aix[j]] += val * avals[j];
-						}
-					}
-				}
-			}
-		}
-		else // X%*%t(X)
-		{
-			if( m==1 ) //VECTOR 
-			{
-				if( !m1.sparseBlock.isEmpty(0) ) {
-					int alen = m1.sparseBlock.size(0); //pos always 0
-					double[] avals = a.values(0);
-					c.set(0, 0, dotProduct(avals, avals, alen));
-				}
-			}
-			else //MATRIX
-			{
-				//note: reorg to similar layout as t(X)%*%X because faster than 
-				//direct computation with IJK (no dependencies/branches in inner loop)
-				//see preprocessMatrixMultTransposeSelf m1<-tmpBlock
-				m = m1.clen;
-				
-				//algorithm: scan rows, foreach row self join (KIJ)
-				if( LOW_LEVEL_OPTIMIZATION )
-				{
-					int arlen = a.numRows();
-					for( int r=0; r<arlen; r++ ) {
-						if( a.isEmpty(r) ) continue;
-						int apos = a.pos(r);
-						int alen = a.size(r);
-						int[] aix = a.indexes(r);
-						double[] avals = a.values(r);
-						int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl);
-						rlix = (rlix>=0) ? apos+rlix : apos+alen;
-						for(int i = rlix; i < apos+alen && aix[i]<ru; i++) {
-							double val = avals[i];
-							if( val != 0 )
-								vectMultiplyAdd(val, avals, c.values(aix[i]),
-									aix, i, c.pos(aix[i]), alen-i);
-						}
-					}
-				}
-				else
-				{
-					for( int r=0; r<a.numRows(); r++ ) {
-						if( a.isEmpty(r) ) continue;
-						int apos = a.pos(r);
-						int alen = a.size(r);
-						int[] aix = a.indexes(r);
-						double[] avals = a.values(r);
-						int rlix = (rl==0) ? 0 : a.posFIndexGTE(r,rl);
-						rlix = (rlix>=0) ? apos+rlix : apos+alen;
-						for(int i = rlix; i < apos+alen && aix[i]<ru; i++) {
-							double val = avals[i];
-							if( val != 0 ) {
-								double[] cvals = c.values(aix[i]);
-								int cix = c.pos(aix[i]);
-								for( int j = i; j < alen; j++ )
-									cvals[cix+aix[j]] += val * avals[j];
-							}
-						}
-					}
-				}
-			}
-		}
-	}
-
-	private static void matrixMultPermuteDense(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) {
-		double[] a = pm1.getDenseBlockValues();
-		DenseBlock b = m2.getDenseBlock();
-		DenseBlock c = ret1.getDenseBlock();
-
-		final int n = m2.clen;
-		final int blen = ret1.getNumRows();
-		int lastblk = -1;
-		
-		for( int i=rl; i<ru; i++ ) {
-			//compute block index and in-block indexes
-			int pos = UtilFunctions.toInt( a[ i ]); //safe cast
-			if( pos > 0 ) { //selected row
-				int bpos = (pos-1) % blen;
-				int blk = (pos-1) / blen;
-				//allocate and switch to second output block
-				//(never happens in cp, correct for multi-threaded usage)
-				if( lastblk!=-1 && lastblk<blk ) {
-					ret2.sparse = false;
-					ret2.allocateDenseBlock();
-					c = ret2.getDenseBlock();
-				}
-				//memcopy entire dense row into target position
-				System.arraycopy(b.values(i), b.pos(i), c.values(bpos), c.pos(bpos), n);
-				lastblk = blk;
-			}
-		}
-	}
-
-	private static void matrixMultPermuteDenseSparse( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
-	{
-		double[] a = pm1.getDenseBlockValues(); //vector
-		DenseBlock b = m2.getDenseBlock();
-		SparseBlock c = ret1.sparseBlock;
-
-		final int n = m2.clen;
-		final int blen = ret1.getNumRows();
-		
-		int lastblk = -1;
-		for( int i=rl; i<ru; i++ ) {
-			//compute block index and in-block indexes
-			int pos = UtilFunctions.toInt( a[ i ]); //safe cast
-			if( pos > 0 ) { //selected row
-				double[] bvals = b.values(i);
-				int bix = b.pos(i);
-				int bpos = (pos-1) % blen;
-				int blk = (pos-1) / blen;
-				//allocate and switch to second output block
-				//(never happens in cp, correct for multi-threaded usage)
-				if( lastblk!=-1 && lastblk<blk ){ 
-					ret2.sparse = true;
-					ret2.rlen=ret1.rlen;
-					ret2.allocateSparseRowsBlock();
-					c = ret2.sparseBlock;
-				}
-				//append entire dense row into sparse target position
-				for( int j=0; j<n; j++ )
-					c.append(bpos, j, bvals[bix+j]);
-				lastblk = blk;
-			}
-		}
-	}
-
-	private static void matrixMultPermuteSparse( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
-	{
-		double[] a = pm1.getDenseBlockValues(); //vector
-		SparseBlock b = m2.sparseBlock;
-		SparseBlock c = ret1.sparseBlock;
-
-		final int blen = ret1.getNumRows();
-		
-		int lastblk = -1;
-		for( int i=rl; i<ru; i++ )  {
-			//compute block index and in-block indexes
-			int pos = UtilFunctions.toInt( a[ i ]); //safe cast
-			if( pos > 0 ) { //selected row
-				int bpos = (pos-1) % blen;
-				int blk = (pos-1) / blen;
-				//allocate and switch to second output block
-				//(never happens in cp, correct for multi-threaded usage)
-				if( lastblk!=-1 && lastblk<blk ){ 
-					ret2.sparse = true;
-					ret2.allocateSparseRowsBlock();
-					c = ret2.sparseBlock;
-				}
-				//memcopy entire sparse row into target position
-				c.set(bpos, b.get(i), true);
-				lastblk = blk;
-			}
-		}
-
-	}
-
-	private static void matrixMultWSLossDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru)
-	{
-		DenseBlock x = mX.getDenseBlock();
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		DenseBlock w = (mW!=null)? mW.getDenseBlock() : null;
-		final int n = mX.clen;
-		final int cd = mU.clen;
-		double wsloss = 0;
-		
-		// approach: iterate over all cells of X 
-		//cache-conscious blocking: due to blocksize constraint (default 1000),
-		//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) 
-					
-		final int blocksizeIJ = 16; //u/v block (max at typical L2 size) 
-		
-
-		//blocked execution
-		for( int bi = rl; bi < ru; bi+=blocksizeIJ ) {
-			int bimin = Math.min(ru, bi+blocksizeIJ);
-			for( int bj = 0; bj < n; bj+=blocksizeIJ ){
-				int bjmin = Math.min(n, bj+blocksizeIJ);
-				
-				// Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting)
-				if( wt==WeightsType.POST ) {
-					for( int i=bi; i<bimin; i++ ) {
-						double[] wvals = w.values(i), xvals = x.values(i), uvals = u.values(i);
-						int xix = x.pos(i), uix = u.pos(i);
-						for( int j=bj; j<bjmin; j++ ) {
-							double wij = wvals[xix+j];
-							if( wij != 0 ) {
-								double uvij = dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
-								wsloss += wij*(xvals[xix+j]-uvij)*(xvals[xix+j]-uvij); //^2
-							}
-						}
-					}
-				}
-				// Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post_nz weighting)
-				else if( wt==WeightsType.POST_NZ ) {
-					for( int i=bi; i<bimin; i++ ) {
-						double[] xvals = x.values(i), uvals = u.values(i);
-						int xix = x.pos(i), uix = u.pos(i);
-						for( int j=bj; j<bjmin; j++ ) {
-							double xij = xvals[xix+j];
-							if( xij != 0 ) {
-								double uvij = dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
-								wsloss += (xij-uvij)*(xij-uvij); //^2
-							}
-						}
-					}
-				}
-				// Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting)
-				else if( wt==WeightsType.PRE ) {
-					for( int i=bi; i<bimin; i++ ) {
-						double[] wvals = w.values(i), xvals = x.values(i), uvals = u.values(i);
-						int xix = x.pos(i), uix = u.pos(i);
-						for( int j=bj; j<bjmin; j++ ) {
-							double wij = wvals[xix+j];
-							double uvij = 0;
-							if( wij != 0 )
-								uvij = dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
-							wsloss += (xvals[xix+j]-wij*uvij)*(xvals[xix+j]-wij*uvij); //^2
-						}
-					}
-				}
-				// Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting)
-				else if( wt==WeightsType.NONE ) {
-					for( int i=bi; i<bimin; i++ ) {
-						double[] xvals = x.values(i), uvals = u.values(i);
-						int xix = x.pos(i), uix = u.pos(i);
-						for( int j=bj; j<bjmin; j++) {
-							double uvij = dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
-							wsloss += (xvals[xix+j]-uvij)*(xvals[xix+j]-uvij); //^2
-						}
-					}
-				}
-			}
-		}
-		ret.quickSetValue(0, 0, wsloss);
-	}
-
-	private static void matrixMultWSLossSparseDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru)
-	{
-		SparseBlock x = mX.sparseBlock;
-		SparseBlock w = (mW!=null)? mW.sparseBlock : null;
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		final int n = mX.clen; 
-		final int cd = mU.clen;
-		double wsloss = 0; 
-		
-		// Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting)
-		if( wt==WeightsType.POST ) {
-			// approach: iterate over W, point-wise in order to exploit sparsity
-			for( int i=rl; i<ru; i++ ) {
-				if( w.isEmpty(i) ) continue;
-				int wpos = w.pos(i);
-				int wlen = w.size(i);
-				int[] wix = w.indexes(i);
-				double[] wval = w.values(i);
-				double[] uvals = u.values(i);
-				int uix = u.pos(i);
-				if( w.isAligned(i, x) ) {
-					//O(n) where n is nnz in w/x 
-					double[] xval = x.values(i);
-					for( int k=wpos; k<wpos+wlen; k++ ) {
-						double uvij = dotProduct(uvals, v.values(wix[k]), uix, v.pos(wix[k]), cd);
-						wsloss += wval[k]*(xval[k]-uvij)*(xval[k]-uvij);
-					}
-				}
-				else {
-					//O(n log m) where n/m is nnz in w/x 
-					for( int k=wpos; k<wpos+wlen; k++ ) {
-						double xi = mX.quickGetValue(i, wix[k]);
-						double uvij = dotProduct(uvals, v.values(wix[k]), uix, v.pos(wix[k]), cd);
-						wsloss += wval[k]*(xi-uvij)*(xi-uvij);
-					}
-				}
-			}
-		}
-		// Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post weighting)
-		else if( wt==WeightsType.POST_NZ ) {
-			// approach: iterate over W, point-wise in order to exploit sparsity
-			// blocked over ij, while maintaining front of column indexes, where the
-			// blocksize is chosen such that we reuse each vector on average 8 times.
-			final int blocksizeIJ = (int) (8L*mX.rlen*mX.clen/mX.nonZeros); 
-			int[] curk = new int[blocksizeIJ];
-			
-			for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
-				int bimin = Math.min(ru, bi+blocksizeIJ);
-				//prepare starting indexes for block row
-				Arrays.fill(curk, 0); 
-				//blocked execution over column blocks
-				for( int bj=0; bj<n; bj+=blocksizeIJ ) {
-					int bjmin = Math.min(n, bj+blocksizeIJ);
-					for( int i=bi; i<bimin; i++ ) {
-						if( x.isEmpty(i) ) continue;
-						int xpos = x.pos(i);
-						int xlen = x.size(i);
-						int[] xix = x.indexes(i);
-						double[] xval = x.values(i), uvals = u.values(i);
-						int uix = u.pos(i);
-						int k = xpos + curk[i-bi];
-						for( ; k<xpos+xlen && xix[k]<bjmin; k++ ) {
-							double uvij = dotProduct(uvals, v.values(xix[k]), uix, v.pos(xix[k]), cd);
-							wsloss += (xval[k]-uvij)*(xval[k]-uvij);
-						}
-						curk[i-bi] = k - xpos;
-					}
-				}
-			}
-		}
-		// Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting)
-		else if( wt==WeightsType.PRE ) {
-			// approach: iterate over all cells of X maybe sparse and dense
-			// (note: tuning similar to pattern 3 possible but more complex)
-			for( int i=rl; i<ru; i++ ) {
-				double[] uvals = u.values(i);
-				int uix = u.pos(i);
-				for( int j=0; j<n; j++ ) {
-					double xij = mX.quickGetValue(i, j);
-					double wij = mW.quickGetValue(i, j);
-					double uvij = 0;
-					if( wij != 0 )
-						uvij = dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
-					wsloss += (xij-wij*uvij)*(xij-wij*uvij);
-				}
-			}
-		}
-		// Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting)
-		else if( wt==WeightsType.NONE ) {
-			//approach: use sparsity-exploiting pattern rewrite sum((X-(U%*%t(V)))^2) 
-			//-> sum(X^2)-sum(2*X*(U%*%t(V))))+sum((t(U)%*%U)*(t(V)%*%V)), where each
-			//parallel task computes sum(X^2)-sum(2*X*(U%*%t(V)))) and the last term
-			//sum((t(U)%*%U)*(t(V)%*%V)) is computed once via two tsmm operations.
-			
-			final int blocksizeIJ = (int) (8L*mX.rlen*mX.clen/mX.nonZeros); 
-			int[] curk = new int[blocksizeIJ];
-			
-			for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
-				int bimin = Math.min(ru, bi+blocksizeIJ);
-				//prepare starting indexes for block row
-				Arrays.fill(curk, 0); 
-				//blocked execution over column blocks
-				for( int bj=0; bj<n; bj+=blocksizeIJ ) {
-					int bjmin = Math.min(n, bj+blocksizeIJ);
-					for( int i=bi; i<bimin; i++ ) {
-						if( x.isEmpty(i) ) continue; 
-						int xpos = x.pos(i);
-						int xlen = x.size(i);
-						int[] xix = x.indexes(i);
-						double[] xval = x.values(i);
-						double[] uvals = u.values(i);
-						int uix = u.pos(i);
-						int k = xpos + curk[i-bi];
-						for( ; k<xpos+xlen && xix[k]<bjmin; k++ ) {
-							double xij = xval[k];
-							double uvij = dotProduct(uvals, v.values(xix[k]), uix, v.pos(xix[k]), cd);
-							wsloss += xij * xij - 2 * xij * uvij;
-						}
-						curk[i-bi] = k - xpos;
-					}
-				}
-			}
-		}
-		
-		ret.quickSetValue(0, 0, wsloss);
-	}
-
-	private static void matrixMultWSLossGeneric (MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru)
-	{
-		final int n = mX.clen; 
-		final int cd = mU.clen;
-		double wsloss = 0;
-		
-		// Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting)
-		if( wt==WeightsType.POST )
-		{
-			// approach: iterate over W, point-wise in order to exploit sparsity
-			if( mW.sparse ) //SPARSE
-			{
-				SparseBlock w = mW.sparseBlock;
-				for( int i=rl; i<ru; i++ ) {
-					if( w.isEmpty(i) ) continue;
-					int wpos = w.pos(i);
-					int wlen = w.size(i);
-					int[] wix = w.indexes(i);
-					double[] wval = w.values(i);
-					for( int k=wpos; k<wpos+wlen; k++ ) {
-						double uvij = dotProductGeneric(mU, mV, i, wix[k], cd);
-						double xi = mX.quickGetValue(i, wix[k]);
-						wsloss += wval[k]*(xi-uvij)*(xi-uvij);
-					}
-				}
-			}
-			else //DENSE
-			{
-				DenseBlock w = mW.getDenseBlock();
-				for( int i=rl; i<ru; i++ ) {
-					double[] wvals = w.values(i);
-					int wix = w.pos(i);
-					for( int j=0; j<n; j++)
-						if( wvals[wix+j] != 0 ) {
-							double uvij = dotProductGeneric(mU, mV, i, j, cd);
-							double xij = mX.quickGetValue(i, j);
-							wsloss += wvals[wix+j]*(xij-uvij)*(xij-uvij);
-						}
-				}
-			}
-		}
-		// Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post weighting)
-		else if( wt==WeightsType.POST_NZ )
-		{
-			// approach: iterate over W, point-wise in order to exploit sparsity
-			if( mX.sparse ) //SPARSE
-			{
-				SparseBlock x = mX.sparseBlock;
-				for( int i=rl; i<ru; i++ ) {
-					if( x.isEmpty(i) ) continue;
-					int xpos = x.pos(i);
-					int xlen = x.size(i);
-					int[] xix = x.indexes(i);
-					double[] xval = x.values(i);
-					for( int k=xpos; k<xpos+xlen; k++ ) {
-						double uvij = dotProductGeneric(mU, mV, i, xix[k], cd);
-						wsloss += (xval[k]-uvij)*(xval[k]-uvij);
-					}
-				}
-			}
-			else //DENSE
-			{
-				DenseBlock x = mX.getDenseBlock();
-				for( int i=rl; i<ru; i++ ) {
-					double[] xvals = x.values(i);
-					int xix = x.pos(i);
-					for( int j=0; j<n; j++) {
-						double xij = xvals[xix+j];
-						if( xij != 0 ) {
-							double uvij = dotProductGeneric(mU, mV, i, j, cd);
-							wsloss += (xij-uvij)*(xij-uvij);
-						}
-					}
-				}
-			}
-		}
-		// Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting)
-		else if( wt==WeightsType.PRE )
-		{
-			// approach: iterate over all cells of X maybe sparse and dense
-			for( int i=rl; i<ru; i++ )
-				for( int j=0; j<n; j++) {
-					double xij = mX.quickGetValue(i, j);
-					double wij = mW.quickGetValue(i, j);
-					double uvij = 0;
-					if( wij != 0 )
-						uvij = dotProductGeneric(mU, mV, i, j, cd);
-					wsloss += (xij-wij*uvij)*(xij-wij*uvij);
-				}
-		}
-		// Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting)
-		else if( wt==WeightsType.NONE )
-		{
-			//approach: use sparsity-exploiting pattern rewrite sum((X-(U%*%t(V)))^2) 
-			//-> sum(X^2)-sum(2*X*(U%*%t(V))))+sum((t(U)%*%U)*(t(V)%*%V)), where each
-			//parallel task computes sum(X^2)-sum(2*X*(U%*%t(V)))) and the last term
-			//sum((t(U)%*%U)*(t(V)%*%V)) is computed once via two tsmm operations.
-			
-			if( mX.sparse ) { //SPARSE
-				SparseBlock x = mX.sparseBlock;
-				for( int i=rl; i<ru; i++ ) {
-					if( x.isEmpty(i) ) continue;
-					int xpos = x.pos(i);
-					int xlen = x.size(i);
-					int[] xix = x.indexes(i);
-					double[] xval = x.values(i);
-					for( int k=xpos; k<xpos+xlen; k++ ) {
-						double xij = xval[k];
-						double uvij = dotProductGeneric(mU, mV, i, xix[k], cd);
-						wsloss += xij * xij - 2 * xij * uvij;
-					}
-				}
-			}
-			else { //DENSE
-				DenseBlock x = mX.getDenseBlock();
-				for( int i=rl; i<ru; i++ ) {
-					double[] xvals = x.values(i);
-					int xix = x.pos(i);
-					for( int j=0; j<n; j++)
-						if( xvals[xix+j] != 0 ) {
-							double xij = xvals[xix+j];
-							double uvij = dotProductGeneric(mU, mV, i, j, cd);
-							wsloss += xij * xij - 2 * xij * uvij;
-						}
-				}
-			}
-		}
-
-		ret.quickSetValue(0, 0, wsloss);
-	}
-	
-	private static void addMatrixMultWSLossNoWeightCorrection(MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, int k) {
-		MatrixBlock tmp1 = new MatrixBlock(mU.clen, mU.clen, false);
-		MatrixBlock tmp2 = new MatrixBlock(mU.clen, mU.clen, false);
-		matrixMultTransposeSelf(mU, tmp1, true, k);
-		matrixMultTransposeSelf(mV, tmp2, true, k);
-		ret.quickSetValue(0, 0, ret.quickGetValue(0, 0) + 
-			((tmp1.sparse || tmp2.sparse) ? dotProductGeneric(tmp1, tmp2) :
-			dotProduct(tmp1.getDenseBlockValues(), tmp2.getDenseBlockValues(), mU.clen*mU.clen)));
-	}
-
-	private static void matrixMultWSigmoidDenseNative(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt) {
-		double[] w = mW.getDenseBlockValues();
-		double[] c = ret.getDenseBlockValues();
-		final int m = mW.rlen, n = mW.clen;
-		final int cd = mU.clen;
-		boolean flagminus = (wt==WSigmoidType.MINUS || wt==WSigmoidType.LOG_MINUS); 
-		boolean flaglog = (wt==WSigmoidType.LOG || wt==WSigmoidType.LOG_MINUS);
-		
-		//note: experiments with a fully native implementation of this method (even with #pragma omp simd)
-		//showed performance regressions compared to this version because we benefit from FastMath.exp 
-		
-		//call native matrix multiplication (only called for single-threaded and matrix-vector
-		//because this ensures that we can deal with the transpose mV without additional transpose)
-		long nnz =NativeHelper.dmmdd(((m==1)?mV:mU).getDenseBlockValues(),
-			((m==1)?mU:mV).getDenseBlockValues(), c, (m==1)?n:m, cd, 1, 1);
-		if(nnz < 0) {
-			//fallback to default java implementation
-			LOG.warn("matrixMultWSigmoidDenseNative: Native mat mult failed. Falling back to java version.");
-			matrixMult(((m==1)?mV:mU), ((m==1)?mU:mV), ret, false);
-		}
-		
-		//compute remaining wsigmoid for all relevant outputs
-		for( int i=0; i<m*n; i++ ) {
-			//compute core sigmoid function
-			double cval = flagminus ?
-				1 / (1 + FastMath.exp(c[i])) :
-				1 / (1 + FastMath.exp(-c[i]));
-			//compute weighted output
-			c[i] = w[i] * ((flaglog) ? Math.log(cval) : cval);
-		}
-	}
-	
-	private static void matrixMultWSigmoidDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) {
-		DenseBlock w = mW.getDenseBlock();
-		DenseBlock c = ret.getDenseBlock();
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		final int n = mW.clen;
-		final int cd = mU.clen;
-		
-		//note: cannot compute U %*% t(V) in-place of result w/ regular mm because
-		//t(V) comes in transformed form and hence would require additional memory
-	
-		boolean flagminus = (wt==WSigmoidType.MINUS || wt==WSigmoidType.LOG_MINUS); 
-		boolean flaglog = (wt==WSigmoidType.LOG || wt==WSigmoidType.LOG_MINUS);
-		
-		//approach: iterate over non-zeros of w, selective mm computation
-		//cache-conscious blocking: due to blocksize constraint (default 1000),
-		//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) 
-		
-		final int blocksizeIJ = 16; //u/v block (max at typical L2 size) 
-		
-		//blocked execution
-		for( int bi = rl; bi < ru; bi+=blocksizeIJ ) {
-			int bimin = Math.min(ru, bi+blocksizeIJ);
-			for( int bj = 0; bj < n; bj+=blocksizeIJ ) {
-				int bjmin = Math.min(n, bj+blocksizeIJ);
-				//core wsigmoid computation
-				for( int i=bi; i<bimin; i++ ) {
-					double[] wvals = w.values(i), uvals = u.values(i), cvals = c.values(i);
-					int wix = w.pos(i), uix = u.pos(i);
-					for( int j=bj; j<bjmin; j++) {
-						double wij = wvals[wix+j];
-						if( wij != 0 )
-							cvals[wix+j] = wsigmoid(wij, uvals, v.values(j),
-								uix, v.pos(j), flagminus, flaglog, cd);
-					}
-				}
-			}
-		}
-	}
-
-	private static void matrixMultWSigmoidSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) {
-		SparseBlock w = mW.sparseBlock;
-		SparseBlock c = ret.sparseBlock;
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		final int cd = mU.clen;
-		
-		boolean flagminus = (wt==WSigmoidType.MINUS || wt==WSigmoidType.LOG_MINUS);
-		boolean flaglog = (wt==WSigmoidType.LOG || wt==WSigmoidType.LOG_MINUS);
-	
-		//approach: iterate over non-zeros of w, selective mm computation
-		for( int i=rl; i<ru; i++ ) {
-			if( w.isEmpty(i) ) continue;
-			int wpos = w.pos(i);
-			int wlen = w.size(i);
-			int[] wix = w.indexes(i);
-			double[] wval = w.values(i);
-			double[] uvals = u.values(i);
-			int uix = u.pos(i);
-			c.allocate(i, wlen);
-			for( int k=wpos; k<wpos+wlen; k++ ) {
-				double cval = wsigmoid(wval[k], uvals, v.values(wix[k]),
-					uix, v.pos(wix[k]), flagminus, flaglog, cd);
-				c.append(i, wix[k], cval);
-			}
-		}
-	}
-
-	private static void matrixMultWSigmoidGeneric (MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) {
-		final int n = mW.clen; 
-		final int cd = mU.clen;
-	
-		boolean flagminus = (wt==WSigmoidType.MINUS || wt==WSigmoidType.LOG_MINUS); 
-		boolean flaglog = (wt==WSigmoidType.LOG || wt==WSigmoidType.LOG_MINUS);
-	
-		//approach: iterate over non-zeros of w, selective mm computation
-		if( mW.sparse ) //SPARSE
-		{
-			//w and c always in same representation
-			SparseBlock w = mW.sparseBlock;
-			SparseBlock c = ret.sparseBlock;
-			for( int i=rl; i<ru; i++ ) {
-				if( w.isEmpty(i) ) continue;
-				int wpos = w.pos(i);
-				int wlen = w.size(i);
-				int[] wix = w.indexes(i);
-				double[] wval = w.values(i);
-				c.allocate(i, wlen);
-				for( int k=wpos; k<wpos+wlen; k++ ) {
-					double cval = wsigmoid(wval[k], mU, mV, i, wix[k], flagminus, flaglog, cd);
-					c.append(i, wix[k], cval);
-				}
-			}
-		}
-		else //DENSE
-		{
-			//w and c always in same representation
-			DenseBlock w = mW.getDenseBlock();
-			DenseBlock c = ret.getDenseBlock();
-			for( int i=rl; i<ru; i++ ) {
-				double[] wvals = w.values(i), cvals = c.values(i);
-				int ix = w.pos(i);
-				for( int j=0; j<n; j++ ) {
-					double wij = wvals[ix+j];
-					if( wij != 0 )
-						cvals[ix+j] = wsigmoid(wij, mU, mV, i, j, flagminus, flaglog, cd);
-				}
-			}
-		}
-	}
-
-	private static void matrixMultWDivMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) {
-		final boolean basic = wt.isBasic();
-		final boolean left = wt.isLeft();
-		final boolean mult = wt.isMult();
-		final boolean minus = wt.isMinus();
-		final boolean four = wt.hasFourInputs();
-		final boolean scalar = wt.hasScalar();
-		final double eps = scalar ? mX.quickGetValue(0, 0) : 0;
-		final int cd = mU.clen;
-		
-		DenseBlock w = mW.getDenseBlock();
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		DenseBlock x = (mX==null) ? null : mX.getDenseBlock();
-		DenseBlock c = ret.getDenseBlock();
-		
-		//approach: iterate over non-zeros of w, selective mm computation
-		//cache-conscious blocking: due to blocksize constraint (default 1000),
-		//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) 
-		
-		final int blocksizeIJ = 16; //u/v block (max at typical L2 size) 
-		
-		//blocked execution
-		for( int bi = rl; bi < ru; bi+=blocksizeIJ ) {
-			int bimin = Math.min(ru, bi+blocksizeIJ);
-			for( int bj = cl; bj < cu; bj+=blocksizeIJ ) {
-				int bjmin = Math.min(cu, bj+blocksizeIJ);
-				//core wsigmoid computation
-				for( int i=bi; i<bimin; i++ ) {
-					double[] wvals = w.values(i), uvals = u.values(i);
-					double[] xvals = four ? x.values(i) : null; 
-					int wix = w.pos(i), uix = u.pos(i);
-					for( int j=bj; j<bjmin; j++ )
-						if( wvals[wix+j] != 0 ) {
-							double[] cvals = c.values((basic||!left) ? i : j);
-							if( basic ) 
-								cvals[wix+j] = wvals[wix+j] * dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
-							else if( four ) { //left/right 
-								if (scalar)
-									wdivmm(wvals[wix+j], eps, uvals, v.values(j), cvals, uix, v.pos(j), left, scalar, cd);
-								else
-									wdivmm(wvals[wix+j], xvals[wix+j], uvals, v.values(j), cvals, uix, v.pos(j), left, scalar, cd);
-							}
-							else //left/right minus/default
-								wdivmm(wvals[wix+j], uvals, v.values(j), cvals, uix, v.pos(j), left, mult, minus, cd);
-						}
-				}
-			}
-		}
-	}
-
-	private static void matrixMultWDivMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) {
-		final boolean basic = wt.isBasic();
-		final boolean left = wt.isLeft();
-		final boolean mult = wt.isMult();
-		final boolean minus = wt.isMinus();
-		final boolean four = wt.hasFourInputs();
-		final boolean scalar = wt.hasScalar();
-		final double eps = scalar ? mX.quickGetValue(0, 0) : 0;
-		final int cd = mU.clen;
-		
-		SparseBlock w = mW.sparseBlock;
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		DenseBlock c = ret.getDenseBlock();
-		SparseBlock x = (mX==null) ? null : mX.sparseBlock;
-		
-		//approach: iterate over non-zeros of w, selective mm computation
-		//blocked over ij, while maintaining front of column indexes, where the
-		//blocksize is chosen such that we reuse each  Ui/Vj vector on average 8 times,
-		//with custom blocksizeJ for wdivmm_left to avoid LLC misses on output.
-		final int blocksizeI = (int) (8L*mW.rlen*mW.clen/mW.nonZeros);
-		final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(mU.clen*8), blocksizeI)) : blocksizeI;
-		
-		int[] curk = new int[blocksizeI];
-		boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeI] : null;
-		
-		//blocked execution over row blocks
-		for( int bi=rl; bi<ru; bi+=blocksizeI ) 
-		{
-			int bimin = Math.min(ru, bi+blocksizeI);
-			//prepare starting indexes for block row
-			for( int i=bi; i<bimin; i++ ) {
-				int k = (cl==0||w.isEmpty(i)) ? 0 : w.posFIndexGTE(i,cl);
-				curk[i-bi] = (k>=0) ? k : mW.clen;
-			}
-			//prepare alignment info if necessary
-			if( four && !scalar )
-				for( int i=bi; i<bimin; i++ )
-					aligned[i-bi] = w.isAligned(i-bi, x);
-			
-			//blocked execution over column blocks
-			for( int bj=cl; bj<cu; bj+=blocksizeJ )  
-			{
-				int bjmin = Math.min(cu, bj+blocksizeJ);
-				//core wdivmm block matrix mult
-				for( int i=bi; i<bimin; i++ ) {
-					if( w.isEmpty(i) ) continue;
-					
-					int wpos = w.pos(i);
-					int wlen = w.size(i);
-					int[] wix = w.indexes(i);
-					double[] wval = w.values(i);
-					double[] uvals = u.values(i);
-					int uix = u.pos(i);
-					
-					int k = wpos + curk[i-bi];
-					if( basic ) {
-						for( ; k<wpos+wlen && wix[k]<bjmin; k++ )
-							ret.appendValue( i, wix[k], wval[k] *dotProduct(
-								uvals, v.values(wix[k]), uix, v.pos(wix[k]), cd));
-					}
-					else if( four ) { //left/right
-						//checking alignment per row is ok because early abort if false, 
-						//row nnz likely fit in L1/L2 cache, and asymptotically better if aligned
-						if( !scalar && w.isAligned(i, x) ) {
-							//O(n) where n is nnz in w/x 
-							double[] xvals = x.values(i);
-							for( ; k<wpos+wlen && wix[k]<bjmin; k++ ) {
-								double[] cvals = c.values(left ? wix[k] : i);
-								wdivmm(wval[k], xvals[k], uvals, v.values(wix[k]),
-									cvals, uix, v.pos(wix[k]), left, scalar, cd);
-							}
-						}
-						else {
-							//scalar or O(n log m) where n/m are nnz in w/x
-							for( ; k<wpos+wlen && wix[k]<bjmin; k++ ) {
-								double[] cvals = c.values(left ? wix[k] : i);
-								if (scalar)
-									wdivmm(wval[k], eps, uvals, v.values(wix[k]),
-										cvals, uix, v.pos(wix[k]), left, scalar, cd);
-								else
-									wdivmm(wval[k], x.get(i, wix[k]), uvals,
-										v.values(wix[k]), cvals, uix, v.pos(wix[k]), left, scalar, cd);
-							}
-						}
-					}
-					else { //left/right minus default
-						for( ; k<wpos+wlen && wix[k]<bjmin; k++ ) {
-							double[] cvals = c.values(left ? wix[k] : i);
-							wdivmm(wval[k], uvals, v.values(wix[k]), cvals,
-								uix, v.pos(wix[k]), left, mult, minus, cd);
-						}
-					}
-					curk[i-bi] = k - wpos;
-				}
-			}
-		}
-	}
-
-	private static void matrixMultWDivMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) {
-		final boolean basic = wt.isBasic();
-		final boolean left = wt.isLeft(); 
-		final boolean mult = wt.isMult();
-		final boolean minus = wt.isMinus();
-		final boolean four = wt.hasFourInputs();
-		final boolean scalar = wt.hasScalar();
-		final double eps = scalar ? mX.quickGetValue(0, 0) : 0;
-		final int cd = mU.clen;
-
-		//output always in dense representation
-		DenseBlock c = ret.getDenseBlock();
-		
-		//approach: iterate over non-zeros of w, selective mm computation
-		if( mW.sparse ) //SPARSE
-		{
-			SparseBlock w = mW.sparseBlock;
-			
-			for( int i=rl; i<ru; i++ ) {
-				if( w.isEmpty(i) ) continue;
-				int wpos = w.pos(i);
-				int wlen = w.size(i);
-				int[] wix = w.indexes(i);
-				double[] wval = w.values(i);
-				int k = (cl==0) ? 0 : w.posFIndexGTE(i,cl);
-				k = (k>=0) ? wpos+k : wpos+wlen;
-				for( ; k<wpos+wlen && wix[k]<cu; k++ ) {
-					double[] cvals = c.values((basic||!left) ? i : wix[k]);
-					if( basic ) {
-						double uvij = dotProductGeneric(mU,mV, i, wix[k], cd);
-						ret.appendValue(i, wix[k], uvij);
-					}
-					else if( four ) { //left/right
-						double xij = scalar ? eps : mX.quickGetValue(i, wix[k]);
-						wdivmm(wval[k], xij, mU, mV, cvals, i, wix[k], left, scalar, cd);
-					}
-					else { //left/right minus/default
-						wdivmm(wval[k], mU, mV, cvals, i, wix[k], left, mult, minus, cd);
-					}
-				}
-			}
-		}
-		else //DENSE
-		{
-			DenseBlock w = mW.getDenseBlock();
-			for( int i=rl; i<ru; i++ ) {
-				double[] wvals = w.values(i);
-				int ix = w.pos(i);
-				for( int j=cl; j<cu; j++)
-					if( wvals[ix+j] != 0 ) {
-						double[] cvals = c.values((basic||!left) ? i : j);
-						if( basic ) {
-							cvals[ix+j] = dotProductGeneric(mU,mV, i, j, cd);
-						}
-						else if( four ) { //left/right
-							double xij = scalar ? eps : mX.quickGetValue(i, j);
-							wdivmm(wvals[ix+j], xij, mU, mV, cvals, i, j, left, scalar, cd);
-						}
-						else { //left/right minus/default
-							wdivmm(wvals[ix+j], mU, mV, cvals, i, j, left, mult, minus, cd);
-						}
-					}
-			}
-		}
-	}
-
-	private static void matrixMultWCeMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int rl, int ru)
-	{
-		DenseBlock w = mW.getDenseBlock();
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		final int n = mW.clen;
-		final int cd = mU.clen;
-		double wceval = 0;
-		
-		// approach: iterate over all cells of X 
-		//cache-conscious blocking: due to blocksize constraint (default 1000),
-		//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) 
-		final int blocksizeIJ = 16; //u/v block (max at typical L2 size) 
-
-		//blocked execution
-		for( int bi = rl; bi < ru; bi+=blocksizeIJ ) {
-			int bimin = Math.min(ru, bi+blocksizeIJ);
-			for( int bj = 0; bj < n; bj+=blocksizeIJ ) {
-				int bjmin = Math.min(n, bj+blocksizeIJ);
-				for( int i=bi; i<bimin; i++ ) {
-					double[] wvals = w.values(i), uvals = u.values(i);
-					int wix = w.pos(i), uix = u.pos(i);
-					for( int j=bj; j<bjmin; j++ ) {
-						double wij = wvals[wix+j];
-						if( wij != 0 ) {
-							double uvij = dotProduct(uvals, v.values(j), uix, v.pos(j), cd);
-							wceval += wij * Math.log(uvij + eps);
-						}
-					}
-				}
-			}
-		}
-		ret.quickSetValue(0, 0, wceval);
-	}
-
-	private static void matrixMultWCeMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int rl, int ru)
-	{
-		SparseBlock w = mW.sparseBlock;
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		final int n = mW.clen;
-		final int cd = mU.clen;
-		double wceval = 0;
-		
-		// approach: iterate over W, point-wise in order to exploit sparsity
-		// blocked over ij, while maintaining front of column indexes, where the
-		// blocksize is chosen such that we reuse each vector on average 8 times.
-		final int blocksizeIJ = (int) (8L*mW.rlen*mW.clen/mW.nonZeros); 
-		int[] curk = new int[blocksizeIJ];
-		
-		for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
-			int bimin = Math.min(ru, bi+blocksizeIJ);
-			//prepare starting indexes for block row
-			Arrays.fill(curk, 0); 
-			//blocked execution over column blocks
-			for( int bj=0; bj<n; bj+=blocksizeIJ ) {
-				int bjmin = Math.min(n, bj+blocksizeIJ);
-				for( int i=bi; i<bimin; i++ ) {
-					if( w.isEmpty(i) ) continue;
-					int wpos = w.pos(i);
-					int wlen = w.size(i);
-					int[] wix = w.indexes(i);
-					double[] wvals = w.values(i);
-					double[] uvals = u.values(i);
-					int uix = u.pos(i);
-					int k = wpos + curk[i-bi];
-					for( ; k<wpos+wlen && wix[k]<bjmin; k++ ) {
-						double uvij = dotProduct(uvals, v.values(wix[k]), uix, v.pos(wix[k]), cd);
-						wceval += wvals[k] * Math.log(uvij + eps);
-					}
-					curk[i-bi] = k - wpos;
-				}
-			}
-		}
-		ret.quickSetValue(0, 0, wceval);
-	}
-
-	private static void matrixMultWCeMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int rl, int ru)
-	{
-		final int n = mW.clen; 
-		final int cd = mU.clen;
-		double wceval = 0; 
-
-		//approach: iterate over non-zeros of w, selective mm computation
-		if( mW.sparse ) //SPARSE
-		{
-			SparseBlock w = mW.sparseBlock;
-			for( int i=rl; i<ru; i++ ) {
-				if( w.isEmpty(i) ) continue;
-				int wpos = w.pos(i);
-				int wlen = w.size(i);
-				int[] wix = w.indexes(i);
-				double[] wval = w.values(i);
-				for( int k=wpos; k<wpos+wlen; k++ ) {
-					double uvij = dotProductGeneric(mU, mV, i, wix[k], cd);
-					wceval += wval[k] * Math.log(uvij + eps);
-				}
-			}
-		}
-		else //DENSE
-		{
-			DenseBlock w = mW.getDenseBlock();
-			for( int i=rl; i<ru; i++ ) {
-				double[] wvals = w.values(i);
-				int wix = w.pos(i);
-				for( int j=0; j<n; j++ ) {
-					double wij = wvals[wix+j];
-					if( wij != 0 ) {
-						double uvij = dotProductGeneric(mU, mV, i, j, cd);
-						wceval += wij * Math.log(uvij + eps);
-					}
-				}
-			}
-		}
-
-		ret.quickSetValue(0, 0, wceval);
-	}
-
-	private static void matrixMultWuMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) {
-		DenseBlock w = mW.getDenseBlock();
-		DenseBlock c = ret.getDenseBlock();
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		final int n = mW.clen;
-		final int cd = mU.clen;
-		
-		//note: cannot compute U %*% t(V) in-place of result w/ regular mm because
-		//t(V) comes in transformed form and hence would require additional memory
-	
-		boolean flagmult = (wt==WUMMType.MULT);
-		
-		//approach: iterate over non-zeros of w, selective mm computation
-		//cache-conscious blocking: due to blocksize constraint (default 1000),
-		//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB)
-		
-		final int blocksizeIJ = 16; //u/v block (max at typical L2 size)
-		
-		//blocked execution
-		for( int bi = rl; bi < ru; bi+=blocksizeIJ ) {
-			int bimin = Math.min(ru, bi+blocksizeIJ);
-			for( int bj = 0; bj < n; bj+=blocksizeIJ ) {
-				int bjmin = Math.min(n, bj+blocksizeIJ);
-				//core wsigmoid computation
-				for( int i=bi; i<bimin; i++ ) {
-					double[] wvals = w.values(i), uvals = u.values(i), cvals = c.values(i);
-					int wix = w.pos(i), uix = u.pos(i);
-					for( int j=bj; j<bjmin; j++ ) {
-						double wij = wvals[wix+j];
-						if( wij != 0 )
-							cvals[wix+j] = wumm(wij, uvals, v.values(j),
-								uix, v.pos(j), flagmult, fn, cd);
-					}
-				}
-			}
-		}
-	}
-
-	private static void matrixMultWuMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) {
-		SparseBlock w = mW.sparseBlock;
-		SparseBlock c = ret.sparseBlock;
-		DenseBlock u = mU.getDenseBlock();
-		DenseBlock v = mV.getDenseBlock();
-		final int cd = mU.clen;
-		boolean flagmult = (wt==WUMMType.MULT);
-		
-		//approach: iterate over non-zeros of w, selective mm computation
-		for( int i=rl; i<ru; i++ ) {
-			if( w.isEmpty(i) ) continue;
-			int wpos = w.pos(i);
-			int wlen = w.size(i);
-			int[] wix = w.indexes(i);
-			double[] wvals = w.values(i);
-			double[] uvals = u.values(i);
-			int uix = u.pos(i);
-			c.allocate(i, wlen);
-			for( int k=wpos; k<wpos+wlen; k++ ) {
-				double cval = wumm(wvals[k], uvals, v.values(wix[k]),
-					uix, v.pos(wix[k]), flagmult, fn, cd);
-				c.append(i, wix[k], cval);
-			}
-		}
-	}
-
-	private static void matrixMultWuMMGeneric (MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) {
-		final int n = mW.clen;
-		final int cd = mU.clen;
-		boolean flagmult = (wt==WUMMType.MULT);
-		
-		//approach: iterate over non-zeros of w, selective mm computation
-		if( mW.sparse ) { //SPARSE
-			//w and c always in same representation
-			SparseBlock w = mW.sparseBlock;
-			SparseBlock c = ret.sparseBlock;
-			for( int i=rl; i<ru; i++ ) {
-				if( w.isEmpty(i) ) continue;
-				int wpos = w.pos(i);
-				int wlen = w.size(i);
-				int[] wix = w.indexes(i);
-				double[] wval = w.values(i);
-				c.allocate(i, wlen);
-				for( int k=wpos; k<wpos+wlen; k++ ) {
-					double cval = wumm(wval[k], mU, mV, i, wix[k], flagmult, fn, cd);
-					c.append(i, wix[k], cval);
-				}
-			}
-		}
-		else { //DENSE
-			//w and c always in same representation
-			DenseBlock w = mW.getDenseBlock();
-			DenseBlock c = ret.getDenseBlock();
-			for( int i=rl; i<ru; i++ ) {
-				double[] wvals = w.values(i), cvals = c.values(i);
-				int ix = w.pos(i);
-				for( int j=0; j<n; j++) {
-					double wij = wvals[ix+j];
-					if( wij != 0 )
-						cvals[ix+j] = wumm(wij, mU, mV, i, j, flagmult, fn, cd);
-				}
-			}
-		}
-	}
-	
-	////////////////////////////////////////////
-	// performance-relevant utility functions //
-	////////////////////////////////////////////
-
-	/**
-	 * @param a   first vector
-	 * @param b   second vector
-	 * @param len length
-	 * @return dot product of the two input vectors
-	 */
-	private static double dotProduct(double[] a, double[] b, final int len) {
-		double val = 0;
-
-		int i;
-		for (i = 0; i < SPECIES.loopBound(len); i += SPECIES.length()) {
-			//read 64B cachelines of a and b
-			//compute cval' = sum(a * b) + cval
-			var aVec = DoubleVector.fromArray(SPECIES, a, i);
-			var bVec = DoubleVector.fromArray(SPECIES, b, i);
-			val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
-		}
-
-		//compute rest
-		for (; i < len; i++)
-			val += a[i] * b[i];
-
-		//scalar result
-		return val;
-	}
-
-	//note: public for use by codegen for consistency
-	public static double dotProduct(double[] a, double[] b, int ai, int bi, final int len) {
-		double val = 0;
-
-		int i;
-		for (i = 0; i < SPECIES.loopBound(len); ai += SPECIES.length(), bi += SPECIES.length(), i += SPECIES.length()) {
-			//read 64B cachelines of a and b
-			//compute cval' = sum(a * b) + cval
-			var aVec = DoubleVector.fromArray(SPECIES, a, ai);
-			var bVec = DoubleVector.fromArray(SPECIES, b, bi);
-			val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
-		}
-
-		//compute rest
-		for (; i < len; ai++, bi++, i++)
-			val += a[ai] * b[bi];
-
-		//scalar result
-		return val;
-	}
-
-	//note: public for use by codegen for consistency
-	public static double dotProduct(double[] a, double[] b, int[] aix, int ai, final int bi, final int len) {
-		double val = 0;
-		final int bn = len % 8;
-
-		//compute rest
-		for (int i = ai; i < ai + bn; i++)
-			val += a[i] * b[bi + aix[i]];
-
-		int i;
-		for (i = ai; i < SPECIES.loopBound(len); i += SPECIES.length()) {
-			//read 64B cacheline of a
-			//read 64B of b via 'gather'
-			//compute cval' = sum(a * b) + cval
-			var aVec = DoubleVector.fromArray(SPECIES, a, i);
-			var bVec = DoubleVector.fromArray(SPECIES, b, bi, aix, i);
-			val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
-		}
-
-		//compute rest
-		for (; i < len; i++)
-			val += a[i] * b[bi + aix[i]];
-
-		//scalar result
-		return val; 
-	}
-
-	//note: public for use by codegen for consistency
-	public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len )
-	{
-		int j = 0;
-		var avalVec = DoubleVector.broadcast(SPECIES, aval);
-		for(; j < SPECIES.loopBound(len); j += SPECIES.length()) {
-			var res = DoubleVector.fromArray(SPECIES, c, ci + j);
-			var bVec = SPECIES.fromArray(b, bi + j);
-			res = avalVec.fma(bVec, res);
-			res.intoArray(c, ci + j);
-		}
-		for(; j < len; ++j) {
-			c[ci + j] += aval * b[bi + j];
-		}
-	}
-
-	private static void vectMultiplyAdd2(final double aval1, final double aval2, double[] b, double[] c, int bi1,
-		int bi2, int ci, final int len) {
-		int j = 0;
-		var aval1Vec = DoubleVector.broadcast(SPECIES, aval1);
-		var aval2Vec = DoubleVector.broadcast(SPECIES, aval2);
-		for(; j < SPECIES.loopBound(len); j += SPECIES.length()) {
-			var res = DoubleVector.fromArray(SPECIES, c, ci + j);
-			var b1Vec = SPECIES.fromArray(b, bi1 + j);
-			res = aval1Vec.fma(b1Vec, res);
-			var b2Vec = SPECIES.fromArray(b, bi2 + j);
-			res = aval2Vec.fma(b2Vec, res);
-			res.intoArray(c, ci + j);
-		}
-		for(; j < len; ++j) {
-			c[ci + j] += aval1 * b[bi1 + j] + aval2 * b[bi2 + j];
-		}
-	}
-
-	private static void vectMultiplyAdd3( final double aval1, final double aval2, final double aval3, double[] b, double[] c, int bi1, int bi2, int bi3, int ci, final int len )
-	{
-		int j = 0;
-		var aval1Vec = DoubleVector.broadcast(SPECIES, aval1);
-		var aval2Vec = DoubleVector.broadcast(SPECIES, aval2);
-		var aval3Vec = DoubleVector.broadcast(SPECIES, aval3);
-		for(; j < SPECIES.loopBound(len); j += SPECIES.length()) {
-			var res = DoubleVector.fromArray(SPECIES, c, ci + j);
-			var b1Vec = SPECIES.fromArray(b, bi1 + j);
-			res = aval1Vec.fma(b1Vec, res);
-			var b2Vec = SPECIES.fromArray(b, bi2 + j);
-			res = aval2Vec.fma(b2Vec, res);
-			var b3Vec = SPECIES.fromArray(b, bi3 + j);
-			res = aval3Vec.fma(b3Vec, res);
-			res.intoArray(c, ci + j);
-		}
-		for(; j < len; ++j) {
-			c[ci + j] += aval1 * b[bi1 + j] + aval2 * b[bi2 + j] + aval3 * b[bi3 + j];
-		}
-	}
-
-	private static void vectMultiplyAdd4( final double aval1, final double aval2, final double aval3, final double aval4, double[] b, double[] c, int bi1, int bi2, int bi3, int bi4, int ci, final int len )
-	{
-		int j = 0;
-		var aval1Vec = DoubleVector.broadcast(SPECIES, aval1);
-		var aval2Vec = DoubleVector.broadcast(SPECIES, aval2);
-		var aval3Vec = DoubleVector.broadcast(SPECIES, aval3);
-		var aval4Vec = DoubleVector.broadcast(SPECIES, aval4);
-		for(; j < SPECIES.loopBound(len); j += SPECIES.length()) {
-			var res = DoubleVector.fromArray(SPECIES, c, ci + j);
-			var b1Vec = SPECIES.fromArray(b, bi1 + j);
-			res = aval1Vec.fma(b1Vec, res);
-			var b2Vec = SPECIES.fromArray(b, bi2 + j);
-			res = aval2Vec.fma(b2Vec, res);
-			var b3Vec = SPECIES.fromArray(b, bi3 + j);
-			res = aval3Vec.fma(b3Vec, res);
-			var b4Vec = SPECIES.fromArray(b, bi4 + j);
-			res = aval4Vec.fma(b4Vec, res);
-			res.intoArray(c, ci + j);
-		}
-		for(; j < len; ++j) {
-			c[ci + j] += aval1 * b[bi1 + j] + aval2 * b[bi2 + j] + aval3 * b[bi3 + j] + aval4 * b[bi4 + j];
-		}
-	}
-	
-	@SuppressWarnings("unused")
-	private static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int ci, final int len )
-	{
-		final int bn = len%8;
-		
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++ )
-			c[ ci + bix[j] ] += aval * b[ j ];
-		
-		//unrolled 8-block (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8 )
-		{
-			//read 64B cacheline of b
-			//read 64B of c via 'gather'
-			//compute c' = aval * b + c
-			//write back 64B of c = c' via 'scatter'
-			c[ ci+bix[j+0] ] += aval * b[ j+0 ];
-			c[ ci+bix[j+1] ] += aval * b[ j+1 ];
-			c[ ci+bix[j+2] ] += aval * b[ j+2 ];
-			c[ ci+bix[j+3] ] += aval * b[ j+3 ];
-			c[ ci+bix[j+4] ] += aval * b[ j+4 ];
-			c[ ci+bix[j+5] ] += aval * b[ j+5 ];
-			c[ ci+bix[j+6] ] += aval * b[ j+6 ];
-			c[ ci+bix[j+7] ] += aval * b[ j+7 ];
-		}
-	}
-
-	//note: public for use by codegen for consistency
-	public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int bi, final int ci, final int len )
-	{
-		final int bn = len%8;
-		
-		//rest, not aligned to 8-blocks
-		for( int j = bi; j < bi+bn; j++ )
-			c[ ci + bix[j] ] += aval * b[ j ];
-		
-		//unrolled 8-block (for better instruction-level parallelism)
-		for( int j = bi+bn; j < bi+len; j+=8 )
-		{
-			//read 64B cacheline of b
-			//read 64B of c via 'gather'
-			//compute c' = aval * b + c
-			//write back 64B of c = c' via 'scatter'
-			c[ ci+bix[j+0] ] += aval * b[ j+0 ];
-			c[ ci+bix[j+1] ] += aval * b[ j+1 ];
-			c[ ci+bix[j+2] ] += aval * b[ j+2 ];
-			c[ ci+bix[j+3] ] += aval * b[ j+3 ];
-			c[ ci+bix[j+4] ] += aval * b[ j+4 ];
-			c[ ci+bix[j+5] ] += aval * b[ j+5 ];
-			c[ ci+bix[j+6] ] += aval * b[ j+6 ];
-			c[ ci+bix[j+7] ] += aval * b[ j+7 ];
-		}
-	}
-
-	//note: public for use by codegen for consistency
-	public static void vectMultiplyWrite( final double aval, double[] b, double[] c, int bi, int ci, final int len )
-	{
-		final int bn = len%8;
-		
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++, bi++, ci++)
-			c[ ci ] = aval * b[ bi ];
-		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, bi+=8, ci+=8) 
-		{
-			//read 64B cachelines of b and c
-			//compute c' = aval * b + c
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] = aval * b[ bi+0 ];
-			c[ ci+1 ] = aval * b[ bi+1 ];
-			c[ ci+2 ] = aval * b[ bi+2 ];
-			c[ ci+3 ] = aval * b[ bi+3 ];
-			c[ ci+4 ] = aval * b[ bi+4 ];
-			c[ ci+5 ] = aval * b[ bi+5 ];
-			c[ ci+6 ] = aval * b[ bi+6 ];
-			c[ ci+7 ] = aval * b[ bi+7 ];
-		}
-	}
-	
-	public static void vectMultiplyInPlace( final double aval, double[] c, int ci, final int len ) {
-		final int bn = len%8;
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++, ci++)
-			c[ ci ] *= aval;
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, ci+=8) {
-			c[ ci+0 ] *= aval; c[ ci+1 ] *= aval;
-			c[ ci+2 ] *= aval; c[ ci+3 ] *= aval;
-			c[ ci+4 ] *= aval; c[ ci+5 ] *= aval;
-			c[ ci+6 ] *= aval; c[ ci+7 ] *= aval;
-		}
-	}
-
-	//note: public for use by codegen for consistency
-	public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len )
-	{
-		final int bn = len%8;
-		
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++, ai++, bi++, ci++)
-			c[ ci ] = a[ ai ] * b[ bi ];
-		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, ai+=8, bi+=8, ci+=8) 
-		{
-			//read 64B cachelines of a and b
-			//compute c' = a * b
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] = a[ ai+0 ] * b[ bi+0 ];
-			c[ ci+1 ] = a[ ai+1 ] * b[ bi+1 ];
-			c[ ci+2 ] = a[ ai+2 ] * b[ bi+2 ];
-			c[ ci+3 ] = a[ ai+3 ] * b[ bi+3 ];
-			c[ ci+4 ] = a[ ai+4 ] * b[ bi+4 ];
-			c[ ci+5 ] = a[ ai+5 ] * b[ bi+5 ];
-			c[ ci+6 ] = a[ ai+6 ] * b[ bi+6 ];
-			c[ ci+7 ] = a[ ai+7 ] * b[ bi+7 ];
-		}
-	}
-	
-	public static void vectMultiplyWrite( final double[] a, double[] b, double[] c, int[] bix, final int ai, final int bi, final int ci, final int len ) {
-		final int bn = len%8;
-		//rest, not aligned to 8-blocks
-		for( int j = bi; j < bi+bn; j++ )
-			c[ ci+bix[j] ] = a[ ai+bix[j] ] * b[ j ];
-		//unrolled 8-block (for better instruction-level parallelism)
-		for( int j = bi+bn; j < bi+len; j+=8 ) {
-			c[ ci+bix[j+0] ] = a[ ai+bix[j+0] ] * b[ j+0 ];
-			c[ ci+bix[j+1] ] = a[ ai+bix[j+1] ] * b[ j+1 ];
-			c[ ci+bix[j+2] ] = a[ ai+bix[j+2] ] * b[ j+2 ];
-			c[ ci+bix[j+3] ] = a[ ai+bix[j+3] ] * b[ j+3 ];
-			c[ ci+bix[j+4] ] = a[ ai+bix[j+4] ] * b[ j+4 ];
-			c[ ci+bix[j+5] ] = a[ ai+bix[j+5] ] * b[ j+5 ];
-			c[ ci+bix[j+6] ] = a[ ai+bix[j+6] ] * b[ j+6 ];
-			c[ ci+bix[j+7] ] = a[ ai+bix[j+7] ] * b[ j+7 ];
-		}
-	}
-
-	private static void vectMultiply( double[] a, double[] c, int ai, int ci, final int len )
-	{
-		final int bn = len%8;
-		
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++, ai++, ci++)
-			c[ ci ] *= a[ ai ];
-		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) 
-		{
-			//read 64B cachelines of a and c
-			//compute c' = c * a
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] *= a[ ai+0 ];
-			c[ ci+1 ] *= a[ ai+1 ];
-			c[ ci+2 ] *= a[ ai+2 ];
-			c[ ci+3 ] *= a[ ai+3 ];
-			c[ ci+4 ] *= a[ ai+4 ];
-			c[ ci+5 ] *= a[ ai+5 ];
-			c[ ci+6 ] *= a[ ai+6 ];
-			c[ ci+7 ] *= a[ ai+7 ];
-		}
-	}
-
-	//note: public for use by codegen for consistency
-	public static void vectAdd( double[] a, double bval, double[] c, int ai, int ci, final int len ) {
-		final int bn = len%8;
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++, ai++, ci++)
-			c[ ci ] += a[ ai ];
-		//unrolled 8-block  (for better ILP)
-		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) {
-			c[ ci+0 ] += a[ ai+0 ] + bval;
-			c[ ci+1 ] += a[ ai+1 ] + bval;
-			c[ ci+2 ] += a[ ai+2 ] + bval;
-			c[ ci+3 ] += a[ ai+3 ] + bval;
-			c[ ci+4 ] += a[ ai+4 ] + bval;
-			c[ ci+5 ] += a[ ai+5 ] + bval;
-			c[ ci+6 ] += a[ ai+6 ] + bval;
-			c[ ci+7 ] += a[ ai+7 ] + bval;
-		}
-	}
-	
-	//note: public for use by codegen for consistency
-	public static void vectAdd( double[] a, double[] c, int ai, int ci, final int len )
-	{
-		final int bn = len%8;
-		
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++, ai++, ci++)
-			c[ ci ] += a[ ai ];
-		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) 
-		{
-			//read 64B cachelines of a and c
-			//compute c' = c * a
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] += a[ ai+0 ];
-			c[ ci+1 ] += a[ ai+1 ];
-			c[ ci+2 ] += a[ ai+2 ];
-			c[ ci+3 ] += a[ ai+3 ];
-			c[ ci+4 ] += a[ ai+4 ];
-			c[ ci+5 ] += a[ ai+5 ];
-			c[ ci+6 ] += a[ ai+6 ];
-			c[ ci+7 ] += a[ ai+7 ];
-		}
-	}
-
-	public static void vectAdd( double[] a, double[] c, int[] aix, int ai, int ci, final int alen ) {
-		final int bn = alen%8;
-		//rest, not aligned to 8-blocks
-		for( int j = ai; j < ai+bn; j++ )
-			c[ ci+aix[j] ] += a[ j ];
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = ai+bn; j < ai+alen; j+=8 ) {
-			c[ ci+aix[j+0] ] += a[ j+0 ];
-			c[ ci+aix[j+1] ] += a[ j+1 ];
-			c[ ci+aix[j+2] ] += a[ j+2 ];
-			c[ ci+aix[j+3] ] += a[ j+3 ];
-			c[ ci+aix[j+4] ] += a[ j+4 ];
-			c[ ci+aix[j+5] ] += a[ j+5 ];
-			c[ ci+aix[j+6] ] += a[ j+6 ];
-			c[ ci+aix[j+7] ] += a[ j+7 ];
-		}
-	}
-	
-	private static void vectAdd4( double[] a1, double[] a2, double[] a3, double[] a4, double[] c, int ai, int ci, final int len )
-	{
-		final int bn = len%8;
-		
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++, ai++, ci++)
-			c[ ci ] += a1[ ai ] + a2[ ai ] + a3[ ai ] + a4[ ai ];
-		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) 
-		{
-			//read 64B cachelines of a (4x) and c
-			//compute c' = c + a1 + a2 + a3 + a4
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] += a1[ ai+0 ] + a2[ ai+0 ] + a3[ ai+0 ] + a4[ ai+0 ];
-			c[ ci+1 ] += a1[ ai+1 ] + a2[ ai+1 ] + a3[ ai+1 ] + a4[ ai+1 ];
-			c[ ci+2 ] += a1[ ai+2 ] + a2[ ai+2 ] + a3[ ai+2 ] + a4[ ai+2 ];
-			c[ ci+3 ] += a1[ ai+3 ] + a2[ ai+3 ] + a3[ ai+3 ] + a4[ ai+3 ];
-			c[ ci+4 ] += a1[ ai+4 ] + a2[ ai+4 ] + a3[ ai+4 ] + a4[ ai+4 ];
-			c[ ci+5 ] += a1[ ai+5 ] + a2[ ai+5 ] + a3[ ai+5 ] + a4[ ai+5 ];
-			c[ ci+6 ] += a1[ ai+6 ] + a2[ ai+6 ] + a3[ ai+6 ] + a4[ ai+6 ];
-			c[ ci+7 ] += a1[ ai+7 ] + a2[ ai+7 ] + a3[ ai+7 ] + a4[ ai+7 ];
-		}
-	}
-	
-	private static void vectAddAll(double[][] a, double[] c, int ai, int ci, final int len) {
-		int bi = a.length % 4;
-		//process stride for remaining blocks
-		for(int i=0; i<bi; i++)
-			vectAdd(a[i], c, ai, ci, len);
-		//process stride in 4 blocks at a time
-		for(int i=bi; i<a.length; i+=4)
-			vectAdd4(a[i], a[i+1], a[i+2], a[i+3], c, ai, ci, len);
-	}
-	
-	public static void vectAddInPlace(double aval, double[] c, final int ci, final int len) {
-		final int bn = len%8;
-		//rest, not aligned to 8-blocks
-		for( int j = ci; j < ci+bn; j++)
-			c[ j ] += aval;
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = ci+bn; j < ci+len; j+=8) {
-			c[ j+0 ] += aval; c[ j+1 ] += aval; 
-			c[ j+2 ] += aval; c[ j+3 ] += aval;
-			c[ j+4 ] += aval; c[ j+5 ] += aval;
-			c[ j+6 ] += aval; c[ j+7 ] += aval;
-		}
-	}
-
-	private static void vectSubtract( double[] a, double[] c, int ai, int ci, final int len )
-	{
-		final int bn = len%8;
-		
-		//rest, not aligned to 8-blocks
-		for( int j = 0; j < bn; j++, ai++, ci++)
-			c[ ci ] -= a[ ai ];
-		
-		//unrolled 8-block  (for better instruction-level parallelism)
-		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) 
-		{
-			//read 64B cachelines of a and c
-			//compute c' = c * a
-			//write back 64B cacheline of c = c'
-			c[ ci+0 ] -= a[ ai+0 ];
-			c[ ci+1 ] -= a[ ai+1 ];
-			c[ ci+2 ] -= a[ ai+2 ];
-			c[ ci+3 ] -= a[ ai+3 ];
-			c[ ci+4 ] -= a[ ai+4 ];
-			c[ ci+5 ] -= a[ ai+5 ];
-			c[ ci+6 ] -= a[ ai+6 ];
-			c[ ci+7 ] -= a[ ai+7 ];
-		}
-	}
-
-	private static double wsigmoid( final double wij, double[] u, double[] v, final int uix, final int vix, final boolean flagminus, final boolean flaglog, final int len )
-	{
-		//compute dot product over ui vj 
-		double uvij = dotProduct(u, v, uix, vix, len);
-		
-		//compute core sigmoid function  
-		double cval = flagminus ?
-				1 / (1 + FastMath.exp(uvij)) :
-				1 / (1 + FastMath.exp(-uvij));
-				
-		//compute weighted output
-		return wij * ((flaglog) ? Math.log(cval) : cval);
-	}
-
-	private static double wsigmoid( final double wij, MatrixBlock u, MatrixBlock v, final int uix, final int vix, final boolean flagminus, final boolean flaglog, final int len )
-	{
-		//compute dot product over ui vj 
-		double uvij = dotProductGeneric(u, v, uix, vix, len);
-		
-		//compute core sigmoid function  
-		double cval = flagminus ?
-				1 / (1 + FastMath.exp(uvij)) :
-				1 / (1 + FastMath.exp(-uvij));
-				
-		//compute weighted output
-		return wij * ((flaglog) ? Math.log(cval) : cval);
-	}
-	
-	private static void wdivmm( final double wij, double[] u, double[] v, double[] c, final int uix, final int vix, final boolean left, final boolean mult, final boolean minus, final int len )
-	{
-		//compute dot product over ui vj
-		double uvij = dotProduct(u, v, uix, vix, len);
-		
-		//compute core wdivmm  
-		double tmpval = minus ? uvij - wij :
-			mult ? wij * uvij : wij / uvij;
-		
-		//prepare inputs for final mm
-		int bix = left ? uix : vix;
-		int cix = left ? vix : uix;
-		double[] b = left ? u : v;
-		
-		//compute final mm output
-		vectMultiplyAdd(tmpval, b, c, bix, cix, len);
-	}
-
-	private static void wdivmm( final double wij, final double xij, double[] u, double[] v, double[] c, final int uix, final int vix, final boolean left, final boolean scalar, final int len )
-	{
-		//compute dot product over ui vj 
-		double uvij = dotProduct(u, v, uix, vix, len);
-		
-		//compute core wdivmm  
-		double tmpval = scalar ? wij / (uvij + xij) : wij * (uvij - xij);
-		
-		//prepare inputs for final mm
-		int bix = left ? uix : vix;
-		int cix = left ? vix : uix;
-		double[] b = left ? u : v;
-		
-		//compute final mm output
-		vectMultiplyAdd(tmpval, b, c, bix, cix, len);
-	}
-
-	private static void wdivmm( final double wij, MatrixBlock u, MatrixBlock v, double[] c, final int uix, final int vix, final boolean left, boolean mult, final boolean minus, final int len )
-	{
-		//compute dot product over ui vj 
-		double uvij = dotProductGeneric(u, v, uix, vix, len);
-		
-		//compute core wdivmm
-		double wtmp = minus ? uvij - wij :
-			mult ? wij * uvij : wij / uvij;
-		
-		//prepare inputs for final mm
-		int bix = left ? uix : vix;
-		int cix = left ? vix*len : uix*len;
-		MatrixBlock b = left ? u : v;
-		
-		//compute final mm
-		for( int k2=0; k2<len; k2++ )
-			c[cix+k2] += b.quickGetValue(bix, k2) * wtmp;
-	}
-
-	private static void wdivmm( final double wij, final double xij, MatrixBlock u, MatrixBlock v, double[] c, final int uix, final int vix, final boolean left, final boolean scalar, final int len )
-	{
-		//compute dot product over ui vj 
-		double uvij = dotProductGeneric(u, v, uix, vix, len);
-		
-		//compute core wdivmm
-		double wtmp = scalar ? wij / (uvij + xij) : wij * (uvij - xij);
-		
-		//prepare inputs for final mm
-		int bix = left ? uix : vix;
-		int cix = left ? vix*len : uix*len;
-		MatrixBlock b = left ? u : v;
-		
-		//compute final mm
-		for( int k2=0; k2<len; k2++ )
-			c[cix+k2] += b.quickGetValue(bix, k2) * wtmp;
-	}
-
-	private static double wumm( final double wij, double[] u, double[] v, final int uix, final int vix, final boolean flagmult, ValueFunction fn, final int len ) {
-		//compute dot product over ui vj 
-		double uvij = dotProduct(u, v, uix, vix, len);
-		
-		//compute unary operations
-		double cval = fn.execute(uvij);
-		
-		//compute weighted output
-		return flagmult ? wij * cval : wij / cval;
-	}
-
-	private static double wumm( final double wij, MatrixBlock u, MatrixBlock v, final int uix, final int vix, final boolean flagmult, ValueFunction fn, final int len ) {
-		//compute dot product over ui vj 
-		double uvij = dotProductGeneric(u, v, uix, vix, len);
-
-		//compute unary operations
-		double cval = fn.execute(uvij);
-		
-		//compute weighted output
-		return flagmult ? wij * cval : wij / cval;
-	}
-
-	private static double dotProductGeneric(MatrixBlock a, MatrixBlock b, final int ai, final int bi, int len)
-	{
-		double val = 0;
-		for( int k2=0; k2<len; k2++ )
-			val += a.quickGetValue(ai, k2) * b.quickGetValue(bi, k2);
-		
-		return val;
-	}
-	
-	private static double dotProductGeneric(MatrixBlock a, MatrixBlock b)
-	{
-		double val = 0;
-		for( int i=0; i<a.getNumRows(); i++ )
-			for( int j=0; j<a.getNumColumns(); j++ )
-				val += a.quickGetValue(i, j) * b.quickGetValue(i, j);
-		
-		return val;
-	}
-	
-	/**
-	 * Used for all version of TSMM where the result is known to be symmetric.
-	 * Hence, we compute only the upper triangular matrix and copy this partial
-	 * result down to lower triangular matrix once.
-	 * 
-	 * @param ret matrix
-	 * @return number of non zeros
-	 */
-	public static long copyUpperToLowerTriangle( MatrixBlock ret )
-	{
-		//ret is guaranteed to be a squared, symmetric matrix
-		if( ret.rlen != ret.clen )
-			throw new RuntimeException("Invalid non-squared input matrix.");
-		
-		final double[] c = ret.getDenseBlockValues();
-		final int n = ret.rlen;
-		long nnz = 0;
-		
-		//blocked execution (2x128KB for L2 blocking)
-		final int blocksizeIJ = 128; 
-		
-		//handle blocks on diagonal
-		for( int bi = 0; bi<n; bi+=blocksizeIJ ) {
-			int bimin = Math.min(bi+blocksizeIJ, n);
-			for( int i=bi, rix=bi*n; i<bimin; i++, rix+=n ) {
-				LibMatrixReorg.transposeRow(c, c, rix+bi, bi*n+i, n, bimin-bi);
-				nnz += (c[rix+i] != 0) ? 1 : 0; //for diagonal element
-				for( int j=rix+i+1; j<rix+bimin; j++ )
-					nnz += (c[j] != 0) ? 2 : 0;
-			}
-		}
-		
-		//handle non-diagonal blocks (full block copies)
-		for( int bi = 0; bi<n; bi+=blocksizeIJ ) {
-			int bimin = Math.min(bi+blocksizeIJ, n);
-			for( int bj = bi; bj<n; bj+=blocksizeIJ ) 
-				if( bi != bj ) { //not on diagonal
-					int bjmin = Math.min(bj+blocksizeIJ, n);
-					for( int i=bi, rix=bi*n; i<bimin; i++, rix+=n ) {
-						LibMatrixReorg.transposeRow(c, c, rix+bj, bj*n+i, n, bjmin-bj);
-						for( int j=rix+bj; j<rix+bjmin; j++ )
-							nnz += (c[j] != 0) ? 2 : 0;
-					}
-				}
-		}
-		
-		return nnz;
-	}
-
-	public static MatrixBlock prepMatrixMultTransposeSelfInput( MatrixBlock m1, boolean leftTranspose, boolean par ) {
-		MatrixBlock ret = m1;
-		final int rlen = m1.rlen;
-		final int clen = m1.clen;
-		
-		if( !leftTranspose && m1.sparse && rlen > 1) { //X%*%t(X) SPARSE MATRIX
-			//directly via LibMatrixReorg in order to prevent sparsity change
-			MatrixBlock tmpBlock = new MatrixBlock(clen, rlen, m1.sparse);
-			LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
-			ret = tmpBlock;
-		}
-		else if( leftTranspose && m1.sparse && m1.sparseBlock instanceof SparseBlockCSR ) {
-			//for a special case of CSR inputs where all non-empty rows are dense, we can
-			//create a shallow copy of the values arrays to a "dense" block and perform
-			//tsmm with the existing dense block operations w/o unnecessary gather/scatter
-			SparseBlockCSR sblock = (SparseBlockCSR)m1.sparseBlock;
-			boolean convertDense = (par ?
-				IntStream.range(0, rlen).parallel() : IntStream.range(0, rlen))
-				.allMatch(i -> sblock.isEmpty(i) || sblock.size(i)==clen );
-			if( convertDense ) {
-				int rows = (int) sblock.size() / clen;
-				MatrixBlock tmpBlock = new MatrixBlock(rows, clen, false);
-				tmpBlock.denseBlock = DenseBlockFactory
-					.createDenseBlock(sblock.values(), rows, clen);
-				tmpBlock.setNonZeros(m1.nonZeros);
-				ret = tmpBlock;
-			}
-		}
-		
-		return ret;
-	}
-
-	private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 ) {
-		//transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output 
-		return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
-			&& isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen, true));
-	}
-	
-	//note: public for use by codegen for consistency
-	public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen, boolean inclCacheSize) {
-		return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1 
-			&& m2clen < 64 && (!inclCacheSize || 8*m2rlen*m2clen < L2_CACHESIZE);
-	}
-	
-	private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) {
-		//parallelize over rows in rhs matrix if number of rows in lhs/output is very small
-		double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
-		return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && !(m1.isUltraSparse()||m2.isUltraSparse()))
-			|| (m1.rlen<=16 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && m2.rlen > m1.rlen 
-			   && ( !m1.isUltraSparse() && !(m1.sparse & m2.sparse) ) //dense-dense / sparse-dense / dense-sparse
-			   && (long)k * 8 * m1.rlen * m2.clen < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) );
-	}
-
-	private static boolean checkParMatrixMultRightInputCols( MatrixBlock m1, MatrixBlock m2, int k, boolean pm2r ) {
-		//parallelize over cols in rhs matrix if dense, number of cols in rhs is large, and lhs fits in l2
-		return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
-				&& m2.clen > k * 1024 && m1.rlen < k * 32 && !pm2r
-				&& 8*m1.rlen*m1.clen < 256*1024 ); //lhs fits in L2 cache
-	}
-	
-	public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, int k) {
-		return satisfiesMultiThreadingConstraints(m1, true, false, -1, k);
-	}
-	
-	public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
-		boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k);
-		double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
-		return k > 1 && LOW_LEVEL_OPTIMIZATION
-			&& (!checkMem || 8L * m1.clen * k < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
-			&& (!checkFLOPs || FPfactor * m1.rlen * m1.clen >
-			(sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1));
-	}
-	
-	public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, MatrixBlock m2, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
-		boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k);
-		double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
-		return k > 1 && LOW_LEVEL_OPTIMIZATION
-			&& (!checkMem || 8L * m2.clen * k < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
-			//note: cast to double to avoid long overflows on ultra-sparse matrices
-			//due to FLOP computation based on number of cells not non-zeros
-			&& (!checkFLOPs || (double)FPfactor * m1.rlen * m1.clen * m2.clen >
-			(sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1));
-	}
-	
-	private static boolean satisfiesMultiThreadingConstraintsTSMM(MatrixBlock m1, boolean leftTranspose, long FPfactor, int k) {
-		boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k);
-		double threshold = sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1;
-		return k > 1 && LOW_LEVEL_OPTIMIZATION && (leftTranspose?m1.clen:m1.rlen)!=1
-			&& ((leftTranspose && FPfactor * m1.rlen * m1.clen * m1.clen > threshold)
-			||(!leftTranspose && FPfactor * m1.clen * m1.rlen * m1.rlen > threshold));
-	}
-	
-	public static boolean isUltraSparseMatrixMult(MatrixBlock m1, MatrixBlock m2, boolean m1Perm) {
-		if( m2.clen == 1 ) //mv always dense
-			return false;
-		//note: ultra-sparse matrix mult implies also sparse outputs, hence we need
-		//to be conservative an cannot use this for all ultra-sparse matrices.
-		double outSp = OptimizerUtils.getMatMultSparsity(
-			m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, true);
-		return (m1.isUltraSparse() || m2.isUltraSparse()) //base case
-			|| (m1.isUltraSparse(false) && m1 == m2) //ultra-sparse self product
-			|| (m1Perm && OptimizerUtils.getSparsity(m2.rlen, m2.clen, m2.nonZeros)<1.0)
-			|| ((m1.isUltraSparse(false) || m2.isUltraSparse(false)) 
-				&& outSp < MatrixBlock.ULTRA_SPARSITY_TURN_POINT2)
-			|| (m1.isInSparseFormat() // otherwise no matching branch
-				&& m1.getSparsity() < MatrixBlock.ULTRA_SPARSITY_TURN_POINT2
-				&& m1.getNonZeros() < MatrixBlock.ULTRA_SPARSE_BLOCK_NNZ
-				&& m1.getLength()+m2.getLength() < (long)m1.rlen*m2.clen
-				&& outSp < MatrixBlock.SPARSITY_TURN_POINT);
-	}
-	
-	public static boolean isSparseOutputMatrixMult(MatrixBlock m1, MatrixBlock m2) {
-		//output is a matrix (not vector), very likely sparse, and output rows fit into L1 cache
-		if( !(m1.sparse && m2.sparse && m1.rlen > 1 && m2.clen > 1) )
-			return false;
-		double estSp = OptimizerUtils.getMatMultSparsity(
-			m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, false);
-		long estNnz = (long)(estSp * m1.rlen * m2.clen);
-		boolean sparseOut = MatrixBlock.evalSparseFormatInMemory(m1.rlen, m2.clen, estNnz);
-		return m2.clen < 4*1024 && sparseOut;
-	}
-
-	public static boolean isOuterProductTSMM(int rlen, int clen, boolean left) {
-		return left ? rlen == 1 & clen > 1 : rlen > 1 & clen == 1;
-	}
-
-	private static MatrixBlock prepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2, boolean tm2 ) {
-		MatrixBlock ret = m2;
-		
-		//transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output 
-		if( tm2 ) {
-			MatrixBlock tmpBlock = new MatrixBlock(m2.clen, m2.rlen, m2.sparse);
-			ret = LibMatrixReorg.reorg(m2, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
-		}
-		
-		return ret;
-	}
-
-	//cp non-zeros for dense-dense mm
-	private static int copyNonZeroElements( double[] a, final int aixi, final int bixk, final int n, double[] tmpa, int[] tmpbi, final int bklen ) {
-		int knnz = 0;
-		for( int k = 0; k < bklen; k++ )
-			if( a[ aixi+k ] != 0 ) {
-				tmpa[ knnz ] = a[ aixi+k ];
-				tmpbi[ knnz ] = bixk + k*n;
-				knnz ++;
-			}
-		return knnz;
-	}
-
-	//cp non-zeros for dense tsmm
-	private static int copyNonZeroElements( double[] a, int aixi, int bixk, final int n, final int nx, double[] tmpa, int[] tmpbi, final int bklen ) {
-		int knnz = 0;
-		for( int k = 0; k < bklen; k++, aixi+=n, bixk+=nx )
-			if( a[ aixi ] != 0 ) {
-				tmpa[ knnz ] = a[ aixi ];
-				tmpbi[ knnz ] = bixk;
-				knnz ++;
-			}
-		return knnz;
-	}
-	
-	@SuppressWarnings("unused")
-	private static void compactSparseOutput(MatrixBlock ret) {
-		if( !ret.sparse || ret.nonZeros > ret.rlen || ret.isEmpty() 
-			|| ret.getSparseBlock() instanceof SparseBlockCSR )
-			return; //early abort
-		ret.sparseBlock = SparseBlockFactory
-			.copySparseBlock(Type.CSR, ret.sparseBlock, false);
-	}
-	
-	@SuppressWarnings("unused")
-	private static void resetPosVect(int[] curk, SparseBlock sblock, int rl, int ru) {
-		if( sblock instanceof SparseBlockMCSR ) {
-			//all rows start at position 0 (individual arrays)
-			Arrays.fill(curk, 0, ru-rl, 0);
-		}
-		else if( sblock instanceof SparseBlockCSR ) {
-			//row start positions given in row ptr array
-			SparseBlockCSR csr = (SparseBlockCSR) sblock;
-			System.arraycopy(csr.rowPointers(), rl, curk, 0, ru-rl);
-		}
-		else { //general case
-			for(int i=rl; i<ru; i++)
-				curk[i-rl] = sblock.pos(i);
-		}
-	}
-
-	private static void sumScalarResults(List<Future<Double>> tasks, MatrixBlock ret) 
-		throws InterruptedException, ExecutionException
-	{
-		//aggregate partial results and check for errors
-		double val = 0;
-		for(Future<Double> task : tasks)
-			val += task.get();
-		ret.quickSetValue(0, 0, val);
-	}
-
-	@SuppressWarnings("unused")
-	private static void sumDenseResults( double[][] partret, double[] ret )
-	{
-		final int len = ret.length;
-		final int k = partret.length;
-		final int bk = k % 4;
-		final int blocksize = 2 * 1024; //16KB (half of common L1 data)
-		
-		//cache-conscious aggregation to prevent repreated scans/writes of ret
-		for( int bi=0; bi<len; bi+=blocksize ) {
-			int llen = Math.min(len-bi, blocksize);
-			
-			//aggregate next block from all partial results
-			for( int j=0; j<bk; j++ ) //rest (not aligned to 4)
-				vectAdd(partret[j], ret, bi, bi, llen);
-			for( int j=bk; j<k; j+=4 ) //4 partial results at a time
-				vectAdd4(partret[j], partret[j+1], partret[j+2], partret[j+3], ret, bi, bi, llen);
-		}
-		
-	}
-	
-	/////////////////////////////////////////////////////////
-	// Task Implementations for Multi-Threaded Operations  //
-	/////////////////////////////////////////////////////////
-
-	private static class MatrixMultTask implements Callable<Object> 
-	{
-		private final MatrixBlock _m1;
-		private final MatrixBlock _m2;
-		private MatrixBlock _ret = null;
-		private final boolean _tm2; //transposed m2
-		private final boolean _pm2r; //par over m2 rows
-		private final boolean _pm2c; //par over m2 rows
-		private final boolean _m1Perm; //sparse permutation
-		private final boolean _sparse; //sparse output
-		private final int _rl;
-		private final int _ru;
-
-		protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
-			boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, boolean sparse, int rl, int ru )
-		{
-			_m1 = m1;
-			_m2 = m2;
-			_tm2 = tm2;
-			_pm2r = pm2r;
-			_pm2c = pm2c;
-			_m1Perm = m1Perm;
-			_sparse = sparse;
-			_rl = rl;
-			_ru = ru;
-			
-			if( pm2r ) { //vector-matrix / matrix-matrix
-				//allocate local result for partial aggregation
-				_ret = new MatrixBlock(ret.rlen, ret.clen, false);
-			}
-			else { //default case
-				_ret = ret;
-			}
-		}
-		
-		@Override
-		public Object call() {
-			//setup target index ranges
-			int rl = _pm2c ? 0 : _rl;
-			int ru = _pm2c ? _m1.rlen : _ru;
-			int cl = _pm2c ? _rl : 0;
-			int cu = _pm2c ? _ru : _ret.clen;
-			
-			//thread-local allocation
-			if( _pm2r )
-				_ret.allocateDenseBlock();
-			
-			//compute block matrix multiplication
-			if( _ret.sparse ) //ultra-sparse
-				matrixMultUltraSparse(_m1, _m2, _ret, _m1Perm, rl, ru);
-			else if(!_m1.sparse && !_m2.sparse)
-				matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu);
-			else if(_m1.sparse && _m2.sparse)
-				matrixMultSparseSparse(_m1, _m2, _ret, _pm2r, _sparse, rl, ru);
-			else if(_m1.sparse)
-				matrixMultSparseDense(_m1, _m2, _ret, _pm2r, rl, ru);
-			else
-				matrixMultDenseSparse(_m1, _m2, _ret, _pm2r, rl, ru);
-			
-			//maintain block nnz (upper bounds inclusive)
-			if( !_pm2r )
-				return _ret.recomputeNonZeros(rl, ru-1, cl, cu-1);
-			else
-				return _ret.getDenseBlockValues();
-		}
-	}
-
-	private static class MatrixMultChainTask implements Callable<double[]> 
-	{
-		private MatrixBlock _m1  = null;
-		private MatrixBlock _m2  = null;
-		private MatrixBlock _m3  = null;
-		private ChainType _ct = null;
-		private int _rl = -1;
-		private int _ru = -1;
-
-		protected MatrixMultChainTask( MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, ChainType ct, int rl, int ru ) {
-			_m1 = mX;
-			_m2 = mV;
-			_m3 = mW;
-			_ct = ct;
-			_rl = rl;
-			_ru = ru;
-		}
-		
-		@Override
-		public double[] call() {
-			//thread-local allocation for partial aggregation
-			MatrixBlock ret = new MatrixBlock(1, _m1.clen, false);
-			ret.allocateDenseBlock();
-			
-			if( _m1.sparse )
-				matrixMultChainSparse(_m1, _m2, _m3, ret, _ct, _rl, _ru);
-			else
-				matrixMultChainDense(_m1, _m2, _m3, ret, _ct, _rl, _ru);
-			
-			//NOTE: we dont do global aggregation from concurrent tasks in order
-			//to prevent synchronization (sequential aggregation led to better 
-			//performance after JIT)
-			return ret.getDenseBlockValues();
-		}
-	}
-
-	private static class MatrixMultTransposeTask implements Callable<Object> 
-	{
-		private final MatrixBlock _m1;
-		private final MatrixBlock _ret;
-		private final boolean _left;
-		private final int _rl;
-		private final int _ru;
-
-		protected MatrixMultTransposeTask( MatrixBlock m1, MatrixBlock ret, boolean left, int rl, int ru )
-		{
-			_m1 = m1;
-			_ret = ret;
-			_left = left;
-			_rl = rl;
-			_ru = ru;
-		}
-		
-		@Override
-		public Object call() {
-			if( _m1.sparse )
-				matrixMultTransposeSelfSparse(_m1, _ret, _left, _rl, _ru);
-			else
-				matrixMultTransposeSelfDense(_m1, _ret, _left, _rl, _ru);
-			return null;
-		}
-	}
-
-	private static class MatrixMultPermuteTask implements Callable<Object> 
-	{
-		private MatrixBlock _pm1  = null;
-		private MatrixBlock _m2 = null;
-		private MatrixBlock _ret1 = null;
-		private MatrixBlock _ret2 = null;
-		private int _rl = -1;
-		private int _ru = -1;
-
-		protected MatrixMultPermuteTask( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
-		{
-			_pm1 = pm1;
-			_m2 = m2;
-			_ret1 = ret1;
-			_ret2 = ret2;
-			_rl = rl;
-			_ru = ru;
-		}
-		
-		@Override
-		public Object call() {
-			if( _m2.sparse )
-				matrixMultPermuteSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru);
-			else if( _ret1.sparse )
-				matrixMultPermuteDenseSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru);
-			else 
-				matrixMultPermuteDense(_pm1, _m2, _ret1, _ret2, _rl, _ru);
-
-			return null;
-		}
-	}
-
-	private static class MatrixMultWSLossTask implements Callable<Double>
-	{
-		private MatrixBlock _mX = null;
-		private MatrixBlock _mU = null;
-		private MatrixBlock _mV = null;
-		private MatrixBlock _mW = null;
-		private MatrixBlock _ret = null;
-		private WeightsType _wt = null;
-		private int _rl = -1;
-		private int _ru = -1;
-
-		protected MatrixMultWSLossTask(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, WeightsType wt, int rl, int ru) {
-			_mX = mX;
-			_mU = mU;
-			_mV = mV;
-			_mW = mW;
-			_wt = wt;
-			_rl = rl;
-			_ru = ru;
-			
-			//allocate local result for partial aggregation
-			_ret = new MatrixBlock(1, 1, false);
-			_ret.allocateDenseBlock();
-		}
-		
-		@Override
-		public Double call() {
-			if( !_mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || !_mW.sparse) 
-				&& !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() 
-				&& (_mW==null || !_mW.isEmptyBlock()))
-				matrixMultWSLossDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
-			else if( _mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || _mW.sparse)
-				    && !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() 
-				    && (_mW==null || !_mW.isEmptyBlock()))
-				matrixMultWSLossSparseDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
-			else
-				matrixMultWSLossGeneric(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
-
-			return _ret.quickGetValue(0, 0);
-		}
-	}
-
-	private static class MatrixMultWSigmoidTask implements Callable<Long> 
-	{
-		private MatrixBlock _mW = null;
-		private MatrixBlock _mU = null;
-		private MatrixBlock _mV = null;
-		private MatrixBlock _ret = null;
-		private WSigmoidType _wt = null;
-		private int _rl = -1;
-		private int _ru = -1;
-		
-		protected MatrixMultWSigmoidTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) {
-			_mW = mW;
-			_mU = mU;
-			_mV = mV;
-			_ret = ret;
-			_wt = wt;
-			_rl = rl;
-			_ru = ru;
-		}
-		
-		@Override
-		public Long call() {
-			//core weighted square sum mm computation
-			if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
-				matrixMultWSigmoidDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
-			else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
-				matrixMultWSigmoidSparseDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
-			else
-				matrixMultWSigmoidGeneric(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
-			
-			//maintain block nnz (upper bounds inclusive)
-			return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
-		}
-	}
-
-	private static class MatrixMultWDivTask implements Callable<Long> 
-	{
-		private MatrixBlock _mW = null;
-		private MatrixBlock _mU = null;
-		private MatrixBlock _mV = null;
-		private MatrixBlock _mX = null;
-		private MatrixBlock _ret = null;
-		private WDivMMType _wt = null;
-		private int _rl = -1;
-		private int _ru = -1;
-		private int _cl = -1;
-		private int _cu = -1;
-		
-		protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) {
-			_mW = mW;
-			_mU = mU;
-			_mV = mV;
-			_mX = mX;
-			_wt = wt;
-			_rl = rl;
-			_ru = ru;
-			_cl = cl;
-			_cu = cu;
-			_ret = ret;	
-		}
-		
-		@Override
-		public Long call() {
-			//core weighted div mm computation
-			boolean scalarX = _wt.hasScalar();
-			if( !_mW.sparse && !_mU.sparse && !_mV.sparse && (_mX==null || !_mX.sparse || scalarX) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
-				matrixMultWDivMMDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
-			else if( _mW.sparse && !_mU.sparse && !_mV.sparse && (_mX==null || _mX.sparse || scalarX) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
-				matrixMultWDivMMSparseDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
-			else
-				matrixMultWDivMMGeneric(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
-		
-			//maintain partial nnz for right (upper bounds inclusive)
-			int rl = _wt.isLeft() ? _cl : _rl;
-			int ru = _wt.isLeft() ? _cu : _ru;
-			return _ret.recomputeNonZeros(rl, ru-1, 0, _ret.getNumColumns()-1);
-		}
-	}
-	
-	private static class MatrixMultWCeTask implements Callable<Double>
-	{
-		private MatrixBlock _mW = null;
-		private MatrixBlock _mU = null;
-		private MatrixBlock _mV = null;
-		private double _eps = 0.0;
-		private MatrixBlock _ret = null;
-		private WCeMMType _wt = null;
-		private int _rl = -1;
-		private int _ru = -1;
-
-		protected MatrixMultWCeTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, WCeMMType wt, int rl, int ru) {
-			_mW = mW;
-			_mU = mU;
-			_mV = mV;
-			_eps = eps;
-			_wt = wt;
-			_rl = rl;
-			_ru = ru;
-			
-			//allocate local result for partial aggregation
-			_ret = new MatrixBlock(1, 1, false);
-			_ret.allocateDenseBlock();
-		}
-		
-		@Override
-		public Double call() {
-			//core weighted cross entropy mm computation
-			if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
-				matrixMultWCeMMDense(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru);
-			else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
-				matrixMultWCeMMSparseDense(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru);
-			else
-				matrixMultWCeMMGeneric(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru);
-			
-			
-			return _ret.quickGetValue(0, 0);
-		}
-	}
-
-	private static class MatrixMultWuTask implements Callable<Long> 
-	{
-		private MatrixBlock _mW = null;
-		private MatrixBlock _mU = null;
-		private MatrixBlock _mV = null;
-		private MatrixBlock _ret = null;
-		private WUMMType _wt = null;
-		private ValueFunction _fn = null;
-		private int _rl = -1;
-		private int _ru = -1;
-		
-		protected MatrixMultWuTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) {
-			_mW = mW;
-			_mU = mU;
-			_mV = mV;
-			_ret = ret;
-			_wt = wt;
-			_fn = fn;
-			_rl = rl;
-			_ru = ru;
-		}
-		
-		@Override
-		public Long call() {
-			//core weighted square sum mm computation
-			if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
-				matrixMultWuMMDense(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru);
-			else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
-				matrixMultWuMMSparseDense(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru);
-			else
-				matrixMultWuMMGeneric(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru);
-			
-			//maintain block nnz (upper bounds inclusive)
-			return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
-		}
-	}
-}
diff --git a/scripts/staging/SIMD-double-vectors/README.md b/scripts/staging/SIMD-double-vectors/README.md
deleted file mode 100644
index 55812002d0c..00000000000
--- a/scripts/staging/SIMD-double-vectors/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements.  See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% end comment %}
--->
-
-# SIMD DoubleVectors for matrix multiplication
-
-`DoubleVector` is still in incubator stage, but promises performance improvements for many SystemDS components.
-This patch explored potential speedup for matrix multiplication of two dense matrices. Additionally, dot product
-is also implemented with `DoubleVector` for the case where common dimension is `1`.
-
-Initial experiments showed varying results, usually the vectorized implementation performs somewhere between
-`MKL` and our reference. There are also cases where we are slower than the reference, or faster than `MKL`.
-For detailed discussion (and plots) see PR #1643.
-
-## Further Work
-
-This patch focused only on dense matrix multiplication, increasing sparsity would complicate things.
-The sparsity aware copying (see `LibMatrixMult.java:1170`) and general loop structure is kept as it is, as a lot of 
-experimentation went into a very efficient implementation. Note that the usage of `DoubleVector` might change
-a lot of things about this and revisiting this (and using SIMD for sparsity aware copying) will be a necessary step.
-
-## Changes
-
-Due to the dependency of at least JDK17, there are changes to `pom.xml`, run script `systemds` and, of course, `LibMatrixMult.java`.
-
-## Note
-
-The pom file repeatedly gets flagged for old version of various libraries, therefore we renamed the file to avoid this inconvenience.
-and we introduced a string in the beginning of the file making it not compile as a pom file.
diff --git a/scripts/staging/SIMD-double-vectors/pom.xml.tmp b/scripts/staging/SIMD-double-vectors/pom.xml.tmp
deleted file mode 100644
index fbb940668b7..00000000000
--- a/scripts/staging/SIMD-double-vectors/pom.xml.tmp
+++ /dev/null
@@ -1,1336 +0,0 @@
-THIS IS NO LONGER A POM.XML FILE, GITHUB PLEASE DO NOT GIVE US UPDATE REMINDERS OF THIS FILE.
-
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-	<modelVersion>4.0.0</modelVersion>
-	<parent>
-		<groupId>org.apache</groupId>
-		<artifactId>apache</artifactId>
-		<version>24</version>
-	</parent>
-	<groupId>org.apache.systemds</groupId>
-	<version>3.1.0-SNAPSHOT</version>
-	<artifactId>systemds</artifactId>
-	<packaging>jar</packaging>
-	<name>Apache SystemDS</name>
-	<url>https://github.com/apache/systemds</url>
-	<description>An open source ML system for the end-to-end data science lifecycle</description>
-	<licenses>
-		<license>
-			<name>Apache 2.0 License</name>
-			<url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
-		</license>
-	</licenses>
-
-	<properties>
-		<hadoop.version>3.3.3</hadoop.version>
-		<antlr.version>4.8</antlr.version>
-		<protobuf.version>3.20.3</protobuf.version>
-		<spark.version>3.2.0</spark.version>
-		<scala.version>2.12.0</scala.version>
-		<scala.binary.version>2.12</scala.binary.version>
-		<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ss z</maven.build.timestamp.format>
-		<project.build.outputTimestamp>1</project.build.outputTimestamp>
-		<enableGPU>false</enableGPU>
-		<jcuda.scope>provided</jcuda.scope>
-		<jcuda.version>10.2.0</jcuda.version>
-		<slf4j.version>1.7.36</slf4j.version>
-		<log4j.version>2.17.2</log4j.version>
-		<!-- Set java compile level via argument, ex: 1.8 1.9 10 11-->
-		<java.level>17</java.level>
-		<!-->Testing settings<!-->
-		<maven.test.skip>true</maven.test.skip>
-		<test-parallel>classes</test-parallel>
-		<test-threadCount>2</test-threadCount>
-		<test-forkCount>1C</test-forkCount>
-		<rerun.failing.tests.count>2</rerun.failing.tests.count>
-		<jacoco.skip>true</jacoco.skip>
-		<jacoco.include>**</jacoco.include>
-		<automatedtestbase.outputbuffering>false</automatedtestbase.outputbuffering>
-		<argLine>-Xms3000m -Xmx3000m -Xmn300m</argLine>
-		<enableStats>false</enableStats>
-	</properties>
-
-	<repositories>
-		<repository>
-			<id>central</id>
-			<url>https://repo1.maven.org/maven2</url>
-			<releases>
-				<enabled>true</enabled>
-			</releases>
-		</repository>
-	</repositories>
-
-	<scm>
-		<developerConnection>scm:git:https://github.com/apache/systemds.git</developerConnection>
-		<tag>HEAD</tag>
-	</scm>
-
-	<build>
-		<!-- Adds scripts to main jar, in-memory jar, sources jar, and standalone jar -->
-		<resources>
-			<resource>
-				<directory>scripts</directory>
-				<excludes>
-					<exclude>algorithms/obsolete/*</exclude>
-					<exclude>datagen/obsolete/*</exclude>
-					<exclude>perftest/**/*</exclude>
-					<exclude>perftest</exclude>
-					<exclude>perftestDeprecated/*</exclude>
-					<exclude>perftestDeprecated</exclude>
-					<exclude>staging/**/*</exclude>
-					<exclude>nn/test/compare_backends/*</exclude>
-					<exclude>nn/test/compare_backends/*</exclude>
-				</excludes>
-				<targetPath>scripts</targetPath>
-			</resource>
-			<resource>
-				<directory>src/main/cuda/kernels</directory>
-				<includes>
-					<include>SystemDS.ptx</include>
-					<include>reduction.ptx</include>
-				</includes>
-				<targetPath>cuda/kernels</targetPath>
-			</resource>
-			<resource>
-				<directory>src/main/cpp/lib</directory>
-				<targetPath>lib</targetPath>
-			</resource>
-			<resource>
-				<directory>src/main/cuda/spoof</directory>
-				<targetPath>cuda/spoof</targetPath>
-			</resource>
-			<resource>
-				<directory>src/main/cuda/headers</directory>
-				<includes>
-					<include>agg_ops.cuh</include>
-					<include>operators.cuh</include>
-					<include>reduction.cuh</include>
-					<include>spoof_utils.cuh</include>
-					<include>TempStorage.cuh</include>
-					<include>utils.cuh</include>
-					<include>vector_write.cuh</include>
-					<include>vector_add.cuh</include>
-					<include>Matrix.h</include>
-				</includes>
-				<targetPath>cuda/headers</targetPath>
-			</resource>
-			<resource>
-				<directory>src/main/java/org/apache/sysds/hops/codegen/cplan/java</directory>
-				<includes>
-					<include>Cellwise.java.template</include>
-					<include>Rowwise.java.template</include>
-				</includes>
-				<targetPath>java/spoof</targetPath>
-			</resource>
-		</resources>
-
-		<plugins>
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-dependency-plugin</artifactId>
-				<executions>
-					<execution>
-						<id>unpack</id>
-						<phase>package</phase>
-						<goals>
-							<goal>unpack</goal>
-						</goals>
-						<configuration>
-							<artifactItems>
-								<artifactItem>
-									<groupId>org.apache.hadoop</groupId>
-									<artifactId>hadoop-test</artifactId>
-									<version>1.2.1</version>
-									<type>jar</type>
-									<overWrite>true</overWrite>
-									<outputDirectory>${project.build.directory}/hadoop-test</outputDirectory>
-									<includes>**/*</includes>
-								</artifactItem>
-							</artifactItems>
-							<overWriteReleases>false</overWriteReleases>
-							<overWriteSnapshots>true</overWriteSnapshots>
-						</configuration>
-					</execution>
-					<execution>
-						<phase>compile</phase>
-						<goals>
-							<goal>copy-dependencies</goal>
-						</goals>
-						<configuration>
-							<silent>true</silent>
-							<outputDirectory>${project.build.directory}/lib</outputDirectory>
-						</configuration>
-					</execution>
-				</executions>
-			</plugin>
-
-			<!-- PLEASE DO NOT REMOVE! NEEDED to "PACKAGE" ANTLR RUNTIME INTO SYSTEMDS.JAR -->
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-shade-plugin</artifactId>
-				<version>2.3</version>
-				<executions>
-					<execution>
-						<phase>package</phase>
-						<goals>
-							<goal>shade</goal>
-						</goals>
-						<configuration>
-							<artifactSet>
-								<includes>
-									<include>org.apache.wink:wink-json4j:*</include>
-									<include>org.antlr:antlr4-runtime:*</include>
-								</includes>
-							</artifactSet>
-							<transformers>
-								<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-									<mainClass>org.apache.sysds.api.DMLScript</mainClass>
-								</transformer>
-								<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"></transformer>
-								<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
-									<resource>META-INF/LICENSE</resource>
-									<file>src/assembly/bin/LICENSE</file>
-								</transformer>
-								<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
-									<resource>META-INF/NOTICE</resource>
-									<file>NOTICE</file>
-								</transformer>
-							</transformers>
-							<createDependencyReducedPom>false</createDependencyReducedPom>
-						</configuration>
-					</execution>
-				</executions>
-				<configuration>
-					<!-- Include signature files so that recent versions of Java will run
-						the resulting jar without complaining about "Invalid signature file digest
-						for Manifest main attributes".
-						Furthermore, the excluded notice and license files will be explicitly
-						added by the resource transformers above -->
-					<filters>
-						<filter>
-							<artifact>*:*</artifact>
-							<excludes>
-								<exclude>META-INF/*.SF</exclude>
-								<exclude>META-INF/*.DSA</exclude>
-								<exclude>META-INF/*.RSA</exclude>
-								<exclude>META-INF/LICENSE</exclude>
-								<exclude>META-INF/NOTICE</exclude>
-							</excludes>
-						</filter>
-					</filters>
-				</configuration>
-			</plugin>
-
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-compiler-plugin</artifactId>
-				<version>3.8.1</version> <!--$NO-MVN-MAN-VER$-->
-				<configuration>
-					<source>${java.level}</source>
-					<target>${java.level}</target>
-					<compilerArgs>
-						<arg>--add-modules=jdk.incubator.vector</arg>
-					</compilerArgs>
-				</configuration>
-			</plugin>
-
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-resources-plugin</artifactId>
-				<executions>
-					<execution>
-						<id>copy-resources</id>
-						<phase>compile</phase>
-						<goals>
-							<goal>copy-resources</goal>
-						</goals>
-						<configuration>
-							<resources>
-								<resource>
-									<directory>${basedir}/src/test/config/hadoop_bin_windows/bin</directory>
-									<filtering>false</filtering>
-									<includes>
-										<include>*.*</include>
-									</includes>
-								</resource>
-							</resources>
-							<outputDirectory>${basedir}/target/lib/hadoop/bin</outputDirectory>
-						</configuration>
-					</execution>
-				</executions>
-			</plugin>
-
-			<plugin>
-				<groupId>org.antlr</groupId>
-				<artifactId>antlr4-maven-plugin</artifactId>
-				<configuration>
-					<sourceDirectory>${basedir}/src/main/java</sourceDirectory>
-					<outputDirectory>${basedir}/src/main/java</outputDirectory>
-				</configuration>
-				<version>${antlr.version}</version>
-				<executions>
-					<execution>
-						<id>antlr</id>
-						<goals>
-							<goal>antlr4</goal>
-						</goals>
-					</execution>
-				</executions>
-			</plugin>
-
-			<plugin>
-				<!-- unit tests -->
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-surefire-plugin</artifactId>
-				<version>3.0.0-M5</version>
-				<configuration>
-					<skipTests>${maven.test.skip}</skipTests>
-					<parallel>${test-parallel}</parallel>
-					<threadCount>${test-threadCount}</threadCount>
-					<!-- 1C means the number of threads times 1 possible maximum forks for testing-->
-					<forkCount>${test-forkCount}</forkCount>
-					<reuseForks>false</reuseForks>
-					<reportFormat>brief</reportFormat>
-					<trimStackTrace>true</trimStackTrace>
-					<rerunFailingTestsCount>${rerun.failing.tests.count}</rerunFailingTestsCount>
-					<argLine>--add-modules=jdk.incubator.vector</argLine>
-				</configuration>
-			</plugin>
-
-			<plugin>
-				<artifactId>maven-clean-plugin</artifactId>
-				<executions>
-					<execution>
-						<id>clean-original-jar</id>
-						<phase>package</phase>
-						<goals>
-							<goal>clean</goal>
-						</goals>
-						<configuration>
-							<excludeDefaultDirectories>true</excludeDefaultDirectories>
-							<filesets>
-								<fileset>
-									<directory>${project.build.directory}</directory>
-									<includes>
-										<include>original-*.jar</include>
-									</includes>
-								</fileset>
-							</filesets>
-						</configuration>
-					</execution>
-					<execution>
-						<!-- remove antlr tokens files during initialize phase so antlr4 -->
-						<!-- plugin can regenerate them during generate-sources phase -->
-						<!-- <id>remove-antlr-tokens-files</id>
-						<phase>initialize</phase>
-						<goals>
-							<goal>clean</goal>
-						</goals>
-						<configuration>
-							<filesets>
-								<fileset>
-									<directory>src/main/java</directory>
-									<includes>
-										<include>*.tokens</include>
-									</includes>
-								</fileset>
-							</filesets>
-						</configuration> -->
-					</execution>
-				</executions>
-			</plugin>
-
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-antrun-plugin</artifactId>
-				<executions>
-					<execution>
-						<id>copy</id>
-						<phase>package</phase>
-						<configuration>
-							<target name="copy and rename JAR">
-								<copy file="${project.build.directory}/${project.artifactId}-${project.version}.jar" tofile="${project.build.directory}/SystemDS.jar" />
-							</target>
-						</configuration>
-						<goals>
-							<goal>run</goal>
-						</goals>
-					</execution>
-				</executions>
-			</plugin>
-
-			<plugin>
-				<groupId>org.jacoco</groupId>
-				<artifactId>jacoco-maven-plugin</artifactId>
-				<version>0.8.7</version>
-				<configuration>
-					<includes>
-						<include>${jacoco.include}</include>
-					</includes>
-				</configuration>
-				<executions>
-					<execution>
-						<id>default-prepare-agent</id>
-						<goals>
-							<goal>prepare-agent</goal>
-						</goals>
-					</execution>
-					<execution>
-						<id>generate-code-coverage-report</id>
-						<phase>test</phase>
-						<goals>
-							<goal>report</goal>
-						</goals>
-					</execution>
-				</executions>
-			</plugin>
-
-			<plugin>
-				<groupId>org.eluder.coveralls</groupId>
-				<artifactId>coveralls-maven-plugin</artifactId>
-				<version>4.3.0</version>
-			</plugin>
-
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-javadoc-plugin</artifactId>
-				<version>3.2.0</version>
-				<configuration>
-					<quiet>true</quiet>
-					<!-- Skip java docs creation if not explicitly asked, to create use -P distribution-->
-					<skip>true</skip>
-				</configuration>
-			</plugin>
-
-			<plugin>
-				<groupId>org.codehaus.mojo</groupId>
-				<artifactId>properties-maven-plugin</artifactId>
-				<version>1.0.0</version>
-				<executions>
-					<execution>
-						<phase>generate-resources</phase>
-						<goals>
-							<goal>write-project-properties</goal>
-						</goals>
-						<configuration>
-							<outputFile>${project.build.testOutputDirectory}/my.properties</outputFile>
-						</configuration>
-					</execution>
-				</executions>
-			</plugin>
-		</plugins>
-	</build>
-
-	<profiles>
-		<profile>
-			<id>windows-x86_64</id>
-			<activation>
-				<os>
-					<family>windows</family>
-					<arch>amd64</arch>
-				</os>
-			</activation>
-			<properties>
-				<jcuda.os>windows</jcuda.os>
-				<jcuda.arch>x86_64</jcuda.arch>
-			</properties>
-		</profile>
-		<profile>
-			<id>linux-x86_64</id>
-			<activation>
-				<os>
-					<family>unix</family>
-					<arch>amd64</arch>
-				</os>
-			</activation>
-			<properties>
-				<jcuda.os>linux</jcuda.os>
-				<jcuda.arch>x86_64</jcuda.arch>
-			</properties>
-		</profile>
-		<profile>
-			<id>apple-x86_64</id>
-			<activation>
-				<os>
-					<family>mac</family>
-					<arch>x86_64</arch>
-				</os>
-			</activation>
-			<properties>
-				<jcuda.os>apple</jcuda.os>
-				<jcuda.arch>x86_64</jcuda.arch>
-			</properties>
-		</profile>
-		<profile>
-			<id>linux-ppc_64</id>
-			<activation>
-				<os>
-					<family>unix</family>
-					<arch>ppc64le</arch>
-				</os>
-			</activation>
-			<properties>
-				<jcuda.os>linux</jcuda.os>
-				<jcuda.arch>ppc_64</jcuda.arch>
-			</properties>
-		</profile>
-
-		<profile>
-			<id>eclipse-only</id>
-			<activation>
-				<property>
-					<name>m2e.version</name>
-				</property>
-			</activation>
-			<build>
-				<pluginManagement>
-					<plugins>
-						<!-- Prevent m2e warnings in Eclipse. -->
-						<plugin>
-							<groupId>org.eclipse.m2e</groupId>
-							<artifactId>lifecycle-mapping</artifactId>
-							<version>1.0.0</version>
-							<configuration>
-								<lifecycleMappingMetadata>
-									<pluginExecutions>
-										<pluginExecution>
-											<pluginExecutionFilter>
-												<groupId>org.apache.maven.plugins</groupId>
-												<artifactId>maven-remote-resources-plugin</artifactId>
-												<versionRange>[1.4,)</versionRange>
-												<goals>
-													<goal>process</goal>
-												</goals>
-											</pluginExecutionFilter>
-											<action>
-												<ignore></ignore>
-											</action>
-										</pluginExecution>
-										<pluginExecution>
-											<pluginExecutionFilter>
-												<groupId>org.apache.maven.plugins</groupId>
-												<artifactId>maven-clean-plugin</artifactId>
-												<versionRange>[3.0.0,)</versionRange>
-												<goals>
-													<goal>clean</goal>
-												</goals>
-											</pluginExecutionFilter>
-											<action>
-												<ignore></ignore>
-											</action>
-										</pluginExecution>
-										<pluginExecution>
-											<pluginExecutionFilter>
-												<groupId>org.apache.maven.plugins</groupId>
-												<artifactId>maven-dependency-plugin</artifactId>
-												<versionRange>[2.10,)</versionRange>
-												<goals>
-													<goal>copy-dependencies</goal>
-												</goals>
-											</pluginExecutionFilter>
-											<action>
-												<ignore></ignore>
-											</action>
-										</pluginExecution>
-									</pluginExecutions>
-								</lifecycleMappingMetadata>
-							</configuration>
-						</plugin>
-					</plugins>
-				</pluginManagement>
-			</build>
-		</profile>
-
-		<profile>
-			<id>rat</id>
-			<build>
-				<defaultGoal>clean org.apache.rat:apache-rat-plugin:check</defaultGoal>
-				<plugins>
-					<plugin>
-						<groupId>org.apache.rat</groupId>
-						<artifactId>apache-rat-plugin</artifactId>
-						<version>0.12</version>
-						<executions>
-							<execution>
-								<phase>package</phase>
-								<goals>
-									<goal>check</goal>
-								</goals>
-							</execution>
-						</executions>
-						<configuration>
-							<excludes>
-								<exclude>scripts/perftest/results/**</exclude>
-								<exclude>scripts/perftest/temp/**</exclude>
-								<exclude>.gitignore</exclude>
-								<exclude>src/main/python/.gitignore</exclude>
-								<exclude>.gitmodules</exclude>
-								<exclude>.repository/</exclude>
-								<exclude>.idea/</exclude>
-								<exclude>.git</exclude>
-								<exclude>.settings</exclude>
-								<exclude>.classpath</exclude>
-								<exclude>.project</exclude>
-								<exclude>CITATION</exclude>
-								<exclude>src/main/python/docs/build/**/*</exclude>
-								<exclude>src/main/python/docs/source/_build/**</exclude>
-								<exclude>src/main/python/generator/resources/**</exclude>
-								<exclude>docs/api/**/*</exclude>
-								<exclude>docs/_site/**/*</exclude>
-								<exclude>docs/site/run_issues.md</exclude>
-								<exclude>docs/.jekyll-cache/**/*</exclude>
-								<exclude>docs/css/bootstrap.min.css</exclude>
-								<exclude>docs/css/pygments-default.css</exclude>
-								<exclude>docs/js/vendor/**/*</exclude>
-								<exclude>**/*.lock</exclude>
-								<exclude>**/*.csv</exclude>
-								<exclude>**/*.ijv</exclude>
-								<exclude>**/*.json</exclude>
-								<exclude>**/*.libsvm</exclude>
-								<exclude>**/*.mtx</exclude>
-								<exclude>**/*.mtd</exclude>
-								<exclude>**/*.out</exclude>
-								<exclude>**/__pycache__/**</exclude>
-								<exclude>**/part-*</exclude>
-								<exclude>**/*.keep</exclude>
-								<exclude>**/target/**</exclude>
-								<exclude>**/README.md</exclude>
-								<exclude>**/*.svg</exclude>
-								<!-- Jupyter Notebooks -->
-								<exclude>**/*.ipynb</exclude>
-								<!-- Generated antlr files -->
-								<exclude>src/main/java/*.tokens</exclude>
-								<exclude>**/*.interp</exclude>
-								<!-- Generated antlr files -->
-								<exclude>src/main/java/org/apache/sysds/protobuf/*.java</exclude>
-								<!-- Compiled ptx file from nvcc -->
-								<exclude>src/main/cuda/kernels/SystemDS.ptx</exclude>
-								<exclude>src/main/cuda/kernels/reduction.ptx</exclude>
-								<!-- Test Validation files -->
-								<exclude>src/test/scripts/functions/jmlc/**/*.impute</exclude>
-								<exclude>src/test/scripts/functions/jmlc/**/*.map</exclude>
-								<exclude>src/test/scripts/functions/jmlc/**/*.mode</exclude>
-								<exclude>src/test/scripts/functions/jmlc/**/*.ndistinct</exclude>
-								<exclude>src/test/scripts/functions/jmlc/**/*.node</exclude>
-								<exclude>src/test/scripts/functions/jmlc/tfmtd_example/Bin/saleprice.bin</exclude>
-								<exclude>src/test/scripts/functions/jmlc/tfmtd_example/Bin/sqft.bin</exclude>
-								<exclude>src/test/scripts/functions/jmlc/tfmtd_example/column.names</exclude>
-								<exclude>src/test/scripts/functions/jmlc/tfmtd_example/dummycoded.column.names</exclude>
-								<exclude>src/test/scripts/functions/jmlc/tfmtd_example2/column.names</exclude>
-								<exclude>src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame</exclude>
-								<!-- csv test input not captured by *.csv -->
-								<exclude>src/test/scripts/functions/io/csv/in/*/*</exclude>
-								<!-- Python bindings lineage test result comparison -->
-								<exclude>src/main/python/tests/lt*.txt</exclude>
-								<!-- Perftest requirement file -->
-								<exclude>scripts/perftest/python/requirements.txt</exclude>
-								<!-- external sources -->
-								<exclude>src/main/cuda/ext/**</exclude>
-								<exclude>src/main/cuda/.idea/</exclude>
-							</excludes>
-						</configuration>
-					</plugin>
-				</plugins>
-			</build>
-		</profile>
-
-		<profile>
-			<id>proton</id>
-			<build>
-				<plugins>
-					<plugin>
-						<!-- generate .java files from .proto -->
-						<groupId>com.github.os72</groupId>
-						<artifactId>protoc-jar-maven-plugin</artifactId>
-						<version>3.11.4</version>
-						<executions>
-							<execution>
-								<phase>generate-sources</phase>
-								<goals>
-									<goal>run</goal>
-								</goals>
-								<configuration>
-									<!-- protoc binaries to be picked up from
-									  https://repo.maven.apache.org/maven2/com/google/protobuf/protoc/ -->
-									<protocVersion>${protobuf.version}</protocVersion>
-									<inputDirectories>
-										<include>src/main/resources/protobuf</include>
-									</inputDirectories>
-									<outputDirectory>src/main/java</outputDirectory>
-								</configuration>
-							</execution>
-						</executions>
-					</plugin>
-				</plugins>
-			</build>
-		</profile>
-
-		<profile>
-			<!-- Profile to create binary distributions. Execute with `mvn clean package
-				-P distribution` -->
-			<id>distribution</id>
-			<build>
-				<plugins>
-					<plugin>
-						<artifactId>maven-assembly-plugin</artifactId>
-						<configuration>
-							<tarLongFileMode>posix</tarLongFileMode>
-						</configuration>
-						<executions>
-							<execution>
-								<id>create-source-distribution</id>
-								<phase>package</phase>
-								<goals>
-									<goal>single</goal>
-								</goals>
-								<configuration>
-									<descriptors>
-										<descriptor>src/assembly/source.xml</descriptor>
-									</descriptors>
-								</configuration>
-							</execution>
-							<execution>
-								<id>create-extra-jar</id>
-								<phase>package</phase>
-								<goals>
-									<goal>single</goal>
-								</goals>
-								<configuration>
-									<descriptors>
-										<descriptor>src/assembly/extra.xml</descriptor>
-									</descriptors>
-									<archive>
-										<manifestEntries>
-											<Build-Time>${maven.build.timestamp}</Build-Time>
-											<Artifact-Id>${project.artifactId}-extra</Artifact-Id>
-											<Version>${project.version}</Version>
-										</manifestEntries>
-									</archive>
-								</configuration>
-							</execution>
-							<execution>
-								<id>create-binary-distribution</id>
-								<phase>package</phase>
-								<goals>
-									<goal>single</goal>
-								</goals>
-								<configuration>
-									<descriptors>
-										<descriptor>src/assembly/bin.xml</descriptor>
-									</descriptors>
-								</configuration>
-							</execution>
-						</executions>
-					</plugin>
-
-					<plugin>
-						<artifactId>maven-gpg-plugin</artifactId>
-						<version>3.0.1</version>
-						<executions>
-							<execution>
-								<phase>verify</phase>
-								<goals>
-									<goal>sign</goal>
-								</goals>
-								<configuration>
-									<!-- This is necessary for gpg to not try to use the pinentry programs -->
-									<gpgArguments>
-										<arg>--pinentry-mode</arg>
-										<arg>loopback</arg>
-									</gpgArguments>
-								</configuration>
-							</execution>
-						</executions>
-					</plugin>
-
-					<plugin>
-						<groupId>org.apache.maven.plugins</groupId>
-						<artifactId>maven-remote-resources-plugin</artifactId>
-						<version>1.4</version>
-						<executions>
-							<execution>
-								<goals>
-									<goal>process</goal>
-								</goals>
-								<configuration>
-									<resourceBundles>
-										<!-- Will generate META-INF/DEPENDENCIES META-INF/LICENSE META-INF/NOTICE -->
-										<resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
-									</resourceBundles>
-								</configuration>
-							</execution>
-						</executions>
-					</plugin>
-
-					<plugin>
-						<groupId>org.apache.maven.plugins</groupId>
-						<artifactId>maven-javadoc-plugin</artifactId>
-						<version>3.2.0</version>
-						<configuration>
-							<!-- https://maven.apache.org/plugins/maven-javadoc-plugin/javadoc-mojo.html -->
-							<excludePackageNames>*.protobuf</excludePackageNames>
-							<notimestamp>true</notimestamp>
-							<failOnWarnings>true</failOnWarnings>
-							<quiet>true</quiet>
-							<skip>false</skip>
-							<show>public</show>
-							<source>${java.level}</source>
-						</configuration>
-						<executions>
-							<execution>
-								<id>attach-javadocs</id>
-								<goals>
-									<goal>jar</goal>
-								</goals>
-							</execution>
-						</executions>
-					</plugin>
-				</plugins>
-			</build>
-		</profile>
-
-		<profile>
-			<id>skip-sign</id>
-			<build>
-				<plugins>
-					<plugin>
-						<groupId>org.apache.maven.plugins</groupId>
-						<artifactId>maven-gpg-plugin</artifactId>
-						<configuration>
-							<skip>true</skip>
-						</configuration>
-					</plugin>
-				</plugins>
-			</build>
-		</profile>
-	</profiles>
-
-	<dependencies>
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcuda</artifactId>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-			<exclusions>
-				<exclusion>
-					<!-- always exclude recursive fetching of native libraries -->
-					<groupId>org.jcuda</groupId>
-					<artifactId>jcuda-natives</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcublas</artifactId>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-			<exclusions>
-				<exclusion>
-					<!-- always exclude recursive fetching of native libraries -->
-					<groupId>org.jcuda</groupId>
-					<artifactId>jcublas-natives</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusparse</artifactId>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-			<exclusions>
-				<exclusion>
-					<!-- always exclude recursive fetching of native libraries -->
-					<groupId>org.jcuda</groupId>
-					<artifactId>jcusparse-natives</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusolver</artifactId>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-			<exclusions>
-				<exclusion>
-					<!-- always exclude recursive fetching of native libraries -->
-					<groupId>org.jcuda</groupId>
-					<artifactId>jcusolver-natives</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcudnn</artifactId>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-			<exclusions>
-				<exclusion>
-					<!-- always exclude recursive fetching of native libraries -->
-					<groupId>org.jcuda</groupId>
-					<artifactId>jcudnn-natives</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<!-- for all platforms, to be included in the extra jar -->
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcuda-natives</artifactId>
-			<classifier>windows-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcublas-natives</artifactId>
-			<classifier>windows-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusparse-natives</artifactId>
-			<classifier>windows-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusolver-natives</artifactId>
-			<classifier>windows-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcudnn-natives</artifactId>
-			<classifier>windows-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcuda-natives</artifactId>
-			<classifier>linux-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcublas-natives</artifactId>
-			<classifier>linux-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusparse-natives</artifactId>
-			<classifier>linux-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusolver-natives</artifactId>
-			<classifier>linux-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcudnn-natives</artifactId>
-			<classifier>linux-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcuda-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcublas-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusparse-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcusolver-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.jcuda</groupId>
-			<artifactId>jcudnn-natives</artifactId>
-			<classifier>apple-x86_64</classifier>
-			<version>${jcuda.version}</version>
-			<scope>${jcuda.scope}</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.spark</groupId>
-			<artifactId>spark-core_${scala.binary.version}</artifactId>
-			<version>${spark.version}</version>
-			<exclusions>
-				<exclusion>
-					<groupId>log4j</groupId>
-					<artifactId>log4j</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-log4j12</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-reload4j</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-api</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>jul-to-slf4j</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>jcl-over-slf4j</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.apache.hadoop</groupId>
-					<artifactId>hadoop-client-api</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.apache.hadoop</groupId>
-					<artifactId>hadoop-client-runtime</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.apache.hadoop</groupId>
-					<artifactId>hadoop-client-runtime</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.spark</groupId>
-			<artifactId>spark-sql_${scala.binary.version}</artifactId>
-			<version>${spark.version}</version>
-			<exclusions>
-				<exclusion>
-					<groupId>log4j</groupId>
-					<artifactId>log4j</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-log4j12</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-reload4j</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.spark</groupId>
-			<artifactId>spark-mllib_${scala.binary.version}</artifactId>
-			<version>${spark.version}</version>
-			<exclusions>
-				<exclusion>
-					<groupId>log4j</groupId>
-					<artifactId>log4j</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-log4j12</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-reload4j</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-common</artifactId>
-			<version>${hadoop.version}</version>
-			<exclusions>
-				<exclusion>
-					<groupId>javax.servlet</groupId>
-					<artifactId>servlet-api</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-api</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-reload4j</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-hdfs</artifactId>
-			<version>${hadoop.version}</version>
-			<exclusions>
-				<exclusion>
-					<groupId>javax.servlet</groupId>
-					<artifactId>servlet-api</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-log4j12</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-reload4j</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-client</artifactId>
-			<version>${hadoop.version}</version>
-			<exclusions>
-				<exclusion>
-					<groupId>log4j</groupId>
-					<artifactId>log4j</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-log4j12</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-reload4j</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>commons-logging</groupId>
-			<artifactId>commons-logging</artifactId>
-			<version>1.1.3</version>
-			<exclusions>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-log4j12</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.slf4j</groupId>
-					<artifactId>slf4j-reload4j</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.commons</groupId>
-			<artifactId>commons-math3</artifactId>
-			<version>3.4.1</version>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.wink</groupId>
-			<artifactId>wink-json4j</artifactId>
-			<version>1.4</version>
-		</dependency>
-
-		<dependency>
-			<groupId>com.fasterxml.jackson.core</groupId>
-			<artifactId>jackson-databind</artifactId>
-			<version>2.12.6.1</version>
-		</dependency>
-
-		<dependency>
-			<groupId>junit</groupId>
-			<artifactId>junit</artifactId>
-			<version>4.13.1</version>
-			<scope>provided</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.openjdk.jol</groupId>
-			<artifactId>jol-core</artifactId>
-			<version>0.10</version>
-			<scope>test</scope>
-		</dependency>
-
-		<dependency>
-			<!--Used for annotations in tests to execute tests in thread safe manner-->
-			<groupId>com.github.stephenc.jcip</groupId>
-			<artifactId>jcip-annotations</artifactId>
-			<version>1.0-1</version>
-			<scope>test</scope>
-		</dependency>
-
-		<!-- fast java compiler for codegen, consistent version w/ spark -->
-		<dependency>
-			<groupId>org.codehaus.janino</groupId>
-			<artifactId>janino</artifactId>
-			<version>3.0.16</version>
-			<scope>provided</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.antlr</groupId>
-			<artifactId>antlr4</artifactId>
-			<version>${antlr.version}</version>
-			<scope>provided</scope>
-			<exclusions>
-				<exclusion>
-					<artifactId>antlr-runtime</artifactId>
-					<groupId>org.antlr</groupId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>org.antlr</groupId>
-			<artifactId>antlr4-runtime</artifactId>
-			<version>${antlr.version}</version>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.derby</groupId>
-			<artifactId>derby</artifactId>
-			<version>10.14.2.0</version>
-			<scope>provided</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>io.netty</groupId>
-			<artifactId>netty-all</artifactId>
-			<version>4.1.68.Final</version>
-			<scope>provided</scope>
-			<exclusions>
-				<exclusion>
-					<groupId>org.apache.logging.log4j</groupId>
-					<artifactId>log4j-api</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>org.apache.logging.log4j</groupId>
-					<artifactId>log4j-1.2-api</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-
-		<dependency>
-			<groupId>net.sf.py4j</groupId>
-			<artifactId>py4j</artifactId>
-			<version>0.10.9</version>
-		</dependency>
-
-		<!-- https://mvnrepository.com/artifact/org.apache.maven.plugins/maven-javadoc-plugin -->
-		<dependency>
-			<groupId>org.apache.maven.plugins</groupId>
-			<artifactId>maven-javadoc-plugin</artifactId>
-			<version>3.2.0</version>
-		</dependency>
-
-		<!-- https://mvnrepository.com/artifact/org.apache.maven.plugins/maven-gpg-plugin -->
-		<dependency>
-			<groupId>org.apache.maven.plugins</groupId>
-			<artifactId>maven-gpg-plugin</artifactId>
-			<version>1.6</version>
-		</dependency>
-
-		<!-- https://developers.google.com/protocol-buffers/docs/reference/java -->
-		<dependency>
-			<groupId>com.google.protobuf</groupId>
-			<artifactId>protobuf-java</artifactId>
-			<version>${protobuf.version}</version>
-		</dependency>
-
-		<dependency>
-			<groupId>com.google.protobuf</groupId>
-			<artifactId>protobuf-java-util</artifactId>
-			<version>${protobuf.version}</version>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.maven.plugins</groupId>
-			<artifactId>maven-assembly-plugin</artifactId>
-			<version>3.3.0</version>
-		</dependency>
-
-		<dependency>
-			<groupId>org.slf4j</groupId>
-			<artifactId>slf4j-api</artifactId>
-			<version>${slf4j.version}</version>
-		</dependency>
-		<dependency>
-			<groupId>org.slf4j</groupId>
-			<artifactId>jul-to-slf4j</artifactId>
-			<version>${slf4j.version}</version>
-		</dependency>
-		<dependency>
-			<groupId>org.slf4j</groupId>
-			<artifactId>jcl-over-slf4j</artifactId>
-			<version>${slf4j.version}</version>
-		</dependency>
-		<dependency>
-			<groupId>org.slf4j</groupId>
-			<artifactId>slf4j-reload4j</artifactId>
-			<version>${slf4j.version}</version>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.logging.log4j</groupId>
-			<artifactId>log4j-api</artifactId>
-			<version>${log4j.version}</version>
-		</dependency>
-	</dependencies>
-</project>
\ No newline at end of file
diff --git a/scripts/staging/SIMD-double-vectors/systemds b/scripts/staging/SIMD-double-vectors/systemds
deleted file mode 100755
index 6c17a8e0faf..00000000000
--- a/scripts/staging/SIMD-double-vectors/systemds
+++ /dev/null
@@ -1,487 +0,0 @@
-#!/usr/bin/env bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-##############################################################
-# This script is part of the SystemDS binary release. It is
-# meant to work out of the box when unzipping the
-# systemds-<version>.zip (or tbz) file.
-#
-# Make configuration changes here:
-##############################################################
-
-#  If not set by env,  set to 1 to run spark-submit instead of local java
-#  This should be used to run with spark-submit instead of java
-if [[ -z "$SYSDS_DISTRIBUTED" ]]; then
-  SYSDS_DISTRIBUTED=0
-fi
-
-# if not set by env, set to 1 to disable setup output of this script
-if [ -z "$SYSDS_QUIET" ]; then
-  SYSDS_QUIET=0
-fi
-
-# if not set by env, set to default exec modes
-if [[ -z "$SYSDS_EXEC_MODE" ]]; then
-  case "$SYSDS_DISTRIBUTED" in
-    0) SYSDS_EXEC_MODE=singlenode ;;
-    *) SYSDS_EXEC_MODE=hybrid ;;
-  esac
-fi
-
-# an echo toggle
-print_out()
-{
-  if [ $SYSDS_QUIET == 0 ]; then
-    echo "$1"
-  fi
-}
-
-if [[ -z $SYSTEMDS_ROOT ]] ; then
-  SYSTEMDS_ROOT=.
-  print_out "SYSTEMDS_ROOT not set defaulting to current dir $(pwd)"
-else
-  # construct a relative path
-  SYSTEMDS_ROOT=$(realpath --relative-to=. ${SYSTEMDS_ROOT})
-fi;
-
-# when using find, look in the directories in this order
-DIR_SEARCH_ORDER=". $SYSTEMDS_ROOT $SYSTEMDS_ROOT/conf  $SYSTEMDS_ROOT/lib $SYSTEMDS_ROOT/src $SYSTEMDS_ROOT/target"
-ordered_find() {
-  result=""
-  for dir in $(echo "$DIR_SEARCH_ORDER" | tr ' ' '\n') ; do
-    if [[ $dir == "$SYSTEMDS_ROOT" ]] || [[ $dir == "." ]]; then
-      result=$(find "$dir" -maxdepth 1 -iname "$1" -print -quit)
-      if [[ $result != "" ]]; then break; fi
-    else
-      result=$(find "$dir" -iname "$1" -print -quit 2> /dev/null)
-      if [[ $result != "" ]]; then break; fi
-    fi
-  done
-  echo "$result"
-}
-
-if [ -n "$SYSTEMDS_STANDALONE_OPTS" ]; then
-	print_out "Overriding SYSTEMDS_STANDALONE_OPTS with env var: $SYSTEMDS_STANDALONE_OPTS"
-else
-  # specify parameters  to java when running locally here
-  SYSTEMDS_STANDALONE_OPTS="-Xmx4g -Xms4g -Xmn400m "
-fi
-
-if [ -n "$SYSTEMDS_REMOTE_DEBUGGING" ]; then
-	print_out "Overriding SYSTEMDS_REMOTE_DEBUGGING with env var: $SYSTEMDS_REMOTE_DEBUGGING"
-else
-  SYSTEMDS_REMOTE_DEBUGGING=" -agentlib:jdwp=transport=dt_socket,suspend=y,address=8787,server=y "
-fi
-
-# check if log4j config file exists, otherwise unset
-# to run with a non fatal complaint by SystemDS
-if [ -z "$LOG4JPROP" ] ; then
-  LOG4JPROP=$(ordered_find "log4j*properties")
-
-  if [ -z "${LOG4JPROP}" ]; then
-    LOG4JPROP=""
-  else
-    LOG4JPROP="-Dlog4j.configuration=file:$LOG4JPROP"
-  fi
-else
-  # L4J was set by env var. Unset if that setting is wrong
-  LOG4JPROP2=$(find "$LOG4JPROP")
-  if [ -z "${LOG4JPROP2}" ]; then
-    LOG4JPROP=""
-  else
-    LOG4JPROP="-Dlog4j.configuration=file:$LOG4JPROP2"
-  fi
-fi
-
-if [ -n "${SYSTEMDS_DISTRIBUTED_OPTS}" ]; then
-  print_out "Overriding SYSTEMDS_DISTRIBUTED_OPTS with env var $SYSTEMDS_DISTRIBUTED_OPTS"
-else
-  # specify parameters to pass to spark-submit when running on spark here
-  SYSTEMDS_DISTRIBUTED_OPTS="\
-      --master yarn \
-      --deploy-mode client \
-      --driver-memory 100g \
-      --conf spark.driver.extraJavaOptions=\"-Xms100g -Xmn10g -Dlog4j.configuration=file:$LOG4JPROP\" \
-      --conf spark.executor.extraJavaOptions=\"-Dlog4j.configuration=file:$LOG4JPROP\" \
-      --conf spark.executor.heartbeatInterval=100s \
-      --files $LOG4JPROP \
-      --conf spark.network.timeout=512s \
-      --num-executors 4 \
-      --executor-memory 64 \
-      --executor-cores 16 "
-fi
-
-
-##############################################################
-# No need to touch the content below. These commands launch
-# SystemDS based on the settings above.
-##############################################################
-
-
-#-------------------------------------------------------------
-# some helper functions
-
-# error help print
-PRINT_SYSDS_HELP=0
-function printUsage {
-cat << EOF
-
-Usage: $0 [-r] [SystemDS.jar] [-f] <dml-filename> [arguments] [-help]
-
-    SystemDS.jar : Specify a custom SystemDS.jar file (this will be prepended
-                   to the classpath
-                   or fed to spark-submit
-    -r           : Spawn a debug server for remote debugging (standalone and
-                   spark driver only atm). Default port is 8787 - change within
-                   this script if necessary. See SystemDS documentation on how
-                   to attach a remote debugger.
-    -f           : Optional prefix to the dml-filename for consistency with
-                   previous behavior dml-filename : The script file to run.
-                   This is mandatory unless running as a federated worker
-                   (see below).
-    arguments    : The arguments specified after the DML script are passed to
-                   SystemDS. Specify parameters that need to go to
-                   java/spark-submit by editing this run script.
-    -help        : Print this usage message and SystemDS parameter info
-
-Worker Usage: $0 [-r] WORKER [SystemDS.jar] <portnumber> [arguments] [-help]
-
-    port         : The port to open for the federated worker.
-
-Set custom launch configuration by setting/editing SYSTEMDS_STANDALONE_OPTS
-and/or SYSTEMDS_DISTRIBUTED_OPTS.
-
-Set the environment variable SYSDS_DISTRIBUTED=1 to run spark-submit instead of
-local java Set SYSDS_QUIET=1 to omit extra information printed by this run
-script.
-
-EOF
-if [ ${PRINT_SYSDS_HELP} -eq 0 ]; then
-  exit 0
-fi
-}
-
-# print an error if no argument is supplied.
-if [ -z "$1" ] ; then
-    echo "Wrong Usage. Add -help for additional parameters.";
-    echo ""
-    printUsage;
-fi
-
-#This loop handles the parameters to the run-script, not the ones passed to SystemDS.
-#To not confuse getopts with SystemDS parameters, only the first two params are considered
-#here. If more run-script params are needed, adjust the next line accordingly
-while getopts ":hr:f:" options "$1$2"; do
-  case $options in
-    h ) echo "Help requested. Will exit after extended usage message!"
-        PRINT_SYSDS_HELP=1
-        printUsage
-        break
-        ;;
-    \? ) echo "Unknown parameter -$OPTARG"
-        printUsage
-        exit
-        ;;
-    f )
-        # silently remove -f (this variant is triggered if there's no
-        # jar file or WORKER as first parameter)
-        if  echo "$OPTARG" | grep -qi "dml"; then
-          break
-        else
-          print_out "No DML Script found after -f option."
-        fi
-        ;;
-    r )
-        print_out "Spawning server for remote debugging"
-        if [ $SYSDS_DISTRIBUTED == 0 ]; then
-          SYSTEMDS_STANDALONE_OPTS=${SYSTEMDS_STANDALONE_OPTS}${SYSTEMDS_REMOTE_DEBUGGING}
-        else
-          SYSTEMDS_DISTRIBUTED_OPTS=${SYSTEMDS_DISTRIBUTED_OPTS}${SYSTEMDS_REMOTE_DEBUGGING}
-        fi
-        shift # remove -r from positional arguments
-        ;;
-    * )
-      print_out "Error: Unexpected error while processing options;"
-      printUsage
-      exit
-  esac
-done
-
-# Peel off first and/or second argument so that $@ contains arguments to DML script
-if  echo "$1" | grep -q "jar"; then
-  SYSTEMDS_JAR_FILE=$1
-  shift
-  # handle optional '-f' before DML file (for consistency)
-  if  echo "$1" | grep -q "\-f"; then
-    shift
-    SCRIPT_FILE=$1
-    shift
-  else
-    SCRIPT_FILE=$1
-    shift
-  fi
-elif echo "$1" | grep -q "WORKER"; then
-  WORKER=1
-  shift
-  if echo "$1" | grep -q "jar"; then
-    SYSTEMDS_JAR_FILE=$1
-    shift
-  fi
-  PORT=$1
-  re='^[0-9]+$'
-  if ! [[ $PORT =~ $re ]] ; then
-    echo "error: Port is not a number"
-    printUsage
-  fi
-  shift
-elif echo "$1" | grep -q "FEDMONITORING"; then
-  FEDMONITORING=1
-  shift
-  if echo "$1" | grep -q "jar"; then
-    SYSTEMDS_JAR_FILE=$1
-    shift
-  fi
-  PORT=$1
-  re='^[0-9]+$'
-  if ! [[ $PORT =~ $re ]] ; then
-    echo "error: Port is not a number"
-    printUsage
-  fi
-  shift
-else
-  # handle optional '-f' before DML file (for consistency)
-  if  echo "$1" | grep -q "\-f"; then
-    shift
-    SCRIPT_FILE=$1
-    shift
-  else
-    SCRIPT_FILE=$1
-    shift
-  fi
-fi
-
-if [ -z "$WORKER" ] ; then
-  WORKER=0
-fi
-
-if [ -z "$FEDMONITORING" ] ; then
-  FEDMONITORING=0
-fi
-
-# find me a SystemDS jar file to run
-if [ -z "$SYSTEMDS_JAR_FILE" ];then
-  SYSTEMDS_JAR_FILE=$(ordered_find "systemds.jar")
-  if [ -z "$SYSTEMDS_JAR_FILE" ];then
-    SYSTEMDS_JAR_FILE=$(ordered_find "systemds-?.?.?.jar")
-    if [ -z "$SYSTEMDS_JAR_FILE" ];then
-      SYSTEMDS_JAR_FILE=$(ordered_find "systemds-?.?.?-SNAPSHOT.jar")
-    fi
-  fi
-else
-  print_out "Using user supplied systemds jar file $SYSTEMDS_JAR_FILE"
-fi
-
-if [[ "$*" == *-config* ]]; then
-# override config file from env var if given as parameter to SystemDS
-  read -r -d '' -a myArray < <( echo "$@" )
-  INDEX=0
-  for i in "${myArray[@]}"; do
-    if [[ ${myArray[INDEX]} == *-config* ]]; then
-      if [ -f "${myArray[((INDEX+1))]}" ]; then
-        CONFIG_FILE="${myArray[((INDEX+1))]}"
-      else
-        echo Warning! Passed config file "${myArray[((INDEX+1))]}" does not exist.
-      fi
-      # remove -config
-      unset 'myArray[INDEX]'
-      
-      # remove -config param if not starting with -
-      if [[ "${myArray[((INDEX+1))]:0:1}" != "-" ]]; then
-        unset 'myArray[((INDEX+1))]'
-      fi
-      # setting the script arguments without the passed -config for further processing  
-      set -- "${myArray[@]}"
-      break;
-    fi
-    # debug print array item
-    #echo "${myArray[INDEX]}" 
-    (( INDEX=INDEX+1 ))
-  done
-
-  if [ -f "$CONFIG_FILE" ] ; then
-    CONFIG_FILE="-config $CONFIG_FILE"
-  else
-    CONFIG_FILE=""
-  fi
-elif [ -z "$CONFIG_FILE" ] ; then
-  # same as above: set config file param if the file exists
-  CONFIG_FILE=$(ordered_find "SystemDS-config-defaults.xml")
-  if [ -z "$CONFIG_FILE" ]; then
-    CONFIG_FILE=$(ordered_find "SystemDS-config.xml")
-  fi
-  if [ -z "$CONFIG_FILE" ]; then
-    CONFIG_FILE=""
-  else
-    CONFIG_FILE="-config $CONFIG_FILE"
-  fi
-else
-  # CONFIG_FILE was set by env var. Unset if that setting is wrong
-  if [ -f "${CONFIG_FILE}" ]; then
-    CONFIG_FILE="-config $CONFIG_FILE"
-  else
-    CONFIG_FILE=""
-  fi
-fi
-
-# override exec mode if given as parameter to SystemDS (e.g. -exec singlenode)
-read -r -d '' -a myArray < <( echo "$@" )
-INDEX=0
-for i in "${myArray[@]}"; do
-  if [[ "$i" == *-exec* ]]; then
-    SYSDS_EXEC_MODE="${myArray[((INDEX+1))]}"
-    break;
-  fi
-  (( INDEX=INDEX+1 ))
-done
-
-if [ $SYSDS_DISTRIBUTED -ne 0 ] && [[ $SYSDS_EXEC_MODE == "singlenode" ]]; then
-  echo "Error: Can not run on Spark with execution mode singlenode"
-  exit 1
-fi
-
-# find absolute path to hadoop home in SYSTEMDS_ROOT
-if [ -z "$HADOOP_HOME" ]; then
-  HADOOP_HOME=$(realpath "$(find "$SYSTEMDS_ROOT" -iname hadoop | tail -n 1 )")
-  export HADOOP_HOME
-fi
-# add hadoop home to path and lib path for loading hadoop jni
-HADOOP_REL=$(realpath --relative-to=. "$HADOOP_HOME")
-
-# default directory separator unix style
-DIR_SEP=/
-# detect operating system to set correct path separator
-if [ "$OSTYPE" == "win32" ] ||  [ "$OSTYPE" == "msys" ] ||  [ "$OSTYPE" == "cygwin" ]; then
-  PATH_SEP=\;
-  DIR_SEP=\\
-  HADOOP_REL="${HADOOP_REL////\\}"
-else
-  PATH_SEP=:
-fi
-
-# make the jar path relative to skip issues with Windows paths
-JARNAME=$(basename "$SYSTEMDS_JAR_FILE")
-
-# relative path to jar file
-SYSTEMDS_JAR_FILE=$(realpath --relative-to=. "$(dirname "$SYSTEMDS_JAR_FILE")")${DIR_SEP}${JARNAME}
-
-NATIVE_LIBS="$SYSTEMDS_ROOT${DIR_SEP}target${DIR_SEP}classes${DIR_SEP}lib"
-export PATH=${HADOOP_REL}${DIR_SEP}bin${PATH_SEP}${PATH}${PATH_SEP}$NATIVE_LIBS
-export LD_LIBRARY_PATH=${HADOOP_REL}${DIR_SEP}bin${PATH_SEP}${LD_LIBRARY_PATH}
-
-# set java class path
-CLASSPATH="${SYSTEMDS_JAR_FILE}${PATH_SEP} \
-          ${SYSTEMDS_ROOT}${DIR_SEP}lib${DIR_SEP}*${PATH_SEP} \
-          ${SYSTEMDS_ROOT}${DIR_SEP}target${DIR_SEP}lib${DIR_SEP}*"
-# trim whitespace (introduced by the line breaks above)
-CLASSPATH=$(echo "${CLASSPATH}" | tr -d '[:space:]')
-
-if [ $PRINT_SYSDS_HELP == 1 ]; then
-  echo "----------------------------------------------------------------------"
-  echo "Further help on SystemDS arguments:"
-  java -cp "$CLASSPATH" org.apache.sysds.api.DMLScript -help
-  exit 1
-fi
-
-print_out "###############################################################################"
-print_out "#  SYSTEMDS_ROOT= $SYSTEMDS_ROOT"
-print_out "#  SYSTEMDS_JAR_FILE= $SYSTEMDS_JAR_FILE"
-print_out "#  SYSDS_EXEC_MODE= $SYSDS_EXEC_MODE"
-print_out "#  CONFIG_FILE= $CONFIG_FILE"
-print_out "#  LOG4JPROP= $LOG4JPROP"
-print_out "#  CLASSPATH= $CLASSPATH"
-print_out "#  HADOOP_HOME= $HADOOP_HOME"
-
-#build the command to run
-if [ $WORKER == 1 ]; then
-  print_out "#"
-  print_out "#  starting Federated worker on port $PORT"
-  print_out "###############################################################################"
-  CMD=" \
-  java $SYSTEMDS_STANDALONE_OPTS \
-  -cp $CLASSPATH \
-  $LOG4JPROP \
-  org.apache.sysds.api.DMLScript \
-  -w $PORT \
-  $CONFIG_FILE \
-  $*"
-  print_out "Executing command: $CMD"
-  print_out  ""
-
-elif [ "$FEDMONITORING" == 1 ]; then
-  print_out "#"
-  print_out "#  starting Federated backend monitoring on port $PORT"
-  print_out "###############################################################################"
-  CMD=" \
-  java $SYSTEMDS_STANDALONE_OPTS \
-  -cp $CLASSPATH \
-  $LOG4JPROP \
-  org.apache.sysds.api.DMLScript \
-  -fedMonitoring $PORT \
-  $CONFIG_FILE \
-  $*"
-  print_out "Executing command: $CMD"
-  print_out  ""
-
-elif [ $SYSDS_DISTRIBUTED == 0 ]; then
-  print_out "#"
-  print_out "#  Running script $SCRIPT_FILE locally with opts: $*"
-  print_out "###############################################################################"
-  CMD=" \
-  java $SYSTEMDS_STANDALONE_OPTS \
-  -cp $CLASSPATH \
-  $LOG4JPROP \
-  --add-modules=jdk.incubator.vector \
-  org.apache.sysds.api.DMLScript \
-  -f $SCRIPT_FILE \
-  -exec $SYSDS_EXEC_MODE \
-  $CONFIG_FILE \
-  $*"
-  print_out "Executing command:  $CMD"
-  print_out ""
-else
-  print_out "#"
-  print_out "#  Running script $SCRIPT_FILE distributed with opts: $*"
-  print_out "###############################################################################"
-  export SPARK_MAJOR_VERSION=2
-  CMD=" \
-  spark-submit $SYSTEMDS_DISTRIBUTED_OPTS \
-  $SYSTEMDS_JAR_FILE \
-  -f $SCRIPT_FILE \
-  -exec $SYSDS_EXEC_MODE \
-  $CONFIG_FILE \
-  $*"
-  print_out "Executing command: $CMD"
-  print_out  ""
-fi
-
-# run
-eval "$CMD"
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index a9f4beaed29..af702cb7fad 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -3717,26 +3717,22 @@ public static double dotProduct( double[] a, double[] b, int ai, int bi, final i
 	public static double dotProduct( double[] a, double[] b, int[] aix, int ai, final int bi, final int len )
 	{
 		double val = 0;
-		final int bn = len%8;
+		final int bn = len%vLen;
 				
 		//compute rest
 		for( int i = ai; i < ai+bn; i++ )
 			val += a[ i ] * b[ bi+aix[i] ];
 		
-		//unrolled 8-block (for better instruction-level parallelism)
-		for( int i = ai+bn; i < ai+len; i+=8 )
+		//unrolled vLen-block (for better instruction-level parallelism)
+		for( int i = ai+bn; i < ai+len; i+=vLen)
 		{
 			//read 64B cacheline of a
 			//read 64B of b via 'gather'
 			//compute cval' = sum(a * b) + cval
-			val += a[ i+0 ] * b[ bi+aix[i+0] ]
-			     + a[ i+1 ] * b[ bi+aix[i+1] ]
-			     + a[ i+2 ] * b[ bi+aix[i+2] ]
-			     + a[ i+3 ] * b[ bi+aix[i+3] ]
-			     + a[ i+4 ] * b[ bi+aix[i+4] ]
-			     + a[ i+5 ] * b[ bi+aix[i+5] ]
-			     + a[ i+6 ] * b[ bi+aix[i+6] ]
-			     + a[ i+7 ] * b[ bi+aix[i+7] ];
+			var aVec = DoubleVector.fromArray(SPECIES, a, i);
+			var bVec = DoubleVector.fromArray(SPECIES, b, bi, aix, i);
+			val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
+
 		}
 		
 		//scalar result