diff --git a/scripts/staging/SIMD-double-vectors/LibMatrixMult.java b/scripts/staging/SIMD-double-vectors/LibMatrixMult.java
deleted file mode 100644
index e1e3a640e43..00000000000
--- a/scripts/staging/SIMD-double-vectors/LibMatrixMult.java
+++ /dev/null
@@ -1,4445 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.runtime.matrix.data;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.stream.IntStream;
-
-import jdk.incubator.vector.DoubleVector;
-import jdk.incubator.vector.VectorOperators;
-import jdk.incubator.vector.VectorSpecies;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.commons.math3.util.FastMath;
-import org.apache.sysds.hops.OptimizerUtils;
-import org.apache.sysds.lops.MapMultChain.ChainType;
-import org.apache.sysds.lops.WeightedCrossEntropy.WCeMMType;
-import org.apache.sysds.lops.WeightedDivMM.WDivMMType;
-import org.apache.sysds.lops.WeightedSigmoid.WSigmoidType;
-import org.apache.sysds.lops.WeightedSquaredLoss.WeightsType;
-import org.apache.sysds.lops.WeightedUnaryMM.WUMMType;
-import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.DenseBlockFactory;
-import org.apache.sysds.runtime.data.SparseBlock;
-import org.apache.sysds.runtime.data.SparseBlock.Type;
-import org.apache.sysds.runtime.data.SparseBlockCSR;
-import org.apache.sysds.runtime.data.SparseBlockFactory;
-import org.apache.sysds.runtime.data.SparseBlockMCSR;
-import org.apache.sysds.runtime.data.SparseRowScalar;
-import org.apache.sysds.runtime.functionobjects.SwapIndex;
-import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
-import org.apache.sysds.runtime.util.CommonThreadPool;
-import org.apache.sysds.runtime.util.UtilFunctions;
-import org.apache.sysds.utils.NativeHelper;
-
-/**
- * MB: Library for matrix multiplications including MM, MV, VV for all
- * combinations of dense, sparse, ultrasparse representations and special
- * operations such as transpose-self matrix multiplication.
- *
- * In general all implementations use internally dense outputs
- * for direct access, but change the final result to sparse if necessary.
- * The only exceptions are ultra-sparse matrix mult, wsloss and wsigmoid.
- */
-public class LibMatrixMult
-{
- //internal configuration
- private static final boolean LOW_LEVEL_OPTIMIZATION = true;
- private static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB
- private static final long PAR_MINFLOP_THRESHOLD1 = 2L*1024*1024; //MIN 2 MFLOP
- private static final long PAR_MINFLOP_THRESHOLD2 = 128L*1024; //MIN 2 MFLOP
- public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size)
- public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size)
- private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName());
-
- static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED;
-
- private LibMatrixMult() {
- //prevent instantiation via private constructor
- }
-
- ////////////////////////////////
- // public matrix mult interface
- ////////////////////////////////
-
- /**
- * Performs a matrix multiplication
- *
- * All variants use a IKJ access pattern, and internally use dense output. After the
- * actual computation, we recompute nnz and check for sparse/dense representation.
- *
- * @param m1 first matrix
- * @param m2 second matrix
- * @return ret Matrix Block
- */
- public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2) {
- return matrixMult(m1, m2, null, false, 1);
- }
-
- /**
- * Performs a matrix multiplication
- *
- * All variants use a IKJ access pattern, and internally use dense output. After the
- * actual computation, we recompute nnz and check for sparse/dense representation.
- *
- * @param m1 first matrix
- * @param m2 second matrix
- * @param k maximum parallelism
- * @return ret Matrix Block
- */
- public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, int k) {
- return matrixMult(m1, m2, null, false, k);
- }
-
- /**
- * Performs a matrix multiplication and stores the result in the output matrix.
- *
- * All variants use a IKJ access pattern, and internally use dense output. After the
- * actual computation, we recompute nnz and check for sparse/dense representation.
- *
- * @param m1 first matrix
- * @param m2 second matrix
- * @param ret result matrix
- * @return ret Matrix Block
- */
- public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret) {
- return matrixMult(m1, m2, ret, false, 1);
- }
-
- /**
- * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate
- * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally
- * consumed by another dense instruction, which makes repeated conversion to sparse wasteful.
- * This should be used in rare cases and if you are unsure,
- * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead.
- *
- * @param m1 first matrix
- * @param m2 second matrix
- * @param ret result matrix
- * @param fixedRet if true, output representation is fixed and nnzs not recomputed
- * @return ret Matrix Block
- */
- public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean fixedRet) {
- return matrixMult(m1, m2, ret, fixedRet, 1);
- }
-
- /**
- * Performs a multi-threaded matrix multiplication and stores the result in the output matrix.
- * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
- *
- * @param m1 first matrix
- * @param m2 second matrix
- * @param ret result matrix
- * @param k maximum parallelism
- * @return ret Matrix Block
- */
- public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) {
- return matrixMult(m1, m2, ret, false, k);
- }
-
- /**
- * Performs a matrix multiplication and stores the result in the output matrix.
- *
- * All variants use a IKJ access pattern, and internally use dense output. After the
- * actual computation, we recompute nnz and check for sparse/dense representation.
- *
- * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate
- * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally
- * consumed by another dense instruction, which makes repeated conversion to sparse wasteful.
- * This should be used in rare cases and if you are unsure,
- * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead.
- *
- * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
- *
- * @param m1 first matrix
- * @param m2 second matrix
- * @param ret result matrix
- * @param fixedRet if true, output representation is fixed and nnzs not recomputed
- * @param k maximum parallelism
- * @return ret Matrix Block
- */
- public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean fixedRet, int k) {
- if(m1.isEmptyBlock(false) || m2.isEmptyBlock(false))
- return emptyMatrixMult(m1, m2, ret);
-
- // Timing time = new Timing(true);
-
- // pre analysis
- boolean m1Perm = m1.isSparsePermutationMatrix();
- boolean ultraSparse = (fixedRet && ret.sparse) ||
- (!fixedRet && isUltraSparseMatrixMult(m1, m2, m1Perm));
- boolean sparse = !fixedRet && !ultraSparse && !m1Perm
- && isSparseOutputMatrixMult(m1, m2);
-
- // allocate output
- if(ret == null)
- ret = new MatrixBlock(m1.rlen, m2.clen, ultraSparse | sparse);
- else
- ret.reset(m1.rlen, m2.clen, ultraSparse | sparse);
- ret.allocateBlock();
-
- // Detect if we should transpose skinny right side.
- boolean tm2 = !fixedRet && checkPrepMatrixMultRightInput(m1,m2);
- m2 = prepMatrixMultRightInput(m1, m2, tm2);
-
- // check for multi-threading
- if (!ret.isThreadSafe()
- || !satisfiesMultiThreadingConstraints(m1, m2, m1.rlen==1, true, 2, k)
- || fixedRet) // Fixed ret not supported in multithreaded execution yet
- k = 1;
-
- if(k <= 1)
- singleThreadedMatrixMult(m1, m2, ret, ultraSparse, sparse, tm2, m1Perm, fixedRet);
- else
- parallelMatrixMult(m1, m2, ret, k, ultraSparse, sparse, tm2, m1Perm);
-
- //System.out.println("MM "+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
- // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
-
- return ret;
- }
-
- private static void singleThreadedMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
- boolean ultraSparse, boolean sparse, boolean tm2, boolean m1Perm, boolean fixedRet){
- // prepare row-upper for special cases of vector-matrix
- final boolean pm2 = !ultraSparse && checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE);
- final int ru2 = (pm2) ? m2.rlen : m1.rlen;
-
- // core matrix mult computation
- if(ultraSparse && !fixedRet)
- matrixMultUltraSparse(m1, m2, ret, m1Perm, 0, ru2);
- else if(!m1.sparse && !m2.sparse)
- matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, m2.clen);
- else if(m1.sparse && m2.sparse)
- matrixMultSparseSparse(m1, m2, ret, pm2, sparse, 0, ru2);
- else if(m1.sparse)
- matrixMultSparseDense(m1, m2, ret, pm2, 0, ru2);
- else
- matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2);
-
- // post-processing: nnz/representation
- if(!fixedRet) {
- if(!ret.sparse)
- ret.recomputeNonZeros();
- ret.examSparsity();
- }
- }
-
- private static void parallelMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k,
- boolean ultraSparse, boolean sparse, boolean tm2, boolean m1Perm){
- // prepare row-upper for special cases of vector-matrix / matrix-matrix
- boolean pm2r = !ultraSparse && !sparse && checkParMatrixMultRightInputRows(m1, m2, k);
- boolean pm2c = !ultraSparse && checkParMatrixMultRightInputCols(m1, m2, k, pm2r);
- int num = pm2r ? m2.rlen : pm2c ? m2.clen : m1.rlen;
-
- // core multi-threaded matrix mult computation
- // (currently: always parallelization over number of rows)
- try {
- ExecutorService pool = CommonThreadPool.get(k);
- ArrayList tasks = new ArrayList<>();
- ArrayList blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, (pm2r || pm2c));
- for(int i = 0, lb = 0; i < blklens.size(); lb += blklens.get(i), i++)
- tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i)));
- // execute tasks
- List> taskret = pool.invokeAll(tasks);
- pool.shutdown();
- // aggregate partial results (nnz, ret for vector/matrix)
- ret.nonZeros = 0; // reset after execute
- for(Future task : taskret) {
- if(pm2r) // guaranteed single block
- vectAdd((double[]) task.get(), ret.getDenseBlockValues(), 0, 0, ret.rlen * ret.clen);
- else
- ret.nonZeros += (Long) task.get();
- }
- if(pm2r)
- ret.recomputeNonZeros();
- }
- catch(Exception ex) {
- throw new DMLRuntimeException(ex);
- }
-
- // post-processing (nnz maintained in parallel)
- ret.examSparsity();
- }
-
- public static MatrixBlock emptyMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret){
- final int rl = m1.rlen;
- final int cl = m2.clen;
-
- if(ret == null)
- return new MatrixBlock(rl, cl, true);
- else {
- ret.reset(rl, cl, true);
- ret.setNonZeros(0);
- ret.cleanupBlock(true, true);
- return ret;
- }
- }
-
- /**
- * Performs a matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)).
- *
- * All variants use a IKJ access pattern, and internally use dense output. After the
- * actual computation, we recompute nnz and check for sparse/dense representation.
- *
- * @param mX X matrix
- * @param mV v matrix
- * @param mW w matrix
- * @param ret result matrix
- * @param ct chain type
- */
- public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct) {
- //check inputs / outputs (after that mV and mW guaranteed to be dense)
- if( mX.isEmptyBlock(false) || (mV.isEmptyBlock(false) && ct!=ChainType.XtXvy)
- || (mW !=null && mW.isEmptyBlock(false)) ) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing: output allocation
- ret.sparse = false;
- ret.allocateDenseBlock();
-
- //core matrix mult chain computation
- if( mX.sparse )
- matrixMultChainSparse(mX, mV, mW, ret, ct, 0, mX.rlen);
- else
- matrixMultChainDense(mX, mV, mW, ret, ct, 0, mX.rlen);
-
- //post-processing
- ret.recomputeNonZeros();
- ret.examSparsity();
-
- //System.out.println("MMChain "+ct.toString()+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
- }
-
- /**
- * Performs a parallel matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)).
- * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
- *
- * NOTE: This multi-threaded mmchain operation has additional memory requirements of k*ncol(X)*8bytes
- * for partial aggregation. Current max memory: 256KB; otherwise redirectly to sequential execution.
- *
- * @param mX X matrix
- * @param mV v matrix
- * @param mW w matrix
- * @param ret result matrix
- * @param ct chain type
- * @param k maximum parallelism
- */
- public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int k) {
- //check inputs / outputs (after that mV and mW guaranteed to be dense)
- if( mX.isEmptyBlock(false) || (mV.isEmptyBlock(false) && ct!=ChainType.XtXvy)
- || (mW !=null && mW.isEmptyBlock(false)) ) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //check temporary memory and too small workload for multi-threading
- if( !satisfiesMultiThreadingConstraints(mX, true, true, mX.sparse?2:4, k) ) {
- matrixMultChain(mX, mV, mW, ret, ct);
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing (no need to check isThreadSafe)
- ret.sparse = false;
- ret.allocateDenseBlock();
-
- //core matrix mult chain computation
- //(currently: always parallelization over number of rows)
- try {
- ExecutorService pool = CommonThreadPool.get(k);
- ArrayList blklens = UtilFunctions.getBalancedBlockSizesDefault(mX.rlen, k, true);
- ArrayList tasks = new ArrayList<>();
- for( int i=0, lb=0; i> taskret = pool.invokeAll(tasks);
- pool.shutdown();
- //aggregate partial results and error handling
- double[][] a = new double[taskret.size()][];
- for(int i=0; i tasks = new ArrayList<>();
- //load balance via #tasks=2k due to triangular shape
- int blklen = (int)(Math.ceil((double)ret.rlen / (2 * k)));
- for(int i = 0; i < ret.rlen; i += blklen)
- tasks.add(new MatrixMultTransposeTask(m1, ret, leftTranspose, i, Math.min(i+blklen, ret.rlen)));
- for( Future rtask : pool.invokeAll(tasks) )
- rtask.get();
- }
- catch(Exception ex) {
- throw new DMLRuntimeException(ex);
- }
- finally{
- pool.shutdown();
- }
-
- //post-processing
- long nnz = copyUpperToLowerTriangle(ret);
- ret.setNonZeros(nnz);
- ret.examSparsity();
-
- //System.out.println("TSMM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop());
- }
-
- public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2 ) {
- //check inputs / outputs
- if( pm1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
- return;
-
- //Timing time = new Timing(true);
-
- //pre-processing
- ret1.sparse = (m2.sparse || ret1.sparse);
- if( ret1.sparse )
- ret1.allocateSparseRowsBlock();
- else
- ret1.allocateDenseBlock();
-
- //core permutation mm computation
- if( m2.sparse )
- matrixMultPermuteSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
- else if( ret1.sparse )
- matrixMultPermuteDenseSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
- else
- matrixMultPermuteDense(pm1, m2, ret1, ret2, 0, pm1.rlen);
-
- //post-processing
- ret1.recomputeNonZeros();
- ret1.examSparsity();
- if( ret2 != null ) { //optional second output
- ret2.recomputeNonZeros();
- ret2.examSparsity();
- }
-
- //System.out.println("PMM Seq ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" +
- // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
- }
-
- public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int k) {
- //check inputs / outputs
- if( pm1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
- return;
-
- //check no parallelization benefit (fallback to sequential)
- if (pm1.rlen == 1) {
- matrixMultPermute(pm1, m2, ret1, ret2);
- return;
- }
-
- //Timing time = new Timing(true);
-
- //allocate first output block (second allocated if needed)
- ret1.sparse = false; // no need to check isThreadSafe
- ret1.allocateDenseBlock();
-
- try
- {
- ExecutorService pool = CommonThreadPool.get(k);
- ArrayList tasks = new ArrayList<>();
- int blklen = (int)(Math.ceil((double)pm1.rlen/k));
- for( int i=0; i tasks = new ArrayList<>();
- int blklen = (int)(Math.ceil((double)mX.rlen/k));
- for( int i=0; i> taskret = pool.invokeAll(tasks);
- pool.shutdown();
- //aggregate partial results
- sumScalarResults(taskret, ret);
- }
- catch( Exception e ) {
- throw new DMLRuntimeException(e);
- }
-
- //add correction for sparse wsloss w/o weight
- if( mX.sparse && wt==WeightsType.NONE )
- addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, k);
-
- //System.out.println("MMWSLoss "+wt.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
- }
-
- public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt) {
- //check for empty result
- if( mW.isEmptyBlock(false) ) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing
- ret.sparse = mW.sparse;
- ret.allocateBlock();
-
- //core weighted square sum mm computation
- boolean allDense = !mW.sparse && !mU.sparse && !mV.sparse
- && !mU.isEmptyBlock() && !mV.isEmptyBlock();
- if( NativeHelper.isNativeLibraryLoaded() && allDense && (mW.rlen == 1 || mW.clen == 1)
- && !LibMatrixNative.isMatMultMemoryBound(mU.rlen, mU.clen, mV.rlen)
- && mW.getDenseBlock().isContiguous() && mU.getDenseBlock().isContiguous() && mV.getDenseBlock().isContiguous() )
- matrixMultWSigmoidDenseNative(mW, mU, mV, ret, wt);
- else if( allDense )
- matrixMultWSigmoidDense(mW, mU, mV, ret, wt, 0, mW.rlen);
- else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
- matrixMultWSigmoidSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen);
- else
- matrixMultWSigmoidGeneric(mW, mU, mV, ret, wt, 0, mW.rlen);
-
- //post-processing
- ret.recomputeNonZeros();
- ret.examSparsity();
-
- //System.out.println("MMWSig "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
- }
-
- public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int k) {
- //check for empty result
- if( mW.isEmptyBlock(false) ) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //check no parallelization benefit (fallback to sequential)
- if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) {
- matrixMultWSigmoid(mW, mU, mV, ret, wt);
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing
- ret.sparse = mW.sparse;
- ret.allocateBlock();
-
- try
- {
- ExecutorService pool = CommonThreadPool.get(k);
- ArrayList tasks = new ArrayList<>();
- int blklen = (int)(Math.ceil((double)mW.rlen/k));
- for( int i=0; i> taskret = pool.invokeAll(tasks);
- pool.shutdown();
- //aggregate partial nnz and check for errors
- ret.nonZeros = 0; //reset after execute
- for( Future task : taskret )
- ret.nonZeros += task.get();
- }
- catch (Exception e) {
- throw new DMLRuntimeException(e);
- }
-
- //post-processing (nnz maintained in parallel)
- ret.examSparsity();
-
- //System.out.println("MMWSig "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
- }
-
- /**
- * NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations
- * have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail,
- * this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero
- * which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X)
- * is zero.
- *
- * @param mW matrix W
- * @param mU matrix U
- * @param mV matrix V
- * @param mX matrix X
- * @param ret result type
- * @param wt weighted divide matrix multiplication type
- */
- public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt) {
- //check for empty result
- if( mW.isEmptyBlock(false)
- || (wt.isLeft() && mU.isEmptyBlock(false))
- || (wt.isRight() && mV.isEmptyBlock(false))
- || (wt.isBasic() && mW.isEmptyBlock(false))) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing
- ret.sparse = wt.isBasic()?mW.sparse:false;
- ret.allocateBlock();
-
- //core weighted div mm computation
- boolean scalarX = wt.hasScalar();
- if( !mW.sparse && !mU.sparse && !mV.sparse && (mX==null || !mX.sparse || scalarX) && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
- matrixMultWDivMMDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
- else if( mW.sparse && !mU.sparse && !mV.sparse && (mX==null || mX.sparse || scalarX) && !mU.isEmptyBlock() && !mV.isEmptyBlock())
- matrixMultWDivMMSparseDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
- else
- matrixMultWDivMMGeneric(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
-
- //post-processing
- ret.recomputeNonZeros();
- ret.examSparsity();
-
- //System.out.println("MMWDiv "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
- }
-
- /**
- * NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations
- * have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail,
- * this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero
- * which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X)
- * is zero.
- *
- * @param mW matrix W
- * @param mU matrix U
- * @param mV matrix V
- * @param mX matrix X
- * @param ret result matrix
- * @param wt weighted divide matrix multiplication type
- * @param k maximum parallelism
- */
- public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int k) {
- //check for empty result
- if( mW.isEmptyBlock(false)
- || (wt.isLeft() && mU.isEmptyBlock(false))
- || (wt.isRight() && mV.isEmptyBlock(false))
- || (wt.isBasic() && mW.isEmptyBlock(false))) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing
- ret.sparse = wt.isBasic()?mW.sparse:false;
- ret.allocateBlock();
-
- if (!ret.isThreadSafe()){
- matrixMultWDivMM(mW, mU, mV, mX, ret, wt);
- return;
- }
-
- try
- {
- ExecutorService pool = CommonThreadPool.get(k);
- ArrayList tasks = new ArrayList<>();
- //create tasks (for wdivmm-left, parallelization over columns;
- //for wdivmm-right, parallelization over rows; both ensure disjoint results)
- if( wt.isLeft() ) {
- int blklen = (int)(Math.ceil((double)mW.clen/k));
- for( int j=0; j> taskret = pool.invokeAll(tasks);
- pool.shutdown();
- //aggregate partial nnz and check for errors
- ret.nonZeros = 0; //reset after execute
- for( Future task : taskret )
- ret.nonZeros += task.get();
- }
- catch (Exception e) {
- throw new DMLRuntimeException(e);
- }
-
- //post-processing
- ret.examSparsity();
-
- //System.out.println("MMWDiv "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
- }
-
- public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt) {
- //check for empty result
- if( mW.isEmptyBlock(false) ) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing
- ret.sparse = false;
- ret.allocateDenseBlock();
-
- //core weighted cross entropy mm computation
- if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
- matrixMultWCeMMDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
- else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
- matrixMultWCeMMSparseDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
- else
- matrixMultWCeMMGeneric(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
-
- //System.out.println("MMWCe "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
- }
-
- public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int k) {
- //check for empty result
- if( mW.isEmptyBlock(false) ) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing (no need to check isThreadSafe)
- ret.sparse = false;
- ret.allocateDenseBlock();
-
- try
- {
- ExecutorService pool = CommonThreadPool.get(k);
- ArrayList tasks = new ArrayList<>();
- int blklen = (int)(Math.ceil((double)mW.rlen/k));
- for( int i=0; i> taskret = pool.invokeAll(tasks);
- pool.shutdown();
- //aggregate partial results
- sumScalarResults(taskret, ret);
- }
- catch( Exception e ) {
- throw new DMLRuntimeException(e);
- }
-
- //System.out.println("MMWCe "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
- }
-
- public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn) {
- //check for empty result
- if( mW.isEmptyBlock(false) ) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing
- ret.sparse = mW.sparse;
- ret.allocateBlock();
-
- //core weighted square sum mm computation
- if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
- matrixMultWuMMDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
- else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
- matrixMultWuMMSparseDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
- else
- matrixMultWuMMGeneric(mW, mU, mV, ret, wt, fn, 0, mW.rlen);
-
- //post-processing
- ret.recomputeNonZeros();
- ret.examSparsity();
-
- //System.out.println("MMWu "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
- }
-
- public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int k) {
- //check for empty result
- if( mW.isEmptyBlock(false) ) {
- ret.examSparsity(); //turn empty dense into sparse
- return;
- }
-
- //check no parallelization benefit (fallback to sequential)
- if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) {
- matrixMultWuMM(mW, mU, mV, ret, wt, fn);
- return;
- }
-
- //Timing time = new Timing(true);
-
- //pre-processing
- ret.sparse = mW.sparse;
- ret.allocateBlock();
-
- try
- {
- ExecutorService pool = CommonThreadPool.get(k);
- ArrayList tasks = new ArrayList<>();
- int blklen = (int)(Math.ceil((double)mW.rlen/k));
- for( int i=0; i> taskret = pool.invokeAll(tasks);
- pool.shutdown();
- //aggregate partial nnz and check for errors
- ret.nonZeros = 0; //reset after execute
- for( Future task : taskret )
- ret.nonZeros += task.get();
- }
- catch (Exception e) {
- throw new DMLRuntimeException(e);
- }
-
- //post-processing (nnz maintained in parallel)
- ret.examSparsity();
-
- //System.out.println("MMWu "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
- // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
- }
-
- //////////////////////////////////////////
- // optimized matrix mult implementation //
- //////////////////////////////////////////
-
- private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) {
- DenseBlock a = m1.getDenseBlock();
- DenseBlock b = m2.getDenseBlock();
- DenseBlock c = ret.getDenseBlock();
- final int m = m1.rlen;
- final int n = m2.clen;
- final int cd = m1.clen;
-
- if( LOW_LEVEL_OPTIMIZATION ) {
- if( m==1 && n==1 ) { //DOT PRODUCT
- double[] avals = a.valuesAt(0);
- double[] bvals = b.valuesAt(0);
- c.set(0, 0, dotProduct(avals, bvals, cd));
- }
- else if( n>1 && cd == 1 ) { //OUTER PRODUCT
- double[] avals = a.valuesAt(0);
- double[] bvals = b.valuesAt(0);
- for( int i=rl; i < ru; i++) {
- double[] cvals = c.values(i);
- int cix = c.pos(i);
- if( avals[i] == 1 )
- System.arraycopy(bvals, 0, cvals, cix, n);
- else if( avals[i] != 0 )
- vectMultiplyWrite(avals[i], bvals, cvals, 0, cix, n);
- else
- Arrays.fill(cvals, cix, cix+n, 0);
- }
- }
- else if( n==1 && cd == 1 ) { //VECTOR-SCALAR
- double[] avals = a.valuesAt(0);
- double[] cvals = c.valuesAt(0);
- vectMultiplyWrite(b.get(0,0), avals, cvals, rl, rl, ru-rl);
- }
- else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
- matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
- }
- else if( n==1 ) { //MATRIX-VECTOR (tall rhs)
- matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
- }
- else if( pm2 && m==1 ) { //VECTOR-MATRIX
- matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
- }
- else if( pm2 && m<=16 ) { //MATRIX-MATRIX (short lhs)
- matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
- }
- else if( tm2 ) { //MATRIX-MATRIX (skinny rhs)
- matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
- }
- else { //MATRIX-MATRIX
- matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
- }
- }
- else {
- for( int i = rl; i < ru; i++) {
- double[] avals = a.values(i);
- double[] cvals = c.values(i);
- int aix = a.pos(i), cix = c.pos(i);
- for( int k = 0; k < cd; k++) {
- double val = avals[aix + k];
- if( val != 0 ) {
- double[] bvals = b.values(k);
- int bix = b.pos(k);
- for( int j = 0; j < n; j++)
- cvals[cix+j] += val * bvals[bix+j];
- }
- }
- }
- }
- }
-
- private static void matrixMultDenseDenseMVShortRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
- double[] bvals = b.valuesAt(0);
- double[] cvals = c.valuesAt(0);
- for( int i=rl; i < ru; i++ )
- cvals[i] = dotProduct(a.values(i), bvals, a.pos(i), 0, cd);
- }
-
- private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
- final int blocksizeI = 32;
- final int blocksizeK = 2*1024; //16KB vector blocks (L1)
- double[] bvals = b.valuesAt(0);
- double[] cvals = c.valuesAt(0);
- for( int bi=rl; bi n && cd > 64 && n < 64
- //however, explicit flag required since dimension change m2
- for( int i=rl; i < ru; i++ ) {
- double[] avals = a.values(i), cvals = c.values(i);
- int aix = a.pos(i), cix = c.pos(i);
- for( int j=0; j=0) ? rlix : alen;
-
- if( b.isContiguous() ) {
- double[] bvals = b.valuesAt(0);
- for( int k=rlix; k=0) ? apos+k1 : apos+alen;
- int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
- k2 = (k2>=0) ? apos+k2 : apos+alen;
-
- //note: guard k1 (and thus also k2) against overrun nnz, and guard
- //contiguous check for k2-1 against underrun of start pos for k1==k2.
- if( k1=0) ? rlix : alen;
-
- for( int k=rlix; k threshold
- for( int i=rl; i n / 128;
-
- //perform vector-matrix multiply w/ dense or sparse output
- if( ldense ) { //init dense tmp row
- tmp = (tmp == null) ? new double[n] : tmp;
- Arrays.fill(tmp, 0);
- }
- for( int k=apos; k 0 ) {
- c.allocate(i, lnnz); //allocate once
- double[] bvals = m2.getDenseBlock().values(aix);
- for( int j=0, bix=m2.getDenseBlock().pos(aix); j 0 )
- if(c.get(i) instanceof SparseRowScalar){
- SparseRowScalar sv = (SparseRowScalar) c.get(i);
- c.set(i, new SparseRowScalar(sv.getIndex(), sv.getValue() * avals[apos]), false);
- }
- else
- vectMultiplyInPlace(avals[apos], c.values(i), c.pos(i), c.size(i));
-
- }
- else { //GENERAL CASE
- for( int k=apos; k=0) ? apos+rlix : apos+alen;
- int len = apos + alen;
- for(int i = rlix; i < len && aix[i] < ru; i++)
- vectMultiplyAdd(avals[i], avals, c.values(aix[i]), aix, i, c.pos(aix[i]), len - i);
- }
- }
- }
- else
- {
- for( int r=0; r=0) ? apos+rlix : apos+alen;
- for(int i = rlix; i < apos+alen && aix[i]=0) ? apos+rlix : apos+alen;
- for(int i = rlix; i < apos+alen && aix[i]=0) ? apos+rlix : apos+alen;
- for(int i = rlix; i < apos+alen && aix[i] 0 ) { //selected row
- int bpos = (pos-1) % blen;
- int blk = (pos-1) / blen;
- //allocate and switch to second output block
- //(never happens in cp, correct for multi-threaded usage)
- if( lastblk!=-1 && lastblk 0 ) { //selected row
- double[] bvals = b.values(i);
- int bix = b.pos(i);
- int bpos = (pos-1) % blen;
- int blk = (pos-1) / blen;
- //allocate and switch to second output block
- //(never happens in cp, correct for multi-threaded usage)
- if( lastblk!=-1 && lastblk 0 ) { //selected row
- int bpos = (pos-1) % blen;
- int blk = (pos-1) / blen;
- //allocate and switch to second output block
- //(never happens in cp, correct for multi-threaded usage)
- if( lastblk!=-1 && lastblk sum(X^2)-sum(2*X*(U%*%t(V))))+sum((t(U)%*%U)*(t(V)%*%V)), where each
- //parallel task computes sum(X^2)-sum(2*X*(U%*%t(V)))) and the last term
- //sum((t(U)%*%U)*(t(V)%*%V)) is computed once via two tsmm operations.
-
- final int blocksizeIJ = (int) (8L*mX.rlen*mX.clen/mX.nonZeros);
- int[] curk = new int[blocksizeIJ];
-
- for( int bi=rl; bi sum(X^2)-sum(2*X*(U%*%t(V))))+sum((t(U)%*%U)*(t(V)%*%V)), where each
- //parallel task computes sum(X^2)-sum(2*X*(U%*%t(V)))) and the last term
- //sum((t(U)%*%U)*(t(V)%*%V)) is computed once via two tsmm operations.
-
- if( mX.sparse ) { //SPARSE
- SparseBlock x = mX.sparseBlock;
- for( int i=rl; i=0) ? k : mW.clen;
- }
- //prepare alignment info if necessary
- if( four && !scalar )
- for( int i=bi; i=0) ? wpos+k : wpos+wlen;
- for( ; k 1) { //X%*%t(X) SPARSE MATRIX
- //directly via LibMatrixReorg in order to prevent sparsity change
- MatrixBlock tmpBlock = new MatrixBlock(clen, rlen, m1.sparse);
- LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
- ret = tmpBlock;
- }
- else if( leftTranspose && m1.sparse && m1.sparseBlock instanceof SparseBlockCSR ) {
- //for a special case of CSR inputs where all non-empty rows are dense, we can
- //create a shallow copy of the values arrays to a "dense" block and perform
- //tsmm with the existing dense block operations w/o unnecessary gather/scatter
- SparseBlockCSR sblock = (SparseBlockCSR)m1.sparseBlock;
- boolean convertDense = (par ?
- IntStream.range(0, rlen).parallel() : IntStream.range(0, rlen))
- .allMatch(i -> sblock.isEmpty(i) || sblock.size(i)==clen );
- if( convertDense ) {
- int rows = (int) sblock.size() / clen;
- MatrixBlock tmpBlock = new MatrixBlock(rows, clen, false);
- tmpBlock.denseBlock = DenseBlockFactory
- .createDenseBlock(sblock.values(), rows, clen);
- tmpBlock.setNonZeros(m1.nonZeros);
- ret = tmpBlock;
- }
- }
-
- return ret;
- }
-
- private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 ) {
- //transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output
- return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse
- && isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen, true));
- }
-
- //note: public for use by codegen for consistency
- public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen, boolean inclCacheSize) {
- return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1
- && m2clen < 64 && (!inclCacheSize || 8*m2rlen*m2clen < L2_CACHESIZE);
- }
-
- private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) {
- //parallelize over rows in rhs matrix if number of rows in lhs/output is very small
- double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
- return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && !(m1.isUltraSparse()||m2.isUltraSparse()))
- || (m1.rlen<=16 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && m2.rlen > m1.rlen
- && ( !m1.isUltraSparse() && !(m1.sparse & m2.sparse) ) //dense-dense / sparse-dense / dense-sparse
- && (long)k * 8 * m1.rlen * m2.clen < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) );
- }
-
- private static boolean checkParMatrixMultRightInputCols( MatrixBlock m1, MatrixBlock m2, int k, boolean pm2r ) {
- //parallelize over cols in rhs matrix if dense, number of cols in rhs is large, and lhs fits in l2
- return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse
- && m2.clen > k * 1024 && m1.rlen < k * 32 && !pm2r
- && 8*m1.rlen*m1.clen < 256*1024 ); //lhs fits in L2 cache
- }
-
- public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, int k) {
- return satisfiesMultiThreadingConstraints(m1, true, false, -1, k);
- }
-
- public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
- boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k);
- double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
- return k > 1 && LOW_LEVEL_OPTIMIZATION
- && (!checkMem || 8L * m1.clen * k < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
- && (!checkFLOPs || FPfactor * m1.rlen * m1.clen >
- (sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1));
- }
-
- public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, MatrixBlock m2, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
- boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k);
- double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
- return k > 1 && LOW_LEVEL_OPTIMIZATION
- && (!checkMem || 8L * m2.clen * k < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
- //note: cast to double to avoid long overflows on ultra-sparse matrices
- //due to FLOP computation based on number of cells not non-zeros
- && (!checkFLOPs || (double)FPfactor * m1.rlen * m1.clen * m2.clen >
- (sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1));
- }
-
- private static boolean satisfiesMultiThreadingConstraintsTSMM(MatrixBlock m1, boolean leftTranspose, long FPfactor, int k) {
- boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k);
- double threshold = sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1;
- return k > 1 && LOW_LEVEL_OPTIMIZATION && (leftTranspose?m1.clen:m1.rlen)!=1
- && ((leftTranspose && FPfactor * m1.rlen * m1.clen * m1.clen > threshold)
- ||(!leftTranspose && FPfactor * m1.clen * m1.rlen * m1.rlen > threshold));
- }
-
- public static boolean isUltraSparseMatrixMult(MatrixBlock m1, MatrixBlock m2, boolean m1Perm) {
- if( m2.clen == 1 ) //mv always dense
- return false;
- //note: ultra-sparse matrix mult implies also sparse outputs, hence we need
- //to be conservative an cannot use this for all ultra-sparse matrices.
- double outSp = OptimizerUtils.getMatMultSparsity(
- m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, true);
- return (m1.isUltraSparse() || m2.isUltraSparse()) //base case
- || (m1.isUltraSparse(false) && m1 == m2) //ultra-sparse self product
- || (m1Perm && OptimizerUtils.getSparsity(m2.rlen, m2.clen, m2.nonZeros)<1.0)
- || ((m1.isUltraSparse(false) || m2.isUltraSparse(false))
- && outSp < MatrixBlock.ULTRA_SPARSITY_TURN_POINT2)
- || (m1.isInSparseFormat() // otherwise no matching branch
- && m1.getSparsity() < MatrixBlock.ULTRA_SPARSITY_TURN_POINT2
- && m1.getNonZeros() < MatrixBlock.ULTRA_SPARSE_BLOCK_NNZ
- && m1.getLength()+m2.getLength() < (long)m1.rlen*m2.clen
- && outSp < MatrixBlock.SPARSITY_TURN_POINT);
- }
-
- public static boolean isSparseOutputMatrixMult(MatrixBlock m1, MatrixBlock m2) {
- //output is a matrix (not vector), very likely sparse, and output rows fit into L1 cache
- if( !(m1.sparse && m2.sparse && m1.rlen > 1 && m2.clen > 1) )
- return false;
- double estSp = OptimizerUtils.getMatMultSparsity(
- m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, false);
- long estNnz = (long)(estSp * m1.rlen * m2.clen);
- boolean sparseOut = MatrixBlock.evalSparseFormatInMemory(m1.rlen, m2.clen, estNnz);
- return m2.clen < 4*1024 && sparseOut;
- }
-
- public static boolean isOuterProductTSMM(int rlen, int clen, boolean left) {
- return left ? rlen == 1 & clen > 1 : rlen > 1 & clen == 1;
- }
-
- private static MatrixBlock prepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2, boolean tm2 ) {
- MatrixBlock ret = m2;
-
- //transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output
- if( tm2 ) {
- MatrixBlock tmpBlock = new MatrixBlock(m2.clen, m2.rlen, m2.sparse);
- ret = LibMatrixReorg.reorg(m2, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
- }
-
- return ret;
- }
-
- //cp non-zeros for dense-dense mm
- private static int copyNonZeroElements( double[] a, final int aixi, final int bixk, final int n, double[] tmpa, int[] tmpbi, final int bklen ) {
- int knnz = 0;
- for( int k = 0; k < bklen; k++ )
- if( a[ aixi+k ] != 0 ) {
- tmpa[ knnz ] = a[ aixi+k ];
- tmpbi[ knnz ] = bixk + k*n;
- knnz ++;
- }
- return knnz;
- }
-
- //cp non-zeros for dense tsmm
- private static int copyNonZeroElements( double[] a, int aixi, int bixk, final int n, final int nx, double[] tmpa, int[] tmpbi, final int bklen ) {
- int knnz = 0;
- for( int k = 0; k < bklen; k++, aixi+=n, bixk+=nx )
- if( a[ aixi ] != 0 ) {
- tmpa[ knnz ] = a[ aixi ];
- tmpbi[ knnz ] = bixk;
- knnz ++;
- }
- return knnz;
- }
-
- @SuppressWarnings("unused")
- private static void compactSparseOutput(MatrixBlock ret) {
- if( !ret.sparse || ret.nonZeros > ret.rlen || ret.isEmpty()
- || ret.getSparseBlock() instanceof SparseBlockCSR )
- return; //early abort
- ret.sparseBlock = SparseBlockFactory
- .copySparseBlock(Type.CSR, ret.sparseBlock, false);
- }
-
- @SuppressWarnings("unused")
- private static void resetPosVect(int[] curk, SparseBlock sblock, int rl, int ru) {
- if( sblock instanceof SparseBlockMCSR ) {
- //all rows start at position 0 (individual arrays)
- Arrays.fill(curk, 0, ru-rl, 0);
- }
- else if( sblock instanceof SparseBlockCSR ) {
- //row start positions given in row ptr array
- SparseBlockCSR csr = (SparseBlockCSR) sblock;
- System.arraycopy(csr.rowPointers(), rl, curk, 0, ru-rl);
- }
- else { //general case
- for(int i=rl; i> tasks, MatrixBlock ret)
- throws InterruptedException, ExecutionException
- {
- //aggregate partial results and check for errors
- double val = 0;
- for(Future task : tasks)
- val += task.get();
- ret.quickSetValue(0, 0, val);
- }
-
- @SuppressWarnings("unused")
- private static void sumDenseResults( double[][] partret, double[] ret )
- {
- final int len = ret.length;
- final int k = partret.length;
- final int bk = k % 4;
- final int blocksize = 2 * 1024; //16KB (half of common L1 data)
-
- //cache-conscious aggregation to prevent repreated scans/writes of ret
- for( int bi=0; bi
- {
- private final MatrixBlock _m1;
- private final MatrixBlock _m2;
- private MatrixBlock _ret = null;
- private final boolean _tm2; //transposed m2
- private final boolean _pm2r; //par over m2 rows
- private final boolean _pm2c; //par over m2 rows
- private final boolean _m1Perm; //sparse permutation
- private final boolean _sparse; //sparse output
- private final int _rl;
- private final int _ru;
-
- protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
- boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, boolean sparse, int rl, int ru )
- {
- _m1 = m1;
- _m2 = m2;
- _tm2 = tm2;
- _pm2r = pm2r;
- _pm2c = pm2c;
- _m1Perm = m1Perm;
- _sparse = sparse;
- _rl = rl;
- _ru = ru;
-
- if( pm2r ) { //vector-matrix / matrix-matrix
- //allocate local result for partial aggregation
- _ret = new MatrixBlock(ret.rlen, ret.clen, false);
- }
- else { //default case
- _ret = ret;
- }
- }
-
- @Override
- public Object call() {
- //setup target index ranges
- int rl = _pm2c ? 0 : _rl;
- int ru = _pm2c ? _m1.rlen : _ru;
- int cl = _pm2c ? _rl : 0;
- int cu = _pm2c ? _ru : _ret.clen;
-
- //thread-local allocation
- if( _pm2r )
- _ret.allocateDenseBlock();
-
- //compute block matrix multiplication
- if( _ret.sparse ) //ultra-sparse
- matrixMultUltraSparse(_m1, _m2, _ret, _m1Perm, rl, ru);
- else if(!_m1.sparse && !_m2.sparse)
- matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu);
- else if(_m1.sparse && _m2.sparse)
- matrixMultSparseSparse(_m1, _m2, _ret, _pm2r, _sparse, rl, ru);
- else if(_m1.sparse)
- matrixMultSparseDense(_m1, _m2, _ret, _pm2r, rl, ru);
- else
- matrixMultDenseSparse(_m1, _m2, _ret, _pm2r, rl, ru);
-
- //maintain block nnz (upper bounds inclusive)
- if( !_pm2r )
- return _ret.recomputeNonZeros(rl, ru-1, cl, cu-1);
- else
- return _ret.getDenseBlockValues();
- }
- }
-
- private static class MatrixMultChainTask implements Callable
- {
- private MatrixBlock _m1 = null;
- private MatrixBlock _m2 = null;
- private MatrixBlock _m3 = null;
- private ChainType _ct = null;
- private int _rl = -1;
- private int _ru = -1;
-
- protected MatrixMultChainTask( MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, ChainType ct, int rl, int ru ) {
- _m1 = mX;
- _m2 = mV;
- _m3 = mW;
- _ct = ct;
- _rl = rl;
- _ru = ru;
- }
-
- @Override
- public double[] call() {
- //thread-local allocation for partial aggregation
- MatrixBlock ret = new MatrixBlock(1, _m1.clen, false);
- ret.allocateDenseBlock();
-
- if( _m1.sparse )
- matrixMultChainSparse(_m1, _m2, _m3, ret, _ct, _rl, _ru);
- else
- matrixMultChainDense(_m1, _m2, _m3, ret, _ct, _rl, _ru);
-
- //NOTE: we dont do global aggregation from concurrent tasks in order
- //to prevent synchronization (sequential aggregation led to better
- //performance after JIT)
- return ret.getDenseBlockValues();
- }
- }
-
- private static class MatrixMultTransposeTask implements Callable
- {
- private final MatrixBlock _m1;
- private final MatrixBlock _ret;
- private final boolean _left;
- private final int _rl;
- private final int _ru;
-
- protected MatrixMultTransposeTask( MatrixBlock m1, MatrixBlock ret, boolean left, int rl, int ru )
- {
- _m1 = m1;
- _ret = ret;
- _left = left;
- _rl = rl;
- _ru = ru;
- }
-
- @Override
- public Object call() {
- if( _m1.sparse )
- matrixMultTransposeSelfSparse(_m1, _ret, _left, _rl, _ru);
- else
- matrixMultTransposeSelfDense(_m1, _ret, _left, _rl, _ru);
- return null;
- }
- }
-
- private static class MatrixMultPermuteTask implements Callable
- {
- private MatrixBlock _pm1 = null;
- private MatrixBlock _m2 = null;
- private MatrixBlock _ret1 = null;
- private MatrixBlock _ret2 = null;
- private int _rl = -1;
- private int _ru = -1;
-
- protected MatrixMultPermuteTask( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
- {
- _pm1 = pm1;
- _m2 = m2;
- _ret1 = ret1;
- _ret2 = ret2;
- _rl = rl;
- _ru = ru;
- }
-
- @Override
- public Object call() {
- if( _m2.sparse )
- matrixMultPermuteSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru);
- else if( _ret1.sparse )
- matrixMultPermuteDenseSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru);
- else
- matrixMultPermuteDense(_pm1, _m2, _ret1, _ret2, _rl, _ru);
-
- return null;
- }
- }
-
- private static class MatrixMultWSLossTask implements Callable
- {
- private MatrixBlock _mX = null;
- private MatrixBlock _mU = null;
- private MatrixBlock _mV = null;
- private MatrixBlock _mW = null;
- private MatrixBlock _ret = null;
- private WeightsType _wt = null;
- private int _rl = -1;
- private int _ru = -1;
-
- protected MatrixMultWSLossTask(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, WeightsType wt, int rl, int ru) {
- _mX = mX;
- _mU = mU;
- _mV = mV;
- _mW = mW;
- _wt = wt;
- _rl = rl;
- _ru = ru;
-
- //allocate local result for partial aggregation
- _ret = new MatrixBlock(1, 1, false);
- _ret.allocateDenseBlock();
- }
-
- @Override
- public Double call() {
- if( !_mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || !_mW.sparse)
- && !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()
- && (_mW==null || !_mW.isEmptyBlock()))
- matrixMultWSLossDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
- else if( _mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || _mW.sparse)
- && !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()
- && (_mW==null || !_mW.isEmptyBlock()))
- matrixMultWSLossSparseDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
- else
- matrixMultWSLossGeneric(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
-
- return _ret.quickGetValue(0, 0);
- }
- }
-
- private static class MatrixMultWSigmoidTask implements Callable
- {
- private MatrixBlock _mW = null;
- private MatrixBlock _mU = null;
- private MatrixBlock _mV = null;
- private MatrixBlock _ret = null;
- private WSigmoidType _wt = null;
- private int _rl = -1;
- private int _ru = -1;
-
- protected MatrixMultWSigmoidTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) {
- _mW = mW;
- _mU = mU;
- _mV = mV;
- _ret = ret;
- _wt = wt;
- _rl = rl;
- _ru = ru;
- }
-
- @Override
- public Long call() {
- //core weighted square sum mm computation
- if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
- matrixMultWSigmoidDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
- else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
- matrixMultWSigmoidSparseDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
- else
- matrixMultWSigmoidGeneric(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
-
- //maintain block nnz (upper bounds inclusive)
- return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
- }
- }
-
- private static class MatrixMultWDivTask implements Callable
- {
- private MatrixBlock _mW = null;
- private MatrixBlock _mU = null;
- private MatrixBlock _mV = null;
- private MatrixBlock _mX = null;
- private MatrixBlock _ret = null;
- private WDivMMType _wt = null;
- private int _rl = -1;
- private int _ru = -1;
- private int _cl = -1;
- private int _cu = -1;
-
- protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) {
- _mW = mW;
- _mU = mU;
- _mV = mV;
- _mX = mX;
- _wt = wt;
- _rl = rl;
- _ru = ru;
- _cl = cl;
- _cu = cu;
- _ret = ret;
- }
-
- @Override
- public Long call() {
- //core weighted div mm computation
- boolean scalarX = _wt.hasScalar();
- if( !_mW.sparse && !_mU.sparse && !_mV.sparse && (_mX==null || !_mX.sparse || scalarX) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
- matrixMultWDivMMDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
- else if( _mW.sparse && !_mU.sparse && !_mV.sparse && (_mX==null || _mX.sparse || scalarX) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
- matrixMultWDivMMSparseDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
- else
- matrixMultWDivMMGeneric(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
-
- //maintain partial nnz for right (upper bounds inclusive)
- int rl = _wt.isLeft() ? _cl : _rl;
- int ru = _wt.isLeft() ? _cu : _ru;
- return _ret.recomputeNonZeros(rl, ru-1, 0, _ret.getNumColumns()-1);
- }
- }
-
- private static class MatrixMultWCeTask implements Callable
- {
- private MatrixBlock _mW = null;
- private MatrixBlock _mU = null;
- private MatrixBlock _mV = null;
- private double _eps = 0.0;
- private MatrixBlock _ret = null;
- private WCeMMType _wt = null;
- private int _rl = -1;
- private int _ru = -1;
-
- protected MatrixMultWCeTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, WCeMMType wt, int rl, int ru) {
- _mW = mW;
- _mU = mU;
- _mV = mV;
- _eps = eps;
- _wt = wt;
- _rl = rl;
- _ru = ru;
-
- //allocate local result for partial aggregation
- _ret = new MatrixBlock(1, 1, false);
- _ret.allocateDenseBlock();
- }
-
- @Override
- public Double call() {
- //core weighted cross entropy mm computation
- if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
- matrixMultWCeMMDense(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru);
- else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
- matrixMultWCeMMSparseDense(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru);
- else
- matrixMultWCeMMGeneric(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru);
-
-
- return _ret.quickGetValue(0, 0);
- }
- }
-
- private static class MatrixMultWuTask implements Callable
- {
- private MatrixBlock _mW = null;
- private MatrixBlock _mU = null;
- private MatrixBlock _mV = null;
- private MatrixBlock _ret = null;
- private WUMMType _wt = null;
- private ValueFunction _fn = null;
- private int _rl = -1;
- private int _ru = -1;
-
- protected MatrixMultWuTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) {
- _mW = mW;
- _mU = mU;
- _mV = mV;
- _ret = ret;
- _wt = wt;
- _fn = fn;
- _rl = rl;
- _ru = ru;
- }
-
- @Override
- public Long call() {
- //core weighted square sum mm computation
- if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
- matrixMultWuMMDense(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru);
- else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
- matrixMultWuMMSparseDense(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru);
- else
- matrixMultWuMMGeneric(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru);
-
- //maintain block nnz (upper bounds inclusive)
- return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
- }
- }
-}
diff --git a/scripts/staging/SIMD-double-vectors/README.md b/scripts/staging/SIMD-double-vectors/README.md
deleted file mode 100644
index 55812002d0c..00000000000
--- a/scripts/staging/SIMD-double-vectors/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-
-# SIMD DoubleVectors for matrix multiplication
-
-`DoubleVector` is still in incubator stage, but promises performance improvements for many SystemDS components.
-This patch explored potential speedup for matrix multiplication of two dense matrices. Additionally, dot product
-is also implemented with `DoubleVector` for the case where common dimension is `1`.
-
-Initial experiments showed varying results, usually the vectorized implementation performs somewhere between
-`MKL` and our reference. There are also cases where we are slower than the reference, or faster than `MKL`.
-For detailed discussion (and plots) see PR #1643.
-
-## Further Work
-
-This patch focused only on dense matrix multiplication, increasing sparsity would complicate things.
-The sparsity aware copying (see `LibMatrixMult.java:1170`) and general loop structure is kept as it is, as a lot of
-experimentation went into a very efficient implementation. Note that the usage of `DoubleVector` might change
-a lot of things about this and revisiting this (and using SIMD for sparsity aware copying) will be a necessary step.
-
-## Changes
-
-Due to the dependency of at least JDK17, there are changes to `pom.xml`, run script `systemds` and, of course, `LibMatrixMult.java`.
-
-## Note
-
-The pom file repeatedly gets flagged for old version of various libraries, therefore we renamed the file to avoid this inconvenience.
-and we introduced a string in the beginning of the file making it not compile as a pom file.
diff --git a/scripts/staging/SIMD-double-vectors/pom.xml.tmp b/scripts/staging/SIMD-double-vectors/pom.xml.tmp
deleted file mode 100644
index fbb940668b7..00000000000
--- a/scripts/staging/SIMD-double-vectors/pom.xml.tmp
+++ /dev/null
@@ -1,1336 +0,0 @@
-THIS IS NO LONGER A POM.XML FILE, GITHUB PLEASE DO NOT GIVE US UPDATE REMINDERS OF THIS FILE.
-
-
-
-
- 4.0.0
-
- org.apache
- apache
- 24
-
- org.apache.systemds
- 3.1.0-SNAPSHOT
- systemds
- jar
- Apache SystemDS
- https://github.com/apache/systemds
- An open source ML system for the end-to-end data science lifecycle
-
-
- Apache 2.0 License
- http://www.apache.org/licenses/LICENSE-2.0.html
-
-
-
-
- 3.3.3
- 4.8
- 3.20.3
- 3.2.0
- 2.12.0
- 2.12
- yyyy-MM-dd HH:mm:ss z
- 1
- false
- provided
- 10.2.0
- 1.7.36
- 2.17.2
-
- 17
- Testing settings
- true
- classes
- 2
- 1C
- 2
- true
- **
- false
- -Xms3000m -Xmx3000m -Xmn300m
- false
-
-
-
-
- central
- https://repo1.maven.org/maven2
-
- true
-
-
-
-
-
- scm:git:https://github.com/apache/systemds.git
- HEAD
-
-
-
-
-
-
- scripts
-
- algorithms/obsolete/*
- datagen/obsolete/*
- perftest/**/*
- perftest
- perftestDeprecated/*
- perftestDeprecated
- staging/**/*
- nn/test/compare_backends/*
- nn/test/compare_backends/*
-
- scripts
-
-
- src/main/cuda/kernels
-
- SystemDS.ptx
- reduction.ptx
-
- cuda/kernels
-
-
- src/main/cpp/lib
- lib
-
-
- src/main/cuda/spoof
- cuda/spoof
-
-
- src/main/cuda/headers
-
- agg_ops.cuh
- operators.cuh
- reduction.cuh
- spoof_utils.cuh
- TempStorage.cuh
- utils.cuh
- vector_write.cuh
- vector_add.cuh
- Matrix.h
-
- cuda/headers
-
-
- src/main/java/org/apache/sysds/hops/codegen/cplan/java
-
- Cellwise.java.template
- Rowwise.java.template
-
- java/spoof
-
-
-
-
-
- org.apache.maven.plugins
- maven-dependency-plugin
-
-
- unpack
- package
-
- unpack
-
-
-
-
- org.apache.hadoop
- hadoop-test
- 1.2.1
- jar
- true
- ${project.build.directory}/hadoop-test
- **/*
-
-
- false
- true
-
-
-
- compile
-
- copy-dependencies
-
-
- true
- ${project.build.directory}/lib
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 2.3
-
-
- package
-
- shade
-
-
-
-
- org.apache.wink:wink-json4j:*
- org.antlr:antlr4-runtime:*
-
-
-
-
- org.apache.sysds.api.DMLScript
-
-
-
- META-INF/LICENSE
- src/assembly/bin/LICENSE
-
-
- META-INF/NOTICE
- NOTICE
-
-
- false
-
-
-
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
- META-INF/LICENSE
- META-INF/NOTICE
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.8.1
-
- ${java.level}
- ${java.level}
-
- --add-modules=jdk.incubator.vector
-
-
-
-
-
- org.apache.maven.plugins
- maven-resources-plugin
-
-
- copy-resources
- compile
-
- copy-resources
-
-
-
-
- ${basedir}/src/test/config/hadoop_bin_windows/bin
- false
-
- *.*
-
-
-
- ${basedir}/target/lib/hadoop/bin
-
-
-
-
-
-
- org.antlr
- antlr4-maven-plugin
-
- ${basedir}/src/main/java
- ${basedir}/src/main/java
-
- ${antlr.version}
-
-
- antlr
-
- antlr4
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
- 3.0.0-M5
-
- ${maven.test.skip}
- ${test-parallel}
- ${test-threadCount}
-
- ${test-forkCount}
- false
- brief
- true
- ${rerun.failing.tests.count}
- --add-modules=jdk.incubator.vector
-
-
-
-
- maven-clean-plugin
-
-
- clean-original-jar
- package
-
- clean
-
-
- true
-
-
- ${project.build.directory}
-
- original-*.jar
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-antrun-plugin
-
-
- copy
- package
-
-
-
-
-
-
- run
-
-
-
-
-
-
- org.jacoco
- jacoco-maven-plugin
- 0.8.7
-
-
- ${jacoco.include}
-
-
-
-
- default-prepare-agent
-
- prepare-agent
-
-
-
- generate-code-coverage-report
- test
-
- report
-
-
-
-
-
-
- org.eluder.coveralls
- coveralls-maven-plugin
- 4.3.0
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 3.2.0
-
- true
-
- true
-
-
-
-
- org.codehaus.mojo
- properties-maven-plugin
- 1.0.0
-
-
- generate-resources
-
- write-project-properties
-
-
- ${project.build.testOutputDirectory}/my.properties
-
-
-
-
-
-
-
-
-
- windows-x86_64
-
-
- windows
- amd64
-
-
-
- windows
- x86_64
-
-
-
- linux-x86_64
-
-
- unix
- amd64
-
-
-
- linux
- x86_64
-
-
-
- apple-x86_64
-
-
- mac
- x86_64
-
-
-
- apple
- x86_64
-
-
-
- linux-ppc_64
-
-
- unix
- ppc64le
-
-
-
- linux
- ppc_64
-
-
-
-
- eclipse-only
-
-
- m2e.version
-
-
-
-
-
-
-
- org.eclipse.m2e
- lifecycle-mapping
- 1.0.0
-
-
-
-
-
- org.apache.maven.plugins
- maven-remote-resources-plugin
- [1.4,)
-
- process
-
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-clean-plugin
- [3.0.0,)
-
- clean
-
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-dependency-plugin
- [2.10,)
-
- copy-dependencies
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- rat
-
- clean org.apache.rat:apache-rat-plugin:check
-
-
- org.apache.rat
- apache-rat-plugin
- 0.12
-
-
- package
-
- check
-
-
-
-
-
- scripts/perftest/results/**
- scripts/perftest/temp/**
- .gitignore
- src/main/python/.gitignore
- .gitmodules
- .repository/
- .idea/
- .git
- .settings
- .classpath
- .project
- CITATION
- src/main/python/docs/build/**/*
- src/main/python/docs/source/_build/**
- src/main/python/generator/resources/**
- docs/api/**/*
- docs/_site/**/*
- docs/site/run_issues.md
- docs/.jekyll-cache/**/*
- docs/css/bootstrap.min.css
- docs/css/pygments-default.css
- docs/js/vendor/**/*
- **/*.lock
- **/*.csv
- **/*.ijv
- **/*.json
- **/*.libsvm
- **/*.mtx
- **/*.mtd
- **/*.out
- **/__pycache__/**
- **/part-*
- **/*.keep
- **/target/**
- **/README.md
- **/*.svg
-
- **/*.ipynb
-
- src/main/java/*.tokens
- **/*.interp
-
- src/main/java/org/apache/sysds/protobuf/*.java
-
- src/main/cuda/kernels/SystemDS.ptx
- src/main/cuda/kernels/reduction.ptx
-
- src/test/scripts/functions/jmlc/**/*.impute
- src/test/scripts/functions/jmlc/**/*.map
- src/test/scripts/functions/jmlc/**/*.mode
- src/test/scripts/functions/jmlc/**/*.ndistinct
- src/test/scripts/functions/jmlc/**/*.node
- src/test/scripts/functions/jmlc/tfmtd_example/Bin/saleprice.bin
- src/test/scripts/functions/jmlc/tfmtd_example/Bin/sqft.bin
- src/test/scripts/functions/jmlc/tfmtd_example/column.names
- src/test/scripts/functions/jmlc/tfmtd_example/dummycoded.column.names
- src/test/scripts/functions/jmlc/tfmtd_example2/column.names
- src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame
-
- src/test/scripts/functions/io/csv/in/*/*
-
- src/main/python/tests/lt*.txt
-
- scripts/perftest/python/requirements.txt
-
- src/main/cuda/ext/**
- src/main/cuda/.idea/
-
-
-
-
-
-
-
-
- proton
-
-
-
-
- com.github.os72
- protoc-jar-maven-plugin
- 3.11.4
-
-
- generate-sources
-
- run
-
-
-
- ${protobuf.version}
-
- src/main/resources/protobuf
-
- src/main/java
-
-
-
-
-
-
-
-
-
-
- distribution
-
-
-
- maven-assembly-plugin
-
- posix
-
-
-
- create-source-distribution
- package
-
- single
-
-
-
- src/assembly/source.xml
-
-
-
-
- create-extra-jar
- package
-
- single
-
-
-
- src/assembly/extra.xml
-
-
-
- ${maven.build.timestamp}
- ${project.artifactId}-extra
- ${project.version}
-
-
-
-
-
- create-binary-distribution
- package
-
- single
-
-
-
- src/assembly/bin.xml
-
-
-
-
-
-
-
- maven-gpg-plugin
- 3.0.1
-
-
- verify
-
- sign
-
-
-
-
- --pinentry-mode
- loopback
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-remote-resources-plugin
- 1.4
-
-
-
- process
-
-
-
-
- org.apache:apache-jar-resource-bundle:1.4
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 3.2.0
-
-
- *.protobuf
- true
- true
- true
- false
- public
- ${java.level}
-
-
-
- attach-javadocs
-
- jar
-
-
-
-
-
-
-
-
-
- skip-sign
-
-
-
- org.apache.maven.plugins
- maven-gpg-plugin
-
- true
-
-
-
-
-
-
-
-
-
- org.jcuda
- jcuda
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcuda-natives
-
-
-
-
-
- org.jcuda
- jcublas
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcublas-natives
-
-
-
-
-
- org.jcuda
- jcusparse
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcusparse-natives
-
-
-
-
-
- org.jcuda
- jcusolver
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcusolver-natives
-
-
-
-
-
- org.jcuda
- jcudnn
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcudnn-natives
-
-
-
-
-
-
- org.jcuda
- jcuda-natives
- windows-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcublas-natives
- windows-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcusparse-natives
- windows-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcusolver-natives
- windows-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcudnn-natives
- windows-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcuda-natives
- linux-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcublas-natives
- linux-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcusparse-natives
- linux-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcusolver-natives
- linux-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcudnn-natives
- linux-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcuda-natives
- apple-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcublas-natives
- apple-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcusparse-natives
- apple-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcusolver-natives
- apple-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.jcuda
- jcudnn-natives
- apple-x86_64
- ${jcuda.version}
- ${jcuda.scope}
-
-
-
- org.apache.spark
- spark-core_${scala.binary.version}
- ${spark.version}
-
-
- log4j
- log4j
-
-
- org.slf4j
- slf4j-log4j12
-
-
- org.slf4j
- slf4j-reload4j
-
-
- org.slf4j
- slf4j-api
-
-
- org.slf4j
- jul-to-slf4j
-
-
- org.slf4j
- jcl-over-slf4j
-
-
- org.apache.hadoop
- hadoop-client-api
-
-
- org.apache.hadoop
- hadoop-client-runtime
-
-
- org.apache.hadoop
- hadoop-client-runtime
-
-
-
-
-
- org.apache.spark
- spark-sql_${scala.binary.version}
- ${spark.version}
-
-
- log4j
- log4j
-
-
- org.slf4j
- slf4j-log4j12
-
-
- org.slf4j
- slf4j-reload4j
-
-
-
-
-
- org.apache.spark
- spark-mllib_${scala.binary.version}
- ${spark.version}
-
-
- log4j
- log4j
-
-
- org.slf4j
- slf4j-log4j12
-
-
- org.slf4j
- slf4j-reload4j
-
-
-
-
-
- org.apache.hadoop
- hadoop-common
- ${hadoop.version}
-
-
- javax.servlet
- servlet-api
-
-
- org.slf4j
- slf4j-api
-
-
- org.slf4j
- slf4j-reload4j
-
-
-
-
-
- org.apache.hadoop
- hadoop-hdfs
- ${hadoop.version}
-
-
- javax.servlet
- servlet-api
-
-
- org.slf4j
- slf4j-log4j12
-
-
- org.slf4j
- slf4j-reload4j
-
-
-
-
-
- org.apache.hadoop
- hadoop-client
- ${hadoop.version}
-
-
- log4j
- log4j
-
-
- org.slf4j
- slf4j-log4j12
-
-
- org.slf4j
- slf4j-reload4j
-
-
-
-
-
- commons-logging
- commons-logging
- 1.1.3
-
-
- org.slf4j
- slf4j-log4j12
-
-
- org.slf4j
- slf4j-reload4j
-
-
-
-
-
- org.apache.commons
- commons-math3
- 3.4.1
-
-
-
- org.apache.wink
- wink-json4j
- 1.4
-
-
-
- com.fasterxml.jackson.core
- jackson-databind
- 2.12.6.1
-
-
-
- junit
- junit
- 4.13.1
- provided
-
-
-
- org.openjdk.jol
- jol-core
- 0.10
- test
-
-
-
-
- com.github.stephenc.jcip
- jcip-annotations
- 1.0-1
- test
-
-
-
-
- org.codehaus.janino
- janino
- 3.0.16
- provided
-
-
-
- org.antlr
- antlr4
- ${antlr.version}
- provided
-
-
- antlr-runtime
- org.antlr
-
-
-
-
-
- org.antlr
- antlr4-runtime
- ${antlr.version}
-
-
-
- org.apache.derby
- derby
- 10.14.2.0
- provided
-
-
-
- io.netty
- netty-all
- 4.1.68.Final
- provided
-
-
- org.apache.logging.log4j
- log4j-api
-
-
- org.apache.logging.log4j
- log4j-1.2-api
-
-
-
-
-
- net.sf.py4j
- py4j
- 0.10.9
-
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 3.2.0
-
-
-
-
- org.apache.maven.plugins
- maven-gpg-plugin
- 1.6
-
-
-
-
- com.google.protobuf
- protobuf-java
- ${protobuf.version}
-
-
-
- com.google.protobuf
- protobuf-java-util
- ${protobuf.version}
-
-
-
- org.apache.maven.plugins
- maven-assembly-plugin
- 3.3.0
-
-
-
- org.slf4j
- slf4j-api
- ${slf4j.version}
-
-
- org.slf4j
- jul-to-slf4j
- ${slf4j.version}
-
-
- org.slf4j
- jcl-over-slf4j
- ${slf4j.version}
-
-
- org.slf4j
- slf4j-reload4j
- ${slf4j.version}
-
-
-
- org.apache.logging.log4j
- log4j-api
- ${log4j.version}
-
-
-
\ No newline at end of file
diff --git a/scripts/staging/SIMD-double-vectors/systemds b/scripts/staging/SIMD-double-vectors/systemds
deleted file mode 100755
index 6c17a8e0faf..00000000000
--- a/scripts/staging/SIMD-double-vectors/systemds
+++ /dev/null
@@ -1,487 +0,0 @@
-#!/usr/bin/env bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-##############################################################
-# This script is part of the SystemDS binary release. It is
-# meant to work out of the box when unzipping the
-# systemds-.zip (or tbz) file.
-#
-# Make configuration changes here:
-##############################################################
-
-# If not set by env, set to 1 to run spark-submit instead of local java
-# This should be used to run with spark-submit instead of java
-if [[ -z "$SYSDS_DISTRIBUTED" ]]; then
- SYSDS_DISTRIBUTED=0
-fi
-
-# if not set by env, set to 1 to disable setup output of this script
-if [ -z "$SYSDS_QUIET" ]; then
- SYSDS_QUIET=0
-fi
-
-# if not set by env, set to default exec modes
-if [[ -z "$SYSDS_EXEC_MODE" ]]; then
- case "$SYSDS_DISTRIBUTED" in
- 0) SYSDS_EXEC_MODE=singlenode ;;
- *) SYSDS_EXEC_MODE=hybrid ;;
- esac
-fi
-
-# an echo toggle
-print_out()
-{
- if [ $SYSDS_QUIET == 0 ]; then
- echo "$1"
- fi
-}
-
-if [[ -z $SYSTEMDS_ROOT ]] ; then
- SYSTEMDS_ROOT=.
- print_out "SYSTEMDS_ROOT not set defaulting to current dir $(pwd)"
-else
- # construct a relative path
- SYSTEMDS_ROOT=$(realpath --relative-to=. ${SYSTEMDS_ROOT})
-fi;
-
-# when using find, look in the directories in this order
-DIR_SEARCH_ORDER=". $SYSTEMDS_ROOT $SYSTEMDS_ROOT/conf $SYSTEMDS_ROOT/lib $SYSTEMDS_ROOT/src $SYSTEMDS_ROOT/target"
-ordered_find() {
- result=""
- for dir in $(echo "$DIR_SEARCH_ORDER" | tr ' ' '\n') ; do
- if [[ $dir == "$SYSTEMDS_ROOT" ]] || [[ $dir == "." ]]; then
- result=$(find "$dir" -maxdepth 1 -iname "$1" -print -quit)
- if [[ $result != "" ]]; then break; fi
- else
- result=$(find "$dir" -iname "$1" -print -quit 2> /dev/null)
- if [[ $result != "" ]]; then break; fi
- fi
- done
- echo "$result"
-}
-
-if [ -n "$SYSTEMDS_STANDALONE_OPTS" ]; then
- print_out "Overriding SYSTEMDS_STANDALONE_OPTS with env var: $SYSTEMDS_STANDALONE_OPTS"
-else
- # specify parameters to java when running locally here
- SYSTEMDS_STANDALONE_OPTS="-Xmx4g -Xms4g -Xmn400m "
-fi
-
-if [ -n "$SYSTEMDS_REMOTE_DEBUGGING" ]; then
- print_out "Overriding SYSTEMDS_REMOTE_DEBUGGING with env var: $SYSTEMDS_REMOTE_DEBUGGING"
-else
- SYSTEMDS_REMOTE_DEBUGGING=" -agentlib:jdwp=transport=dt_socket,suspend=y,address=8787,server=y "
-fi
-
-# check if log4j config file exists, otherwise unset
-# to run with a non fatal complaint by SystemDS
-if [ -z "$LOG4JPROP" ] ; then
- LOG4JPROP=$(ordered_find "log4j*properties")
-
- if [ -z "${LOG4JPROP}" ]; then
- LOG4JPROP=""
- else
- LOG4JPROP="-Dlog4j.configuration=file:$LOG4JPROP"
- fi
-else
- # L4J was set by env var. Unset if that setting is wrong
- LOG4JPROP2=$(find "$LOG4JPROP")
- if [ -z "${LOG4JPROP2}" ]; then
- LOG4JPROP=""
- else
- LOG4JPROP="-Dlog4j.configuration=file:$LOG4JPROP2"
- fi
-fi
-
-if [ -n "${SYSTEMDS_DISTRIBUTED_OPTS}" ]; then
- print_out "Overriding SYSTEMDS_DISTRIBUTED_OPTS with env var $SYSTEMDS_DISTRIBUTED_OPTS"
-else
- # specify parameters to pass to spark-submit when running on spark here
- SYSTEMDS_DISTRIBUTED_OPTS="\
- --master yarn \
- --deploy-mode client \
- --driver-memory 100g \
- --conf spark.driver.extraJavaOptions=\"-Xms100g -Xmn10g -Dlog4j.configuration=file:$LOG4JPROP\" \
- --conf spark.executor.extraJavaOptions=\"-Dlog4j.configuration=file:$LOG4JPROP\" \
- --conf spark.executor.heartbeatInterval=100s \
- --files $LOG4JPROP \
- --conf spark.network.timeout=512s \
- --num-executors 4 \
- --executor-memory 64 \
- --executor-cores 16 "
-fi
-
-
-##############################################################
-# No need to touch the content below. These commands launch
-# SystemDS based on the settings above.
-##############################################################
-
-
-#-------------------------------------------------------------
-# some helper functions
-
-# error help print
-PRINT_SYSDS_HELP=0
-function printUsage {
-cat << EOF
-
-Usage: $0 [-r] [SystemDS.jar] [-f] [arguments] [-help]
-
- SystemDS.jar : Specify a custom SystemDS.jar file (this will be prepended
- to the classpath
- or fed to spark-submit
- -r : Spawn a debug server for remote debugging (standalone and
- spark driver only atm). Default port is 8787 - change within
- this script if necessary. See SystemDS documentation on how
- to attach a remote debugger.
- -f : Optional prefix to the dml-filename for consistency with
- previous behavior dml-filename : The script file to run.
- This is mandatory unless running as a federated worker
- (see below).
- arguments : The arguments specified after the DML script are passed to
- SystemDS. Specify parameters that need to go to
- java/spark-submit by editing this run script.
- -help : Print this usage message and SystemDS parameter info
-
-Worker Usage: $0 [-r] WORKER [SystemDS.jar] [arguments] [-help]
-
- port : The port to open for the federated worker.
-
-Set custom launch configuration by setting/editing SYSTEMDS_STANDALONE_OPTS
-and/or SYSTEMDS_DISTRIBUTED_OPTS.
-
-Set the environment variable SYSDS_DISTRIBUTED=1 to run spark-submit instead of
-local java Set SYSDS_QUIET=1 to omit extra information printed by this run
-script.
-
-EOF
-if [ ${PRINT_SYSDS_HELP} -eq 0 ]; then
- exit 0
-fi
-}
-
-# print an error if no argument is supplied.
-if [ -z "$1" ] ; then
- echo "Wrong Usage. Add -help for additional parameters.";
- echo ""
- printUsage;
-fi
-
-#This loop handles the parameters to the run-script, not the ones passed to SystemDS.
-#To not confuse getopts with SystemDS parameters, only the first two params are considered
-#here. If more run-script params are needed, adjust the next line accordingly
-while getopts ":hr:f:" options "$1$2"; do
- case $options in
- h ) echo "Help requested. Will exit after extended usage message!"
- PRINT_SYSDS_HELP=1
- printUsage
- break
- ;;
- \? ) echo "Unknown parameter -$OPTARG"
- printUsage
- exit
- ;;
- f )
- # silently remove -f (this variant is triggered if there's no
- # jar file or WORKER as first parameter)
- if echo "$OPTARG" | grep -qi "dml"; then
- break
- else
- print_out "No DML Script found after -f option."
- fi
- ;;
- r )
- print_out "Spawning server for remote debugging"
- if [ $SYSDS_DISTRIBUTED == 0 ]; then
- SYSTEMDS_STANDALONE_OPTS=${SYSTEMDS_STANDALONE_OPTS}${SYSTEMDS_REMOTE_DEBUGGING}
- else
- SYSTEMDS_DISTRIBUTED_OPTS=${SYSTEMDS_DISTRIBUTED_OPTS}${SYSTEMDS_REMOTE_DEBUGGING}
- fi
- shift # remove -r from positional arguments
- ;;
- * )
- print_out "Error: Unexpected error while processing options;"
- printUsage
- exit
- esac
-done
-
-# Peel off first and/or second argument so that $@ contains arguments to DML script
-if echo "$1" | grep -q "jar"; then
- SYSTEMDS_JAR_FILE=$1
- shift
- # handle optional '-f' before DML file (for consistency)
- if echo "$1" | grep -q "\-f"; then
- shift
- SCRIPT_FILE=$1
- shift
- else
- SCRIPT_FILE=$1
- shift
- fi
-elif echo "$1" | grep -q "WORKER"; then
- WORKER=1
- shift
- if echo "$1" | grep -q "jar"; then
- SYSTEMDS_JAR_FILE=$1
- shift
- fi
- PORT=$1
- re='^[0-9]+$'
- if ! [[ $PORT =~ $re ]] ; then
- echo "error: Port is not a number"
- printUsage
- fi
- shift
-elif echo "$1" | grep -q "FEDMONITORING"; then
- FEDMONITORING=1
- shift
- if echo "$1" | grep -q "jar"; then
- SYSTEMDS_JAR_FILE=$1
- shift
- fi
- PORT=$1
- re='^[0-9]+$'
- if ! [[ $PORT =~ $re ]] ; then
- echo "error: Port is not a number"
- printUsage
- fi
- shift
-else
- # handle optional '-f' before DML file (for consistency)
- if echo "$1" | grep -q "\-f"; then
- shift
- SCRIPT_FILE=$1
- shift
- else
- SCRIPT_FILE=$1
- shift
- fi
-fi
-
-if [ -z "$WORKER" ] ; then
- WORKER=0
-fi
-
-if [ -z "$FEDMONITORING" ] ; then
- FEDMONITORING=0
-fi
-
-# find me a SystemDS jar file to run
-if [ -z "$SYSTEMDS_JAR_FILE" ];then
- SYSTEMDS_JAR_FILE=$(ordered_find "systemds.jar")
- if [ -z "$SYSTEMDS_JAR_FILE" ];then
- SYSTEMDS_JAR_FILE=$(ordered_find "systemds-?.?.?.jar")
- if [ -z "$SYSTEMDS_JAR_FILE" ];then
- SYSTEMDS_JAR_FILE=$(ordered_find "systemds-?.?.?-SNAPSHOT.jar")
- fi
- fi
-else
- print_out "Using user supplied systemds jar file $SYSTEMDS_JAR_FILE"
-fi
-
-if [[ "$*" == *-config* ]]; then
-# override config file from env var if given as parameter to SystemDS
- read -r -d '' -a myArray < <( echo "$@" )
- INDEX=0
- for i in "${myArray[@]}"; do
- if [[ ${myArray[INDEX]} == *-config* ]]; then
- if [ -f "${myArray[((INDEX+1))]}" ]; then
- CONFIG_FILE="${myArray[((INDEX+1))]}"
- else
- echo Warning! Passed config file "${myArray[((INDEX+1))]}" does not exist.
- fi
- # remove -config
- unset 'myArray[INDEX]'
-
- # remove -config param if not starting with -
- if [[ "${myArray[((INDEX+1))]:0:1}" != "-" ]]; then
- unset 'myArray[((INDEX+1))]'
- fi
- # setting the script arguments without the passed -config for further processing
- set -- "${myArray[@]}"
- break;
- fi
- # debug print array item
- #echo "${myArray[INDEX]}"
- (( INDEX=INDEX+1 ))
- done
-
- if [ -f "$CONFIG_FILE" ] ; then
- CONFIG_FILE="-config $CONFIG_FILE"
- else
- CONFIG_FILE=""
- fi
-elif [ -z "$CONFIG_FILE" ] ; then
- # same as above: set config file param if the file exists
- CONFIG_FILE=$(ordered_find "SystemDS-config-defaults.xml")
- if [ -z "$CONFIG_FILE" ]; then
- CONFIG_FILE=$(ordered_find "SystemDS-config.xml")
- fi
- if [ -z "$CONFIG_FILE" ]; then
- CONFIG_FILE=""
- else
- CONFIG_FILE="-config $CONFIG_FILE"
- fi
-else
- # CONFIG_FILE was set by env var. Unset if that setting is wrong
- if [ -f "${CONFIG_FILE}" ]; then
- CONFIG_FILE="-config $CONFIG_FILE"
- else
- CONFIG_FILE=""
- fi
-fi
-
-# override exec mode if given as parameter to SystemDS (e.g. -exec singlenode)
-read -r -d '' -a myArray < <( echo "$@" )
-INDEX=0
-for i in "${myArray[@]}"; do
- if [[ "$i" == *-exec* ]]; then
- SYSDS_EXEC_MODE="${myArray[((INDEX+1))]}"
- break;
- fi
- (( INDEX=INDEX+1 ))
-done
-
-if [ $SYSDS_DISTRIBUTED -ne 0 ] && [[ $SYSDS_EXEC_MODE == "singlenode" ]]; then
- echo "Error: Can not run on Spark with execution mode singlenode"
- exit 1
-fi
-
-# find absolute path to hadoop home in SYSTEMDS_ROOT
-if [ -z "$HADOOP_HOME" ]; then
- HADOOP_HOME=$(realpath "$(find "$SYSTEMDS_ROOT" -iname hadoop | tail -n 1 )")
- export HADOOP_HOME
-fi
-# add hadoop home to path and lib path for loading hadoop jni
-HADOOP_REL=$(realpath --relative-to=. "$HADOOP_HOME")
-
-# default directory separator unix style
-DIR_SEP=/
-# detect operating system to set correct path separator
-if [ "$OSTYPE" == "win32" ] || [ "$OSTYPE" == "msys" ] || [ "$OSTYPE" == "cygwin" ]; then
- PATH_SEP=\;
- DIR_SEP=\\
- HADOOP_REL="${HADOOP_REL////\\}"
-else
- PATH_SEP=:
-fi
-
-# make the jar path relative to skip issues with Windows paths
-JARNAME=$(basename "$SYSTEMDS_JAR_FILE")
-
-# relative path to jar file
-SYSTEMDS_JAR_FILE=$(realpath --relative-to=. "$(dirname "$SYSTEMDS_JAR_FILE")")${DIR_SEP}${JARNAME}
-
-NATIVE_LIBS="$SYSTEMDS_ROOT${DIR_SEP}target${DIR_SEP}classes${DIR_SEP}lib"
-export PATH=${HADOOP_REL}${DIR_SEP}bin${PATH_SEP}${PATH}${PATH_SEP}$NATIVE_LIBS
-export LD_LIBRARY_PATH=${HADOOP_REL}${DIR_SEP}bin${PATH_SEP}${LD_LIBRARY_PATH}
-
-# set java class path
-CLASSPATH="${SYSTEMDS_JAR_FILE}${PATH_SEP} \
- ${SYSTEMDS_ROOT}${DIR_SEP}lib${DIR_SEP}*${PATH_SEP} \
- ${SYSTEMDS_ROOT}${DIR_SEP}target${DIR_SEP}lib${DIR_SEP}*"
-# trim whitespace (introduced by the line breaks above)
-CLASSPATH=$(echo "${CLASSPATH}" | tr -d '[:space:]')
-
-if [ $PRINT_SYSDS_HELP == 1 ]; then
- echo "----------------------------------------------------------------------"
- echo "Further help on SystemDS arguments:"
- java -cp "$CLASSPATH" org.apache.sysds.api.DMLScript -help
- exit 1
-fi
-
-print_out "###############################################################################"
-print_out "# SYSTEMDS_ROOT= $SYSTEMDS_ROOT"
-print_out "# SYSTEMDS_JAR_FILE= $SYSTEMDS_JAR_FILE"
-print_out "# SYSDS_EXEC_MODE= $SYSDS_EXEC_MODE"
-print_out "# CONFIG_FILE= $CONFIG_FILE"
-print_out "# LOG4JPROP= $LOG4JPROP"
-print_out "# CLASSPATH= $CLASSPATH"
-print_out "# HADOOP_HOME= $HADOOP_HOME"
-
-#build the command to run
-if [ $WORKER == 1 ]; then
- print_out "#"
- print_out "# starting Federated worker on port $PORT"
- print_out "###############################################################################"
- CMD=" \
- java $SYSTEMDS_STANDALONE_OPTS \
- -cp $CLASSPATH \
- $LOG4JPROP \
- org.apache.sysds.api.DMLScript \
- -w $PORT \
- $CONFIG_FILE \
- $*"
- print_out "Executing command: $CMD"
- print_out ""
-
-elif [ "$FEDMONITORING" == 1 ]; then
- print_out "#"
- print_out "# starting Federated backend monitoring on port $PORT"
- print_out "###############################################################################"
- CMD=" \
- java $SYSTEMDS_STANDALONE_OPTS \
- -cp $CLASSPATH \
- $LOG4JPROP \
- org.apache.sysds.api.DMLScript \
- -fedMonitoring $PORT \
- $CONFIG_FILE \
- $*"
- print_out "Executing command: $CMD"
- print_out ""
-
-elif [ $SYSDS_DISTRIBUTED == 0 ]; then
- print_out "#"
- print_out "# Running script $SCRIPT_FILE locally with opts: $*"
- print_out "###############################################################################"
- CMD=" \
- java $SYSTEMDS_STANDALONE_OPTS \
- -cp $CLASSPATH \
- $LOG4JPROP \
- --add-modules=jdk.incubator.vector \
- org.apache.sysds.api.DMLScript \
- -f $SCRIPT_FILE \
- -exec $SYSDS_EXEC_MODE \
- $CONFIG_FILE \
- $*"
- print_out "Executing command: $CMD"
- print_out ""
-else
- print_out "#"
- print_out "# Running script $SCRIPT_FILE distributed with opts: $*"
- print_out "###############################################################################"
- export SPARK_MAJOR_VERSION=2
- CMD=" \
- spark-submit $SYSTEMDS_DISTRIBUTED_OPTS \
- $SYSTEMDS_JAR_FILE \
- -f $SCRIPT_FILE \
- -exec $SYSDS_EXEC_MODE \
- $CONFIG_FILE \
- $*"
- print_out "Executing command: $CMD"
- print_out ""
-fi
-
-# run
-eval "$CMD"
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index a9f4beaed29..af702cb7fad 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -3717,26 +3717,22 @@ public static double dotProduct( double[] a, double[] b, int ai, int bi, final i
public static double dotProduct( double[] a, double[] b, int[] aix, int ai, final int bi, final int len )
{
double val = 0;
- final int bn = len%8;
+ final int bn = len%vLen;
//compute rest
for( int i = ai; i < ai+bn; i++ )
val += a[ i ] * b[ bi+aix[i] ];
- //unrolled 8-block (for better instruction-level parallelism)
- for( int i = ai+bn; i < ai+len; i+=8 )
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for( int i = ai+bn; i < ai+len; i+=vLen)
{
//read 64B cacheline of a
//read 64B of b via 'gather'
//compute cval' = sum(a * b) + cval
- val += a[ i+0 ] * b[ bi+aix[i+0] ]
- + a[ i+1 ] * b[ bi+aix[i+1] ]
- + a[ i+2 ] * b[ bi+aix[i+2] ]
- + a[ i+3 ] * b[ bi+aix[i+3] ]
- + a[ i+4 ] * b[ bi+aix[i+4] ]
- + a[ i+5 ] * b[ bi+aix[i+5] ]
- + a[ i+6 ] * b[ bi+aix[i+6] ]
- + a[ i+7 ] * b[ bi+aix[i+7] ];
+ var aVec = DoubleVector.fromArray(SPECIES, a, i);
+ var bVec = DoubleVector.fromArray(SPECIES, b, bi, aix, i);
+ val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD);
+
}
//scalar result