From 1a850f961345b642d5b12f2c7cc66c5e0c7b73f6 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 22 May 2025 23:14:36 +0200 Subject: [PATCH] [SYSTEMDS-3874] Initial vector gather and remove vector staging Co-authored-by: Kevin Innerebner --- .../SIMD-double-vectors/LibMatrixMult.java | 4445 ----------------- scripts/staging/SIMD-double-vectors/README.md | 45 - .../staging/SIMD-double-vectors/pom.xml.tmp | 1336 ----- scripts/staging/SIMD-double-vectors/systemds | 487 -- .../runtime/matrix/data/LibMatrixMult.java | 18 +- 5 files changed, 7 insertions(+), 6324 deletions(-) delete mode 100644 scripts/staging/SIMD-double-vectors/LibMatrixMult.java delete mode 100644 scripts/staging/SIMD-double-vectors/README.md delete mode 100644 scripts/staging/SIMD-double-vectors/pom.xml.tmp delete mode 100755 scripts/staging/SIMD-double-vectors/systemds diff --git a/scripts/staging/SIMD-double-vectors/LibMatrixMult.java b/scripts/staging/SIMD-double-vectors/LibMatrixMult.java deleted file mode 100644 index e1e3a640e43..00000000000 --- a/scripts/staging/SIMD-double-vectors/LibMatrixMult.java +++ /dev/null @@ -1,4445 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.matrix.data; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; -import java.util.stream.IntStream; - -import jdk.incubator.vector.DoubleVector; -import jdk.incubator.vector.VectorOperators; -import jdk.incubator.vector.VectorSpecies; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.commons.math3.util.FastMath; -import org.apache.sysds.hops.OptimizerUtils; -import org.apache.sysds.lops.MapMultChain.ChainType; -import org.apache.sysds.lops.WeightedCrossEntropy.WCeMMType; -import org.apache.sysds.lops.WeightedDivMM.WDivMMType; -import org.apache.sysds.lops.WeightedSigmoid.WSigmoidType; -import org.apache.sysds.lops.WeightedSquaredLoss.WeightsType; -import org.apache.sysds.lops.WeightedUnaryMM.WUMMType; -import org.apache.sysds.runtime.DMLRuntimeException; -import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; -import org.apache.sysds.runtime.data.DenseBlock; -import org.apache.sysds.runtime.data.DenseBlockFactory; -import org.apache.sysds.runtime.data.SparseBlock; -import org.apache.sysds.runtime.data.SparseBlock.Type; -import org.apache.sysds.runtime.data.SparseBlockCSR; -import org.apache.sysds.runtime.data.SparseBlockFactory; -import org.apache.sysds.runtime.data.SparseBlockMCSR; -import org.apache.sysds.runtime.data.SparseRowScalar; -import org.apache.sysds.runtime.functionobjects.SwapIndex; -import org.apache.sysds.runtime.functionobjects.ValueFunction; -import org.apache.sysds.runtime.matrix.operators.ReorgOperator; -import org.apache.sysds.runtime.util.CommonThreadPool; -import org.apache.sysds.runtime.util.UtilFunctions; -import org.apache.sysds.utils.NativeHelper; - -/** - * MB: Library for matrix multiplications including MM, MV, VV for all - * combinations of dense, sparse, ultrasparse representations and special - * operations such as transpose-self matrix multiplication. - *

- * In general all implementations use internally dense outputs - * for direct access, but change the final result to sparse if necessary. - * The only exceptions are ultra-sparse matrix mult, wsloss and wsigmoid. - */ -public class LibMatrixMult -{ - //internal configuration - private static final boolean LOW_LEVEL_OPTIMIZATION = true; - private static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB - private static final long PAR_MINFLOP_THRESHOLD1 = 2L*1024*1024; //MIN 2 MFLOP - private static final long PAR_MINFLOP_THRESHOLD2 = 128L*1024; //MIN 2 MFLOP - public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size) - public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size) - private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName()); - - static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; - - private LibMatrixMult() { - //prevent instantiation via private constructor - } - - //////////////////////////////// - // public matrix mult interface - //////////////////////////////// - - /** - * Performs a matrix multiplication - * - * All variants use a IKJ access pattern, and internally use dense output. After the - * actual computation, we recompute nnz and check for sparse/dense representation. - * - * @param m1 first matrix - * @param m2 second matrix - * @return ret Matrix Block - */ - public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2) { - return matrixMult(m1, m2, null, false, 1); - } - - /** - * Performs a matrix multiplication - * - * All variants use a IKJ access pattern, and internally use dense output. After the - * actual computation, we recompute nnz and check for sparse/dense representation. - * - * @param m1 first matrix - * @param m2 second matrix - * @param k maximum parallelism - * @return ret Matrix Block - */ - public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, int k) { - return matrixMult(m1, m2, null, false, k); - } - - /** - * Performs a matrix multiplication and stores the result in the output matrix. - * - * All variants use a IKJ access pattern, and internally use dense output. After the - * actual computation, we recompute nnz and check for sparse/dense representation. - * - * @param m1 first matrix - * @param m2 second matrix - * @param ret result matrix - * @return ret Matrix Block - */ - public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret) { - return matrixMult(m1, m2, ret, false, 1); - } - - /** - * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate - * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally - * consumed by another dense instruction, which makes repeated conversion to sparse wasteful. - * This should be used in rare cases and if you are unsure, - * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead. - * - * @param m1 first matrix - * @param m2 second matrix - * @param ret result matrix - * @param fixedRet if true, output representation is fixed and nnzs not recomputed - * @return ret Matrix Block - */ - public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean fixedRet) { - return matrixMult(m1, m2, ret, fixedRet, 1); - } - - /** - * Performs a multi-threaded matrix multiplication and stores the result in the output matrix. - * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen). - * - * @param m1 first matrix - * @param m2 second matrix - * @param ret result matrix - * @param k maximum parallelism - * @return ret Matrix Block - */ - public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) { - return matrixMult(m1, m2, ret, false, k); - } - - /** - * Performs a matrix multiplication and stores the result in the output matrix. - * - * All variants use a IKJ access pattern, and internally use dense output. After the - * actual computation, we recompute nnz and check for sparse/dense representation. - * - * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate - * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally - * consumed by another dense instruction, which makes repeated conversion to sparse wasteful. - * This should be used in rare cases and if you are unsure, - * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead. - * - * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen). - * - * @param m1 first matrix - * @param m2 second matrix - * @param ret result matrix - * @param fixedRet if true, output representation is fixed and nnzs not recomputed - * @param k maximum parallelism - * @return ret Matrix Block - */ - public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean fixedRet, int k) { - if(m1.isEmptyBlock(false) || m2.isEmptyBlock(false)) - return emptyMatrixMult(m1, m2, ret); - - // Timing time = new Timing(true); - - // pre analysis - boolean m1Perm = m1.isSparsePermutationMatrix(); - boolean ultraSparse = (fixedRet && ret.sparse) || - (!fixedRet && isUltraSparseMatrixMult(m1, m2, m1Perm)); - boolean sparse = !fixedRet && !ultraSparse && !m1Perm - && isSparseOutputMatrixMult(m1, m2); - - // allocate output - if(ret == null) - ret = new MatrixBlock(m1.rlen, m2.clen, ultraSparse | sparse); - else - ret.reset(m1.rlen, m2.clen, ultraSparse | sparse); - ret.allocateBlock(); - - // Detect if we should transpose skinny right side. - boolean tm2 = !fixedRet && checkPrepMatrixMultRightInput(m1,m2); - m2 = prepMatrixMultRightInput(m1, m2, tm2); - - // check for multi-threading - if (!ret.isThreadSafe() - || !satisfiesMultiThreadingConstraints(m1, m2, m1.rlen==1, true, 2, k) - || fixedRet) // Fixed ret not supported in multithreaded execution yet - k = 1; - - if(k <= 1) - singleThreadedMatrixMult(m1, m2, ret, ultraSparse, sparse, tm2, m1Perm, fixedRet); - else - parallelMatrixMult(m1, m2, ret, k, ultraSparse, sparse, tm2, m1Perm); - - //System.out.println("MM "+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" + - // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop()); - - return ret; - } - - private static void singleThreadedMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, - boolean ultraSparse, boolean sparse, boolean tm2, boolean m1Perm, boolean fixedRet){ - // prepare row-upper for special cases of vector-matrix - final boolean pm2 = !ultraSparse && checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE); - final int ru2 = (pm2) ? m2.rlen : m1.rlen; - - // core matrix mult computation - if(ultraSparse && !fixedRet) - matrixMultUltraSparse(m1, m2, ret, m1Perm, 0, ru2); - else if(!m1.sparse && !m2.sparse) - matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, m2.clen); - else if(m1.sparse && m2.sparse) - matrixMultSparseSparse(m1, m2, ret, pm2, sparse, 0, ru2); - else if(m1.sparse) - matrixMultSparseDense(m1, m2, ret, pm2, 0, ru2); - else - matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2); - - // post-processing: nnz/representation - if(!fixedRet) { - if(!ret.sparse) - ret.recomputeNonZeros(); - ret.examSparsity(); - } - } - - private static void parallelMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k, - boolean ultraSparse, boolean sparse, boolean tm2, boolean m1Perm){ - // prepare row-upper for special cases of vector-matrix / matrix-matrix - boolean pm2r = !ultraSparse && !sparse && checkParMatrixMultRightInputRows(m1, m2, k); - boolean pm2c = !ultraSparse && checkParMatrixMultRightInputCols(m1, m2, k, pm2r); - int num = pm2r ? m2.rlen : pm2c ? m2.clen : m1.rlen; - - // core multi-threaded matrix mult computation - // (currently: always parallelization over number of rows) - try { - ExecutorService pool = CommonThreadPool.get(k); - ArrayList tasks = new ArrayList<>(); - ArrayList blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, (pm2r || pm2c)); - for(int i = 0, lb = 0; i < blklens.size(); lb += blklens.get(i), i++) - tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i))); - // execute tasks - List> taskret = pool.invokeAll(tasks); - pool.shutdown(); - // aggregate partial results (nnz, ret for vector/matrix) - ret.nonZeros = 0; // reset after execute - for(Future task : taskret) { - if(pm2r) // guaranteed single block - vectAdd((double[]) task.get(), ret.getDenseBlockValues(), 0, 0, ret.rlen * ret.clen); - else - ret.nonZeros += (Long) task.get(); - } - if(pm2r) - ret.recomputeNonZeros(); - } - catch(Exception ex) { - throw new DMLRuntimeException(ex); - } - - // post-processing (nnz maintained in parallel) - ret.examSparsity(); - } - - public static MatrixBlock emptyMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret){ - final int rl = m1.rlen; - final int cl = m2.clen; - - if(ret == null) - return new MatrixBlock(rl, cl, true); - else { - ret.reset(rl, cl, true); - ret.setNonZeros(0); - ret.cleanupBlock(true, true); - return ret; - } - } - - /** - * Performs a matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)). - * - * All variants use a IKJ access pattern, and internally use dense output. After the - * actual computation, we recompute nnz and check for sparse/dense representation. - * - * @param mX X matrix - * @param mV v matrix - * @param mW w matrix - * @param ret result matrix - * @param ct chain type - */ - public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct) { - //check inputs / outputs (after that mV and mW guaranteed to be dense) - if( mX.isEmptyBlock(false) || (mV.isEmptyBlock(false) && ct!=ChainType.XtXvy) - || (mW !=null && mW.isEmptyBlock(false)) ) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //Timing time = new Timing(true); - - //pre-processing: output allocation - ret.sparse = false; - ret.allocateDenseBlock(); - - //core matrix mult chain computation - if( mX.sparse ) - matrixMultChainSparse(mX, mV, mW, ret, ct, 0, mX.rlen); - else - matrixMultChainDense(mX, mV, mW, ret, ct, 0, mX.rlen); - - //post-processing - ret.recomputeNonZeros(); - ret.examSparsity(); - - //System.out.println("MMChain "+ct.toString()+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); - } - - /** - * Performs a parallel matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)). - * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen). - * - * NOTE: This multi-threaded mmchain operation has additional memory requirements of k*ncol(X)*8bytes - * for partial aggregation. Current max memory: 256KB; otherwise redirectly to sequential execution. - * - * @param mX X matrix - * @param mV v matrix - * @param mW w matrix - * @param ret result matrix - * @param ct chain type - * @param k maximum parallelism - */ - public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int k) { - //check inputs / outputs (after that mV and mW guaranteed to be dense) - if( mX.isEmptyBlock(false) || (mV.isEmptyBlock(false) && ct!=ChainType.XtXvy) - || (mW !=null && mW.isEmptyBlock(false)) ) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //check temporary memory and too small workload for multi-threading - if( !satisfiesMultiThreadingConstraints(mX, true, true, mX.sparse?2:4, k) ) { - matrixMultChain(mX, mV, mW, ret, ct); - return; - } - - //Timing time = new Timing(true); - - //pre-processing (no need to check isThreadSafe) - ret.sparse = false; - ret.allocateDenseBlock(); - - //core matrix mult chain computation - //(currently: always parallelization over number of rows) - try { - ExecutorService pool = CommonThreadPool.get(k); - ArrayList blklens = UtilFunctions.getBalancedBlockSizesDefault(mX.rlen, k, true); - ArrayList tasks = new ArrayList<>(); - for( int i=0, lb=0; i> taskret = pool.invokeAll(tasks); - pool.shutdown(); - //aggregate partial results and error handling - double[][] a = new double[taskret.size()][]; - for(int i=0; i tasks = new ArrayList<>(); - //load balance via #tasks=2k due to triangular shape - int blklen = (int)(Math.ceil((double)ret.rlen / (2 * k))); - for(int i = 0; i < ret.rlen; i += blklen) - tasks.add(new MatrixMultTransposeTask(m1, ret, leftTranspose, i, Math.min(i+blklen, ret.rlen))); - for( Future rtask : pool.invokeAll(tasks) ) - rtask.get(); - } - catch(Exception ex) { - throw new DMLRuntimeException(ex); - } - finally{ - pool.shutdown(); - } - - //post-processing - long nnz = copyUpperToLowerTriangle(ret); - ret.setNonZeros(nnz); - ret.examSparsity(); - - //System.out.println("TSMM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop()); - } - - public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2 ) { - //check inputs / outputs - if( pm1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) - return; - - //Timing time = new Timing(true); - - //pre-processing - ret1.sparse = (m2.sparse || ret1.sparse); - if( ret1.sparse ) - ret1.allocateSparseRowsBlock(); - else - ret1.allocateDenseBlock(); - - //core permutation mm computation - if( m2.sparse ) - matrixMultPermuteSparse(pm1, m2, ret1, ret2, 0, pm1.rlen); - else if( ret1.sparse ) - matrixMultPermuteDenseSparse(pm1, m2, ret1, ret2, 0, pm1.rlen); - else - matrixMultPermuteDense(pm1, m2, ret1, ret2, 0, pm1.rlen); - - //post-processing - ret1.recomputeNonZeros(); - ret1.examSparsity(); - if( ret2 != null ) { //optional second output - ret2.recomputeNonZeros(); - ret2.examSparsity(); - } - - //System.out.println("PMM Seq ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" + - // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop()); - } - - public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int k) { - //check inputs / outputs - if( pm1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) - return; - - //check no parallelization benefit (fallback to sequential) - if (pm1.rlen == 1) { - matrixMultPermute(pm1, m2, ret1, ret2); - return; - } - - //Timing time = new Timing(true); - - //allocate first output block (second allocated if needed) - ret1.sparse = false; // no need to check isThreadSafe - ret1.allocateDenseBlock(); - - try - { - ExecutorService pool = CommonThreadPool.get(k); - ArrayList tasks = new ArrayList<>(); - int blklen = (int)(Math.ceil((double)pm1.rlen/k)); - for( int i=0; i tasks = new ArrayList<>(); - int blklen = (int)(Math.ceil((double)mX.rlen/k)); - for( int i=0; i> taskret = pool.invokeAll(tasks); - pool.shutdown(); - //aggregate partial results - sumScalarResults(taskret, ret); - } - catch( Exception e ) { - throw new DMLRuntimeException(e); - } - - //add correction for sparse wsloss w/o weight - if( mX.sparse && wt==WeightsType.NONE ) - addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, k); - - //System.out.println("MMWSLoss "+wt.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); - } - - public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt) { - //check for empty result - if( mW.isEmptyBlock(false) ) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //Timing time = new Timing(true); - - //pre-processing - ret.sparse = mW.sparse; - ret.allocateBlock(); - - //core weighted square sum mm computation - boolean allDense = !mW.sparse && !mU.sparse && !mV.sparse - && !mU.isEmptyBlock() && !mV.isEmptyBlock(); - if( NativeHelper.isNativeLibraryLoaded() && allDense && (mW.rlen == 1 || mW.clen == 1) - && !LibMatrixNative.isMatMultMemoryBound(mU.rlen, mU.clen, mV.rlen) - && mW.getDenseBlock().isContiguous() && mU.getDenseBlock().isContiguous() && mV.getDenseBlock().isContiguous() ) - matrixMultWSigmoidDenseNative(mW, mU, mV, ret, wt); - else if( allDense ) - matrixMultWSigmoidDense(mW, mU, mV, ret, wt, 0, mW.rlen); - else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) - matrixMultWSigmoidSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen); - else - matrixMultWSigmoidGeneric(mW, mU, mV, ret, wt, 0, mW.rlen); - - //post-processing - ret.recomputeNonZeros(); - ret.examSparsity(); - - //System.out.println("MMWSig "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); - } - - public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int k) { - //check for empty result - if( mW.isEmptyBlock(false) ) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //check no parallelization benefit (fallback to sequential) - if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) { - matrixMultWSigmoid(mW, mU, mV, ret, wt); - return; - } - - //Timing time = new Timing(true); - - //pre-processing - ret.sparse = mW.sparse; - ret.allocateBlock(); - - try - { - ExecutorService pool = CommonThreadPool.get(k); - ArrayList tasks = new ArrayList<>(); - int blklen = (int)(Math.ceil((double)mW.rlen/k)); - for( int i=0; i> taskret = pool.invokeAll(tasks); - pool.shutdown(); - //aggregate partial nnz and check for errors - ret.nonZeros = 0; //reset after execute - for( Future task : taskret ) - ret.nonZeros += task.get(); - } - catch (Exception e) { - throw new DMLRuntimeException(e); - } - - //post-processing (nnz maintained in parallel) - ret.examSparsity(); - - //System.out.println("MMWSig "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + "."); - } - - /** - * NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations - * have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail, - * this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero - * which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X) - * is zero. - * - * @param mW matrix W - * @param mU matrix U - * @param mV matrix V - * @param mX matrix X - * @param ret result type - * @param wt weighted divide matrix multiplication type - */ - public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt) { - //check for empty result - if( mW.isEmptyBlock(false) - || (wt.isLeft() && mU.isEmptyBlock(false)) - || (wt.isRight() && mV.isEmptyBlock(false)) - || (wt.isBasic() && mW.isEmptyBlock(false))) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //Timing time = new Timing(true); - - //pre-processing - ret.sparse = wt.isBasic()?mW.sparse:false; - ret.allocateBlock(); - - //core weighted div mm computation - boolean scalarX = wt.hasScalar(); - if( !mW.sparse && !mU.sparse && !mV.sparse && (mX==null || !mX.sparse || scalarX) && !mU.isEmptyBlock() && !mV.isEmptyBlock() ) - matrixMultWDivMMDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen); - else if( mW.sparse && !mU.sparse && !mV.sparse && (mX==null || mX.sparse || scalarX) && !mU.isEmptyBlock() && !mV.isEmptyBlock()) - matrixMultWDivMMSparseDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen); - else - matrixMultWDivMMGeneric(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen); - - //post-processing - ret.recomputeNonZeros(); - ret.examSparsity(); - - //System.out.println("MMWDiv "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); - } - - /** - * NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations - * have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail, - * this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero - * which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X) - * is zero. - * - * @param mW matrix W - * @param mU matrix U - * @param mV matrix V - * @param mX matrix X - * @param ret result matrix - * @param wt weighted divide matrix multiplication type - * @param k maximum parallelism - */ - public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int k) { - //check for empty result - if( mW.isEmptyBlock(false) - || (wt.isLeft() && mU.isEmptyBlock(false)) - || (wt.isRight() && mV.isEmptyBlock(false)) - || (wt.isBasic() && mW.isEmptyBlock(false))) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //Timing time = new Timing(true); - - //pre-processing - ret.sparse = wt.isBasic()?mW.sparse:false; - ret.allocateBlock(); - - if (!ret.isThreadSafe()){ - matrixMultWDivMM(mW, mU, mV, mX, ret, wt); - return; - } - - try - { - ExecutorService pool = CommonThreadPool.get(k); - ArrayList tasks = new ArrayList<>(); - //create tasks (for wdivmm-left, parallelization over columns; - //for wdivmm-right, parallelization over rows; both ensure disjoint results) - if( wt.isLeft() ) { - int blklen = (int)(Math.ceil((double)mW.clen/k)); - for( int j=0; j> taskret = pool.invokeAll(tasks); - pool.shutdown(); - //aggregate partial nnz and check for errors - ret.nonZeros = 0; //reset after execute - for( Future task : taskret ) - ret.nonZeros += task.get(); - } - catch (Exception e) { - throw new DMLRuntimeException(e); - } - - //post-processing - ret.examSparsity(); - - //System.out.println("MMWDiv "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); - } - - public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt) { - //check for empty result - if( mW.isEmptyBlock(false) ) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //Timing time = new Timing(true); - - //pre-processing - ret.sparse = false; - ret.allocateDenseBlock(); - - //core weighted cross entropy mm computation - if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() ) - matrixMultWCeMMDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen); - else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) - matrixMultWCeMMSparseDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen); - else - matrixMultWCeMMGeneric(mW, mU, mV, eps, ret, wt, 0, mW.rlen); - - //System.out.println("MMWCe "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); - } - - public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int k) { - //check for empty result - if( mW.isEmptyBlock(false) ) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //Timing time = new Timing(true); - - //pre-processing (no need to check isThreadSafe) - ret.sparse = false; - ret.allocateDenseBlock(); - - try - { - ExecutorService pool = CommonThreadPool.get(k); - ArrayList tasks = new ArrayList<>(); - int blklen = (int)(Math.ceil((double)mW.rlen/k)); - for( int i=0; i> taskret = pool.invokeAll(tasks); - pool.shutdown(); - //aggregate partial results - sumScalarResults(taskret, ret); - } - catch( Exception e ) { - throw new DMLRuntimeException(e); - } - - //System.out.println("MMWCe "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); - } - - public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn) { - //check for empty result - if( mW.isEmptyBlock(false) ) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //Timing time = new Timing(true); - - //pre-processing - ret.sparse = mW.sparse; - ret.allocateBlock(); - - //core weighted square sum mm computation - if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() ) - matrixMultWuMMDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen); - else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) - matrixMultWuMMSparseDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen); - else - matrixMultWuMMGeneric(mW, mU, mV, ret, wt, fn, 0, mW.rlen); - - //post-processing - ret.recomputeNonZeros(); - ret.examSparsity(); - - //System.out.println("MMWu "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); - } - - public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int k) { - //check for empty result - if( mW.isEmptyBlock(false) ) { - ret.examSparsity(); //turn empty dense into sparse - return; - } - - //check no parallelization benefit (fallback to sequential) - if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) { - matrixMultWuMM(mW, mU, mV, ret, wt, fn); - return; - } - - //Timing time = new Timing(true); - - //pre-processing - ret.sparse = mW.sparse; - ret.allocateBlock(); - - try - { - ExecutorService pool = CommonThreadPool.get(k); - ArrayList tasks = new ArrayList<>(); - int blklen = (int)(Math.ceil((double)mW.rlen/k)); - for( int i=0; i> taskret = pool.invokeAll(tasks); - pool.shutdown(); - //aggregate partial nnz and check for errors - ret.nonZeros = 0; //reset after execute - for( Future task : taskret ) - ret.nonZeros += task.get(); - } - catch (Exception e) { - throw new DMLRuntimeException(e); - } - - //post-processing (nnz maintained in parallel) - ret.examSparsity(); - - //System.out.println("MMWu "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + - // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + "."); - } - - ////////////////////////////////////////// - // optimized matrix mult implementation // - ////////////////////////////////////////// - - private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) { - DenseBlock a = m1.getDenseBlock(); - DenseBlock b = m2.getDenseBlock(); - DenseBlock c = ret.getDenseBlock(); - final int m = m1.rlen; - final int n = m2.clen; - final int cd = m1.clen; - - if( LOW_LEVEL_OPTIMIZATION ) { - if( m==1 && n==1 ) { //DOT PRODUCT - double[] avals = a.valuesAt(0); - double[] bvals = b.valuesAt(0); - c.set(0, 0, dotProduct(avals, bvals, cd)); - } - else if( n>1 && cd == 1 ) { //OUTER PRODUCT - double[] avals = a.valuesAt(0); - double[] bvals = b.valuesAt(0); - for( int i=rl; i < ru; i++) { - double[] cvals = c.values(i); - int cix = c.pos(i); - if( avals[i] == 1 ) - System.arraycopy(bvals, 0, cvals, cix, n); - else if( avals[i] != 0 ) - vectMultiplyWrite(avals[i], bvals, cvals, 0, cix, n); - else - Arrays.fill(cvals, cix, cix+n, 0); - } - } - else if( n==1 && cd == 1 ) { //VECTOR-SCALAR - double[] avals = a.valuesAt(0); - double[] cvals = c.valuesAt(0); - vectMultiplyWrite(b.get(0,0), avals, cvals, rl, rl, ru-rl); - } - else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs) - matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru); - } - else if( n==1 ) { //MATRIX-VECTOR (tall rhs) - matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru); - } - else if( pm2 && m==1 ) { //VECTOR-MATRIX - matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru); - } - else if( pm2 && m<=16 ) { //MATRIX-MATRIX (short lhs) - matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru); - } - else if( tm2 ) { //MATRIX-MATRIX (skinny rhs) - matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru); - } - else { //MATRIX-MATRIX - matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu); - } - } - else { - for( int i = rl; i < ru; i++) { - double[] avals = a.values(i); - double[] cvals = c.values(i); - int aix = a.pos(i), cix = c.pos(i); - for( int k = 0; k < cd; k++) { - double val = avals[aix + k]; - if( val != 0 ) { - double[] bvals = b.values(k); - int bix = b.pos(k); - for( int j = 0; j < n; j++) - cvals[cix+j] += val * bvals[bix+j]; - } - } - } - } - } - - private static void matrixMultDenseDenseMVShortRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) { - double[] bvals = b.valuesAt(0); - double[] cvals = c.valuesAt(0); - for( int i=rl; i < ru; i++ ) - cvals[i] = dotProduct(a.values(i), bvals, a.pos(i), 0, cd); - } - - private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) { - final int blocksizeI = 32; - final int blocksizeK = 2*1024; //16KB vector blocks (L1) - double[] bvals = b.valuesAt(0); - double[] cvals = c.valuesAt(0); - for( int bi=rl; bi n && cd > 64 && n < 64 - //however, explicit flag required since dimension change m2 - for( int i=rl; i < ru; i++ ) { - double[] avals = a.values(i), cvals = c.values(i); - int aix = a.pos(i), cix = c.pos(i); - for( int j=0; j=0) ? rlix : alen; - - if( b.isContiguous() ) { - double[] bvals = b.valuesAt(0); - for( int k=rlix; k=0) ? apos+k1 : apos+alen; - int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru); - k2 = (k2>=0) ? apos+k2 : apos+alen; - - //note: guard k1 (and thus also k2) against overrun nnz, and guard - //contiguous check for k2-1 against underrun of start pos for k1==k2. - if( k1=0) ? rlix : alen; - - for( int k=rlix; k threshold - for( int i=rl; i n / 128; - - //perform vector-matrix multiply w/ dense or sparse output - if( ldense ) { //init dense tmp row - tmp = (tmp == null) ? new double[n] : tmp; - Arrays.fill(tmp, 0); - } - for( int k=apos; k 0 ) { - c.allocate(i, lnnz); //allocate once - double[] bvals = m2.getDenseBlock().values(aix); - for( int j=0, bix=m2.getDenseBlock().pos(aix); j 0 ) - if(c.get(i) instanceof SparseRowScalar){ - SparseRowScalar sv = (SparseRowScalar) c.get(i); - c.set(i, new SparseRowScalar(sv.getIndex(), sv.getValue() * avals[apos]), false); - } - else - vectMultiplyInPlace(avals[apos], c.values(i), c.pos(i), c.size(i)); - - } - else { //GENERAL CASE - for( int k=apos; k=0) ? apos+rlix : apos+alen; - int len = apos + alen; - for(int i = rlix; i < len && aix[i] < ru; i++) - vectMultiplyAdd(avals[i], avals, c.values(aix[i]), aix, i, c.pos(aix[i]), len - i); - } - } - } - else - { - for( int r=0; r=0) ? apos+rlix : apos+alen; - for(int i = rlix; i < apos+alen && aix[i]=0) ? apos+rlix : apos+alen; - for(int i = rlix; i < apos+alen && aix[i]=0) ? apos+rlix : apos+alen; - for(int i = rlix; i < apos+alen && aix[i] 0 ) { //selected row - int bpos = (pos-1) % blen; - int blk = (pos-1) / blen; - //allocate and switch to second output block - //(never happens in cp, correct for multi-threaded usage) - if( lastblk!=-1 && lastblk 0 ) { //selected row - double[] bvals = b.values(i); - int bix = b.pos(i); - int bpos = (pos-1) % blen; - int blk = (pos-1) / blen; - //allocate and switch to second output block - //(never happens in cp, correct for multi-threaded usage) - if( lastblk!=-1 && lastblk 0 ) { //selected row - int bpos = (pos-1) % blen; - int blk = (pos-1) / blen; - //allocate and switch to second output block - //(never happens in cp, correct for multi-threaded usage) - if( lastblk!=-1 && lastblk sum(X^2)-sum(2*X*(U%*%t(V))))+sum((t(U)%*%U)*(t(V)%*%V)), where each - //parallel task computes sum(X^2)-sum(2*X*(U%*%t(V)))) and the last term - //sum((t(U)%*%U)*(t(V)%*%V)) is computed once via two tsmm operations. - - final int blocksizeIJ = (int) (8L*mX.rlen*mX.clen/mX.nonZeros); - int[] curk = new int[blocksizeIJ]; - - for( int bi=rl; bi sum(X^2)-sum(2*X*(U%*%t(V))))+sum((t(U)%*%U)*(t(V)%*%V)), where each - //parallel task computes sum(X^2)-sum(2*X*(U%*%t(V)))) and the last term - //sum((t(U)%*%U)*(t(V)%*%V)) is computed once via two tsmm operations. - - if( mX.sparse ) { //SPARSE - SparseBlock x = mX.sparseBlock; - for( int i=rl; i=0) ? k : mW.clen; - } - //prepare alignment info if necessary - if( four && !scalar ) - for( int i=bi; i=0) ? wpos+k : wpos+wlen; - for( ; k 1) { //X%*%t(X) SPARSE MATRIX - //directly via LibMatrixReorg in order to prevent sparsity change - MatrixBlock tmpBlock = new MatrixBlock(clen, rlen, m1.sparse); - LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject())); - ret = tmpBlock; - } - else if( leftTranspose && m1.sparse && m1.sparseBlock instanceof SparseBlockCSR ) { - //for a special case of CSR inputs where all non-empty rows are dense, we can - //create a shallow copy of the values arrays to a "dense" block and perform - //tsmm with the existing dense block operations w/o unnecessary gather/scatter - SparseBlockCSR sblock = (SparseBlockCSR)m1.sparseBlock; - boolean convertDense = (par ? - IntStream.range(0, rlen).parallel() : IntStream.range(0, rlen)) - .allMatch(i -> sblock.isEmpty(i) || sblock.size(i)==clen ); - if( convertDense ) { - int rows = (int) sblock.size() / clen; - MatrixBlock tmpBlock = new MatrixBlock(rows, clen, false); - tmpBlock.denseBlock = DenseBlockFactory - .createDenseBlock(sblock.values(), rows, clen); - tmpBlock.setNonZeros(m1.nonZeros); - ret = tmpBlock; - } - } - - return ret; - } - - private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 ) { - //transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output - return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse - && isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen, true)); - } - - //note: public for use by codegen for consistency - public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen, boolean inclCacheSize) { - return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1 - && m2clen < 64 && (!inclCacheSize || 8*m2rlen*m2clen < L2_CACHESIZE); - } - - private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) { - //parallelize over rows in rhs matrix if number of rows in lhs/output is very small - double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory(); - return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && !(m1.isUltraSparse()||m2.isUltraSparse())) - || (m1.rlen<=16 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && m2.rlen > m1.rlen - && ( !m1.isUltraSparse() && !(m1.sparse & m2.sparse) ) //dense-dense / sparse-dense / dense-sparse - && (long)k * 8 * m1.rlen * m2.clen < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) ); - } - - private static boolean checkParMatrixMultRightInputCols( MatrixBlock m1, MatrixBlock m2, int k, boolean pm2r ) { - //parallelize over cols in rhs matrix if dense, number of cols in rhs is large, and lhs fits in l2 - return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse - && m2.clen > k * 1024 && m1.rlen < k * 32 && !pm2r - && 8*m1.rlen*m1.clen < 256*1024 ); //lhs fits in L2 cache - } - - public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, int k) { - return satisfiesMultiThreadingConstraints(m1, true, false, -1, k); - } - - public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) { - boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k); - double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory(); - return k > 1 && LOW_LEVEL_OPTIMIZATION - && (!checkMem || 8L * m1.clen * k < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem)) - && (!checkFLOPs || FPfactor * m1.rlen * m1.clen > - (sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1)); - } - - public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, MatrixBlock m2, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) { - boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k); - double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory(); - return k > 1 && LOW_LEVEL_OPTIMIZATION - && (!checkMem || 8L * m2.clen * k < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem)) - //note: cast to double to avoid long overflows on ultra-sparse matrices - //due to FLOP computation based on number of cells not non-zeros - && (!checkFLOPs || (double)FPfactor * m1.rlen * m1.clen * m2.clen > - (sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1)); - } - - private static boolean satisfiesMultiThreadingConstraintsTSMM(MatrixBlock m1, boolean leftTranspose, long FPfactor, int k) { - boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k); - double threshold = sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1; - return k > 1 && LOW_LEVEL_OPTIMIZATION && (leftTranspose?m1.clen:m1.rlen)!=1 - && ((leftTranspose && FPfactor * m1.rlen * m1.clen * m1.clen > threshold) - ||(!leftTranspose && FPfactor * m1.clen * m1.rlen * m1.rlen > threshold)); - } - - public static boolean isUltraSparseMatrixMult(MatrixBlock m1, MatrixBlock m2, boolean m1Perm) { - if( m2.clen == 1 ) //mv always dense - return false; - //note: ultra-sparse matrix mult implies also sparse outputs, hence we need - //to be conservative an cannot use this for all ultra-sparse matrices. - double outSp = OptimizerUtils.getMatMultSparsity( - m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, true); - return (m1.isUltraSparse() || m2.isUltraSparse()) //base case - || (m1.isUltraSparse(false) && m1 == m2) //ultra-sparse self product - || (m1Perm && OptimizerUtils.getSparsity(m2.rlen, m2.clen, m2.nonZeros)<1.0) - || ((m1.isUltraSparse(false) || m2.isUltraSparse(false)) - && outSp < MatrixBlock.ULTRA_SPARSITY_TURN_POINT2) - || (m1.isInSparseFormat() // otherwise no matching branch - && m1.getSparsity() < MatrixBlock.ULTRA_SPARSITY_TURN_POINT2 - && m1.getNonZeros() < MatrixBlock.ULTRA_SPARSE_BLOCK_NNZ - && m1.getLength()+m2.getLength() < (long)m1.rlen*m2.clen - && outSp < MatrixBlock.SPARSITY_TURN_POINT); - } - - public static boolean isSparseOutputMatrixMult(MatrixBlock m1, MatrixBlock m2) { - //output is a matrix (not vector), very likely sparse, and output rows fit into L1 cache - if( !(m1.sparse && m2.sparse && m1.rlen > 1 && m2.clen > 1) ) - return false; - double estSp = OptimizerUtils.getMatMultSparsity( - m1.getSparsity(), m2.getSparsity(), m1.rlen, m1.clen, m2.clen, false); - long estNnz = (long)(estSp * m1.rlen * m2.clen); - boolean sparseOut = MatrixBlock.evalSparseFormatInMemory(m1.rlen, m2.clen, estNnz); - return m2.clen < 4*1024 && sparseOut; - } - - public static boolean isOuterProductTSMM(int rlen, int clen, boolean left) { - return left ? rlen == 1 & clen > 1 : rlen > 1 & clen == 1; - } - - private static MatrixBlock prepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2, boolean tm2 ) { - MatrixBlock ret = m2; - - //transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output - if( tm2 ) { - MatrixBlock tmpBlock = new MatrixBlock(m2.clen, m2.rlen, m2.sparse); - ret = LibMatrixReorg.reorg(m2, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject())); - } - - return ret; - } - - //cp non-zeros for dense-dense mm - private static int copyNonZeroElements( double[] a, final int aixi, final int bixk, final int n, double[] tmpa, int[] tmpbi, final int bklen ) { - int knnz = 0; - for( int k = 0; k < bklen; k++ ) - if( a[ aixi+k ] != 0 ) { - tmpa[ knnz ] = a[ aixi+k ]; - tmpbi[ knnz ] = bixk + k*n; - knnz ++; - } - return knnz; - } - - //cp non-zeros for dense tsmm - private static int copyNonZeroElements( double[] a, int aixi, int bixk, final int n, final int nx, double[] tmpa, int[] tmpbi, final int bklen ) { - int knnz = 0; - for( int k = 0; k < bklen; k++, aixi+=n, bixk+=nx ) - if( a[ aixi ] != 0 ) { - tmpa[ knnz ] = a[ aixi ]; - tmpbi[ knnz ] = bixk; - knnz ++; - } - return knnz; - } - - @SuppressWarnings("unused") - private static void compactSparseOutput(MatrixBlock ret) { - if( !ret.sparse || ret.nonZeros > ret.rlen || ret.isEmpty() - || ret.getSparseBlock() instanceof SparseBlockCSR ) - return; //early abort - ret.sparseBlock = SparseBlockFactory - .copySparseBlock(Type.CSR, ret.sparseBlock, false); - } - - @SuppressWarnings("unused") - private static void resetPosVect(int[] curk, SparseBlock sblock, int rl, int ru) { - if( sblock instanceof SparseBlockMCSR ) { - //all rows start at position 0 (individual arrays) - Arrays.fill(curk, 0, ru-rl, 0); - } - else if( sblock instanceof SparseBlockCSR ) { - //row start positions given in row ptr array - SparseBlockCSR csr = (SparseBlockCSR) sblock; - System.arraycopy(csr.rowPointers(), rl, curk, 0, ru-rl); - } - else { //general case - for(int i=rl; i> tasks, MatrixBlock ret) - throws InterruptedException, ExecutionException - { - //aggregate partial results and check for errors - double val = 0; - for(Future task : tasks) - val += task.get(); - ret.quickSetValue(0, 0, val); - } - - @SuppressWarnings("unused") - private static void sumDenseResults( double[][] partret, double[] ret ) - { - final int len = ret.length; - final int k = partret.length; - final int bk = k % 4; - final int blocksize = 2 * 1024; //16KB (half of common L1 data) - - //cache-conscious aggregation to prevent repreated scans/writes of ret - for( int bi=0; bi - { - private final MatrixBlock _m1; - private final MatrixBlock _m2; - private MatrixBlock _ret = null; - private final boolean _tm2; //transposed m2 - private final boolean _pm2r; //par over m2 rows - private final boolean _pm2c; //par over m2 rows - private final boolean _m1Perm; //sparse permutation - private final boolean _sparse; //sparse output - private final int _rl; - private final int _ru; - - protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, - boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, boolean sparse, int rl, int ru ) - { - _m1 = m1; - _m2 = m2; - _tm2 = tm2; - _pm2r = pm2r; - _pm2c = pm2c; - _m1Perm = m1Perm; - _sparse = sparse; - _rl = rl; - _ru = ru; - - if( pm2r ) { //vector-matrix / matrix-matrix - //allocate local result for partial aggregation - _ret = new MatrixBlock(ret.rlen, ret.clen, false); - } - else { //default case - _ret = ret; - } - } - - @Override - public Object call() { - //setup target index ranges - int rl = _pm2c ? 0 : _rl; - int ru = _pm2c ? _m1.rlen : _ru; - int cl = _pm2c ? _rl : 0; - int cu = _pm2c ? _ru : _ret.clen; - - //thread-local allocation - if( _pm2r ) - _ret.allocateDenseBlock(); - - //compute block matrix multiplication - if( _ret.sparse ) //ultra-sparse - matrixMultUltraSparse(_m1, _m2, _ret, _m1Perm, rl, ru); - else if(!_m1.sparse && !_m2.sparse) - matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu); - else if(_m1.sparse && _m2.sparse) - matrixMultSparseSparse(_m1, _m2, _ret, _pm2r, _sparse, rl, ru); - else if(_m1.sparse) - matrixMultSparseDense(_m1, _m2, _ret, _pm2r, rl, ru); - else - matrixMultDenseSparse(_m1, _m2, _ret, _pm2r, rl, ru); - - //maintain block nnz (upper bounds inclusive) - if( !_pm2r ) - return _ret.recomputeNonZeros(rl, ru-1, cl, cu-1); - else - return _ret.getDenseBlockValues(); - } - } - - private static class MatrixMultChainTask implements Callable - { - private MatrixBlock _m1 = null; - private MatrixBlock _m2 = null; - private MatrixBlock _m3 = null; - private ChainType _ct = null; - private int _rl = -1; - private int _ru = -1; - - protected MatrixMultChainTask( MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, ChainType ct, int rl, int ru ) { - _m1 = mX; - _m2 = mV; - _m3 = mW; - _ct = ct; - _rl = rl; - _ru = ru; - } - - @Override - public double[] call() { - //thread-local allocation for partial aggregation - MatrixBlock ret = new MatrixBlock(1, _m1.clen, false); - ret.allocateDenseBlock(); - - if( _m1.sparse ) - matrixMultChainSparse(_m1, _m2, _m3, ret, _ct, _rl, _ru); - else - matrixMultChainDense(_m1, _m2, _m3, ret, _ct, _rl, _ru); - - //NOTE: we dont do global aggregation from concurrent tasks in order - //to prevent synchronization (sequential aggregation led to better - //performance after JIT) - return ret.getDenseBlockValues(); - } - } - - private static class MatrixMultTransposeTask implements Callable - { - private final MatrixBlock _m1; - private final MatrixBlock _ret; - private final boolean _left; - private final int _rl; - private final int _ru; - - protected MatrixMultTransposeTask( MatrixBlock m1, MatrixBlock ret, boolean left, int rl, int ru ) - { - _m1 = m1; - _ret = ret; - _left = left; - _rl = rl; - _ru = ru; - } - - @Override - public Object call() { - if( _m1.sparse ) - matrixMultTransposeSelfSparse(_m1, _ret, _left, _rl, _ru); - else - matrixMultTransposeSelfDense(_m1, _ret, _left, _rl, _ru); - return null; - } - } - - private static class MatrixMultPermuteTask implements Callable - { - private MatrixBlock _pm1 = null; - private MatrixBlock _m2 = null; - private MatrixBlock _ret1 = null; - private MatrixBlock _ret2 = null; - private int _rl = -1; - private int _ru = -1; - - protected MatrixMultPermuteTask( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) - { - _pm1 = pm1; - _m2 = m2; - _ret1 = ret1; - _ret2 = ret2; - _rl = rl; - _ru = ru; - } - - @Override - public Object call() { - if( _m2.sparse ) - matrixMultPermuteSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru); - else if( _ret1.sparse ) - matrixMultPermuteDenseSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru); - else - matrixMultPermuteDense(_pm1, _m2, _ret1, _ret2, _rl, _ru); - - return null; - } - } - - private static class MatrixMultWSLossTask implements Callable - { - private MatrixBlock _mX = null; - private MatrixBlock _mU = null; - private MatrixBlock _mV = null; - private MatrixBlock _mW = null; - private MatrixBlock _ret = null; - private WeightsType _wt = null; - private int _rl = -1; - private int _ru = -1; - - protected MatrixMultWSLossTask(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, WeightsType wt, int rl, int ru) { - _mX = mX; - _mU = mU; - _mV = mV; - _mW = mW; - _wt = wt; - _rl = rl; - _ru = ru; - - //allocate local result for partial aggregation - _ret = new MatrixBlock(1, 1, false); - _ret.allocateDenseBlock(); - } - - @Override - public Double call() { - if( !_mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || !_mW.sparse) - && !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() - && (_mW==null || !_mW.isEmptyBlock())) - matrixMultWSLossDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru); - else if( _mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || _mW.sparse) - && !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() - && (_mW==null || !_mW.isEmptyBlock())) - matrixMultWSLossSparseDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru); - else - matrixMultWSLossGeneric(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru); - - return _ret.quickGetValue(0, 0); - } - } - - private static class MatrixMultWSigmoidTask implements Callable - { - private MatrixBlock _mW = null; - private MatrixBlock _mU = null; - private MatrixBlock _mV = null; - private MatrixBlock _ret = null; - private WSigmoidType _wt = null; - private int _rl = -1; - private int _ru = -1; - - protected MatrixMultWSigmoidTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) { - _mW = mW; - _mU = mU; - _mV = mV; - _ret = ret; - _wt = wt; - _rl = rl; - _ru = ru; - } - - @Override - public Long call() { - //core weighted square sum mm computation - if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() ) - matrixMultWSigmoidDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru); - else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) - matrixMultWSigmoidSparseDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru); - else - matrixMultWSigmoidGeneric(_mW, _mU, _mV, _ret, _wt, _rl, _ru); - - //maintain block nnz (upper bounds inclusive) - return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1); - } - } - - private static class MatrixMultWDivTask implements Callable - { - private MatrixBlock _mW = null; - private MatrixBlock _mU = null; - private MatrixBlock _mV = null; - private MatrixBlock _mX = null; - private MatrixBlock _ret = null; - private WDivMMType _wt = null; - private int _rl = -1; - private int _ru = -1; - private int _cl = -1; - private int _cu = -1; - - protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) { - _mW = mW; - _mU = mU; - _mV = mV; - _mX = mX; - _wt = wt; - _rl = rl; - _ru = ru; - _cl = cl; - _cu = cu; - _ret = ret; - } - - @Override - public Long call() { - //core weighted div mm computation - boolean scalarX = _wt.hasScalar(); - if( !_mW.sparse && !_mU.sparse && !_mV.sparse && (_mX==null || !_mX.sparse || scalarX) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() ) - matrixMultWDivMMDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu); - else if( _mW.sparse && !_mU.sparse && !_mV.sparse && (_mX==null || _mX.sparse || scalarX) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) - matrixMultWDivMMSparseDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu); - else - matrixMultWDivMMGeneric(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu); - - //maintain partial nnz for right (upper bounds inclusive) - int rl = _wt.isLeft() ? _cl : _rl; - int ru = _wt.isLeft() ? _cu : _ru; - return _ret.recomputeNonZeros(rl, ru-1, 0, _ret.getNumColumns()-1); - } - } - - private static class MatrixMultWCeTask implements Callable - { - private MatrixBlock _mW = null; - private MatrixBlock _mU = null; - private MatrixBlock _mV = null; - private double _eps = 0.0; - private MatrixBlock _ret = null; - private WCeMMType _wt = null; - private int _rl = -1; - private int _ru = -1; - - protected MatrixMultWCeTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, WCeMMType wt, int rl, int ru) { - _mW = mW; - _mU = mU; - _mV = mV; - _eps = eps; - _wt = wt; - _rl = rl; - _ru = ru; - - //allocate local result for partial aggregation - _ret = new MatrixBlock(1, 1, false); - _ret.allocateDenseBlock(); - } - - @Override - public Double call() { - //core weighted cross entropy mm computation - if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() ) - matrixMultWCeMMDense(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru); - else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) - matrixMultWCeMMSparseDense(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru); - else - matrixMultWCeMMGeneric(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru); - - - return _ret.quickGetValue(0, 0); - } - } - - private static class MatrixMultWuTask implements Callable - { - private MatrixBlock _mW = null; - private MatrixBlock _mU = null; - private MatrixBlock _mV = null; - private MatrixBlock _ret = null; - private WUMMType _wt = null; - private ValueFunction _fn = null; - private int _rl = -1; - private int _ru = -1; - - protected MatrixMultWuTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) { - _mW = mW; - _mU = mU; - _mV = mV; - _ret = ret; - _wt = wt; - _fn = fn; - _rl = rl; - _ru = ru; - } - - @Override - public Long call() { - //core weighted square sum mm computation - if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() ) - matrixMultWuMMDense(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru); - else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) - matrixMultWuMMSparseDense(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru); - else - matrixMultWuMMGeneric(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru); - - //maintain block nnz (upper bounds inclusive) - return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1); - } - } -} diff --git a/scripts/staging/SIMD-double-vectors/README.md b/scripts/staging/SIMD-double-vectors/README.md deleted file mode 100644 index 55812002d0c..00000000000 --- a/scripts/staging/SIMD-double-vectors/README.md +++ /dev/null @@ -1,45 +0,0 @@ - - - -# SIMD DoubleVectors for matrix multiplication - -`DoubleVector` is still in incubator stage, but promises performance improvements for many SystemDS components. -This patch explored potential speedup for matrix multiplication of two dense matrices. Additionally, dot product -is also implemented with `DoubleVector` for the case where common dimension is `1`. - -Initial experiments showed varying results, usually the vectorized implementation performs somewhere between -`MKL` and our reference. There are also cases where we are slower than the reference, or faster than `MKL`. -For detailed discussion (and plots) see PR #1643. - -## Further Work - -This patch focused only on dense matrix multiplication, increasing sparsity would complicate things. -The sparsity aware copying (see `LibMatrixMult.java:1170`) and general loop structure is kept as it is, as a lot of -experimentation went into a very efficient implementation. Note that the usage of `DoubleVector` might change -a lot of things about this and revisiting this (and using SIMD for sparsity aware copying) will be a necessary step. - -## Changes - -Due to the dependency of at least JDK17, there are changes to `pom.xml`, run script `systemds` and, of course, `LibMatrixMult.java`. - -## Note - -The pom file repeatedly gets flagged for old version of various libraries, therefore we renamed the file to avoid this inconvenience. -and we introduced a string in the beginning of the file making it not compile as a pom file. diff --git a/scripts/staging/SIMD-double-vectors/pom.xml.tmp b/scripts/staging/SIMD-double-vectors/pom.xml.tmp deleted file mode 100644 index fbb940668b7..00000000000 --- a/scripts/staging/SIMD-double-vectors/pom.xml.tmp +++ /dev/null @@ -1,1336 +0,0 @@ -THIS IS NO LONGER A POM.XML FILE, GITHUB PLEASE DO NOT GIVE US UPDATE REMINDERS OF THIS FILE. - - - - - 4.0.0 - - org.apache - apache - 24 - - org.apache.systemds - 3.1.0-SNAPSHOT - systemds - jar - Apache SystemDS - https://github.com/apache/systemds - An open source ML system for the end-to-end data science lifecycle - - - Apache 2.0 License - http://www.apache.org/licenses/LICENSE-2.0.html - - - - - 3.3.3 - 4.8 - 3.20.3 - 3.2.0 - 2.12.0 - 2.12 - yyyy-MM-dd HH:mm:ss z - 1 - false - provided - 10.2.0 - 1.7.36 - 2.17.2 - - 17 - Testing settings - true - classes - 2 - 1C - 2 - true - ** - false - -Xms3000m -Xmx3000m -Xmn300m - false - - - - - central - https://repo1.maven.org/maven2 - - true - - - - - - scm:git:https://github.com/apache/systemds.git - HEAD - - - - - - - scripts - - algorithms/obsolete/* - datagen/obsolete/* - perftest/**/* - perftest - perftestDeprecated/* - perftestDeprecated - staging/**/* - nn/test/compare_backends/* - nn/test/compare_backends/* - - scripts - - - src/main/cuda/kernels - - SystemDS.ptx - reduction.ptx - - cuda/kernels - - - src/main/cpp/lib - lib - - - src/main/cuda/spoof - cuda/spoof - - - src/main/cuda/headers - - agg_ops.cuh - operators.cuh - reduction.cuh - spoof_utils.cuh - TempStorage.cuh - utils.cuh - vector_write.cuh - vector_add.cuh - Matrix.h - - cuda/headers - - - src/main/java/org/apache/sysds/hops/codegen/cplan/java - - Cellwise.java.template - Rowwise.java.template - - java/spoof - - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - unpack - package - - unpack - - - - - org.apache.hadoop - hadoop-test - 1.2.1 - jar - true - ${project.build.directory}/hadoop-test - **/* - - - false - true - - - - compile - - copy-dependencies - - - true - ${project.build.directory}/lib - - - - - - - - org.apache.maven.plugins - maven-shade-plugin - 2.3 - - - package - - shade - - - - - org.apache.wink:wink-json4j:* - org.antlr:antlr4-runtime:* - - - - - org.apache.sysds.api.DMLScript - - - - META-INF/LICENSE - src/assembly/bin/LICENSE - - - META-INF/NOTICE - NOTICE - - - false - - - - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - META-INF/LICENSE - META-INF/NOTICE - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.1 - - ${java.level} - ${java.level} - - --add-modules=jdk.incubator.vector - - - - - - org.apache.maven.plugins - maven-resources-plugin - - - copy-resources - compile - - copy-resources - - - - - ${basedir}/src/test/config/hadoop_bin_windows/bin - false - - *.* - - - - ${basedir}/target/lib/hadoop/bin - - - - - - - org.antlr - antlr4-maven-plugin - - ${basedir}/src/main/java - ${basedir}/src/main/java - - ${antlr.version} - - - antlr - - antlr4 - - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 3.0.0-M5 - - ${maven.test.skip} - ${test-parallel} - ${test-threadCount} - - ${test-forkCount} - false - brief - true - ${rerun.failing.tests.count} - --add-modules=jdk.incubator.vector - - - - - maven-clean-plugin - - - clean-original-jar - package - - clean - - - true - - - ${project.build.directory} - - original-*.jar - - - - - - - - - - - - - - - org.apache.maven.plugins - maven-antrun-plugin - - - copy - package - - - - - - - run - - - - - - - org.jacoco - jacoco-maven-plugin - 0.8.7 - - - ${jacoco.include} - - - - - default-prepare-agent - - prepare-agent - - - - generate-code-coverage-report - test - - report - - - - - - - org.eluder.coveralls - coveralls-maven-plugin - 4.3.0 - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.2.0 - - true - - true - - - - - org.codehaus.mojo - properties-maven-plugin - 1.0.0 - - - generate-resources - - write-project-properties - - - ${project.build.testOutputDirectory}/my.properties - - - - - - - - - - windows-x86_64 - - - windows - amd64 - - - - windows - x86_64 - - - - linux-x86_64 - - - unix - amd64 - - - - linux - x86_64 - - - - apple-x86_64 - - - mac - x86_64 - - - - apple - x86_64 - - - - linux-ppc_64 - - - unix - ppc64le - - - - linux - ppc_64 - - - - - eclipse-only - - - m2e.version - - - - - - - - org.eclipse.m2e - lifecycle-mapping - 1.0.0 - - - - - - org.apache.maven.plugins - maven-remote-resources-plugin - [1.4,) - - process - - - - - - - - - org.apache.maven.plugins - maven-clean-plugin - [3.0.0,) - - clean - - - - - - - - - org.apache.maven.plugins - maven-dependency-plugin - [2.10,) - - copy-dependencies - - - - - - - - - - - - - - - - - rat - - clean org.apache.rat:apache-rat-plugin:check - - - org.apache.rat - apache-rat-plugin - 0.12 - - - package - - check - - - - - - scripts/perftest/results/** - scripts/perftest/temp/** - .gitignore - src/main/python/.gitignore - .gitmodules - .repository/ - .idea/ - .git - .settings - .classpath - .project - CITATION - src/main/python/docs/build/**/* - src/main/python/docs/source/_build/** - src/main/python/generator/resources/** - docs/api/**/* - docs/_site/**/* - docs/site/run_issues.md - docs/.jekyll-cache/**/* - docs/css/bootstrap.min.css - docs/css/pygments-default.css - docs/js/vendor/**/* - **/*.lock - **/*.csv - **/*.ijv - **/*.json - **/*.libsvm - **/*.mtx - **/*.mtd - **/*.out - **/__pycache__/** - **/part-* - **/*.keep - **/target/** - **/README.md - **/*.svg - - **/*.ipynb - - src/main/java/*.tokens - **/*.interp - - src/main/java/org/apache/sysds/protobuf/*.java - - src/main/cuda/kernels/SystemDS.ptx - src/main/cuda/kernels/reduction.ptx - - src/test/scripts/functions/jmlc/**/*.impute - src/test/scripts/functions/jmlc/**/*.map - src/test/scripts/functions/jmlc/**/*.mode - src/test/scripts/functions/jmlc/**/*.ndistinct - src/test/scripts/functions/jmlc/**/*.node - src/test/scripts/functions/jmlc/tfmtd_example/Bin/saleprice.bin - src/test/scripts/functions/jmlc/tfmtd_example/Bin/sqft.bin - src/test/scripts/functions/jmlc/tfmtd_example/column.names - src/test/scripts/functions/jmlc/tfmtd_example/dummycoded.column.names - src/test/scripts/functions/jmlc/tfmtd_example2/column.names - src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame - - src/test/scripts/functions/io/csv/in/*/* - - src/main/python/tests/lt*.txt - - scripts/perftest/python/requirements.txt - - src/main/cuda/ext/** - src/main/cuda/.idea/ - - - - - - - - - proton - - - - - com.github.os72 - protoc-jar-maven-plugin - 3.11.4 - - - generate-sources - - run - - - - ${protobuf.version} - - src/main/resources/protobuf - - src/main/java - - - - - - - - - - - distribution - - - - maven-assembly-plugin - - posix - - - - create-source-distribution - package - - single - - - - src/assembly/source.xml - - - - - create-extra-jar - package - - single - - - - src/assembly/extra.xml - - - - ${maven.build.timestamp} - ${project.artifactId}-extra - ${project.version} - - - - - - create-binary-distribution - package - - single - - - - src/assembly/bin.xml - - - - - - - - maven-gpg-plugin - 3.0.1 - - - verify - - sign - - - - - --pinentry-mode - loopback - - - - - - - - org.apache.maven.plugins - maven-remote-resources-plugin - 1.4 - - - - process - - - - - org.apache:apache-jar-resource-bundle:1.4 - - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.2.0 - - - *.protobuf - true - true - true - false - public - ${java.level} - - - - attach-javadocs - - jar - - - - - - - - - - skip-sign - - - - org.apache.maven.plugins - maven-gpg-plugin - - true - - - - - - - - - - org.jcuda - jcuda - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcuda-natives - - - - - - org.jcuda - jcublas - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcublas-natives - - - - - - org.jcuda - jcusparse - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcusparse-natives - - - - - - org.jcuda - jcusolver - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcusolver-natives - - - - - - org.jcuda - jcudnn - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcudnn-natives - - - - - - - org.jcuda - jcuda-natives - windows-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcublas-natives - windows-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcusparse-natives - windows-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcusolver-natives - windows-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcudnn-natives - windows-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcuda-natives - linux-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcublas-natives - linux-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcusparse-natives - linux-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcusolver-natives - linux-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcudnn-natives - linux-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcuda-natives - apple-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcublas-natives - apple-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcusparse-natives - apple-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcusolver-natives - apple-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.jcuda - jcudnn-natives - apple-x86_64 - ${jcuda.version} - ${jcuda.scope} - - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - - - log4j - log4j - - - org.slf4j - slf4j-log4j12 - - - org.slf4j - slf4j-reload4j - - - org.slf4j - slf4j-api - - - org.slf4j - jul-to-slf4j - - - org.slf4j - jcl-over-slf4j - - - org.apache.hadoop - hadoop-client-api - - - org.apache.hadoop - hadoop-client-runtime - - - org.apache.hadoop - hadoop-client-runtime - - - - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - - - log4j - log4j - - - org.slf4j - slf4j-log4j12 - - - org.slf4j - slf4j-reload4j - - - - - - org.apache.spark - spark-mllib_${scala.binary.version} - ${spark.version} - - - log4j - log4j - - - org.slf4j - slf4j-log4j12 - - - org.slf4j - slf4j-reload4j - - - - - - org.apache.hadoop - hadoop-common - ${hadoop.version} - - - javax.servlet - servlet-api - - - org.slf4j - slf4j-api - - - org.slf4j - slf4j-reload4j - - - - - - org.apache.hadoop - hadoop-hdfs - ${hadoop.version} - - - javax.servlet - servlet-api - - - org.slf4j - slf4j-log4j12 - - - org.slf4j - slf4j-reload4j - - - - - - org.apache.hadoop - hadoop-client - ${hadoop.version} - - - log4j - log4j - - - org.slf4j - slf4j-log4j12 - - - org.slf4j - slf4j-reload4j - - - - - - commons-logging - commons-logging - 1.1.3 - - - org.slf4j - slf4j-log4j12 - - - org.slf4j - slf4j-reload4j - - - - - - org.apache.commons - commons-math3 - 3.4.1 - - - - org.apache.wink - wink-json4j - 1.4 - - - - com.fasterxml.jackson.core - jackson-databind - 2.12.6.1 - - - - junit - junit - 4.13.1 - provided - - - - org.openjdk.jol - jol-core - 0.10 - test - - - - - com.github.stephenc.jcip - jcip-annotations - 1.0-1 - test - - - - - org.codehaus.janino - janino - 3.0.16 - provided - - - - org.antlr - antlr4 - ${antlr.version} - provided - - - antlr-runtime - org.antlr - - - - - - org.antlr - antlr4-runtime - ${antlr.version} - - - - org.apache.derby - derby - 10.14.2.0 - provided - - - - io.netty - netty-all - 4.1.68.Final - provided - - - org.apache.logging.log4j - log4j-api - - - org.apache.logging.log4j - log4j-1.2-api - - - - - - net.sf.py4j - py4j - 0.10.9 - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.2.0 - - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.6 - - - - - com.google.protobuf - protobuf-java - ${protobuf.version} - - - - com.google.protobuf - protobuf-java-util - ${protobuf.version} - - - - org.apache.maven.plugins - maven-assembly-plugin - 3.3.0 - - - - org.slf4j - slf4j-api - ${slf4j.version} - - - org.slf4j - jul-to-slf4j - ${slf4j.version} - - - org.slf4j - jcl-over-slf4j - ${slf4j.version} - - - org.slf4j - slf4j-reload4j - ${slf4j.version} - - - - org.apache.logging.log4j - log4j-api - ${log4j.version} - - - \ No newline at end of file diff --git a/scripts/staging/SIMD-double-vectors/systemds b/scripts/staging/SIMD-double-vectors/systemds deleted file mode 100755 index 6c17a8e0faf..00000000000 --- a/scripts/staging/SIMD-double-vectors/systemds +++ /dev/null @@ -1,487 +0,0 @@ -#!/usr/bin/env bash -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -############################################################## -# This script is part of the SystemDS binary release. It is -# meant to work out of the box when unzipping the -# systemds-.zip (or tbz) file. -# -# Make configuration changes here: -############################################################## - -# If not set by env, set to 1 to run spark-submit instead of local java -# This should be used to run with spark-submit instead of java -if [[ -z "$SYSDS_DISTRIBUTED" ]]; then - SYSDS_DISTRIBUTED=0 -fi - -# if not set by env, set to 1 to disable setup output of this script -if [ -z "$SYSDS_QUIET" ]; then - SYSDS_QUIET=0 -fi - -# if not set by env, set to default exec modes -if [[ -z "$SYSDS_EXEC_MODE" ]]; then - case "$SYSDS_DISTRIBUTED" in - 0) SYSDS_EXEC_MODE=singlenode ;; - *) SYSDS_EXEC_MODE=hybrid ;; - esac -fi - -# an echo toggle -print_out() -{ - if [ $SYSDS_QUIET == 0 ]; then - echo "$1" - fi -} - -if [[ -z $SYSTEMDS_ROOT ]] ; then - SYSTEMDS_ROOT=. - print_out "SYSTEMDS_ROOT not set defaulting to current dir $(pwd)" -else - # construct a relative path - SYSTEMDS_ROOT=$(realpath --relative-to=. ${SYSTEMDS_ROOT}) -fi; - -# when using find, look in the directories in this order -DIR_SEARCH_ORDER=". $SYSTEMDS_ROOT $SYSTEMDS_ROOT/conf $SYSTEMDS_ROOT/lib $SYSTEMDS_ROOT/src $SYSTEMDS_ROOT/target" -ordered_find() { - result="" - for dir in $(echo "$DIR_SEARCH_ORDER" | tr ' ' '\n') ; do - if [[ $dir == "$SYSTEMDS_ROOT" ]] || [[ $dir == "." ]]; then - result=$(find "$dir" -maxdepth 1 -iname "$1" -print -quit) - if [[ $result != "" ]]; then break; fi - else - result=$(find "$dir" -iname "$1" -print -quit 2> /dev/null) - if [[ $result != "" ]]; then break; fi - fi - done - echo "$result" -} - -if [ -n "$SYSTEMDS_STANDALONE_OPTS" ]; then - print_out "Overriding SYSTEMDS_STANDALONE_OPTS with env var: $SYSTEMDS_STANDALONE_OPTS" -else - # specify parameters to java when running locally here - SYSTEMDS_STANDALONE_OPTS="-Xmx4g -Xms4g -Xmn400m " -fi - -if [ -n "$SYSTEMDS_REMOTE_DEBUGGING" ]; then - print_out "Overriding SYSTEMDS_REMOTE_DEBUGGING with env var: $SYSTEMDS_REMOTE_DEBUGGING" -else - SYSTEMDS_REMOTE_DEBUGGING=" -agentlib:jdwp=transport=dt_socket,suspend=y,address=8787,server=y " -fi - -# check if log4j config file exists, otherwise unset -# to run with a non fatal complaint by SystemDS -if [ -z "$LOG4JPROP" ] ; then - LOG4JPROP=$(ordered_find "log4j*properties") - - if [ -z "${LOG4JPROP}" ]; then - LOG4JPROP="" - else - LOG4JPROP="-Dlog4j.configuration=file:$LOG4JPROP" - fi -else - # L4J was set by env var. Unset if that setting is wrong - LOG4JPROP2=$(find "$LOG4JPROP") - if [ -z "${LOG4JPROP2}" ]; then - LOG4JPROP="" - else - LOG4JPROP="-Dlog4j.configuration=file:$LOG4JPROP2" - fi -fi - -if [ -n "${SYSTEMDS_DISTRIBUTED_OPTS}" ]; then - print_out "Overriding SYSTEMDS_DISTRIBUTED_OPTS with env var $SYSTEMDS_DISTRIBUTED_OPTS" -else - # specify parameters to pass to spark-submit when running on spark here - SYSTEMDS_DISTRIBUTED_OPTS="\ - --master yarn \ - --deploy-mode client \ - --driver-memory 100g \ - --conf spark.driver.extraJavaOptions=\"-Xms100g -Xmn10g -Dlog4j.configuration=file:$LOG4JPROP\" \ - --conf spark.executor.extraJavaOptions=\"-Dlog4j.configuration=file:$LOG4JPROP\" \ - --conf spark.executor.heartbeatInterval=100s \ - --files $LOG4JPROP \ - --conf spark.network.timeout=512s \ - --num-executors 4 \ - --executor-memory 64 \ - --executor-cores 16 " -fi - - -############################################################## -# No need to touch the content below. These commands launch -# SystemDS based on the settings above. -############################################################## - - -#------------------------------------------------------------- -# some helper functions - -# error help print -PRINT_SYSDS_HELP=0 -function printUsage { -cat << EOF - -Usage: $0 [-r] [SystemDS.jar] [-f] [arguments] [-help] - - SystemDS.jar : Specify a custom SystemDS.jar file (this will be prepended - to the classpath - or fed to spark-submit - -r : Spawn a debug server for remote debugging (standalone and - spark driver only atm). Default port is 8787 - change within - this script if necessary. See SystemDS documentation on how - to attach a remote debugger. - -f : Optional prefix to the dml-filename for consistency with - previous behavior dml-filename : The script file to run. - This is mandatory unless running as a federated worker - (see below). - arguments : The arguments specified after the DML script are passed to - SystemDS. Specify parameters that need to go to - java/spark-submit by editing this run script. - -help : Print this usage message and SystemDS parameter info - -Worker Usage: $0 [-r] WORKER [SystemDS.jar] [arguments] [-help] - - port : The port to open for the federated worker. - -Set custom launch configuration by setting/editing SYSTEMDS_STANDALONE_OPTS -and/or SYSTEMDS_DISTRIBUTED_OPTS. - -Set the environment variable SYSDS_DISTRIBUTED=1 to run spark-submit instead of -local java Set SYSDS_QUIET=1 to omit extra information printed by this run -script. - -EOF -if [ ${PRINT_SYSDS_HELP} -eq 0 ]; then - exit 0 -fi -} - -# print an error if no argument is supplied. -if [ -z "$1" ] ; then - echo "Wrong Usage. Add -help for additional parameters."; - echo "" - printUsage; -fi - -#This loop handles the parameters to the run-script, not the ones passed to SystemDS. -#To not confuse getopts with SystemDS parameters, only the first two params are considered -#here. If more run-script params are needed, adjust the next line accordingly -while getopts ":hr:f:" options "$1$2"; do - case $options in - h ) echo "Help requested. Will exit after extended usage message!" - PRINT_SYSDS_HELP=1 - printUsage - break - ;; - \? ) echo "Unknown parameter -$OPTARG" - printUsage - exit - ;; - f ) - # silently remove -f (this variant is triggered if there's no - # jar file or WORKER as first parameter) - if echo "$OPTARG" | grep -qi "dml"; then - break - else - print_out "No DML Script found after -f option." - fi - ;; - r ) - print_out "Spawning server for remote debugging" - if [ $SYSDS_DISTRIBUTED == 0 ]; then - SYSTEMDS_STANDALONE_OPTS=${SYSTEMDS_STANDALONE_OPTS}${SYSTEMDS_REMOTE_DEBUGGING} - else - SYSTEMDS_DISTRIBUTED_OPTS=${SYSTEMDS_DISTRIBUTED_OPTS}${SYSTEMDS_REMOTE_DEBUGGING} - fi - shift # remove -r from positional arguments - ;; - * ) - print_out "Error: Unexpected error while processing options;" - printUsage - exit - esac -done - -# Peel off first and/or second argument so that $@ contains arguments to DML script -if echo "$1" | grep -q "jar"; then - SYSTEMDS_JAR_FILE=$1 - shift - # handle optional '-f' before DML file (for consistency) - if echo "$1" | grep -q "\-f"; then - shift - SCRIPT_FILE=$1 - shift - else - SCRIPT_FILE=$1 - shift - fi -elif echo "$1" | grep -q "WORKER"; then - WORKER=1 - shift - if echo "$1" | grep -q "jar"; then - SYSTEMDS_JAR_FILE=$1 - shift - fi - PORT=$1 - re='^[0-9]+$' - if ! [[ $PORT =~ $re ]] ; then - echo "error: Port is not a number" - printUsage - fi - shift -elif echo "$1" | grep -q "FEDMONITORING"; then - FEDMONITORING=1 - shift - if echo "$1" | grep -q "jar"; then - SYSTEMDS_JAR_FILE=$1 - shift - fi - PORT=$1 - re='^[0-9]+$' - if ! [[ $PORT =~ $re ]] ; then - echo "error: Port is not a number" - printUsage - fi - shift -else - # handle optional '-f' before DML file (for consistency) - if echo "$1" | grep -q "\-f"; then - shift - SCRIPT_FILE=$1 - shift - else - SCRIPT_FILE=$1 - shift - fi -fi - -if [ -z "$WORKER" ] ; then - WORKER=0 -fi - -if [ -z "$FEDMONITORING" ] ; then - FEDMONITORING=0 -fi - -# find me a SystemDS jar file to run -if [ -z "$SYSTEMDS_JAR_FILE" ];then - SYSTEMDS_JAR_FILE=$(ordered_find "systemds.jar") - if [ -z "$SYSTEMDS_JAR_FILE" ];then - SYSTEMDS_JAR_FILE=$(ordered_find "systemds-?.?.?.jar") - if [ -z "$SYSTEMDS_JAR_FILE" ];then - SYSTEMDS_JAR_FILE=$(ordered_find "systemds-?.?.?-SNAPSHOT.jar") - fi - fi -else - print_out "Using user supplied systemds jar file $SYSTEMDS_JAR_FILE" -fi - -if [[ "$*" == *-config* ]]; then -# override config file from env var if given as parameter to SystemDS - read -r -d '' -a myArray < <( echo "$@" ) - INDEX=0 - for i in "${myArray[@]}"; do - if [[ ${myArray[INDEX]} == *-config* ]]; then - if [ -f "${myArray[((INDEX+1))]}" ]; then - CONFIG_FILE="${myArray[((INDEX+1))]}" - else - echo Warning! Passed config file "${myArray[((INDEX+1))]}" does not exist. - fi - # remove -config - unset 'myArray[INDEX]' - - # remove -config param if not starting with - - if [[ "${myArray[((INDEX+1))]:0:1}" != "-" ]]; then - unset 'myArray[((INDEX+1))]' - fi - # setting the script arguments without the passed -config for further processing - set -- "${myArray[@]}" - break; - fi - # debug print array item - #echo "${myArray[INDEX]}" - (( INDEX=INDEX+1 )) - done - - if [ -f "$CONFIG_FILE" ] ; then - CONFIG_FILE="-config $CONFIG_FILE" - else - CONFIG_FILE="" - fi -elif [ -z "$CONFIG_FILE" ] ; then - # same as above: set config file param if the file exists - CONFIG_FILE=$(ordered_find "SystemDS-config-defaults.xml") - if [ -z "$CONFIG_FILE" ]; then - CONFIG_FILE=$(ordered_find "SystemDS-config.xml") - fi - if [ -z "$CONFIG_FILE" ]; then - CONFIG_FILE="" - else - CONFIG_FILE="-config $CONFIG_FILE" - fi -else - # CONFIG_FILE was set by env var. Unset if that setting is wrong - if [ -f "${CONFIG_FILE}" ]; then - CONFIG_FILE="-config $CONFIG_FILE" - else - CONFIG_FILE="" - fi -fi - -# override exec mode if given as parameter to SystemDS (e.g. -exec singlenode) -read -r -d '' -a myArray < <( echo "$@" ) -INDEX=0 -for i in "${myArray[@]}"; do - if [[ "$i" == *-exec* ]]; then - SYSDS_EXEC_MODE="${myArray[((INDEX+1))]}" - break; - fi - (( INDEX=INDEX+1 )) -done - -if [ $SYSDS_DISTRIBUTED -ne 0 ] && [[ $SYSDS_EXEC_MODE == "singlenode" ]]; then - echo "Error: Can not run on Spark with execution mode singlenode" - exit 1 -fi - -# find absolute path to hadoop home in SYSTEMDS_ROOT -if [ -z "$HADOOP_HOME" ]; then - HADOOP_HOME=$(realpath "$(find "$SYSTEMDS_ROOT" -iname hadoop | tail -n 1 )") - export HADOOP_HOME -fi -# add hadoop home to path and lib path for loading hadoop jni -HADOOP_REL=$(realpath --relative-to=. "$HADOOP_HOME") - -# default directory separator unix style -DIR_SEP=/ -# detect operating system to set correct path separator -if [ "$OSTYPE" == "win32" ] || [ "$OSTYPE" == "msys" ] || [ "$OSTYPE" == "cygwin" ]; then - PATH_SEP=\; - DIR_SEP=\\ - HADOOP_REL="${HADOOP_REL////\\}" -else - PATH_SEP=: -fi - -# make the jar path relative to skip issues with Windows paths -JARNAME=$(basename "$SYSTEMDS_JAR_FILE") - -# relative path to jar file -SYSTEMDS_JAR_FILE=$(realpath --relative-to=. "$(dirname "$SYSTEMDS_JAR_FILE")")${DIR_SEP}${JARNAME} - -NATIVE_LIBS="$SYSTEMDS_ROOT${DIR_SEP}target${DIR_SEP}classes${DIR_SEP}lib" -export PATH=${HADOOP_REL}${DIR_SEP}bin${PATH_SEP}${PATH}${PATH_SEP}$NATIVE_LIBS -export LD_LIBRARY_PATH=${HADOOP_REL}${DIR_SEP}bin${PATH_SEP}${LD_LIBRARY_PATH} - -# set java class path -CLASSPATH="${SYSTEMDS_JAR_FILE}${PATH_SEP} \ - ${SYSTEMDS_ROOT}${DIR_SEP}lib${DIR_SEP}*${PATH_SEP} \ - ${SYSTEMDS_ROOT}${DIR_SEP}target${DIR_SEP}lib${DIR_SEP}*" -# trim whitespace (introduced by the line breaks above) -CLASSPATH=$(echo "${CLASSPATH}" | tr -d '[:space:]') - -if [ $PRINT_SYSDS_HELP == 1 ]; then - echo "----------------------------------------------------------------------" - echo "Further help on SystemDS arguments:" - java -cp "$CLASSPATH" org.apache.sysds.api.DMLScript -help - exit 1 -fi - -print_out "###############################################################################" -print_out "# SYSTEMDS_ROOT= $SYSTEMDS_ROOT" -print_out "# SYSTEMDS_JAR_FILE= $SYSTEMDS_JAR_FILE" -print_out "# SYSDS_EXEC_MODE= $SYSDS_EXEC_MODE" -print_out "# CONFIG_FILE= $CONFIG_FILE" -print_out "# LOG4JPROP= $LOG4JPROP" -print_out "# CLASSPATH= $CLASSPATH" -print_out "# HADOOP_HOME= $HADOOP_HOME" - -#build the command to run -if [ $WORKER == 1 ]; then - print_out "#" - print_out "# starting Federated worker on port $PORT" - print_out "###############################################################################" - CMD=" \ - java $SYSTEMDS_STANDALONE_OPTS \ - -cp $CLASSPATH \ - $LOG4JPROP \ - org.apache.sysds.api.DMLScript \ - -w $PORT \ - $CONFIG_FILE \ - $*" - print_out "Executing command: $CMD" - print_out "" - -elif [ "$FEDMONITORING" == 1 ]; then - print_out "#" - print_out "# starting Federated backend monitoring on port $PORT" - print_out "###############################################################################" - CMD=" \ - java $SYSTEMDS_STANDALONE_OPTS \ - -cp $CLASSPATH \ - $LOG4JPROP \ - org.apache.sysds.api.DMLScript \ - -fedMonitoring $PORT \ - $CONFIG_FILE \ - $*" - print_out "Executing command: $CMD" - print_out "" - -elif [ $SYSDS_DISTRIBUTED == 0 ]; then - print_out "#" - print_out "# Running script $SCRIPT_FILE locally with opts: $*" - print_out "###############################################################################" - CMD=" \ - java $SYSTEMDS_STANDALONE_OPTS \ - -cp $CLASSPATH \ - $LOG4JPROP \ - --add-modules=jdk.incubator.vector \ - org.apache.sysds.api.DMLScript \ - -f $SCRIPT_FILE \ - -exec $SYSDS_EXEC_MODE \ - $CONFIG_FILE \ - $*" - print_out "Executing command: $CMD" - print_out "" -else - print_out "#" - print_out "# Running script $SCRIPT_FILE distributed with opts: $*" - print_out "###############################################################################" - export SPARK_MAJOR_VERSION=2 - CMD=" \ - spark-submit $SYSTEMDS_DISTRIBUTED_OPTS \ - $SYSTEMDS_JAR_FILE \ - -f $SCRIPT_FILE \ - -exec $SYSDS_EXEC_MODE \ - $CONFIG_FILE \ - $*" - print_out "Executing command: $CMD" - print_out "" -fi - -# run -eval "$CMD" diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index a9f4beaed29..af702cb7fad 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -3717,26 +3717,22 @@ public static double dotProduct( double[] a, double[] b, int ai, int bi, final i public static double dotProduct( double[] a, double[] b, int[] aix, int ai, final int bi, final int len ) { double val = 0; - final int bn = len%8; + final int bn = len%vLen; //compute rest for( int i = ai; i < ai+bn; i++ ) val += a[ i ] * b[ bi+aix[i] ]; - //unrolled 8-block (for better instruction-level parallelism) - for( int i = ai+bn; i < ai+len; i+=8 ) + //unrolled vLen-block (for better instruction-level parallelism) + for( int i = ai+bn; i < ai+len; i+=vLen) { //read 64B cacheline of a //read 64B of b via 'gather' //compute cval' = sum(a * b) + cval - val += a[ i+0 ] * b[ bi+aix[i+0] ] - + a[ i+1 ] * b[ bi+aix[i+1] ] - + a[ i+2 ] * b[ bi+aix[i+2] ] - + a[ i+3 ] * b[ bi+aix[i+3] ] - + a[ i+4 ] * b[ bi+aix[i+4] ] - + a[ i+5 ] * b[ bi+aix[i+5] ] - + a[ i+6 ] * b[ bi+aix[i+6] ] - + a[ i+7 ] * b[ bi+aix[i+7] ]; + var aVec = DoubleVector.fromArray(SPECIES, a, i); + var bVec = DoubleVector.fromArray(SPECIES, b, bi, aix, i); + val += aVec.mul(bVec).reduceLanes(VectorOperators.ADD); + } //scalar result