From 62d78a241c95d09896b731776e29a8cb883dfc49 Mon Sep 17 00:00:00 2001 From: Ignacio Bermudez Date: Tue, 9 May 2017 21:31:03 -0700 Subject: [PATCH 1/6] Reproducing SPARK-20687 --- .../spark/mllib/linalg/MatricesSuite.scala | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index 563756907d201..b7f8cdba0f0db 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -46,6 +46,27 @@ class MatricesSuite extends SparkFunSuite { } } + test("breeze conversion bug") { + // (2, 0, 0) + // (2, 0, 0) + val mat1Brz = Matrices.sparse(2, 3, Array(0, 2, 2, 2), Array(0, 1), Array(2, 2)).asBreeze + // (2, 1E-15, 1E-15) + // (2, 1E-15, 1E-15 + val mat2Brz = Matrices.sparse(2, 3, Array(0, 2, 4, 6), Array(0, 0, 0, 1, 1, 1), Array(2, 1E-15, 1E-15, 2, 1E-15, 1E-15)).asBreeze + // The following shouldn't break + val t01 = mat1Brz - mat1Brz + val t02 = mat2Brz - mat2Brz + val t02Brz = Matrices.fromBreeze(t02) + val t01Brz = Matrices.fromBreeze(t01) + + val t1Brz = mat1Brz - mat2Brz + val t2Brz = mat2Brz - mat1Brz + // The following ones should break + val t1 = Matrices.fromBreeze(t1Brz) + val t2 = Matrices.fromBreeze(t2Brz) + + } + test("sparse matrix construction") { val m = 3 val n = 4 From dbbd39121f3210f6edd7a74bb21853fbda20c0cb Mon Sep 17 00:00:00 2001 From: Ignacio Bermudez Date: Wed, 10 May 2017 11:03:14 -0700 Subject: [PATCH 2/6] [SPARK-20687] mllib.Matrices.fromBreeze may cause crash when converting breeze CSCMatrix In an operation of two A, B CSCMatrices the resulting C matrix may have some extra 0s in rowIndices and data which are created for performance improvement by Breeze. This causes problems on converting back to mllib.Matrix because it relies on rowIndices and data being coherent with colPtrs. Therefore it is necessary to truncate rowIndices and data to the active number of elements hold by the C matrix, before creating a Spark's SparseMatrix. --- .../org/apache/spark/mllib/linalg/Matrices.scala | 15 ++++++++++++++- .../spark/mllib/linalg/MatricesSuite.scala | 16 ++++++---------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 6c39fe5d84865..b8a0fc6a38287 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -992,7 +992,20 @@ object Matrices { new DenseMatrix(dm.rows, dm.cols, dm.data, dm.isTranspose) case sm: BSM[Double] => // There is no isTranspose flag for sparse matrices in Breeze - new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data) + + // Some Breeze CSCMatrices may have extra trailing zeros in + // .rowIndices and .data, which are added after some matrix + // operations for efficiency. + // + // Therefore the last element of sm.colPtrs would no longer be + // coherent with the size of sm.rowIndices and sm.data + // despite sm being a valid CSCMatrix. + // We need to truncate both arrays (rowIndices, data) + // to the real size of the vector sm.activeSize to allow valid conversion + + val truncRowIndices = sm.rowIndices.slice(0, sm.activeSize) + val truncData = sm.data.slice(0, sm.activeSize) + new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, truncRowIndices, truncData) case _ => throw new UnsupportedOperationException( s"Do not support conversion from type ${breeze.getClass.getName}.") diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index b7f8cdba0f0db..e8542edd7fe42 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -46,25 +46,21 @@ class MatricesSuite extends SparkFunSuite { } } - test("breeze conversion bug") { + test("Test Breeze Conversion Bug - SPARK-20687") { // (2, 0, 0) // (2, 0, 0) val mat1Brz = Matrices.sparse(2, 3, Array(0, 2, 2, 2), Array(0, 1), Array(2, 2)).asBreeze // (2, 1E-15, 1E-15) - // (2, 1E-15, 1E-15 + // (2, 1E-15, 1E-15) val mat2Brz = Matrices.sparse(2, 3, Array(0, 2, 4, 6), Array(0, 0, 0, 1, 1, 1), Array(2, 1E-15, 1E-15, 2, 1E-15, 1E-15)).asBreeze - // The following shouldn't break - val t01 = mat1Brz - mat1Brz - val t02 = mat2Brz - mat2Brz - val t02Brz = Matrices.fromBreeze(t02) - val t01Brz = Matrices.fromBreeze(t01) - val t1Brz = mat1Brz - mat2Brz val t2Brz = mat2Brz - mat1Brz - // The following ones should break + // The following operations raise exceptions on un-patch Matrices.fromBreeze val t1 = Matrices.fromBreeze(t1Brz) val t2 = Matrices.fromBreeze(t2Brz) - + // t1 == t1Brz && t2 == t2Brz + assert((t1.asBreeze - t1Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15) + assert((t2.asBreeze - t2Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15) } test("sparse matrix construction") { From df84eb909098b8b5876e8944801dc9a222b44a54 Mon Sep 17 00:00:00 2001 From: Ignacio Bermudez Date: Thu, 11 May 2017 13:08:37 -0700 Subject: [PATCH 3/6] Line too long in test --- .../scala/org/apache/spark/mllib/linalg/MatricesSuite.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index e8542edd7fe42..ed8684556bc4d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -52,7 +52,10 @@ class MatricesSuite extends SparkFunSuite { val mat1Brz = Matrices.sparse(2, 3, Array(0, 2, 2, 2), Array(0, 1), Array(2, 2)).asBreeze // (2, 1E-15, 1E-15) // (2, 1E-15, 1E-15) - val mat2Brz = Matrices.sparse(2, 3, Array(0, 2, 4, 6), Array(0, 0, 0, 1, 1, 1), Array(2, 1E-15, 1E-15, 2, 1E-15, 1E-15)).asBreeze + val mat2Brz = Matrices.sparse(2, 3, + Array(0, 2, 4, 6), + Array(0, 0, 0, 1, 1, 1), + Array(2, 1E-15, 1E-15, 2, 1E-15, 1E-15)).asBreeze val t1Brz = mat1Brz - mat2Brz val t2Brz = mat2Brz - mat1Brz // The following operations raise exceptions on un-patch Matrices.fromBreeze From b40a706e4fa5733d8205c31efd48ff67b9c75575 Mon Sep 17 00:00:00 2001 From: Ignacio Bermudez Date: Thu, 11 May 2017 22:37:55 -0700 Subject: [PATCH 4/6] Addressing suggestions of @hhbyyh --- .../apache/spark/mllib/linalg/Matrices.scala | 10 +++-- .../spark/mllib/linalg/MatricesSuite.scala | 40 +++++++++---------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index b8a0fc6a38287..da5f39dd81fdc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -1003,9 +1003,13 @@ object Matrices { // We need to truncate both arrays (rowIndices, data) // to the real size of the vector sm.activeSize to allow valid conversion - val truncRowIndices = sm.rowIndices.slice(0, sm.activeSize) - val truncData = sm.data.slice(0, sm.activeSize) - new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, truncRowIndices, truncData) + if (sm.rowIndices.length > sm.activeSize) { + val smC = sm.copy + smC.compact() + new SparseMatrix(smC.rows, smC.cols, smC.colPtrs, smC.rowIndices, smC.data) + } else { + new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data) + } case _ => throw new UnsupportedOperationException( s"Do not support conversion from type ${breeze.getClass.getName}.") diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index ed8684556bc4d..93c00d80974c3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -46,26 +46,6 @@ class MatricesSuite extends SparkFunSuite { } } - test("Test Breeze Conversion Bug - SPARK-20687") { - // (2, 0, 0) - // (2, 0, 0) - val mat1Brz = Matrices.sparse(2, 3, Array(0, 2, 2, 2), Array(0, 1), Array(2, 2)).asBreeze - // (2, 1E-15, 1E-15) - // (2, 1E-15, 1E-15) - val mat2Brz = Matrices.sparse(2, 3, - Array(0, 2, 4, 6), - Array(0, 0, 0, 1, 1, 1), - Array(2, 1E-15, 1E-15, 2, 1E-15, 1E-15)).asBreeze - val t1Brz = mat1Brz - mat2Brz - val t2Brz = mat2Brz - mat1Brz - // The following operations raise exceptions on un-patch Matrices.fromBreeze - val t1 = Matrices.fromBreeze(t1Brz) - val t2 = Matrices.fromBreeze(t2Brz) - // t1 == t1Brz && t2 == t2Brz - assert((t1.asBreeze - t1Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15) - assert((t2.asBreeze - t2Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15) - } - test("sparse matrix construction") { val m = 3 val n = 4 @@ -533,6 +513,26 @@ class MatricesSuite extends SparkFunSuite { Matrices.fromBreeze(sum) } + test("Test FromBreeze when Breeze.CSCMatrix.rowIndices has trailing zeros. - SPARK-20687") { + // (2, 0, 0) + // (2, 0, 0) + val mat1Brz = Matrices.sparse(2, 3, Array(0, 2, 2, 2), Array(0, 1), Array(2, 2)).asBreeze + // (2, 1E-15, 1E-15) + // (2, 1E-15, 1E-15) + val mat2Brz = Matrices.sparse(2, 3, + Array(0, 2, 4, 6), + Array(0, 0, 0, 1, 1, 1), + Array(2, 1E-15, 1E-15, 2, 1E-15, 1E-15)).asBreeze + val t1Brz = mat1Brz - mat2Brz + val t2Brz = mat2Brz - mat1Brz + // The following operations raise exceptions on un-patch Matrices.fromBreeze + val t1 = Matrices.fromBreeze(t1Brz) + val t2 = Matrices.fromBreeze(t2Brz) + // t1 == t1Brz && t2 == t2Brz + assert((t1.asBreeze - t1Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15) + assert((t2.asBreeze - t2Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15) + } + test("row/col iterator") { val dm = new DenseMatrix(3, 2, Array(0, 1, 2, 3, 4, 0)) val sm = dm.toSparse From bc8e14cfff5d6537a4d31db49769cda3f660bf27 Mon Sep 17 00:00:00 2001 From: Ignacio Bermudez Corrales Date: Fri, 19 May 2017 10:41:08 -0700 Subject: [PATCH 5/6] Shortening description+cosmetic --- .../apache/spark/mllib/linalg/Matrices.scala | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index da5f39dd81fdc..903f575af0b91 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -992,24 +992,16 @@ object Matrices { new DenseMatrix(dm.rows, dm.cols, dm.data, dm.isTranspose) case sm: BSM[Double] => // There is no isTranspose flag for sparse matrices in Breeze - - // Some Breeze CSCMatrices may have extra trailing zeros in - // .rowIndices and .data, which are added after some matrix - // operations for efficiency. - // - // Therefore the last element of sm.colPtrs would no longer be - // coherent with the size of sm.rowIndices and sm.data - // despite sm being a valid CSCMatrix. - // We need to truncate both arrays (rowIndices, data) - // to the real size of the vector sm.activeSize to allow valid conversion - - if (sm.rowIndices.length > sm.activeSize) { - val smC = sm.copy - smC.compact() - new SparseMatrix(smC.rows, smC.cols, smC.colPtrs, smC.rowIndices, smC.data) + val nsm = if (sm.rowIndices.length > sm.activeSize) { + // This sparse matrix has trainling zeros. + // Remove them by compacting the matrix. + val csm = sm.copy + csm.compact() + csm } else { - new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data) + sm } + new SparseMatrix(nsm.rows, nsm.cols, nsm.colPtrs, nsm.rowIndices, nsm.data) case _ => throw new UnsupportedOperationException( s"Do not support conversion from type ${breeze.getClass.getName}.") From 18ce388d81c54df8d17bb9690f489999a0d19858 Mon Sep 17 00:00:00 2001 From: Ignacio Bermudez Corrales Date: Sat, 20 May 2017 15:28:09 -0700 Subject: [PATCH 6/6] Comment typo --- .../src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 903f575af0b91..2b2b5fe49ea32 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -993,7 +993,7 @@ object Matrices { case sm: BSM[Double] => // There is no isTranspose flag for sparse matrices in Breeze val nsm = if (sm.rowIndices.length > sm.activeSize) { - // This sparse matrix has trainling zeros. + // This sparse matrix has trailing zeros. // Remove them by compacting the matrix. val csm = sm.copy csm.compact()