org.jcuda
jcuda
diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaMatrixCoverageTest.java b/src/test/java/org/apache/sysds/test/component/io/DeltaMatrixCoverageTest.java
new file mode 100644
index 00000000000..8d7ad14539a
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/io/DeltaMatrixCoverageTest.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.io;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.io.File;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.conf.DMLConfig;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.DenseBlockLDRB;
+import org.apache.sysds.runtime.data.DenseBlockLFP64;
+import org.apache.sysds.runtime.io.DeltaKernelUtils;
+import org.apache.sysds.runtime.io.ReaderDelta;
+import org.apache.sysds.runtime.io.ReaderDeltaParallel;
+import org.apache.sysds.runtime.io.WriterDelta;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import io.delta.kernel.data.ColumnVector;
+import io.delta.kernel.data.Row;
+import io.delta.kernel.internal.InternalScanFileUtils;
+import io.delta.kernel.types.BinaryType;
+import io.delta.kernel.types.DataType;
+import io.delta.kernel.types.DateType;
+import io.delta.kernel.types.StringType;
+import io.delta.kernel.types.StructType;
+import io.delta.kernel.types.TimestampType;
+
+/**
+ * Targeted tests for the error/defensive branches of the native Delta matrix
+ * read/write code that the round-trip and interop tests do not reach: malformed
+ * per-file statistics, unsupported column types, unsupported stream operations,
+ * bad table paths, and the non-dense writer input path.
+ *
+ * A few of these branches guard against inputs that the SystemDS writer and
+ * the Delta Kernel scan API never produce in a normal round trip (e.g. a
+ * statistics JSON without {@code numRecords}, or a column type code outside the
+ * supported set). They are exercised here by mocking the Delta Kernel data
+ * objects and invoking the (package-private) helpers reflectively, rather than
+ * widening their production visibility purely for testing.
+ */
+public class DeltaMatrixCoverageTest {
+
+ // ---------------------------------------------------------------------
+ // public defensive paths
+ // ---------------------------------------------------------------------
+
+ @Test
+ public void qualifyRejectsUnknownFilesystemScheme() {
+ try {
+ DeltaKernelUtils.qualify("nosuchfs://host/path/to/table");
+ fail("expected a DMLRuntimeException for an unresolvable table path");
+ }
+ catch(DMLRuntimeException ex) {
+ assertTrue("message should reference the bad path, got: " + ex.getMessage(),
+ ex.getMessage() != null && ex.getMessage().contains("Delta table path"));
+ }
+ }
+
+ @Test
+ public void typeCodeReturnsNegativeForUnsupportedTypes() {
+ //non-numeric / unsupported Delta types must map to the sentinel -1 so the
+ //reader can reject them with a clear message rather than mis-decoding.
+ assertEquals(-1, DeltaKernelUtils.typeCode(DateType.DATE));
+ assertEquals(-1, DeltaKernelUtils.typeCode(TimestampType.TIMESTAMP));
+ assertEquals(-1, DeltaKernelUtils.typeCode(BinaryType.BINARY));
+ }
+
+ @Test(expected = UnsupportedOperationException.class)
+ public void readerRejectsInputStream() throws Exception {
+ new ReaderDelta().readMatrixFromInputStream(null, 1, 1, -1, -1);
+ }
+
+ @Test(expected = UnsupportedOperationException.class)
+ public void writerRejectsStreamWrite() throws Exception {
+ new WriterDelta().writeMatrixFromStream("dummy", null, 1, 1, -1);
+ }
+
+ // ---------------------------------------------------------------------
+ // mocked / reflective coverage of internal defensive branches
+ // ---------------------------------------------------------------------
+
+ @Test
+ public void numRecordsHandlesAbsentNullAndMalformedStats() throws Exception {
+ //no "stats" field at all -> -1
+ assertEquals(-1, numRecords(addFileRow(new StructType().add("path", StringType.STRING), false, null)));
+ //stats column present but null-at -> -1
+ assertEquals(-1, numRecords(addFileRow(statsSchema(), true, null)));
+ //stats string explicitly null -> -1
+ assertEquals(-1, numRecords(addFileRow(statsSchema(), false, null)));
+ //malformed JSON -> JsonProcessingException -> -1
+ assertEquals(-1, numRecords(addFileRow(statsSchema(), false, "{not valid json")));
+ //valid JSON but no numRecords field -> -1
+ assertEquals(-1, numRecords(addFileRow(statsSchema(), false, "{\"minValues\":{}}")));
+ //well-formed stats -> the parsed count
+ assertEquals(1234L, numRecords(addFileRow(statsSchema(), false, "{\"numRecords\":1234}")));
+ }
+
+ @Test
+ public void getDoubleValueRejectsUnknownTypeCode() throws Exception {
+ Method m = ReaderDelta.class.getDeclaredMethod("getDoubleValue", ColumnVector.class, int.class, int.class);
+ m.setAccessible(true);
+ try {
+ //type code outside the supported T_* set; the switch default must throw
+ //before touching the (null) vector.
+ m.invoke(null, (ColumnVector) null, 0, 999);
+ fail("expected a DMLRuntimeException for an unsupported type code");
+ }
+ catch(InvocationTargetException ite) {
+ assertTrue(ite.getCause() instanceof DMLRuntimeException);
+ }
+ }
+
+ @Test
+ public void numericTypeCodeRejectsNonNumericType() throws Exception {
+ Method m = ReaderDelta.class.getDeclaredMethod("numericTypeCode", DataType.class, String.class);
+ m.setAccessible(true);
+ try {
+ m.invoke(null, DateType.DATE, "d");
+ fail("expected a DMLRuntimeException for a non-numeric column type");
+ }
+ catch(InvocationTargetException ite) {
+ assertTrue(ite.getCause() instanceof DMLRuntimeException);
+ }
+ }
+
+ @Test
+ public void parallelReadWrapsFileFailure() throws Exception {
+ //a per-file decode failure in the parallel reader must surface as a single
+ //clear IOException (the awaitFileTasks catch), not a raw executor error.
+ //Provoke it by deleting one data file after the table (and its log) exist.
+ MatrixBlock in = TestUtils.generateTestMatrixBlock(100_000, 8, -10, 10, 1.0, 13);
+ in.recomputeNonZeros();
+ DMLConfig conf = new DMLConfig();
+ conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(256L * 1024));
+ ConfigurationManager.setLocalConfig(conf);
+ Path dir = Files.createTempDirectory("sysds_delta_fail_");
+ String tablePath = new File(dir.toFile(), "table").getAbsolutePath();
+ try {
+ new WriterDelta().writeMatrixToHDFS(in, tablePath,
+ in.getNumRows(), in.getNumColumns(), -1, in.getNonZeros());
+
+ //delete one parquet data file; the transaction log still references it,
+ //so the scan enumerates it but the decode task fails.
+ File victim;
+ try( java.util.stream.Stream s = Files.walk(new File(tablePath).toPath()) ) {
+ victim = s.filter(p -> p.toString().endsWith(".parquet"))
+ .findFirst().map(Path::toFile).orElse(null);
+ }
+ assertTrue("expected at least one data file to delete", victim != null && victim.delete());
+
+ try {
+ new ReaderDeltaParallel().readMatrixFromHDFS(tablePath, -1, -1, -1, -1);
+ fail("expected an IOException when a Delta data file is missing");
+ }
+ catch(java.io.IOException ex) {
+ assertTrue("message should describe the failed parallel read, got: " + ex.getMessage(),
+ ex.getMessage() != null && ex.getMessage().contains("parallel read of Delta table"));
+ }
+ }
+ finally {
+ ConfigurationManager.clearLocalConfigs();
+ FileUtils.deleteQuietly(dir.toFile());
+ }
+ }
+
+ // ---------------------------------------------------------------------
+ // non-dense writer input path
+ // ---------------------------------------------------------------------
+
+ @Test
+ public void sparseFormatMatrixRoundTrips() throws Exception {
+ //a sparse-backed MatrixBlock takes the writer's non-contiguous path (no
+ //direct double[] view), exercising MatrixColumnVector.get via MatrixBlock.
+ MatrixBlock in = TestUtils.generateTestMatrixBlock(2000, 7, -5, 5, 0.05, 13);
+ in.recomputeNonZeros();
+ in.examSparsity();
+ assertTrue("input should be in sparse format to exercise the non-dense path", in.isInSparseFormat());
+
+ Path dir = Files.createTempDirectory("sysds_delta_sparse_");
+ String tablePath = new File(dir.toFile(), "table").getAbsolutePath();
+ try {
+ new WriterDelta().writeMatrixToHDFS(in, tablePath,
+ in.getNumRows(), in.getNumColumns(), -1, in.getNonZeros());
+ MatrixBlock out = new ReaderDelta().readMatrixFromHDFS(tablePath, -1, -1, -1, -1);
+ assertEquals("rows", in.getNumRows(), out.getNumRows());
+ assertEquals("cols", in.getNumColumns(), out.getNumColumns());
+ TestUtils.compareMatrices(in, out, 1e-12, "sparse-format-roundtrip");
+ }
+ finally {
+ FileUtils.deleteQuietly(dir.toFile());
+ }
+ }
+
+ @Test
+ public void fillDenseHandlesNonContiguousBlock() throws Exception {
+ //the dense fill normally hits the contiguous fast path; force a multi-block
+ //(non-contiguous) dense block so the row-by-row fallback is exercised. Such
+ //blocks only arise for matrices beyond a single contiguous array, so we
+ //shrink the per-block allocation cap to provoke it on a tiny matrix.
+ int rows = 5, cols = 4;
+ int savedMaxAlloc = DenseBlockLDRB.MAX_ALLOC;
+ DenseBlock db;
+ try {
+ DenseBlockLDRB.MAX_ALLOC = 2 * cols; //~2 rows per block -> multiple blocks
+ db = new DenseBlockLFP64(new int[] {rows, cols});
+ }
+ finally {
+ DenseBlockLDRB.MAX_ALLOC = savedMaxAlloc;
+ }
+ assertTrue("expected a non-contiguous (multi-block) dense block", !db.isContiguous());
+
+ //two row-major batches (3 rows + 2 rows) covering all 5 rows
+ double[] b0 = new double[3 * cols];
+ double[] b1 = new double[2 * cols];
+ for( int r = 0; r < 3; r++ )
+ for( int c = 0; c < cols; c++ )
+ b0[r * cols + c] = cell(r, c);
+ for( int r = 0; r < 2; r++ )
+ for( int c = 0; c < cols; c++ )
+ b1[r * cols + c] = cell(3 + r, c);
+ java.util.ArrayList batches = new java.util.ArrayList<>();
+ batches.add(b0);
+ batches.add(b1);
+
+ MatrixBlock ret = new MatrixBlock(rows, cols, db);
+ Method m = ReaderDelta.class.getDeclaredMethod("fillDense", MatrixBlock.class, java.util.ArrayList.class);
+ m.setAccessible(true);
+ m.invoke(null, ret, batches);
+
+ for( int r = 0; r < rows; r++ )
+ for( int c = 0; c < cols; c++ )
+ assertEquals("r" + r + " c" + c, cell(r, c), ret.getDenseBlock().get(r, c), 0.0);
+ }
+
+ private static double cell(int r, int c) {
+ return r * 10 + c;
+ }
+
+ // ---------------------------------------------------------------------
+ // helpers
+ // ---------------------------------------------------------------------
+
+ private static StructType statsSchema() {
+ return new StructType().add("stats", StringType.STRING);
+ }
+
+ /**
+ * Build a mocked scan-file row whose AddFile child has the given schema, null
+ * flag and (when not null) stats string, matching what {@code numRecords} reads.
+ */
+ private static Row addFileRow(StructType addSchema, boolean statsNull, String statsValue) {
+ Row outer = mock(Row.class);
+ Row add = mock(Row.class);
+ when(outer.getStruct(InternalScanFileUtils.ADD_FILE_ORDINAL)).thenReturn(add);
+ when(add.getSchema()).thenReturn(addSchema);
+ int statsOrd = addSchema.fieldNames().indexOf("stats");
+ if( statsOrd >= 0 ) {
+ when(add.isNullAt(statsOrd)).thenReturn(statsNull);
+ if( !statsNull )
+ when(add.getString(statsOrd)).thenReturn(statsValue);
+ }
+ return outer; //the scan-file row numRecords consumes (its AddFile child is 'add')
+ }
+
+ private static long numRecords(Row scanFileRow) throws Exception {
+ Method m = DeltaKernelUtils.class.getDeclaredMethod("numRecords", Row.class);
+ m.setAccessible(true);
+ return (Long) m.invoke(null, scanFileRow);
+ }
+}
diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaMatrixSparkInteropTest.java b/src/test/java/org/apache/sysds/test/component/io/DeltaMatrixSparkInteropTest.java
new file mode 100644
index 00000000000..2d79b79f2dd
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/io/DeltaMatrixSparkInteropTest.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.io;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.conf.DMLConfig;
+import org.apache.sysds.runtime.io.ReaderDelta;
+import org.apache.sysds.runtime.io.ReaderDeltaParallel;
+import org.apache.sysds.runtime.io.WriterDelta;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Cross-engine interoperability tests for the native (Delta Kernel based) matrix
+ * reader/writer against the reference Delta implementation (Delta's Spark
+ * connector, {@code delta-spark}, pulled in test-only).
+ *
+ * The other Delta matrix tests round-trip exclusively through SystemDS' own
+ * Kernel-based read/write paths, so they cannot catch a table that SystemDS
+ * writes in a way other Delta engines reject (or vice versa). These tests close
+ * that gap by routing data through two independent engines:
+ *
+ * - SystemDS writes -> Spark/Delta reads (our output is spec-compliant), and
+ * - Spark/Delta writes -> SystemDS reads, including a multi-file layout and a
+ * table with deletion vectors / a second commit that the SystemDS writer
+ * never produces itself.
+ *
+ *
+ * Row order is never assumed: every table carries a unique id in column 0 and
+ * comparisons are keyed by that id, since neither engine guarantees row order
+ * across files.
+ */
+@net.jcip.annotations.NotThreadSafe
+public class DeltaMatrixSparkInteropTest {
+
+ private static SparkSession spark;
+
+ @BeforeClass
+ public static void startSpark() {
+ //each test class runs in its own fork (surefire reuseForks=false), so this
+ //is the only SparkSession in the JVM and gets the Delta extensions injected.
+ SparkSession.clearActiveSession();
+ SparkSession.clearDefaultSession();
+ spark = SparkSession.builder()
+ .appName("sysds-delta-interop")
+ .master("local[2]")
+ .config("spark.ui.enabled", "false")
+ .config("spark.sql.shuffle.partitions", "2")
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+ .getOrCreate();
+ }
+
+ @AfterClass
+ public static void stopSpark() {
+ if( spark != null )
+ spark.stop();
+ SparkSession.clearActiveSession();
+ SparkSession.clearDefaultSession();
+ spark = null;
+ }
+
+ @Test
+ public void systemdsWriteSparkReadMultiFile() throws Exception {
+ //SystemDS writes a (forced) multi-file Delta table; the reference Delta
+ //engine (Spark) must read every data file back with matching values.
+ int rows = 500, cols = 5;
+ MatrixBlock in = indexedMatrix(rows, cols);
+
+ //small target file size -> multiple parquet data files (exercise that an
+ //external reader stitches all of our data files, not just the first).
+ DMLConfig conf = new DMLConfig();
+ conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(16L * 1024));
+ ConfigurationManager.setLocalConfig(conf);
+ Path dir = Files.createTempDirectory("sysds_delta_s2s_");
+ String tablePath = new File(dir.toFile(), "table").getAbsolutePath();
+ try {
+ new WriterDelta().writeMatrixToHDFS(in, tablePath, rows, cols, -1, in.getNonZeros());
+ assertTrue("writer should have produced a multi-file table", countParquet(tablePath) > 1);
+
+ Dataset df = spark.read().format("delta").load(tablePath);
+ assertEquals("rows", rows, df.count());
+ assertEquals("cols", cols, df.schema().fields().length);
+
+ List read = df.collectAsList();
+ assertEquals(rows, read.size());
+ for( Row r : read ) {
+ int id = (int) Math.round(r.getDouble(0));
+ assertTrue("id in range: " + id, id >= 0 && id < rows);
+ for( int c = 0; c < cols; c++ )
+ assertEquals("r" + id + " c" + c, in.get(id, c), r.getDouble(c), 1e-9);
+ }
+ }
+ finally {
+ ConfigurationManager.clearLocalConfigs();
+ FileUtils.deleteQuietly(dir.toFile());
+ }
+ }
+
+ @Test
+ public void sparkWriteSystemdsReadMultiFile() throws Exception {
+ //the reference Delta engine writes a multi-file table; both the serial and
+ //parallel SystemDS readers must reconstruct it (coercing long ids to double).
+ int rows = 600, cols = 4;
+ Dataset df = indexedDataFrame(rows, cols).repartition(3); //-> multiple data files
+ Path dir = Files.createTempDirectory("sysds_delta_p2s_");
+ String tablePath = new File(dir.toFile(), "table").getAbsolutePath();
+ try {
+ df.write().format("delta").save(tablePath);
+ assertTrue("spark should have written a multi-file table", countParquet(tablePath) > 1);
+
+ Map expected = expectedById(rows, cols);
+ assertMatchesById(new ReaderDelta()
+ .readMatrixFromHDFS(tablePath, -1, -1, -1, -1), expected, cols, "serial");
+ assertMatchesById(new ReaderDeltaParallel()
+ .readMatrixFromHDFS(tablePath, -1, -1, -1, -1), expected, cols, "parallel");
+ }
+ finally {
+ FileUtils.deleteQuietly(dir.toFile());
+ }
+ }
+
+ @Test
+ public void sparkDeletionVectorsSystemdsRead() throws Exception {
+ //a Delta table with deletion vectors + a second commit (the DELETE) is a
+ //layout the SystemDS writer never emits; the readers must honor the DV and
+ //return only the surviving rows. This exercises the hasDeletionVector path.
+ int rows = 400, cols = 3, deleteBelow = 50;
+ Path dir = Files.createTempDirectory("sysds_delta_dv_");
+ String tablePath = new File(dir.toFile(), "table").getAbsolutePath();
+ try {
+ //enable deletion vectors for tables created in this block, then delete a
+ //row range so Delta records a DV rather than rewriting the data files.
+ spark.conf().set(DV_DEFAULT, "true");
+ indexedDataFrame(rows, cols).write().format("delta").save(tablePath);
+ spark.sql("DELETE FROM delta.`" + tablePath + "` WHERE c0 < " + deleteBelow);
+
+ Map expected = expectedById(rows, cols);
+ expected.keySet().removeIf(id -> id < deleteBelow);
+
+ MatrixBlock serial = new ReaderDelta().readMatrixFromHDFS(tablePath, -1, -1, -1, -1);
+ assertEquals("surviving rows (serial)", rows - deleteBelow, serial.getNumRows());
+ assertMatchesById(serial, expected, cols, "serial-dv");
+
+ MatrixBlock parallel = new ReaderDeltaParallel().readMatrixFromHDFS(tablePath, -1, -1, -1, -1);
+ assertEquals("surviving rows (parallel)", rows - deleteBelow, parallel.getNumRows());
+ assertMatchesById(parallel, expected, cols, "parallel-dv");
+ }
+ finally {
+ //fresh fork per test class, so simply clearing the override is enough
+ spark.conf().unset(DV_DEFAULT);
+ FileUtils.deleteQuietly(dir.toFile());
+ }
+ }
+
+ private static final String DV_DEFAULT =
+ "spark.databricks.delta.properties.defaults.enableDeletionVectors";
+
+ /** Matrix whose column 0 is the row index and remaining columns are exact doubles. */
+ private static MatrixBlock indexedMatrix(int rows, int cols) {
+ MatrixBlock mb = new MatrixBlock(rows, cols, false);
+ for( int r = 0; r < rows; r++ ) {
+ mb.set(r, 0, r);
+ for( int c = 1; c < cols; c++ )
+ mb.set(r, c, value(r, c));
+ }
+ mb.recomputeNonZeros();
+ return mb;
+ }
+
+ /** Spark DataFrame mirroring {@link #indexedMatrix} with columns c0..c(cols-1) as doubles. */
+ private static Dataset indexedDataFrame(int rows, int cols) {
+ StructField[] fields = new StructField[cols];
+ for( int c = 0; c < cols; c++ )
+ fields[c] = DataTypes.createStructField("c" + c, DataTypes.DoubleType, false);
+ StructType schema = DataTypes.createStructType(fields);
+
+ List data = new ArrayList<>(rows);
+ for( int r = 0; r < rows; r++ ) {
+ Object[] vals = new Object[cols];
+ vals[0] = (double) r;
+ for( int c = 1; c < cols; c++ )
+ vals[c] = value(r, c);
+ data.add(RowFactory.create(vals));
+ }
+ return spark.createDataFrame(data, schema);
+ }
+
+ /** Deterministic, exactly-representable cell value for (row,col), col>=1. */
+ private static double value(int row, int col) {
+ return row * 0.5 - col;
+ }
+
+ private static Map expectedById(int rows, int cols) {
+ Map exp = new HashMap<>(rows);
+ for( int r = 0; r < rows; r++ ) {
+ double[] row = new double[cols];
+ row[0] = r;
+ for( int c = 1; c < cols; c++ )
+ row[c] = value(r, c);
+ exp.put(r, row);
+ }
+ return exp;
+ }
+
+ /** Asserts every row of {@code out} (keyed by its column-0 id) matches {@code expected}. */
+ private static void assertMatchesById(MatrixBlock out, Map expected, int cols, String tag) {
+ assertEquals(tag + " rows", expected.size(), out.getNumRows());
+ assertEquals(tag + " cols", cols, out.getNumColumns());
+ boolean[] seen = new boolean[expected.size() == 0 ? 0 : maxId(expected) + 1];
+ for( int r = 0; r < out.getNumRows(); r++ ) {
+ int id = (int) Math.round(out.get(r, 0));
+ double[] exp = expected.get(id);
+ assertTrue(tag + ": unexpected/duplicate id " + id, exp != null && id < seen.length && !seen[id]);
+ seen[id] = true;
+ for( int c = 0; c < cols; c++ )
+ assertEquals(tag + " id" + id + " c" + c, exp[c], out.get(r, c), 1e-9);
+ }
+ }
+
+ private static int maxId(Map expected) {
+ int m = 0;
+ for( int id : expected.keySet() )
+ m = Math.max(m, id);
+ return m;
+ }
+
+ private static long countParquet(String tablePath) throws Exception {
+ try( java.util.stream.Stream s = Files.walk(new File(tablePath).toPath()) ) {
+ return s.filter(p -> p.toString().endsWith(".parquet")).count();
+ }
+ }
+}