From f9ddaab781e4c98af9981cb02171a64288eae58a Mon Sep 17 00:00:00 2001 From: Xianjin Date: Mon, 20 May 2024 11:50:05 +0800 Subject: [PATCH 1/5] feat: Add random row generator in data gen --- .../org/apache/comet/DataGenerator.scala | 58 +++++++++++++++++++ .../org/apache/comet/DataGeneratorSuite.scala | 49 ++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 spark/src/test/scala/org/apache/comet/DataGeneratorSuite.scala diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala b/spark/src/test/scala/org/apache/comet/DataGenerator.scala index 691a371b59..a1319501c8 100644 --- a/spark/src/test/scala/org/apache/comet/DataGenerator.scala +++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala @@ -19,16 +19,23 @@ package org.apache.comet +import scala.collection.mutable import scala.util.Random +import org.apache.spark.sql.{RandomDataGenerator, Row} +import org.apache.spark.sql.types.{ArrayType, StringType, StructType} + object DataGenerator { // note that we use `def` rather than `val` intentionally here so that // each test suite starts with a fresh data generator to help ensure // that tests are deterministic def DEFAULT = new DataGenerator(new Random(42)) + // matches the probability of nulls in Spark's RandomDataGenerator + private val PROBABILITY_OF_NULL: Float = 0.1f } class DataGenerator(r: Random) { + import DataGenerator._ /** Generate a random string using the specified characters */ def generateString(chars: String, maxLen: Int): String = { @@ -95,4 +102,55 @@ class DataGenerator(r: Random) { Range(0, n).map(_ => r.nextLong()) } + // Generate a random row according to the schema, the string filed in the struct could be + // configured to generate strings by passing a stringGen function. Other types are delegated + // to Spark's RandomDataGenerator. + def generateRow(schema: StructType, stringGen: Option[() => String] = None): Row = { + val fields = mutable.ArrayBuffer.empty[Any] + schema.fields.foreach { f => + f.dataType match { + case ArrayType(childType, nullable) => + val data = if (f.nullable && r.nextFloat() <= PROBABILITY_OF_NULL) { + null + } else { + val arr = mutable.ArrayBuffer.empty[Any] + val n = 1 // rand.nextInt(10) + var i = 0 + val generator = RandomDataGenerator.forType(childType, nullable, r) + assert(generator.isDefined, "Unsupported type") + val gen = generator.get + while (i < n) { + arr += gen() + i += 1 + } + arr.toSeq + } + fields += data + case StructType(children) => + fields += generateRow(StructType(children)) + case StringType if stringGen.isDefined => + val gen = stringGen.get + val data = if (f.nullable && r.nextFloat() <= PROBABILITY_OF_NULL) { + null + } else { + gen() + } + fields += data + case _ => + val generator = RandomDataGenerator.forType(f.dataType, f.nullable, r) + assert(generator.isDefined, "Unsupported type") + val gen = generator.get + fields += gen() + } + } + Row.fromSeq(fields) + } + + def generateRows( + num: Int, + schema: StructType, + stringGen: Option[() => String] = None): Seq[Row] = { + Range(0, num).map(_ => generateRow(schema, stringGen)) + } + } diff --git a/spark/src/test/scala/org/apache/comet/DataGeneratorSuite.scala b/spark/src/test/scala/org/apache/comet/DataGeneratorSuite.scala new file mode 100644 index 0000000000..ed7328fc0b --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/DataGeneratorSuite.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet + +import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.types.StructType + +class DataGeneratorSuite extends CometTestBase { + + test("test configurable stringGen in row generator") { + val gen = DataGenerator.DEFAULT + val chars = "abcde" + val maxLen = 10 + val stringGen = () => gen.generateString(chars, maxLen) + val numRows = 100 + val schema = new StructType().add("a", "string") + var numNulls = 0 + gen + .generateRows(numRows, schema, Some(stringGen)) + .foreach(row => { + if (row.getString(0) != null) { + assert(row.getString(0).forall(chars.toSeq.contains)) + assert(row.getString(0).length <= maxLen) + } else { + numNulls += 1 + } + }) + // 0.1 null probability + assert(numNulls >= 5 && numNulls <= 15) + } + +} From 1e9e9eef4fa360f2f390cc59a7dc38d14728c72b Mon Sep 17 00:00:00 2001 From: Xianjin Date: Mon, 20 May 2024 14:30:51 +0800 Subject: [PATCH 2/5] fix --- spark/src/test/scala/org/apache/comet/DataGenerator.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala b/spark/src/test/scala/org/apache/comet/DataGenerator.scala index a1319501c8..7ebd890a91 100644 --- a/spark/src/test/scala/org/apache/comet/DataGenerator.scala +++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala @@ -143,7 +143,7 @@ class DataGenerator(r: Random) { fields += gen() } } - Row.fromSeq(fields) + Row.fromSeq(fields.toSeq) } def generateRows( From 03fc030a55650c49360b8daca6713185ab45ff78 Mon Sep 17 00:00:00 2001 From: Xianjin YE Date: Tue, 21 May 2024 21:36:48 +0800 Subject: [PATCH 3/5] remove array type match case, which should already been handled in RandomDataGenerator.forType --- .../org/apache/comet/DataGenerator.scala | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala b/spark/src/test/scala/org/apache/comet/DataGenerator.scala index 7ebd890a91..06deff5554 100644 --- a/spark/src/test/scala/org/apache/comet/DataGenerator.scala +++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala @@ -109,25 +109,8 @@ class DataGenerator(r: Random) { val fields = mutable.ArrayBuffer.empty[Any] schema.fields.foreach { f => f.dataType match { - case ArrayType(childType, nullable) => - val data = if (f.nullable && r.nextFloat() <= PROBABILITY_OF_NULL) { - null - } else { - val arr = mutable.ArrayBuffer.empty[Any] - val n = 1 // rand.nextInt(10) - var i = 0 - val generator = RandomDataGenerator.forType(childType, nullable, r) - assert(generator.isDefined, "Unsupported type") - val gen = generator.get - while (i < n) { - arr += gen() - i += 1 - } - arr.toSeq - } - fields += data case StructType(children) => - fields += generateRow(StructType(children)) + fields += generateRow(StructType(children), stringGen) case StringType if stringGen.isDefined => val gen = stringGen.get val data = if (f.nullable && r.nextFloat() <= PROBABILITY_OF_NULL) { From 34acfd21536c7652c1c3285e9102af8b182c5155 Mon Sep 17 00:00:00 2001 From: Xianjin Date: Wed, 22 May 2024 10:29:06 +0800 Subject: [PATCH 4/5] fix style issue --- spark/src/test/scala/org/apache/comet/DataGenerator.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala b/spark/src/test/scala/org/apache/comet/DataGenerator.scala index 06deff5554..91eacbfc35 100644 --- a/spark/src/test/scala/org/apache/comet/DataGenerator.scala +++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala @@ -23,7 +23,7 @@ import scala.collection.mutable import scala.util.Random import org.apache.spark.sql.{RandomDataGenerator, Row} -import org.apache.spark.sql.types.{ArrayType, StringType, StructType} +import org.apache.spark.sql.types.{StringType, StructType} object DataGenerator { // note that we use `def` rather than `val` intentionally here so that From 14ab3cee80a5864f82fa6de368e05e03b110752e Mon Sep 17 00:00:00 2001 From: Xianjin Date: Thu, 23 May 2024 10:17:25 +0800 Subject: [PATCH 5/5] address comments --- .../org/apache/comet/DataGenerator.scala | 22 +++++++++---------- .../org/apache/comet/DataGeneratorSuite.scala | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala b/spark/src/test/scala/org/apache/comet/DataGenerator.scala index 91eacbfc35..80e7c22881 100644 --- a/spark/src/test/scala/org/apache/comet/DataGenerator.scala +++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala @@ -19,7 +19,6 @@ package org.apache.comet -import scala.collection.mutable import scala.util.Random import org.apache.spark.sql.{RandomDataGenerator, Row} @@ -106,11 +105,10 @@ class DataGenerator(r: Random) { // configured to generate strings by passing a stringGen function. Other types are delegated // to Spark's RandomDataGenerator. def generateRow(schema: StructType, stringGen: Option[() => String] = None): Row = { - val fields = mutable.ArrayBuffer.empty[Any] - schema.fields.foreach { f => + val fields = schema.fields.map { f => f.dataType match { case StructType(children) => - fields += generateRow(StructType(children), stringGen) + generateRow(StructType(children), stringGen) case StringType if stringGen.isDefined => val gen = stringGen.get val data = if (f.nullable && r.nextFloat() <= PROBABILITY_OF_NULL) { @@ -118,15 +116,17 @@ class DataGenerator(r: Random) { } else { gen() } - fields += data + data case _ => - val generator = RandomDataGenerator.forType(f.dataType, f.nullable, r) - assert(generator.isDefined, "Unsupported type") - val gen = generator.get - fields += gen() + val gen = RandomDataGenerator.forType(f.dataType, f.nullable, r) match { + case Some(g) => g + case None => + throw new IllegalStateException(s"No RandomDataGenerator for type ${f.dataType}") + } + gen() } - } - Row.fromSeq(fields.toSeq) + }.toSeq + Row.fromSeq(fields) } def generateRows( diff --git a/spark/src/test/scala/org/apache/comet/DataGeneratorSuite.scala b/spark/src/test/scala/org/apache/comet/DataGeneratorSuite.scala index ed7328fc0b..02dfb9d7bf 100644 --- a/spark/src/test/scala/org/apache/comet/DataGeneratorSuite.scala +++ b/spark/src/test/scala/org/apache/comet/DataGeneratorSuite.scala @@ -43,7 +43,7 @@ class DataGeneratorSuite extends CometTestBase { } }) // 0.1 null probability - assert(numNulls >= 5 && numNulls <= 15) + assert(numNulls >= 0.05 * numRows && numNulls <= 0.15 * numRows) } }