From a90535952d42f4061caaaa1d48f59d7f59ef118f Mon Sep 17 00:00:00 2001 From: zgxme Date: Fri, 26 Jun 2026 14:44:58 +0800 Subject: [PATCH 01/10] [improvement](regression) Use Spark thrift JDBC for external SQL helpers ### What problem does this PR solve? Issue Number: close #xxx Related PR: #63719 Problem Summary: The regression Spark Iceberg and Paimon helpers executed SQL through docker exec and spark-sql, which required local Docker access and repeatedly started Spark SQL clients. This change follows the Spark Iceberg JDBC helper approach from PR #63719 and routes Spark Iceberg/Paimon helper execution through Spark ThriftServer with Hive JDBC. Multi-statement execution now reuses one JDBC connection. ### Release note None ### Check List (For Author) - Test: Manual test - mvn -q -DskipTests compile under regression-test/framework - git diff --check -- framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy - Behavior changed: Yes. spark_iceberg, spark_iceberg_multi, and spark_paimon now execute through Spark ThriftServer JDBC instead of docker exec spark-sql. - Does this need documentation: No --- .../docker-compose/iceberg/entrypoint.sh.tpl | 6 +- .../docker-compose/iceberg/iceberg.env | 1 + .../docker-compose/iceberg/iceberg.yaml.tpl | 2 + .../doris/regression/suite/Suite.groovy | 123 +++++------------- 4 files changed, 41 insertions(+), 91 deletions(-) diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl index 4232b4f3cc1321..227831098caba6 100644 --- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl +++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl @@ -48,21 +48,21 @@ start-thriftserver.sh --driver-java-options "-Dderby.system.home=/tmp/derby" # This approach can reduce the time from 150s to 40s. START_TIME1=$(date +%s) -find /mnt/scripts/create_preinstalled_scripts/iceberg -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> iceberg_total.sql +find /mnt/scripts/create_preinstalled_scripts/iceberg -name '*.sql' | sort | sed 's|^|source |' | sed 's|$|;|'> iceberg_total.sql spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -f iceberg_total.sql END_TIME1=$(date +%s) EXECUTION_TIME1=$((END_TIME1 - START_TIME1)) echo "Script iceberg total: {} executed in $EXECUTION_TIME1 seconds" START_TIME2=$(date +%s) -find /mnt/scripts/create_preinstalled_scripts/paimon -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> paimon_total.sql +find /mnt/scripts/create_preinstalled_scripts/paimon -name '*.sql' | sort | sed 's|^|source |' | sed 's|$|;|'> paimon_total.sql spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions -f paimon_total.sql END_TIME2=$(date +%s) EXECUTION_TIME2=$((END_TIME2 - START_TIME2)) echo "Script paimon total: {} executed in $EXECUTION_TIME2 seconds" START_TIME3=$(date +%s) -find /mnt/scripts/create_preinstalled_scripts/iceberg_load -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> iceberg_load_total.sql +find /mnt/scripts/create_preinstalled_scripts/iceberg_load -name '*.sql' | sort | sed 's|^|source |' | sed 's|$|;|'> iceberg_load_total.sql spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -f iceberg_load_total.sql END_TIME3=$(date +%s) EXECUTION_TIME3=$((END_TIME3 - START_TIME3)) diff --git a/docker/thirdparties/docker-compose/iceberg/iceberg.env b/docker/thirdparties/docker-compose/iceberg/iceberg.env index 6bebd49f437d80..0950783075cf21 100644 --- a/docker/thirdparties/docker-compose/iceberg/iceberg.env +++ b/docker/thirdparties/docker-compose/iceberg/iceberg.env @@ -19,6 +19,7 @@ NOTEBOOK_SERVER_PORT=8888 SPARK_DRIVER_UI_PORT=8080 SPARK_HISTORY_UI_PORT=10000 +SPARK_THRIFT_PORT=11000 REST_CATALOG_PORT=18181 MINIO_UI_PORT=9000 MINIO_API_PORT=19001 diff --git a/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl b/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl index 0c4e8e1cc65027..9b1704a7891028 100644 --- a/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl +++ b/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl @@ -41,6 +41,8 @@ services: - AWS_ACCESS_KEY_ID=admin - AWS_SECRET_ACCESS_KEY=password - AWS_REGION=us-east-1 + ports: + - ${SPARK_THRIFT_PORT}:10000 entrypoint: /bin/sh /mnt/scripts/entrypoint.sh user: root networks: diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy index 0dc8f9d17371a5..414450f2954dc8 100644 --- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy @@ -111,7 +111,6 @@ class Suite implements GroovyInterceptable { private AmazonS3 s3Client = null private FileSystem fs = null - private String sparkIcebergContainerNameCache = null Suite(String name, String group, SuiteContext context, SuiteCluster cluster) { this.name = name @@ -1618,80 +1617,54 @@ class Suite implements GroovyInterceptable { return result } - /** - * Get the spark-iceberg container name by querying docker. - * Uses 'docker ps --filter name=spark-iceberg' to find the container. - */ - private String getSparkIcebergContainerName() { - if (!Strings.isNullOrEmpty(sparkIcebergContainerNameCache)) { - return sparkIcebergContainerNameCache + private String getSparkIcebergJdbcUrl() { + String sparkHost = context.config.otherConfigs.get("externalEnvIp") + String sparkPort = context.config.otherConfigs.get("iceberg_spark_thrift_port") ?: "11000" + return "jdbc:hive2://${sparkHost}:${sparkPort}/;auth=noSasl" + } + + private List> spark_sql(String sqlStr, boolean isOrder = false) { + Class.forName("org.apache.hive.jdbc.HiveDriver") + String sparkJdbcUrl = getSparkIcebergJdbcUrl() + String cleanedSqlStr = sqlStr.replaceAll("\\s*;\\s*\$", "") + logger.info("Execute Spark JDBC SQL: ${cleanedSqlStr}".toString()) + logger.info("Spark JDBC URL: ${sparkJdbcUrl}".toString()) + return connect("hadoop", "hadoop", sparkJdbcUrl) { + return sql(cleanedSqlStr, isOrder) } + } - try { - // Use docker ps with filter to find containers with 'spark-iceberg' in the name - String command = "docker ps --filter name=spark-iceberg --format {{.Names}}" - def process = command.execute() - process.waitFor() - String output = process.in.text.trim() + private List spark_sql_multi(String sqlStatements, boolean isOrder = false) { + def statements = sqlStatements.split(';').collect { it.trim() }.findAll { it } - if (output) { - // Get the first matching container - String containerName = output.split('\n')[0].trim() - if (containerName) { - sparkIcebergContainerNameCache = containerName - logger.info("Found spark-iceberg container: ${containerName}".toString()) - return containerName - } - } + if (statements.isEmpty()) { + return [] + } - logger.warn("No spark-iceberg container found via docker ps") - return null - } catch (Exception e) { - logger.warn("Failed to get spark-iceberg container via docker ps: ${e.message}".toString()) - return null + Class.forName("org.apache.hive.jdbc.HiveDriver") + String sparkJdbcUrl = getSparkIcebergJdbcUrl() + logger.info("Execute Spark JDBC SQL statements via ${sparkJdbcUrl}: ${statements}".toString()) + return connect("hadoop", "hadoop", sparkJdbcUrl) { + return statements.collect { statement -> sql(statement, isOrder) } } } /** - * Execute Spark SQL on the spark-iceberg container via docker exec. + * Execute Spark SQL on the Spark ThriftServer via Hive JDBC. * * Usage in test suite: * spark_iceberg "CREATE TABLE demo.test_db.t1 (id INT) USING iceberg" * spark_iceberg "INSERT INTO demo.test_db.t1 VALUES (1)" * def result = spark_iceberg "SELECT * FROM demo.test_db.t1" - * - * The container name is found by querying 'docker ps --filter name=spark-iceberg' */ - String spark_iceberg(String sqlStr, int timeoutSeconds = 120) { - String containerName = getSparkIcebergContainerName() - if (containerName == null) { - throw new RuntimeException("spark-iceberg container not found. Please ensure the container is running.") - } - String masterUrl = "spark://${containerName}:7077" - - // Escape double quotes in SQL string for shell command - String escapedSql = sqlStr.replaceAll('"', '\\\\"') - - // Build docker exec command - String command = """docker exec ${containerName} spark-sql --master ${masterUrl} --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -e "${escapedSql}" """ - - logger.info("Executing Spark Iceberg SQL: ${sqlStr}".toString()) - logger.info("Container: ${containerName}".toString()) - - try { - String result = cmd(command, timeoutSeconds) - logger.info("Spark Iceberg SQL result: ${result}".toString()) - return result - } catch (Exception e) { - logger.error("Spark Iceberg SQL failed: ${e.message}".toString()) - throw e - } + List> spark_iceberg(String sqlStr, boolean isOrder = false) { + return spark_sql(sqlStr, isOrder) } /** - * Execute multiple Spark SQL statements on the spark-iceberg container. + * Execute multiple Spark SQL statements on the Spark ThriftServer via Hive JDBC. * Statements are separated by semicolons. - * All statements are executed in one spark-sql process to reduce startup overhead. + * All statements are executed on one JDBC connection to reduce startup overhead. * * Usage: * spark_iceberg_multi ''' @@ -1700,46 +1673,20 @@ class Suite implements GroovyInterceptable { * INSERT INTO demo.test_db.t1 VALUES (1); * ''' */ - List spark_iceberg_multi(String sqlStatements, int timeoutSeconds = 300) { - def statements = sqlStatements.split(';').collect { it.trim() }.findAll { it } - - if (statements.isEmpty()) { - return [] - } - - String combinedSql = statements.collect { "${it};" }.join(" ") - return [spark_iceberg(combinedSql, timeoutSeconds)] + List spark_iceberg_multi(String sqlStatements, boolean isOrder = false) { + return spark_sql_multi(sqlStatements, isOrder) } /** - * Execute Spark SQL on the spark-iceberg container with Paimon extensions enabled. + * Execute Spark SQL with the Paimon catalog on the Spark ThriftServer via Hive JDBC. * * Usage in test suite: * spark_paimon "CREATE TABLE paimon.test_db.t1 (id INT) USING paimon" * spark_paimon "INSERT INTO paimon.test_db.t1 VALUES (1)" * def result = spark_paimon "SELECT * FROM paimon.test_db.t1" */ - String spark_paimon(String sqlStr, int timeoutSeconds = 120) { - String containerName = getSparkIcebergContainerName() - if (containerName == null) { - throw new RuntimeException("spark-iceberg container not found. Please ensure the container is running.") - } - String masterUrl = "spark://${containerName}:7077" - - String escapedSql = sqlStr.replaceAll('"', '\\\\"') - String command = """docker exec ${containerName} spark-sql --master ${masterUrl} --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions -e "${escapedSql}" """ - - logger.info("Executing Spark Paimon SQL: ${sqlStr}".toString()) - logger.info("Container: ${containerName}".toString()) - - try { - String result = cmd(command, timeoutSeconds) - logger.info("Spark Paimon SQL result: ${result}".toString()) - return result - } catch (Exception e) { - logger.error("Spark Paimon SQL failed: ${e.message}".toString()) - throw e - } + List> spark_paimon(String sqlStr, boolean isOrder = false) { + return spark_sql(sqlStr, isOrder) } List> db2_docker(String sqlStr, boolean isOrder = false) { From 130a245820f0f37cb3306c7dcd464f8651810723 Mon Sep 17 00:00:00 2001 From: zgxme Date: Fri, 26 Jun 2026 15:53:28 +0800 Subject: [PATCH 02/10] [improvement](regression) Reuse Spark Iceberg JDBC connection ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: Spark Iceberg helpers opened a new Hive JDBC connection for every spark_iceberg/spark_paimon call. This added repeated Spark ThriftServer session setup overhead in suites that issue many Spark SQL statements. The framework now keeps a Spark Iceberg JDBC connection in SuiteContext thread-local state, creates it on first use, reuses it for later calls in the same suite context thread, and closes it with other context thread-local resources. ### Release note None ### Check List (For Author) - Test: Manual test - Manual test: mvn package -B -DskipTests=true -Dmaven.javadoc.skip=true in regression-test/framework; git diff --check - Behavior changed: Yes. Spark Iceberg/Paimon helper SQL reuses a SuiteContext-local Spark JDBC connection instead of opening one per call. - Does this need documentation: No --- .../doris/regression/suite/Suite.groovy | 23 +++--------- .../regression/suite/SuiteContext.groovy | 35 +++++++++++++++++++ 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy index 414450f2954dc8..83f749a8839ad9 100644 --- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy @@ -1617,21 +1617,11 @@ class Suite implements GroovyInterceptable { return result } - private String getSparkIcebergJdbcUrl() { - String sparkHost = context.config.otherConfigs.get("externalEnvIp") - String sparkPort = context.config.otherConfigs.get("iceberg_spark_thrift_port") ?: "11000" - return "jdbc:hive2://${sparkHost}:${sparkPort}/;auth=noSasl" - } - private List> spark_sql(String sqlStr, boolean isOrder = false) { - Class.forName("org.apache.hive.jdbc.HiveDriver") - String sparkJdbcUrl = getSparkIcebergJdbcUrl() String cleanedSqlStr = sqlStr.replaceAll("\\s*;\\s*\$", "") logger.info("Execute Spark JDBC SQL: ${cleanedSqlStr}".toString()) - logger.info("Spark JDBC URL: ${sparkJdbcUrl}".toString()) - return connect("hadoop", "hadoop", sparkJdbcUrl) { - return sql(cleanedSqlStr, isOrder) - } + logger.info("Spark JDBC URL: ${context.getSparkIcebergJdbcUrl()}".toString()) + return sql_impl(context.getSparkIcebergConnection(), cleanedSqlStr, isOrder) } private List spark_sql_multi(String sqlStatements, boolean isOrder = false) { @@ -1641,12 +1631,9 @@ class Suite implements GroovyInterceptable { return [] } - Class.forName("org.apache.hive.jdbc.HiveDriver") - String sparkJdbcUrl = getSparkIcebergJdbcUrl() - logger.info("Execute Spark JDBC SQL statements via ${sparkJdbcUrl}: ${statements}".toString()) - return connect("hadoop", "hadoop", sparkJdbcUrl) { - return statements.collect { statement -> sql(statement, isOrder) } - } + logger.info("Execute Spark JDBC SQL statements via ${context.getSparkIcebergJdbcUrl()}: ${statements}".toString()) + Connection sparkConn = context.getSparkIcebergConnection() + return statements.collect { statement -> sql_impl(sparkConn, statement, isOrder) } } /** diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteContext.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteContext.groovy index 0d599aed817ae6..08aa740974be3e 100644 --- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteContext.groovy +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteContext.groovy @@ -53,6 +53,7 @@ class SuiteContext implements Closeable { public final ThreadLocal threadHive2DockerConn = new ThreadLocal<>() public final ThreadLocal threadHive3DockerConn = new ThreadLocal<>() public final ThreadLocal threadHiveRemoteConn = new ThreadLocal<>() + public final ThreadLocal threadSparkIcebergConn = new ThreadLocal<>() public final ThreadLocal threadDB2DockerConn = new ThreadLocal<>() private final ThreadLocal syncer = new ThreadLocal<>() public final Config config @@ -239,6 +240,15 @@ class SuiteContext implements Closeable { return threadConn } + Connection getSparkIcebergConnection() { + def threadConn = threadSparkIcebergConn.get() + if (threadConn == null) { + threadConn = getConnectionBySparkIcebergConfig() + threadSparkIcebergConn.set(threadConn) + } + return threadConn + } + Connection getDB2DockerConnection() { def threadConn = threadDB2DockerConn.get() if (threadConn == null) { @@ -314,6 +324,21 @@ class SuiteContext implements Closeable { return DriverManager.getConnection(hiveJdbcUrl, hiveJdbcUser, hiveJdbcPassword) } + Connection getConnectionBySparkIcebergConfig() { + Class.forName("org.apache.hive.jdbc.HiveDriver"); + String sparkJdbcUser = "hadoop" + String sparkJdbcPassword = "hadoop" + String sparkJdbcUrl = getSparkIcebergJdbcUrl() + log.info("Create Spark Iceberg JDBC connection to ${sparkJdbcUrl}".toString()) + return DriverManager.getConnection(sparkJdbcUrl, sparkJdbcUser, sparkJdbcPassword) + } + + String getSparkIcebergJdbcUrl() { + String sparkHost = config.otherConfigs.get("externalEnvIp") + String sparkPort = config.otherConfigs.get("iceberg_spark_thrift_port") ?: "11000" + return "jdbc:hive2://${sparkHost}:${sparkPort}/;auth=noSasl" + } + Connection getConnectionByDB2DockerConfig() { Class.forName("com.ibm.db2.jcc.DB2Driver"); String db2Host = config.otherConfigs.get("externalEnvIp") @@ -616,6 +641,16 @@ class SuiteContext implements Closeable { log.warn("Close connection failed", t) } } + + Connection spark_iceberg_conn = threadSparkIcebergConn.get() + if (spark_iceberg_conn != null) { + threadSparkIcebergConn.remove() + try { + spark_iceberg_conn.close() + } catch (Throwable t) { + log.warn("Close connection failed", t) + } + } } From f05c2db6a477992c09e22825ff6333e4056c13f1 Mon Sep 17 00:00:00 2001 From: zgxme Date: Fri, 26 Jun 2026 16:03:09 +0800 Subject: [PATCH 03/10] [improvement](regression) Run Iceberg thriftserver on Spark master ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: The Iceberg docker entrypoint started Spark master and worker before Spark ThriftServer, but the thriftserver command did not specify a Spark master. Without an explicit master, Spark can fall back to local execution, so the standalone master and worker may not be used by Hive JDBC queries. This change starts Spark ThriftServer with --master spark://doris--spark-iceberg:7077 while keeping the Derby system home JVM option unchanged. ### Release note None ### Check List (For Author) - Test: Manual test - Manual test: bash -n docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl - Behavior changed: Yes. Iceberg Spark ThriftServer now explicitly runs against the standalone Spark master in the docker environment. - Does this need documentation: No --- docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl index 227831098caba6..c45e2e0bc126b9 100644 --- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl +++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl @@ -39,7 +39,9 @@ done start-master.sh -p 7077 start-worker.sh spark://doris--spark-iceberg:7077 start-history-server.sh -start-thriftserver.sh --driver-java-options "-Dderby.system.home=/tmp/derby" +start-thriftserver.sh \ + --master spark://doris--spark-iceberg:7077 \ + --driver-java-options "-Dderby.system.home=/tmp/derby" # The creation of a Spark SQL client is time-consuming, # and reopening a new client for each SQL file execution leads to significant overhead. From 845c291321df746129169fbf1c78c32d0d633341 Mon Sep 17 00:00:00 2001 From: zgxme Date: Fri, 26 Jun 2026 16:15:40 +0800 Subject: [PATCH 04/10] [improvement](regression) Tune Iceberg Spark docker resources ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: The Iceberg Spark docker environment relied on Spark defaults for ThriftServer and spark-sql resource sizing. Those defaults can use too many CPU cores while leaving executor and driver heap at small defaults, and the default shuffle partition count is high for local regression data. This change caps the Spark app at 8 cores, uses 4-core executors with 8g heap, gives the driver 4g heap, disables dynamic allocation explicitly, and reduces default shuffle/parallelism settings for local regression stability. ### Release note None ### Check List (For Author) - Test: Manual test - Manual test: git diff --check -- docker/thirdparties/docker-compose/iceberg/spark-defaults.conf - Behavior changed: Yes. Iceberg Spark docker jobs now use explicit resource and parallelism defaults. - Does this need documentation: No --- .../docker-compose/iceberg/spark-defaults.conf | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docker/thirdparties/docker-compose/iceberg/spark-defaults.conf b/docker/thirdparties/docker-compose/iceberg/spark-defaults.conf index 8336a2afcf8aba..ea39e4c051b60c 100644 --- a/docker/thirdparties/docker-compose/iceberg/spark-defaults.conf +++ b/docker/thirdparties/docker-compose/iceberg/spark-defaults.conf @@ -20,6 +20,14 @@ # Example: spark.sql.session.timeZone Asia/Shanghai +spark.dynamicAllocation.enabled false +spark.cores.max 8 +spark.executor.cores 4 +spark.executor.memory 8g +spark.driver.memory 4g +spark.sql.shuffle.partitions 16 +spark.default.parallelism 16 + spark.sql.catalog.demo org.apache.iceberg.spark.SparkCatalog spark.sql.catalog.demo.type rest spark.sql.catalog.demo.uri http://rest:8181 @@ -42,4 +50,4 @@ spark.sql.catalog.paimon.warehouse s3://warehouse/wh spark.sql.catalog.paimon.s3.endpoint http://minio:9000 spark.sql.catalog.paimon.s3.access-key admin spark.sql.catalog.paimon.s3.secret-key password -spark.sql.catalog.paimon.s3.region us-east-1 \ No newline at end of file +spark.sql.catalog.paimon.s3.region us-east-1 From 9fdd629a5a3342f653e7cece3ba804fd54d8b511 Mon Sep 17 00:00:00 2001 From: zgxme Date: Fri, 26 Jun 2026 16:26:22 +0800 Subject: [PATCH 05/10] [improvement](regression) Start Iceberg thriftserver after data setup ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: The Iceberg docker entrypoint started Spark ThriftServer before running the preinstalled Spark SQL setup scripts. After moving ThriftServer onto the standalone master, that idle ThriftServer app can reserve executor resources while setup scripts are still running. The ThriftServer also did not receive Iceberg/Paimon SQL extensions, while regression helpers execute Spark SQL through Hive JDBC. This change runs the setup scripts first, then starts ThriftServer with Iceberg and Paimon extensions, and waits for Hive JDBC readiness before marking the container healthy. ### Release note None ### Check List (For Author) - Test: Manual test - Manual test: bash -n docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl; /bin/sh -n docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl; git diff --check -- docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl - Behavior changed: Yes. Iceberg Spark ThriftServer starts after preinstalled data setup and waits for JDBC readiness before /mnt/SUCCESS. - Does this need documentation: No --- .../docker-compose/iceberg/entrypoint.sh.tpl | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl index c45e2e0bc126b9..b1915bbf442cab 100644 --- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl +++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl @@ -26,6 +26,7 @@ done set -ex mkdir -p /opt/spark/events +SPARK_THRIFT_EXTENSIONS="org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions" for f in /opt/spark/sbin/*; do ln -s $f /usr/local/bin/$(basename $f) @@ -39,9 +40,6 @@ done start-master.sh -p 7077 start-worker.sh spark://doris--spark-iceberg:7077 start-history-server.sh -start-thriftserver.sh \ - --master spark://doris--spark-iceberg:7077 \ - --driver-java-options "-Dderby.system.home=/tmp/derby" # The creation of a Spark SQL client is time-consuming, # and reopening a new client for each SQL file execution leads to significant overhead. @@ -70,6 +68,19 @@ END_TIME3=$(date +%s) EXECUTION_TIME3=$((END_TIME3 - START_TIME3)) echo "Script iceberg load total: {} executed in $EXECUTION_TIME3 seconds" +start-thriftserver.sh \ + --master spark://doris--spark-iceberg:7077 \ + --conf "spark.sql.extensions=${SPARK_THRIFT_EXTENSIONS}" \ + --driver-java-options "-Dderby.system.home=/tmp/derby" + +while ! beeline \ + -u "jdbc:hive2://localhost:10000/;auth=noSasl" \ + -n hadoop \ + -p hadoop \ + -e "SELECT 1" >/tmp/spark-thriftserver-ready.log 2>&1; do + sleep 1 +done + touch /mnt/SUCCESS; tail -f /dev/null From 0601f94bf08c49b339d15493f4154c5b4208f573 Mon Sep 17 00:00:00 2001 From: zgxme Date: Fri, 26 Jun 2026 17:10:18 +0800 Subject: [PATCH 06/10] [fix](regression) Fix Spark Iceberg thrift JDBC URL ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Spark 4 thriftserver rejects the previous noSasl JDBC URL and then fails to open sessions against the default Iceberg namespace because demo.default is not created. This makes the Iceberg docker startup loop on the thriftserver readiness check and prevents regression Spark Iceberg JDBC helpers from connecting. Create the default Iceberg namespace before starting thriftserver, use the normal HiveServer2 JDBC URL without auth=noSasl, and fail readiness with useful logs instead of looping forever. ### Release note None ### Check List (For Author) - Test: Manual test - Ran bash -n and /bin/sh -n for docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl - Ran git diff --check for modified files - Ran mvn package -B -DskipTests=true -Dmaven.javadoc.skip=true in regression-test/framework - Behavior changed: No - Does this need documentation: No --- .../docker-compose/iceberg/entrypoint.sh.tpl | 17 +++++++++++++++-- .../doris/regression/suite/SuiteContext.groovy | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl index b1915bbf442cab..dfdcbd441e82d7 100644 --- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl +++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl @@ -19,7 +19,7 @@ export SPARK_MASTER_HOST=doris--spark-iceberg # wait iceberg-rest start -while [[ ! $(curl -s --fail http://rest:8181/v1/config) ]]; do +while ! curl -s --fail http://rest:8181/v1/config >/dev/null; do sleep 1 done @@ -68,16 +68,29 @@ END_TIME3=$(date +%s) EXECUTION_TIME3=$((END_TIME3 - START_TIME3)) echo "Script iceberg load total: {} executed in $EXECUTION_TIME3 seconds" +spark-sql \ + --master spark://doris--spark-iceberg:7077 \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + -e "CREATE DATABASE IF NOT EXISTS demo.default" + start-thriftserver.sh \ --master spark://doris--spark-iceberg:7077 \ --conf "spark.sql.extensions=${SPARK_THRIFT_EXTENSIONS}" \ --driver-java-options "-Dderby.system.home=/tmp/derby" +SPARK_THRIFT_READY_ATTEMPTS=0 while ! beeline \ - -u "jdbc:hive2://localhost:10000/;auth=noSasl" \ + -u "jdbc:hive2://localhost:10000/default" \ -n hadoop \ -p hadoop \ -e "SELECT 1" >/tmp/spark-thriftserver-ready.log 2>&1; do + SPARK_THRIFT_READY_ATTEMPTS=$((SPARK_THRIFT_READY_ATTEMPTS + 1)) + if [ "${SPARK_THRIFT_READY_ATTEMPTS}" -ge 120 ]; then + echo "ERROR: Spark thriftserver did not become ready after ${SPARK_THRIFT_READY_ATTEMPTS} attempts" >&2 + cat /tmp/spark-thriftserver-ready.log >&2 || true + tail -n 200 /opt/spark/logs/*HiveThriftServer2*.out >&2 || true + exit 1 + fi sleep 1 done diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteContext.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteContext.groovy index 08aa740974be3e..fcd59cb3a7337e 100644 --- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteContext.groovy +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteContext.groovy @@ -336,7 +336,7 @@ class SuiteContext implements Closeable { String getSparkIcebergJdbcUrl() { String sparkHost = config.otherConfigs.get("externalEnvIp") String sparkPort = config.otherConfigs.get("iceberg_spark_thrift_port") ?: "11000" - return "jdbc:hive2://${sparkHost}:${sparkPort}/;auth=noSasl" + return "jdbc:hive2://${sparkHost}:${sparkPort}/default" } Connection getConnectionByDB2DockerConfig() { From a983fa3b1c1a453099905e21ad5984f31fcdd57a Mon Sep 17 00:00:00 2001 From: zgxme Date: Fri, 26 Jun 2026 18:39:14 +0800 Subject: [PATCH 07/10] [test](regression) Add Spark Doris consistency demos Issue Number: None Related PR: None Problem Summary: Add P2 demo regression cases for Iceberg and Paimon. The cases write data through Spark SQL first, then query the same external table through both Doris and Spark, normalizing JDBC result values before comparison to avoid false failures caused by different Java number classes returned by the two JDBC drivers. None - Test: Regression test - ./run-regression-test.sh --run -d external_table_p2/iceberg -s test_iceberg_spark_doris_consistency_demo - ./run-regression-test.sh --run -d external_table_p2/paimon -s test_paimon_spark_doris_consistency_demo - Behavior changed: No - Does this need documentation: No --- .../doris/regression/suite/Suite.groovy | 22 +++- ...ceberg_spark_doris_consistency_demo.groovy | 113 ++++++++++++++++++ ...paimon_spark_doris_consistency_demo.groovy | 111 +++++++++++++++++ 3 files changed, 243 insertions(+), 3 deletions(-) create mode 100644 regression-test/suites/external_table_p2/iceberg/test_iceberg_spark_doris_consistency_demo.groovy create mode 100644 regression-test/suites/external_table_p2/paimon/test_paimon_spark_doris_consistency_demo.groovy diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy index 83f749a8839ad9..9ac21f91e9c413 100644 --- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy @@ -1624,8 +1624,8 @@ class Suite implements GroovyInterceptable { return sql_impl(context.getSparkIcebergConnection(), cleanedSqlStr, isOrder) } - private List spark_sql_multi(String sqlStatements, boolean isOrder = false) { - def statements = sqlStatements.split(';').collect { it.trim() }.findAll { it } + private List spark_sql_multi(Object sqlStatements, boolean isOrder = false) { + def statements = sqlStatements.toString().split(';').collect { it.trim() }.findAll { it } if (statements.isEmpty()) { return [] @@ -1660,7 +1660,7 @@ class Suite implements GroovyInterceptable { * INSERT INTO demo.test_db.t1 VALUES (1); * ''' */ - List spark_iceberg_multi(String sqlStatements, boolean isOrder = false) { + List spark_iceberg_multi(Object sqlStatements, boolean isOrder = false) { return spark_sql_multi(sqlStatements, isOrder) } @@ -1676,6 +1676,22 @@ class Suite implements GroovyInterceptable { return spark_sql(sqlStr, isOrder) } + /** + * Execute multiple Spark SQL statements with the Paimon catalog on the Spark ThriftServer via Hive JDBC. + * Statements are separated by semicolons. + * All statements are executed on one JDBC connection to reduce startup overhead. + * + * Usage: + * spark_paimon_multi ''' + * CREATE DATABASE IF NOT EXISTS paimon.test_db; + * CREATE TABLE paimon.test_db.t1 (id INT) USING paimon; + * INSERT INTO paimon.test_db.t1 VALUES (1); + * ''' + */ + List spark_paimon_multi(Object sqlStatements, boolean isOrder = false) { + return spark_sql_multi(sqlStatements, isOrder) + } + List> db2_docker(String sqlStr, boolean isOrder = false) { String cleanedSqlStr = sqlStr.replaceAll("\\s*;\\s*\$", "") def (result, meta) = JdbcUtils.executeToList(context.getDB2DockerConnection(), cleanedSqlStr) diff --git a/regression-test/suites/external_table_p2/iceberg/test_iceberg_spark_doris_consistency_demo.groovy b/regression-test/suites/external_table_p2/iceberg/test_iceberg_spark_doris_consistency_demo.groovy new file mode 100644 index 00000000000000..052b7be513aaca --- /dev/null +++ b/regression-test/suites/external_table_p2/iceberg/test_iceberg_spark_doris_consistency_demo.groovy @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_iceberg_spark_doris_consistency_demo", "p2,external,iceberg,external_docker,external_docker_iceberg") { + String enabled = context.config.otherConfigs.get("enableIcebergTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("disable iceberg test.") + return + } + + String catalogName = "test_iceberg_spark_doris_consistency_demo" + String dbName = "iceberg_spark_doris_consistency_demo_db" + String restPort = context.config.otherConfigs.get("iceberg_rest_uri_port") + String minioPort = context.config.otherConfigs.get("iceberg_minio_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + def normalizeRows = { rows -> + rows.collect { row -> + row.collect { value -> value == null ? null : value.toString() } + } + } + def expectedRows = [ + [1, "alice", 10], + [2, "bob", 20], + [3, "cindy", null], + [4, "doris", 40] + ] + def expectedAggRows = [[4L, 70L]] + + // Example: execute multiple Spark Iceberg statements in one JDBC connection. + spark_iceberg_multi """ + CREATE DATABASE IF NOT EXISTS demo.${dbName}; + DROP TABLE IF EXISTS demo.${dbName}.spark_written_iceberg_demo; + CREATE TABLE demo.${dbName}.spark_written_iceberg_demo ( + id INT, + name STRING, + score INT + ) USING iceberg; + INSERT INTO demo.${dbName}.spark_written_iceberg_demo VALUES + (1, 'alice', 10), + (2, 'bob', 20), + (3, 'cindy', NULL); + """ + + // Example: write one more Iceberg row through Spark SQL. + spark_iceberg """ + INSERT INTO demo.${dbName}.spark_written_iceberg_demo VALUES + (4, 'doris', 40); + """ + + sql """drop catalog if exists ${catalogName}""" + sql """ + CREATE CATALOG ${catalogName} PROPERTIES ( + 'type'='iceberg', + 'iceberg.catalog.type'='rest', + 'uri' = 'http://${externalEnvIp}:${restPort}', + 's3.access_key' = 'admin', + 's3.secret_key' = 'password', + 's3.endpoint' = 'http://${externalEnvIp}:${minioPort}', + 's3.region' = 'us-east-1' + ); + """ + + sql """switch ${catalogName}""" + + def sparkRows = spark_iceberg """ + SELECT id, name, score + FROM demo.${dbName}.spark_written_iceberg_demo + ORDER BY id + """ + // Example 1: compare Spark Iceberg query result with explicit expected values. + assertEquals(expectedRows, sparkRows) + + def dorisRows = sql """ + SELECT id, name, score + FROM ${dbName}.spark_written_iceberg_demo + ORDER BY id + """ + // Example 1: compare Doris Iceberg query result with explicit expected values. + assertEquals(expectedRows, dorisRows) + + // Example 2: compare Doris and Spark query results. + assertEquals(normalizeRows(sparkRows), normalizeRows(dorisRows)) + + def sparkAggRows = spark_iceberg """ + SELECT count(*), sum(score) + FROM demo.${dbName}.spark_written_iceberg_demo + """ + // Compare Spark Iceberg aggregate result with explicit expected values. + assertEquals(expectedAggRows, sparkAggRows) + + def dorisAggRows = sql """ + SELECT count(*), sum(score) + FROM ${dbName}.spark_written_iceberg_demo + """ + // Doris and Spark JDBC may return the same aggregate value with different + // Java number classes, for example Long vs BigInteger, so normalize before comparing. + assertEquals(normalizeRows(sparkAggRows), normalizeRows(dorisAggRows)) +} diff --git a/regression-test/suites/external_table_p2/paimon/test_paimon_spark_doris_consistency_demo.groovy b/regression-test/suites/external_table_p2/paimon/test_paimon_spark_doris_consistency_demo.groovy new file mode 100644 index 00000000000000..f9023ffd73f32b --- /dev/null +++ b/regression-test/suites/external_table_p2/paimon/test_paimon_spark_doris_consistency_demo.groovy @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_paimon_spark_doris_consistency_demo", "p2,external,paimon") { + String enabled = context.config.otherConfigs.get("enablePaimonTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("disable paimon test.") + return + } + + String catalogName = "test_paimon_spark_doris_consistency_demo" + String dbName = "paimon_spark_doris_consistency_demo_db" + String minioPort = context.config.otherConfigs.get("iceberg_minio_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + def normalizeRows = { rows -> + rows.collect { row -> + row.collect { value -> value == null ? null : value.toString() } + } + } + def expectedRows = [ + [1, "alice", 10], + [2, "bob", 20], + [3, "cindy", null], + [4, "doris", 40] + ] + def expectedAggRows = [[4L, 70L]] + + // Example: execute multiple Spark Paimon statements in one JDBC connection. + spark_paimon_multi """ + CREATE DATABASE IF NOT EXISTS paimon.${dbName}; + DROP TABLE IF EXISTS paimon.${dbName}.spark_written_paimon_demo; + CREATE TABLE paimon.${dbName}.spark_written_paimon_demo ( + id INT, + name STRING, + score INT + ) USING paimon; + INSERT INTO paimon.${dbName}.spark_written_paimon_demo VALUES + (1, 'alice', 10), + (2, 'bob', 20), + (3, 'cindy', NULL); + """ + + // Example: write one more Paimon row through Spark SQL. + spark_paimon """ + INSERT INTO paimon.${dbName}.spark_written_paimon_demo VALUES + (4, 'doris', 40); + """ + + sql """drop catalog if exists ${catalogName}""" + sql """ + CREATE CATALOG ${catalogName} PROPERTIES ( + 'type' = 'paimon', + 'warehouse' = 's3://warehouse/wh', + 's3.endpoint' = 'http://${externalEnvIp}:${minioPort}', + 's3.access_key' = 'admin', + 's3.secret_key' = 'password', + 's3.path.style.access' = 'true' + ); + """ + + sql """switch ${catalogName}""" + + def sparkRows = spark_paimon """ + SELECT id, name, score + FROM paimon.${dbName}.spark_written_paimon_demo + ORDER BY id + """ + // Example 1: compare Spark Paimon query result with explicit expected values. + assertEquals(expectedRows, sparkRows) + + def dorisRows = sql """ + SELECT id, name, score + FROM ${dbName}.spark_written_paimon_demo + ORDER BY id + """ + // Example 1: compare Doris Paimon query result with explicit expected values. + assertEquals(expectedRows, dorisRows) + + // Example 2: compare Doris and Spark query results. + assertEquals(normalizeRows(sparkRows), normalizeRows(dorisRows)) + + def sparkAggRows = spark_paimon """ + SELECT count(*), sum(score) + FROM paimon.${dbName}.spark_written_paimon_demo + """ + // Compare Spark Paimon aggregate result with explicit expected values. + assertEquals(expectedAggRows, sparkAggRows) + + def dorisAggRows = sql """ + SELECT count(*), sum(score) + FROM ${dbName}.spark_written_paimon_demo + """ + // Doris and Spark JDBC may return the same aggregate value with different + // Java number classes, for example Long vs BigInteger, so normalize before comparing. + assertEquals(normalizeRows(sparkAggRows), normalizeRows(dorisAggRows)) +} From 7ec599318c7a57f67df522d710e9d0fb60f76cd4 Mon Sep 17 00:00:00 2001 From: zgxme Date: Sat, 27 Jun 2026 22:44:24 +0800 Subject: [PATCH 08/10] [fix](docker) Restore Paimon setup time zone ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: Paimon preinstalled SQL scripts are executed in a shared Spark SQL session. run06.sql changes the session time zone to +08:00 for timestamp partition coverage, but did not restore it before subsequent scripts. This can make later Paimon bootstrap data depend on session state and change physical file metadata such as partition file size. Restore the session time zone to UTC at the end of run06.sql so later scripts start from the default time zone. ### Release note None ### Check List (For Author) - Test: Manual test - git diff --check -- docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql - Behavior changed: No - Does this need documentation: No --- .../scripts/create_preinstalled_scripts/paimon/run06.sql | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql index eb60255a08e965..a0f52f649fa9a8 100644 --- a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql +++ b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql @@ -228,4 +228,7 @@ VALUES (1, NULL, 100.0), (2, 'NULL', 200.0), (3, '\\N', 300.0), (4, 'null', 400.0), - (5, 'A', 500.0); \ No newline at end of file + (5, 'A', 500.0); + +-- Restore the session time zone for subsequent preinstalled Paimon scripts. +SET TIME ZONE 'UTC'; From 7bb5698895576254718bf5b84a90c93d45a7268f Mon Sep 17 00:00:00 2001 From: zgxme Date: Sun, 28 Jun 2026 01:06:55 +0800 Subject: [PATCH 09/10] [fix](docker) Revert Paimon setup ordering changes ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: The Iceberg docker bootstrap was changed to sort preinstalled SQL script paths before generating the Spark SQL source files, and run06.sql restored the session time zone after its timestamp partition setup. Revert those changes so the bootstrap ordering and Paimon setup SQL match the previous behavior while investigating Paimon partition file size differences. ### Release note None ### Check List (For Author) - Test: Manual test - git diff --check -- docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql - Behavior changed: Yes. Iceberg docker preinstalled SQL path handling returns to the prior unsorted find output behavior, and run06.sql no longer restores session time zone. - Does this need documentation: No --- .../thirdparties/docker-compose/iceberg/entrypoint.sh.tpl | 6 +++--- .../scripts/create_preinstalled_scripts/paimon/run06.sql | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl index dfdcbd441e82d7..cbb79e7e5f6d1f 100644 --- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl +++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl @@ -48,21 +48,21 @@ start-history-server.sh # This approach can reduce the time from 150s to 40s. START_TIME1=$(date +%s) -find /mnt/scripts/create_preinstalled_scripts/iceberg -name '*.sql' | sort | sed 's|^|source |' | sed 's|$|;|'> iceberg_total.sql +find /mnt/scripts/create_preinstalled_scripts/iceberg -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> iceberg_total.sql spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -f iceberg_total.sql END_TIME1=$(date +%s) EXECUTION_TIME1=$((END_TIME1 - START_TIME1)) echo "Script iceberg total: {} executed in $EXECUTION_TIME1 seconds" START_TIME2=$(date +%s) -find /mnt/scripts/create_preinstalled_scripts/paimon -name '*.sql' | sort | sed 's|^|source |' | sed 's|$|;|'> paimon_total.sql +find /mnt/scripts/create_preinstalled_scripts/paimon -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> paimon_total.sql spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions -f paimon_total.sql END_TIME2=$(date +%s) EXECUTION_TIME2=$((END_TIME2 - START_TIME2)) echo "Script paimon total: {} executed in $EXECUTION_TIME2 seconds" START_TIME3=$(date +%s) -find /mnt/scripts/create_preinstalled_scripts/iceberg_load -name '*.sql' | sort | sed 's|^|source |' | sed 's|$|;|'> iceberg_load_total.sql +find /mnt/scripts/create_preinstalled_scripts/iceberg_load -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> iceberg_load_total.sql spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -f iceberg_load_total.sql END_TIME3=$(date +%s) EXECUTION_TIME3=$((END_TIME3 - START_TIME3)) diff --git a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql index a0f52f649fa9a8..026bd8aab72f2d 100644 --- a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql +++ b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run06.sql @@ -229,6 +229,3 @@ VALUES (1, NULL, 100.0), (3, '\\N', 300.0), (4, 'null', 400.0), (5, 'A', 500.0); - --- Restore the session time zone for subsequent preinstalled Paimon scripts. -SET TIME ZONE 'UTC'; From 84f251b7edc1cf76140037e840da6c62d529c2ba Mon Sep 17 00:00:00 2001 From: zgxme Date: Sun, 28 Jun 2026 08:46:37 +0800 Subject: [PATCH 10/10] fix --- .../thirdparties/docker-compose/iceberg/entrypoint.sh.tpl | 7 +++++++ .../docker-compose/iceberg/spark-defaults.conf | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl index cbb79e7e5f6d1f..e52d2513d98d64 100644 --- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl +++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl @@ -76,6 +76,13 @@ spark-sql \ start-thriftserver.sh \ --master spark://doris--spark-iceberg:7077 \ --conf "spark.sql.extensions=${SPARK_THRIFT_EXTENSIONS}" \ + --conf spark.dynamicAllocation.enabled=false \ + --conf spark.cores.max=8 \ + --conf spark.executor.cores=4 \ + --conf spark.executor.memory=8g \ + --conf spark.driver.memory=4g \ + --conf spark.sql.shuffle.partitions=16 \ + --conf spark.default.parallelism=16 \ --driver-java-options "-Dderby.system.home=/tmp/derby" SPARK_THRIFT_READY_ATTEMPTS=0 diff --git a/docker/thirdparties/docker-compose/iceberg/spark-defaults.conf b/docker/thirdparties/docker-compose/iceberg/spark-defaults.conf index ea39e4c051b60c..f05bf40726f877 100644 --- a/docker/thirdparties/docker-compose/iceberg/spark-defaults.conf +++ b/docker/thirdparties/docker-compose/iceberg/spark-defaults.conf @@ -20,13 +20,6 @@ # Example: spark.sql.session.timeZone Asia/Shanghai -spark.dynamicAllocation.enabled false -spark.cores.max 8 -spark.executor.cores 4 -spark.executor.memory 8g -spark.driver.memory 4g -spark.sql.shuffle.partitions 16 -spark.default.parallelism 16 spark.sql.catalog.demo org.apache.iceberg.spark.SparkCatalog spark.sql.catalog.demo.type rest