From a954b95a9668cbfda20905048fc8fec4f47d9820 Mon Sep 17 00:00:00 2001 From: daidai Date: Mon, 2 Feb 2026 16:54:28 +0800 Subject: [PATCH] [Enhancement](explain)Display deleteFileNum for FileScanNode when explain verbose (#60308) ### What problem does this PR solve? Problem Summary: This PR enhances the output of EXPLAIN VERBOSE for File Scan nodes by adding the following metrics: `dataFileNum=xxx, deleteFileNum=xxx, deleteSplitNum=xxx` Especially useful for iceberg/paimon/hive acid These metrics provide more visibility into the underlying file and split layout, helping users better tune parameters and control query performance. Details: `dataFileNum` : The number of distinct data files that need to be read. This is not equivalent to the number of splits, since a single data file can be divided into multiple splits. `deleteFileNum` : The number of distinct delete files that need to be read. `deleteSplitNum` : Added because the relationship between data files and delete files is many-to-many: one data file may be associated with multiple delete files one delete file may apply to multiple data files Using deleteSplitNum / dataSplitNum, users can estimate the average number of delete splits that need to be read per data split. Example: ``` mysql> explain verbose select * from iceberg.format_v3.dv_test_1w; +-----------------------------------------------------------------------------------------------------------------------------------------------+ | Explain String(Nereids Planner) | +-----------------------------------------------------------------------------------------------------------------------------------------------+ | PLAN FRAGMENT 0 | | OUTPUT EXPRS: | | id[#0] | | grp[#1] | | value[#2] | | ts[#3] | | PARTITION: RANDOM | | | | HAS_COLO_PLAN_NODE: false | | | | VRESULT SINK | | MYSQL_PROTOCOL | | | | 0:VICEBERG_SCAN_NODE(32) | | table: iceberg.format_v3.dv_test_1w | | inputSplitNum=220, totalFileSize=720774, scanRanges=220 | | partition=0/0 | | backends: | | 1769590309070 | | s3://warehouse/wh/format_v3/dv_test_1w/data/00004-51-fc462f9a-d42a-404d-adfc-c8d2781c8d04-0-00001.parquet start: 4 length: 2672 | | s3://warehouse/wh/format_v3/dv_test_1w/data/00003-50-fc462f9a-d42a-404d-adfc-c8d2781c8d04-0-00001.parquet start: 4 length: 2852 | | s3://warehouse/wh/format_v3/dv_test_1w/data/00000-47-fc462f9a-d42a-404d-adfc-c8d2781c8d04-0-00001.parquet start: 4 length: 2894 | | ... other 216 files ... | | s3://warehouse/wh/format_v3/dv_test_1w/data/00001-48-fc462f9a-d42a-404d-adfc-c8d2781c8d04-0-00001.parquet start: 58397 length: 13894 | | dataFileNum=10, deleteFileNum=1 deleteSplitNum=220 | | cardinality=33334, numNodes=1 | | pushdown agg=NONE | | tuple ids: 0 | | | | Tuples: | | TupleDescriptor{id=0, tbl=dv_test_1w} | | SlotDescriptor{id=0, col=id, colUniqueId=1, type=bigint, nullable=true, isAutoIncrement=false, subColPath=null, virtualColumn=null} | | SlotDescriptor{id=1, col=grp, colUniqueId=2, type=int, nullable=true, isAutoIncrement=false, subColPath=null, virtualColumn=null} | | SlotDescriptor{id=2, col=value, colUniqueId=3, type=int, nullable=true, isAutoIncrement=false, subColPath=null, virtualColumn=null} | | SlotDescriptor{id=3, col=ts, colUniqueId=4, type=datetimev2(6), nullable=true, isAutoIncrement=false, subColPath=null, virtualColumn=null} | | | | | | | | | | ========== STATISTICS ========== | +-----------------------------------------------------------------------------------------------------------------------------------------------+ ``` --- .../apache/doris/datasource/FileScanNode.java | 32 +++++++++++++++ .../datasource/hive/source/HiveScanNode.java | 31 ++++++++++++++ .../iceberg/source/IcebergScanNode.java | 40 +++++++++++++++++++ .../paimon/source/PaimonScanNode.java | 22 ++++++++++ .../hive/test_transactional_hive.groovy | 8 ++++ .../test_iceberg_position_delete.groovy | 10 +++++ .../test_paimon_deletion_vector_oss.groovy | 9 +++++ 7 files changed, 152 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java index 46c69247bf1bb8..8af0096e007b5a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java @@ -55,8 +55,10 @@ import java.util.Collections; import java.util.Comparator; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; /** @@ -104,6 +106,17 @@ public long getTotalFileSize() { return totalFileSize; } + /** + * Get all delete files for the given file range. + * @param rangeDesc the file range descriptor + * @return list of delete file paths (formatted strings) + */ + protected List getDeleteFiles(TFileRangeDesc rangeDesc) { + // Default implementation: return empty list + // Subclasses should override this method + return Collections.emptyList(); + } + @Override public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { StringBuilder output = new StringBuilder(); @@ -149,6 +162,21 @@ public int compare(TFileRangeDesc o1, TFileRangeDesc o2) { return Long.compare(o1.getStartOffset(), o2.getStartOffset()); } }); + + // A Data file may be divided into different splits, so a set is used to remove duplicates. + Set dataFilesSet = new HashSet<>(); + // A delete file might be used by multiple data files, so use set to remove duplicates. + Set deleteFilesSet = new HashSet<>(); + // You can estimate how many delete splits need to be read for a data split + // using deleteSplitNum / dataSplitNum(fileRangeDescs.size()) split. + long deleteSplitNum = 0; + for (TFileRangeDesc fileRangeDesc : fileRangeDescs) { + dataFilesSet.add(fileRangeDesc.getPath()); + List deletefiles = getDeleteFiles(fileRangeDesc); + deleteFilesSet.addAll(deletefiles); + deleteSplitNum += deletefiles.size(); + } + // 3. if size <= 4, print all. if size > 4, print first 3 and last 1 int size = fileRangeDescs.size(); if (size <= 4) { @@ -174,6 +202,10 @@ public int compare(TFileRangeDesc o1, TFileRangeDesc o2) { .append(" length: ").append(file.getSize()) .append("\n"); } + output.append(prefix).append(" ").append("dataFileNum=").append(dataFilesSet.size()) + .append(", deleteFileNum=").append(deleteFilesSet.size()) + .append(", deleteSplitNum=").append(deleteSplitNum) + .append("\n"); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index 2a524a4138d33c..c9317b459823fa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -443,6 +443,37 @@ protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { } } + @Override + protected List getDeleteFiles(TFileRangeDesc rangeDesc) { + List deleteFiles = new ArrayList<>(); + if (rangeDesc == null || !rangeDesc.isSetTableFormatParams()) { + return deleteFiles; + } + TTableFormatFileDesc tableFormatParams = rangeDesc.getTableFormatParams(); + if (tableFormatParams == null || !tableFormatParams.isSetTransactionalHiveParams()) { + return deleteFiles; + } + TTransactionalHiveDesc hiveParams = tableFormatParams.getTransactionalHiveParams(); + if (hiveParams == null || !hiveParams.isSetDeleteDeltas()) { + return deleteFiles; + } + List deleteDeltas = hiveParams.getDeleteDeltas(); + if (deleteDeltas == null) { + return deleteFiles; + } + // Format: {directory_location}/{file_name} + for (TTransactionalHiveDeleteDeltaDesc deleteDelta : deleteDeltas) { + if (deleteDelta != null && deleteDelta.isSetDirectoryLocation() + && deleteDelta.isSetFileNames() && deleteDelta.getFileNames() != null) { + String directoryLocation = deleteDelta.getDirectoryLocation(); + for (String fileName : deleteDelta.getFileNames()) { + deleteFiles.add(directoryLocation + "/" + fileName); + } + } + } + return deleteFiles; + } + @Override protected Map getLocationProperties() { return hmsTable.getBackendStorageProperties(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java index 3101d9f22f3727..debfa513e62e48 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java @@ -269,6 +269,46 @@ private void setIcebergParams(TFileRangeDesc rangeDesc, IcebergSplit icebergSpli rangeDesc.setTableFormatParams(tableFormatFileDesc); } + @Override + protected List getDeleteFiles(TFileRangeDesc rangeDesc) { + List deleteFiles = new ArrayList<>(); + if (rangeDesc == null || !rangeDesc.isSetTableFormatParams()) { + return deleteFiles; + } + TTableFormatFileDesc tableFormatParams = rangeDesc.getTableFormatParams(); + if (tableFormatParams == null || !tableFormatParams.isSetIcebergParams()) { + return deleteFiles; + } + TIcebergFileDesc icebergParams = tableFormatParams.getIcebergParams(); + if (icebergParams == null || !icebergParams.isSetDeleteFiles()) { + return deleteFiles; + } + List icebergDeleteFiles = icebergParams.getDeleteFiles(); + if (icebergDeleteFiles == null) { + return deleteFiles; + } + for (TIcebergDeleteFileDesc deleteFile : icebergDeleteFiles) { + if (deleteFile != null && deleteFile.isSetPath()) { + deleteFiles.add(deleteFile.getPath()); + } + } + return deleteFiles; + } + + private String getDeleteFileContentType(int content) { + // Iceberg file type: 0: data, 1: position delete, 2: equality delete, 3: deletion vector + switch (content) { + case 1: + return "position_delete"; + case 2: + return "equality_delete"; + case 3: + return "deletion_vector"; + default: + return "unknown"; + } + } + @Override public List getSplits(int numBackends) throws UserException { try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java index e1f746a70e809f..fc35282066e7e8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/source/PaimonScanNode.java @@ -276,6 +276,28 @@ private void setPaimonParams(TFileRangeDesc rangeDesc, PaimonSplit paimonSplit) rangeDesc.setTableFormatParams(tableFormatFileDesc); } + @Override + protected List getDeleteFiles(TFileRangeDesc rangeDesc) { + List deleteFiles = new ArrayList<>(); + if (rangeDesc == null || !rangeDesc.isSetTableFormatParams()) { + return deleteFiles; + } + TTableFormatFileDesc tableFormatParams = rangeDesc.getTableFormatParams(); + if (tableFormatParams == null || !tableFormatParams.isSetPaimonParams()) { + return deleteFiles; + } + TPaimonFileDesc paimonParams = tableFormatParams.getPaimonParams(); + if (paimonParams == null || !paimonParams.isSetDeletionFile()) { + return deleteFiles; + } + TPaimonDeletionFileDesc deletionFile = paimonParams.getDeletionFile(); + if (deletionFile != null && deletionFile.isSetPath()) { + // Format: path [offset: offset, length: length] + deleteFiles.add(deletionFile.getPath()); + } + return deleteFiles; + } + @Override public List getSplits(int numBackends) throws UserException { boolean forceJniScanner = sessionVariable.isForceJniScanner(); diff --git a/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy b/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy index 568bb632decbe9..adc97540665c37 100644 --- a/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy +++ b/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy @@ -142,6 +142,13 @@ suite("test_transactional_hive", "p0,external,hive,external_docker,external_dock qt_count_5 """ select count(*) from orc_acid_major; """ //3 } + def test_explain_verbose = { + explain { + sql ("select count(*) from orc_full_acid") + verbose (true) + contains "deleteFileNum" + } + } String enabled = context.config.otherConfigs.get("enableHiveTest") if (enabled == null || !enabled.equalsIgnoreCase("true")) { @@ -177,6 +184,7 @@ suite("test_transactional_hive", "p0,external,hive,external_docker,external_dock test_acid_count() + test_explain_verbose() q01_par_limit() diff --git a/regression-test/suites/external_table_p0/iceberg/test_iceberg_position_delete.groovy b/regression-test/suites/external_table_p0/iceberg/test_iceberg_position_delete.groovy index d793cef3568ddb..91bc48a6758025 100644 --- a/regression-test/suites/external_table_p0/iceberg/test_iceberg_position_delete.groovy +++ b/regression-test/suites/external_table_p0/iceberg/test_iceberg_position_delete.groovy @@ -165,6 +165,16 @@ suite("test_iceberg_position_delete", "p0,external,doris,external_docker,externa assertTrue(iceberg_position_gen_7.size() == 5632) // sql """drop catalog ${catalog_name}""" + + def test_explain_verbose = { + explain { + sql ("select name from iceberg_position_gen_data where id != 5;") + verbose (true) + contains "deleteFileNum" + } + } + test_explain_verbose() + } /* diff --git a/regression-test/suites/external_table_p0/paimon/test_paimon_deletion_vector_oss.groovy b/regression-test/suites/external_table_p0/paimon/test_paimon_deletion_vector_oss.groovy index 71a4d971169ea8..76574aae528474 100644 --- a/regression-test/suites/external_table_p0/paimon/test_paimon_deletion_vector_oss.groovy +++ b/regression-test/suites/external_table_p0/paimon/test_paimon_deletion_vector_oss.groovy @@ -53,8 +53,17 @@ suite("test_paimon_deletion_vector_oss", "p0,external,doris,external_docker,exte qt_6 """select * from deletion_vector_parquet where id > 2 order by id;""" } + def test_explain_verbose = { + explain { + sql ("select * from deletion_vector_orc;") + verbose (true) + contains "deleteFileNum" + } + } + test_cases("false") test_cases("true") + test_explain_verbose() } finally { sql """set force_jni_scanner=false"""