From e0a3dc35be2a7d244737fb9b72644071b19f2b73 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Mon, 22 Jun 2026 14:22:40 +0800 Subject: [PATCH 1/4] [fix](hive) Preserve empty text records --- be/src/format/csv/csv_reader.cpp | 18 ++++++++++++++---- be/src/format/csv/csv_reader.h | 1 + be/src/format/text/text_reader.cpp | 6 ++++++ be/src/format/text/text_reader.h | 1 + .../scripts/data/regression/serde_prop/run.sh | 13 +++++++++++++ .../regression/serde_prop/some_serde_table.hql | 14 +++++++++++++- .../hive/test_hive_serde_prop.out | 12 ++++++++++++ .../hive/test_hive_serde_prop.groovy | 9 +++++++++ 8 files changed, 69 insertions(+), 5 deletions(-) diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index c4837d65fb3288..3d1e978ffe911f 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -436,8 +436,10 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) continue; } if (size == 0) { - if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - ++rows; + if (!_line_reader_eof) { + if (_empty_line_as_record() || _state->is_read_csv_empty_line_as_null()) { + ++rows; + } } // Read empty line, continue continue; @@ -475,8 +477,16 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) continue; } if (size == 0) { - if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); + if (!_line_reader_eof) { + if (_empty_line_as_record()) { + Slice empty_line("", 0); + RETURN_IF_ERROR(_validate_line(empty_line, &success)); + if (success) { + RETURN_IF_ERROR(_fill_dest_columns(empty_line, columns, &rows)); + } + } else if (_state->is_read_csv_empty_line_as_null()) { + RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); + } } // Read empty line, continue continue; diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 80938abd271231..46b8ffd6718989 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -207,6 +207,7 @@ class CsvReader : public TableFormatReader { virtual Status _create_line_reader(); virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice); virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice); + virtual bool _empty_line_as_record() const { return false; } // check the utf8 encoding of a line. // return error status to stop processing. // If return Status::OK but "success" is false, which means this is load request diff --git a/be/src/format/text/text_reader.cpp b/be/src/format/text/text_reader.cpp index c118c21adda9d9..946c4c3d4fc1d7 100644 --- a/be/src/format/text/text_reader.cpp +++ b/be/src/format/text/text_reader.cpp @@ -168,6 +168,12 @@ Status TextReader::_validate_line(const Slice& line, bool* success) { return Status::OK(); } +bool TextReader::_empty_line_as_record() const { + return _params.__isset.num_of_columns_from_file && _params.num_of_columns_from_file == 1 && + !(_options.null_len == 2 && _options.null_format[0] == '\\' && + _options.null_format[1] == 'N'); +} + Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) { // Hot path of hive text load, see CsvReader::_deserialize_nullable_string. The // column type was verified by the checked assert_cast in diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h index c0cebf5da77ffd..dff4159208d1ed 100644 --- a/be/src/format/text/text_reader.h +++ b/be/src/format/text/text_reader.h @@ -67,6 +67,7 @@ class TextReader : public CsvReader { Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice) override; Status _validate_line(const Slice& line, bool* success) override; Status _deserialize_nullable_string(IColumn& column, Slice& slice) override; + bool _empty_line_as_record() const override; }; } // namespace doris diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh index ef6538563d5b58..866f5f026fad63 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh @@ -3,6 +3,19 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)" +trap 'rm -f "${DATA_FILE}"' EXIT +cat > "${DATA_FILE}" <<'EOF' +null_value + +non-null +\N +EOF + +hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_single_col_null_format_text || true +hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_single_col_null_format_text +hadoop fs -put "${DATA_FILE}" /user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000 + # create table hive -f "${CUR_DIR}"/some_serde_table.hql diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index df03f36a8dae22..e542cf4d914e7c 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -226,4 +226,16 @@ STORED AS TEXTFILE; INSERT INTO TABLE test_empty_null_defined_text VALUES (1, 'Alice'), (2, NULL), - (3, ''); \ No newline at end of file + (3, ''); + +drop table if exists test_single_col_null_format_text; + +create external table test_single_col_null_format_text ( + name STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + "serialization.null.format"="null_value" +) +STORED AS TEXTFILE +LOCATION '/user/doris/suites/regression/serde_prop/test_single_col_null_format_text'; diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index cda92c0519ad51..9c256963d1c86e 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -73,6 +73,12 @@ b 2.2 -- !test_empty_null_defined_text3 -- +-- !test_single_col_null_format_text_count -- +4 + +-- !test_single_col_null_format_text_values -- +4 3 1 1 1 1 + -- !1 -- a 1.1 b 2.2 @@ -147,3 +153,9 @@ b 2.2 -- !test_empty_null_defined_text3 -- +-- !test_single_col_null_format_text_count -- +4 + +-- !test_single_col_null_format_text_values -- +4 3 1 1 1 1 + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index 70306968852bb1..67eef27d22773b 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -63,6 +63,15 @@ suite("test_hive_serde_prop", "p0,external") { qt_test_empty_null_defined_text """select * from ${catalog_name}.regression.test_empty_null_defined_text order by id;""" qt_test_empty_null_defined_text2 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name is null order by id;""" qt_test_empty_null_defined_text3 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name = '' order by id;""" + + qt_test_single_col_null_format_text_count """select count(*) from ${catalog_name}.regression.test_single_col_null_format_text;""" + qt_test_single_col_null_format_text_values """ + select count(*), count(name), count(case when name is null then 1 end), + count(case when name = '' then 1 end), + count(case when name = 'non-null' then 1 end), + count(case when name is not null and name not in ('', 'non-null') then 1 end) + from ${catalog_name}.regression.test_single_col_null_format_text; + """ } } From f4c91951c34cb81ff2d0bd00addb4e01ae88e636 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 25 Jun 2026 09:32:36 +0800 Subject: [PATCH 2/4] update --- be/src/format/text/text_reader.cpp | 6 ++--- .../scripts/data/regression/serde_prop/run.sh | 22 ++++++++++++++----- .../serde_prop/some_serde_table.hql | 10 +++++++++ .../hive/test_hive_serde_prop.out | 20 +++++++++++++---- .../hive/test_hive_serde_prop.groovy | 11 ++++++++++ 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/be/src/format/text/text_reader.cpp b/be/src/format/text/text_reader.cpp index 946c4c3d4fc1d7..23501f94cd6122 100644 --- a/be/src/format/text/text_reader.cpp +++ b/be/src/format/text/text_reader.cpp @@ -169,9 +169,9 @@ Status TextReader::_validate_line(const Slice& line, bool* success) { } bool TextReader::_empty_line_as_record() const { - return _params.__isset.num_of_columns_from_file && _params.num_of_columns_from_file == 1 && - !(_options.null_len == 2 && _options.null_format[0] == '\\' && - _options.null_format[1] == 'N'); + // Hive TEXTFILE treats an empty physical line as a record. The splitter maps it + // to one empty field and missing trailing fields are filled with null_format. + return true; } Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) { diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh index 866f5f026fad63..c4f8e7c5d961e2 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh @@ -3,18 +3,30 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)" -trap 'rm -f "${DATA_FILE}"' EXIT -cat > "${DATA_FILE}" <<'EOF' +SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)" +DEFAULT_MULTI_COL_DATA_FILE="$(mktemp /tmp/test_default_null_format_multi_col_text.XXXXXX)" +trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT +cat > "${SINGLE_COL_DATA_FILE}" <<'EOF' +null_value null_value - non-null + \N EOF +{ + printf 'a\tb\n' + printf '\n' + printf '\\N\t\\N\n' +} > "${DEFAULT_MULTI_COL_DATA_FILE}" + hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_single_col_null_format_text || true hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_single_col_null_format_text -hadoop fs -put "${DATA_FILE}" /user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000 +hadoop fs -put "${SINGLE_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000 + +hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text || true +hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text +hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000 # create table hive -f "${CUR_DIR}"/some_serde_table.hql diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index e542cf4d914e7c..bea1e461bf6f5c 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -239,3 +239,13 @@ WITH SERDEPROPERTIES ( ) STORED AS TEXTFILE LOCATION '/user/doris/suites/regression/serde_prop/test_single_col_null_format_text'; + +drop table if exists test_default_null_format_multi_col_text; + +create external table test_default_null_format_multi_col_text ( + c1 STRING, + c2 STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +STORED AS TEXTFILE +LOCATION '/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text'; diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index 9c256963d1c86e..36866613260b8a 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -74,10 +74,16 @@ b 2.2 -- !test_empty_null_defined_text3 -- -- !test_single_col_null_format_text_count -- -4 +5 -- !test_single_col_null_format_text_values -- -4 3 1 1 1 1 +5 3 2 1 1 1 + +-- !test_default_null_format_multi_col_text_count -- +3 + +-- !test_default_null_format_multi_col_text_values -- +3 2 1 1 1 2 0 1 -- !1 -- a 1.1 @@ -154,8 +160,14 @@ b 2.2 -- !test_empty_null_defined_text3 -- -- !test_single_col_null_format_text_count -- -4 +5 -- !test_single_col_null_format_text_values -- -4 3 1 1 1 1 +5 3 2 1 1 1 + +-- !test_default_null_format_multi_col_text_count -- +3 + +-- !test_default_null_format_multi_col_text_values -- +3 2 1 1 1 2 0 1 diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index 67eef27d22773b..24efc34f4485cf 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -72,6 +72,17 @@ suite("test_hive_serde_prop", "p0,external") { count(case when name is not null and name not in ('', 'non-null') then 1 end) from ${catalog_name}.regression.test_single_col_null_format_text; """ + + qt_test_default_null_format_multi_col_text_count """select count(*) from ${catalog_name}.regression.test_default_null_format_multi_col_text;""" + qt_test_default_null_format_multi_col_text_values """ + select count(*), count(c1), count(c2), + count(case when c1 is null then 1 end), + count(case when c1 = '' then 1 end), + count(case when c2 is null then 1 end), + count(case when c2 = '' then 1 end), + count(case when c1 = 'a' and c2 = 'b' then 1 end) + from ${catalog_name}.regression.test_default_null_format_multi_col_text; + """ } } From 684b4229475882f03a86c01a6e5d0b076d2e060f Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 25 Jun 2026 11:40:03 +0800 Subject: [PATCH 3/4] [fix](regression) Fix Hive text serde delimiter fixture ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: The Hive serde regression fixture added a multi-column text table to verify the default Hive null marker, but the table DDL did not specify a tab field delimiter while the prepared data file used tab-separated rows. Hive LazySimpleSerDe defaults to Ctrl-A as the field delimiter, so Doris did not split rows such as a\tb into two columns and the expected aggregation result mismatched. This change declares the table with ROW FORMAT DELIMITED and FIELDS TERMINATED BY '\t' while leaving serialization.null.format unset, so the case continues to validate the default \N null format. ### Release note None ### Check List (For Author) - Test: Regression test - Ran ./run-regression-test.sh --run -d external_table_p0/hive -s test_hive_serde_prop; the suite succeeded but the Hive test body was skipped because enableHiveTest=false in the local config. - Ran git diff --check. - Behavior changed: No - Does this need documentation: No --- .../scripts/data/regression/serde_prop/some_serde_table.hql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index bea1e461bf6f5c..4625f0cbb35a73 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -246,6 +246,7 @@ create external table test_default_null_format_multi_col_text ( c1 STRING, c2 STRING ) -ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text'; From c8eed8fa65c99f85cb7319d0e8784e6ce80c0835 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 25 Jun 2026 11:53:03 +0800 Subject: [PATCH 4/4] [fix](regression) Reupload credit Hive data after table refresh ### What problem does this PR solve? Issue Number: close #xxx Related PR: #64671 Problem Summary: Refreshing the Hive regression module reruns all data/regression run.sh scripts. The credit-data fixture uploaded files before executing its Hive DDL, but the DDL drops and recreates the managed Hive table. On refresh, DROP TABLE can remove the table location after the files have just been uploaded, leaving crdmm_data empty and causing test_external_credit_data to return no rows. This change runs the DDL first, then recreates the target HDFS directory and uploads the credit-data files into the table location. ### Release note None ### Check List (For Author) - Test: Regression test - Ran bash -n docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh. - Ran git diff --check. - Ran ./run-regression-test.sh --run -d external_table_p0/hive -s test_external_credit_data; the suite succeeded but the Hive test body was skipped because enableHiveTest=false in the local config. - Behavior changed: No - Does this need documentation: No --- .../hive/scripts/data/regression/crdmm_data/run.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh index f650ead89d7a2f..5197e8b92762bd 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh @@ -4,10 +4,9 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -hadoop fs -mkdir -p /user/doris/suites/regression/ -hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/ - # create table hive -f "${CUR_DIR}"/create_table.hql - +hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true +hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data +hadoop fs -put "${CUR_DIR}"/data/crdmm_data/* /user/doris/suites/regression/crdmm_data/