From f4ce7413ab535b0a04cadb6a38916ac3c5889b0c Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 25 Jun 2026 15:22:51 +0800 Subject: [PATCH] [fix](hive) Preserve empty text records (#64671) ### What problem does this PR solve? Issue Number: close #xxx Problem Summary: When scanning Hive TEXTFILE tables, Doris previously skipped empty physical lines unless `read_csv_empty_line_as_null` was enabled. This is inconsistent with Hive TEXTFILE semantics: an empty physical line is still a record. For a single-column text table it represents one empty field, and for multi-column text tables missing trailing fields should be filled using the table's null format. This can cause Doris to return fewer rows than Hive for text files containing empty lines, especially when the table uses `LazySimpleSerDe` and custom or default `serialization.null.format`. This PR fixes the behavior by adding a format-level hook for empty-line handling: - CSV keeps the existing default behavior and does not treat empty lines as records. - Hive TEXT overrides the hook and treats empty physical lines as records. - Empty Hive text lines are passed through normal field deserialization so string/null handling stays consistent with `null_format`. The PR also adds Hive regression coverage for: - a single-column text table with custom `serialization.null.format`; - a multi-column text table using the default Hive null marker `\N`; - preservation of empty records and correct NULL/empty-string classification. In addition, the credit-data Hive fixture upload order is made refresh-safe. The Hive regression module refresh may rerun all `data/regression` setup scripts; `crdmm_data` now recreates the Hive table before re-uploading its HDFS data so `DROP TABLE` cannot remove freshly uploaded files. ### Release note Fix Hive TEXTFILE scans to preserve empty physical lines as records, matching Hive behavior. ### Check List (For Author) - Test: Regression test - Added/updated `external_table_p0/hive/test_hive_serde_prop`. - Ran `./run-regression-test.sh --run -d external_table_p0/hive -s test_hive_serde_prop`; local config had `enableHiveTest=false`, so the Hive test body was skipped. - Ran `./run-regression-test.sh --run -d external_table_p0/hive -s test_external_credit_data`; local config had `enableHiveTest=false`, so the Hive test body was skipped. - Ran `bash -n docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh`. - Ran `git diff --check`. - Behavior changed: Yes. Hive TEXTFILE scans now preserve empty physical lines as records instead of skipping them. - Does this need documentation: No --- be/src/format/csv/csv_reader.cpp | 18 ++++++++++--- be/src/format/csv/csv_reader.h | 1 + be/src/format/text/text_reader.cpp | 6 +++++ be/src/format/text/text_reader.h | 1 + .../scripts/data/regression/crdmm_data/run.sh | 7 +++--- .../scripts/data/regression/serde_prop/run.sh | 25 +++++++++++++++++++ .../serde_prop/some_serde_table.hql | 25 ++++++++++++++++++- .../hive/test_hive_serde_prop.out | 24 ++++++++++++++++++ .../hive/test_hive_serde_prop.groovy | 20 +++++++++++++++ 9 files changed, 118 insertions(+), 9 deletions(-) diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index e0738efe954985..7bc340e7f21774 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -362,8 +362,10 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { continue; } if (size == 0) { - if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - ++rows; + if (!_line_reader_eof) { + if (_empty_line_as_record() || _state->is_read_csv_empty_line_as_null()) { + ++rows; + } } // Read empty line, continue continue; @@ -400,8 +402,16 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { continue; } if (size == 0) { - if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); + if (!_line_reader_eof) { + if (_empty_line_as_record()) { + Slice empty_line("", 0); + RETURN_IF_ERROR(_validate_line(empty_line, &success)); + if (success) { + RETURN_IF_ERROR(_fill_dest_columns(empty_line, columns, &rows)); + } + } else if (_state->is_read_csv_empty_line_as_null()) { + RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); + } } // Read empty line, continue continue; diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 25cbaba31a1c31..3e5579d71de96f 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -201,6 +201,7 @@ class CsvReader : public GenericReader { virtual Status _create_line_reader(); virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice); virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice); + virtual bool _empty_line_as_record() const { return false; } // check the utf8 encoding of a line. // return error status to stop processing. // If return Status::OK but "success" is false, which means this is load request diff --git a/be/src/format/text/text_reader.cpp b/be/src/format/text/text_reader.cpp index e52da7f3249036..388cd56a0ddb90 100644 --- a/be/src/format/text/text_reader.cpp +++ b/be/src/format/text/text_reader.cpp @@ -164,6 +164,12 @@ Status TextReader::_validate_line(const Slice& line, bool* success) { return Status::OK(); } +bool TextReader::_empty_line_as_record() const { + // Hive TEXTFILE treats an empty physical line as a record. The splitter maps it + // to one empty field and missing trailing fields are filled with null_format. + return true; +} + Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) { auto& null_column = assert_cast(column); if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) { diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h index 22073c130a8486..60b0fb2f8b544c 100644 --- a/be/src/format/text/text_reader.h +++ b/be/src/format/text/text_reader.h @@ -67,6 +67,7 @@ class TextReader : public CsvReader { Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice) override; Status _validate_line(const Slice& line, bool* success) override; Status _deserialize_nullable_string(IColumn& column, Slice& slice) override; + bool _empty_line_as_record() const override; }; #include "common/compile_check_end.h" diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh index f650ead89d7a2f..5197e8b92762bd 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh @@ -4,10 +4,9 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -hadoop fs -mkdir -p /user/doris/suites/regression/ -hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/ - # create table hive -f "${CUR_DIR}"/create_table.hql - +hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true +hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data +hadoop fs -put "${CUR_DIR}"/data/crdmm_data/* /user/doris/suites/regression/crdmm_data/ diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh index ef6538563d5b58..c4f8e7c5d961e2 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh @@ -3,6 +3,31 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)" +DEFAULT_MULTI_COL_DATA_FILE="$(mktemp /tmp/test_default_null_format_multi_col_text.XXXXXX)" +trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT +cat > "${SINGLE_COL_DATA_FILE}" <<'EOF' +null_value +null_value +non-null + +\N +EOF + +{ + printf 'a\tb\n' + printf '\n' + printf '\\N\t\\N\n' +} > "${DEFAULT_MULTI_COL_DATA_FILE}" + +hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_single_col_null_format_text || true +hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_single_col_null_format_text +hadoop fs -put "${SINGLE_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000 + +hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text || true +hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text +hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000 + # create table hive -f "${CUR_DIR}"/some_serde_table.hql diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index 81bdf03da8e6c4..02393bd5d3b877 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -199,4 +199,27 @@ STORED AS TEXTFILE; INSERT INTO TABLE test_empty_null_defined_text VALUES (1, 'Alice'), (2, NULL), - (3, ''); \ No newline at end of file + (3, ''); + +drop table if exists test_single_col_null_format_text; + +create external table test_single_col_null_format_text ( + name STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + "serialization.null.format"="null_value" +) +STORED AS TEXTFILE +LOCATION '/user/doris/suites/regression/serde_prop/test_single_col_null_format_text'; + +drop table if exists test_default_null_format_multi_col_text; + +create external table test_default_null_format_multi_col_text ( + c1 STRING, + c2 STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '\t' +STORED AS TEXTFILE +LOCATION '/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text'; diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index cda92c0519ad51..36866613260b8a 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -73,6 +73,18 @@ b 2.2 -- !test_empty_null_defined_text3 -- +-- !test_single_col_null_format_text_count -- +5 + +-- !test_single_col_null_format_text_values -- +5 3 2 1 1 1 + +-- !test_default_null_format_multi_col_text_count -- +3 + +-- !test_default_null_format_multi_col_text_values -- +3 2 1 1 1 2 0 1 + -- !1 -- a 1.1 b 2.2 @@ -147,3 +159,15 @@ b 2.2 -- !test_empty_null_defined_text3 -- +-- !test_single_col_null_format_text_count -- +5 + +-- !test_single_col_null_format_text_values -- +5 3 2 1 1 1 + +-- !test_default_null_format_multi_col_text_count -- +3 + +-- !test_default_null_format_multi_col_text_values -- +3 2 1 1 1 2 0 1 + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index d4bb051214d724..9ca9ad3b6b45ee 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -63,6 +63,26 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte qt_test_empty_null_defined_text """select * from ${catalog_name}.regression.test_empty_null_defined_text order by id;""" qt_test_empty_null_defined_text2 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name is null order by id;""" qt_test_empty_null_defined_text3 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name = '' order by id;""" + + qt_test_single_col_null_format_text_count """select count(*) from ${catalog_name}.regression.test_single_col_null_format_text;""" + qt_test_single_col_null_format_text_values """ + select count(*), count(name), count(case when name is null then 1 end), + count(case when name = '' then 1 end), + count(case when name = 'non-null' then 1 end), + count(case when name is not null and name not in ('', 'non-null') then 1 end) + from ${catalog_name}.regression.test_single_col_null_format_text; + """ + + qt_test_default_null_format_multi_col_text_count """select count(*) from ${catalog_name}.regression.test_default_null_format_multi_col_text;""" + qt_test_default_null_format_multi_col_text_values """ + select count(*), count(c1), count(c2), + count(case when c1 is null then 1 end), + count(case when c1 = '' then 1 end), + count(case when c2 is null then 1 end), + count(case when c2 = '' then 1 end), + count(case when c1 = 'a' and c2 = 'b' then 1 end) + from ${catalog_name}.regression.test_default_null_format_multi_col_text; + """ } }