diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index c4837d65fb3288..3d1e978ffe911f 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -436,8 +436,10 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) continue; } if (size == 0) { - if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - ++rows; + if (!_line_reader_eof) { + if (_empty_line_as_record() || _state->is_read_csv_empty_line_as_null()) { + ++rows; + } } // Read empty line, continue continue; @@ -475,8 +477,16 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) continue; } if (size == 0) { - if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); + if (!_line_reader_eof) { + if (_empty_line_as_record()) { + Slice empty_line("", 0); + RETURN_IF_ERROR(_validate_line(empty_line, &success)); + if (success) { + RETURN_IF_ERROR(_fill_dest_columns(empty_line, columns, &rows)); + } + } else if (_state->is_read_csv_empty_line_as_null()) { + RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); + } } // Read empty line, continue continue; diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 80938abd271231..46b8ffd6718989 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -207,6 +207,7 @@ class CsvReader : public TableFormatReader { virtual Status _create_line_reader(); virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice); virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice); + virtual bool _empty_line_as_record() const { return false; } // check the utf8 encoding of a line. // return error status to stop processing. // If return Status::OK but "success" is false, which means this is load request diff --git a/be/src/format/text/text_reader.cpp b/be/src/format/text/text_reader.cpp index c118c21adda9d9..23501f94cd6122 100644 --- a/be/src/format/text/text_reader.cpp +++ b/be/src/format/text/text_reader.cpp @@ -168,6 +168,12 @@ Status TextReader::_validate_line(const Slice& line, bool* success) { return Status::OK(); } +bool TextReader::_empty_line_as_record() const { + // Hive TEXTFILE treats an empty physical line as a record. The splitter maps it + // to one empty field and missing trailing fields are filled with null_format. + return true; +} + Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) { // Hot path of hive text load, see CsvReader::_deserialize_nullable_string. The // column type was verified by the checked assert_cast in diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h index c0cebf5da77ffd..dff4159208d1ed 100644 --- a/be/src/format/text/text_reader.h +++ b/be/src/format/text/text_reader.h @@ -67,6 +67,7 @@ class TextReader : public CsvReader { Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice) override; Status _validate_line(const Slice& line, bool* success) override; Status _deserialize_nullable_string(IColumn& column, Slice& slice) override; + bool _empty_line_as_record() const override; }; } // namespace doris diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh index f650ead89d7a2f..5197e8b92762bd 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh @@ -4,10 +4,9 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -hadoop fs -mkdir -p /user/doris/suites/regression/ -hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/ - # create table hive -f "${CUR_DIR}"/create_table.hql - +hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true +hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data +hadoop fs -put "${CUR_DIR}"/data/crdmm_data/* /user/doris/suites/regression/crdmm_data/ diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh index ef6538563d5b58..c4f8e7c5d961e2 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh @@ -3,6 +3,31 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)" +DEFAULT_MULTI_COL_DATA_FILE="$(mktemp /tmp/test_default_null_format_multi_col_text.XXXXXX)" +trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT +cat > "${SINGLE_COL_DATA_FILE}" <<'EOF' +null_value +null_value +non-null + +\N +EOF + +{ + printf 'a\tb\n' + printf '\n' + printf '\\N\t\\N\n' +} > "${DEFAULT_MULTI_COL_DATA_FILE}" + +hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_single_col_null_format_text || true +hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_single_col_null_format_text +hadoop fs -put "${SINGLE_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000 + +hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text || true +hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text +hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000 + # create table hive -f "${CUR_DIR}"/some_serde_table.hql diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index df03f36a8dae22..4625f0cbb35a73 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -226,4 +226,27 @@ STORED AS TEXTFILE; INSERT INTO TABLE test_empty_null_defined_text VALUES (1, 'Alice'), (2, NULL), - (3, ''); \ No newline at end of file + (3, ''); + +drop table if exists test_single_col_null_format_text; + +create external table test_single_col_null_format_text ( + name STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + "serialization.null.format"="null_value" +) +STORED AS TEXTFILE +LOCATION '/user/doris/suites/regression/serde_prop/test_single_col_null_format_text'; + +drop table if exists test_default_null_format_multi_col_text; + +create external table test_default_null_format_multi_col_text ( + c1 STRING, + c2 STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '\t' +STORED AS TEXTFILE +LOCATION '/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text'; diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index cda92c0519ad51..36866613260b8a 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -73,6 +73,18 @@ b 2.2 -- !test_empty_null_defined_text3 -- +-- !test_single_col_null_format_text_count -- +5 + +-- !test_single_col_null_format_text_values -- +5 3 2 1 1 1 + +-- !test_default_null_format_multi_col_text_count -- +3 + +-- !test_default_null_format_multi_col_text_values -- +3 2 1 1 1 2 0 1 + -- !1 -- a 1.1 b 2.2 @@ -147,3 +159,15 @@ b 2.2 -- !test_empty_null_defined_text3 -- +-- !test_single_col_null_format_text_count -- +5 + +-- !test_single_col_null_format_text_values -- +5 3 2 1 1 1 + +-- !test_default_null_format_multi_col_text_count -- +3 + +-- !test_default_null_format_multi_col_text_values -- +3 2 1 1 1 2 0 1 + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index 70306968852bb1..24efc34f4485cf 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -63,6 +63,26 @@ suite("test_hive_serde_prop", "p0,external") { qt_test_empty_null_defined_text """select * from ${catalog_name}.regression.test_empty_null_defined_text order by id;""" qt_test_empty_null_defined_text2 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name is null order by id;""" qt_test_empty_null_defined_text3 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name = '' order by id;""" + + qt_test_single_col_null_format_text_count """select count(*) from ${catalog_name}.regression.test_single_col_null_format_text;""" + qt_test_single_col_null_format_text_values """ + select count(*), count(name), count(case when name is null then 1 end), + count(case when name = '' then 1 end), + count(case when name = 'non-null' then 1 end), + count(case when name is not null and name not in ('', 'non-null') then 1 end) + from ${catalog_name}.regression.test_single_col_null_format_text; + """ + + qt_test_default_null_format_multi_col_text_count """select count(*) from ${catalog_name}.regression.test_default_null_format_multi_col_text;""" + qt_test_default_null_format_multi_col_text_values """ + select count(*), count(c1), count(c2), + count(case when c1 is null then 1 end), + count(case when c1 = '' then 1 end), + count(case when c2 is null then 1 end), + count(case when c2 = '' then 1 end), + count(case when c1 = 'a' and c2 = 'b' then 1 end) + from ${catalog_name}.regression.test_default_null_format_multi_col_text; + """ } }