diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index e0738efe954985..7bc340e7f21774 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -362,8 +362,10 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { continue; } if (size == 0) { - if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - ++rows; + if (!_line_reader_eof) { + if (_empty_line_as_record() || _state->is_read_csv_empty_line_as_null()) { + ++rows; + } } // Read empty line, continue continue; @@ -400,8 +402,16 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { continue; } if (size == 0) { - if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); + if (!_line_reader_eof) { + if (_empty_line_as_record()) { + Slice empty_line("", 0); + RETURN_IF_ERROR(_validate_line(empty_line, &success)); + if (success) { + RETURN_IF_ERROR(_fill_dest_columns(empty_line, columns, &rows)); + } + } else if (_state->is_read_csv_empty_line_as_null()) { + RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); + } } // Read empty line, continue continue; diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 25cbaba31a1c31..3e5579d71de96f 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -201,6 +201,7 @@ class CsvReader : public GenericReader { virtual Status _create_line_reader(); virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice); virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice); + virtual bool _empty_line_as_record() const { return false; } // check the utf8 encoding of a line. // return error status to stop processing. // If return Status::OK but "success" is false, which means this is load request diff --git a/be/src/format/text/text_reader.cpp b/be/src/format/text/text_reader.cpp index e52da7f3249036..388cd56a0ddb90 100644 --- a/be/src/format/text/text_reader.cpp +++ b/be/src/format/text/text_reader.cpp @@ -164,6 +164,12 @@ Status TextReader::_validate_line(const Slice& line, bool* success) { return Status::OK(); } +bool TextReader::_empty_line_as_record() const { + // Hive TEXTFILE treats an empty physical line as a record. The splitter maps it + // to one empty field and missing trailing fields are filled with null_format. + return true; +} + Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) { auto& null_column = assert_cast(column); if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) { diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h index 22073c130a8486..60b0fb2f8b544c 100644 --- a/be/src/format/text/text_reader.h +++ b/be/src/format/text/text_reader.h @@ -67,6 +67,7 @@ class TextReader : public CsvReader { Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice) override; Status _validate_line(const Slice& line, bool* success) override; Status _deserialize_nullable_string(IColumn& column, Slice& slice) override; + bool _empty_line_as_record() const override; }; #include "common/compile_check_end.h" diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh index f650ead89d7a2f..5197e8b92762bd 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/crdmm_data/run.sh @@ -4,10 +4,9 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -hadoop fs -mkdir -p /user/doris/suites/regression/ -hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/ - # create table hive -f "${CUR_DIR}"/create_table.hql - +hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true +hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data +hadoop fs -put "${CUR_DIR}"/data/crdmm_data/* /user/doris/suites/regression/crdmm_data/ diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh index ef6538563d5b58..c4f8e7c5d961e2 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh @@ -3,6 +3,31 @@ set -x CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)" +DEFAULT_MULTI_COL_DATA_FILE="$(mktemp /tmp/test_default_null_format_multi_col_text.XXXXXX)" +trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT +cat > "${SINGLE_COL_DATA_FILE}" <<'EOF' +null_value +null_value +non-null + +\N +EOF + +{ + printf 'a\tb\n' + printf '\n' + printf '\\N\t\\N\n' +} > "${DEFAULT_MULTI_COL_DATA_FILE}" + +hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_single_col_null_format_text || true +hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_single_col_null_format_text +hadoop fs -put "${SINGLE_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000 + +hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text || true +hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text +hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000 + # create table hive -f "${CUR_DIR}"/some_serde_table.hql diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql index 81bdf03da8e6c4..02393bd5d3b877 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -199,4 +199,27 @@ STORED AS TEXTFILE; INSERT INTO TABLE test_empty_null_defined_text VALUES (1, 'Alice'), (2, NULL), - (3, ''); \ No newline at end of file + (3, ''); + +drop table if exists test_single_col_null_format_text; + +create external table test_single_col_null_format_text ( + name STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + "serialization.null.format"="null_value" +) +STORED AS TEXTFILE +LOCATION '/user/doris/suites/regression/serde_prop/test_single_col_null_format_text'; + +drop table if exists test_default_null_format_multi_col_text; + +create external table test_default_null_format_multi_col_text ( + c1 STRING, + c2 STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '\t' +STORED AS TEXTFILE +LOCATION '/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text'; diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index cda92c0519ad51..36866613260b8a 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -73,6 +73,18 @@ b 2.2 -- !test_empty_null_defined_text3 -- +-- !test_single_col_null_format_text_count -- +5 + +-- !test_single_col_null_format_text_values -- +5 3 2 1 1 1 + +-- !test_default_null_format_multi_col_text_count -- +3 + +-- !test_default_null_format_multi_col_text_values -- +3 2 1 1 1 2 0 1 + -- !1 -- a 1.1 b 2.2 @@ -147,3 +159,15 @@ b 2.2 -- !test_empty_null_defined_text3 -- +-- !test_single_col_null_format_text_count -- +5 + +-- !test_single_col_null_format_text_values -- +5 3 2 1 1 1 + +-- !test_default_null_format_multi_col_text_count -- +3 + +-- !test_default_null_format_multi_col_text_values -- +3 2 1 1 1 2 0 1 + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index d4bb051214d724..9ca9ad3b6b45ee 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -63,6 +63,26 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte qt_test_empty_null_defined_text """select * from ${catalog_name}.regression.test_empty_null_defined_text order by id;""" qt_test_empty_null_defined_text2 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name is null order by id;""" qt_test_empty_null_defined_text3 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name = '' order by id;""" + + qt_test_single_col_null_format_text_count """select count(*) from ${catalog_name}.regression.test_single_col_null_format_text;""" + qt_test_single_col_null_format_text_values """ + select count(*), count(name), count(case when name is null then 1 end), + count(case when name = '' then 1 end), + count(case when name = 'non-null' then 1 end), + count(case when name is not null and name not in ('', 'non-null') then 1 end) + from ${catalog_name}.regression.test_single_col_null_format_text; + """ + + qt_test_default_null_format_multi_col_text_count """select count(*) from ${catalog_name}.regression.test_default_null_format_multi_col_text;""" + qt_test_default_null_format_multi_col_text_values """ + select count(*), count(c1), count(c2), + count(case when c1 is null then 1 end), + count(case when c1 = '' then 1 end), + count(case when c2 is null then 1 end), + count(case when c2 = '' then 1 end), + count(case when c1 = 'a' and c2 = 'b' then 1 end) + from ${catalog_name}.regression.test_default_null_format_multi_col_text; + """ } }