Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions be/src/format/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,8 +436,10 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof)
continue;
}
if (size == 0) {
if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) {
++rows;
if (!_line_reader_eof) {
if (_empty_line_as_record() || _state->is_read_csv_empty_line_as_null()) {
++rows;
}
}
// Read empty line, continue
continue;
Expand Down Expand Up @@ -475,8 +477,16 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof)
continue;
}
if (size == 0) {
if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) {
RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
if (!_line_reader_eof) {
if (_empty_line_as_record()) {
Slice empty_line("", 0);
RETURN_IF_ERROR(_validate_line(empty_line, &success));
if (success) {
RETURN_IF_ERROR(_fill_dest_columns(empty_line, columns, &rows));
}
} else if (_state->is_read_csv_empty_line_as_null()) {
RETURN_IF_ERROR(_fill_empty_line(columns, &rows));
}
}
// Read empty line, continue
continue;
Expand Down
1 change: 1 addition & 0 deletions be/src/format/csv/csv_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ class CsvReader : public TableFormatReader {
virtual Status _create_line_reader();
virtual Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice);
virtual Status _deserialize_nullable_string(IColumn& column, Slice& slice);
virtual bool _empty_line_as_record() const { return false; }
// check the utf8 encoding of a line.
// return error status to stop processing.
// If return Status::OK but "success" is false, which means this is load request
Expand Down
6 changes: 6 additions & 0 deletions be/src/format/text/text_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@ Status TextReader::_validate_line(const Slice& line, bool* success) {
return Status::OK();
}

bool TextReader::_empty_line_as_record() const {
// Hive TEXTFILE treats an empty physical line as a record. The splitter maps it
// to one empty field and missing trailing fields are filled with null_format.
return true;
}

Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice) {
// Hot path of hive text load, see CsvReader::_deserialize_nullable_string. The
// column type was verified by the checked assert_cast in
Expand Down
1 change: 1 addition & 0 deletions be/src/format/text/text_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ class TextReader : public CsvReader {
Status _deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column, Slice& slice) override;
Status _validate_line(const Slice& line, bool* success) override;
Status _deserialize_nullable_string(IColumn& column, Slice& slice) override;
bool _empty_line_as_record() const override;
};

} // namespace doris
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@ set -x
CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"


hadoop fs -mkdir -p /user/doris/suites/regression/
hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/regression/

# create table
hive -f "${CUR_DIR}"/create_table.hql


hadoop fs -rm -r -f /user/doris/suites/regression/crdmm_data || true
hadoop fs -mkdir -p /user/doris/suites/regression/crdmm_data
hadoop fs -put "${CUR_DIR}"/data/crdmm_data/* /user/doris/suites/regression/crdmm_data/
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,31 @@ set -x

CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"

SINGLE_COL_DATA_FILE="$(mktemp /tmp/test_single_col_null_format_text.XXXXXX)"
DEFAULT_MULTI_COL_DATA_FILE="$(mktemp /tmp/test_default_null_format_multi_col_text.XXXXXX)"
trap 'rm -f "${SINGLE_COL_DATA_FILE}" "${DEFAULT_MULTI_COL_DATA_FILE}"' EXIT
cat > "${SINGLE_COL_DATA_FILE}" <<'EOF'
null_value
null_value
non-null

\N
EOF

{
printf 'a\tb\n'
printf '\n'
printf '\\N\t\\N\n'
} > "${DEFAULT_MULTI_COL_DATA_FILE}"

hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_single_col_null_format_text || true
hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_single_col_null_format_text
hadoop fs -put "${SINGLE_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_single_col_null_format_text/part-00000

hadoop fs -rm -r -f /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text || true
hadoop fs -mkdir -p /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text
hadoop fs -put "${DEFAULT_MULTI_COL_DATA_FILE}" /user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text/part-00000

# create table
hive -f "${CUR_DIR}"/some_serde_table.hql

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,4 +226,27 @@ STORED AS TEXTFILE;
INSERT INTO TABLE test_empty_null_defined_text VALUES
(1, 'Alice'),
(2, NULL),
(3, '');
(3, '');

drop table if exists test_single_col_null_format_text;

create external table test_single_col_null_format_text (
name STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
"serialization.null.format"="null_value"
)
STORED AS TEXTFILE
LOCATION '/user/doris/suites/regression/serde_prop/test_single_col_null_format_text';

drop table if exists test_default_null_format_multi_col_text;

create external table test_default_null_format_multi_col_text (
c1 STRING,
c2 STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/user/doris/suites/regression/serde_prop/test_default_null_format_multi_col_text';
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,18 @@ b 2.2

-- !test_empty_null_defined_text3 --

-- !test_single_col_null_format_text_count --
5

-- !test_single_col_null_format_text_values --
5 3 2 1 1 1

-- !test_default_null_format_multi_col_text_count --
3

-- !test_default_null_format_multi_col_text_values --
3 2 1 1 1 2 0 1

-- !1 --
a 1.1
b 2.2
Expand Down Expand Up @@ -147,3 +159,15 @@ b 2.2

-- !test_empty_null_defined_text3 --

-- !test_single_col_null_format_text_count --
5

-- !test_single_col_null_format_text_values --
5 3 2 1 1 1

-- !test_default_null_format_multi_col_text_count --
3

-- !test_default_null_format_multi_col_text_values --
3 2 1 1 1 2 0 1

Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,26 @@ suite("test_hive_serde_prop", "p0,external") {
qt_test_empty_null_defined_text """select * from ${catalog_name}.regression.test_empty_null_defined_text order by id;"""
qt_test_empty_null_defined_text2 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name is null order by id;"""
qt_test_empty_null_defined_text3 """select * from ${catalog_name}.regression.test_empty_null_defined_text where name = '' order by id;"""

qt_test_single_col_null_format_text_count """select count(*) from ${catalog_name}.regression.test_single_col_null_format_text;"""
qt_test_single_col_null_format_text_values """
select count(*), count(name), count(case when name is null then 1 end),
count(case when name = '' then 1 end),
count(case when name = 'non-null' then 1 end),
count(case when name is not null and name not in ('', 'non-null') then 1 end)
from ${catalog_name}.regression.test_single_col_null_format_text;
"""

qt_test_default_null_format_multi_col_text_count """select count(*) from ${catalog_name}.regression.test_default_null_format_multi_col_text;"""
qt_test_default_null_format_multi_col_text_values """
select count(*), count(c1), count(c2),
count(case when c1 is null then 1 end),
count(case when c1 = '' then 1 end),
count(case when c2 is null then 1 end),
count(case when c2 = '' then 1 end),
count(case when c1 = 'a' and c2 = 'b' then 1 end)
from ${catalog_name}.regression.test_default_null_format_multi_col_text;
"""
}
}

Loading