From 295d45da4e8700b230cac6e0aa49592dc4b6ab89 Mon Sep 17 00:00:00 2001 From: Tiewei Fang Date: Mon, 7 Apr 2025 15:12:34 +0800 Subject: [PATCH] [Fix](Serde) Support hive compatible output format (#49036) Problem Summary: The output format of complex data types are different between Hive and Doris, such as array, map and struct. When user migrate from Hive to Doris, they expect the same format so that they don't need to modify their business code. This PR mainly changes: Add a new option to session variable `serde_dialect`: If set to hive, the output format returned to MySQL client of some datatypes will be changed: Array Doris: ["abc", "def", "", null, 1] Hive: ["abc","def","",null,true] Map Doris: {"k1":null, "k2":"v3"} Hive: {"k1":null,"k2":"v3"} Struct Doris: {"s_id":100, "s_name":"abc , "", "s_address":null} Hive: {"s_id":100,"s_name":"abc ,"","s_address":null} Related #37039 --- .../serde/data_type_array_serde.cpp | 5 +- .../data_types/serde/data_type_map_serde.cpp | 7 +- .../serde/data_type_number_serde.cpp | 9 +- be/src/vec/data_types/serde/data_type_serde.h | 20 ++++ .../serde/data_type_struct_serde.cpp | 5 +- be/src/vec/sink/vmysql_result_writer.cpp | 16 +++ .../apache/doris/nereids/NereidsPlanner.java | 1 + .../org/apache/doris/qe/SessionVariable.java | 10 +- gensrc/thrift/PaloInternalService.thrift | 3 +- .../serde/test_serde_dialect_hive.out | 7 ++ .../serde/test_serde_dialect_hive.groovy | 107 ++++++++++++++++++ 11 files changed, 182 insertions(+), 8 deletions(-) create mode 100644 regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out create mode 100644 regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy diff --git a/be/src/vec/data_types/serde/data_type_array_serde.cpp b/be/src/vec/data_types/serde/data_type_array_serde.cpp index 872dd84d8c7355..e5fc7461e45648 100644 --- a/be/src/vec/data_types/serde/data_type_array_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_array_serde.cpp @@ -336,7 +336,8 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, const auto end_arr_element = offsets[row_idx_of_col_arr]; for (int j = begin_arr_element; j < end_arr_element; ++j) { if (j != begin_arr_element) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -345,6 +346,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_nested_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -358,6 +360,7 @@ Status DataTypeArraySerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR( nested_serde->write_column_to_mysql(data, result, j, false, options)); } + --options.level; } } if (0 != result.push_string("]", 1)) { diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp b/be/src/vec/data_types/serde/data_type_map_serde.cpp index 2140885942d1d9..bf018ce3a80fcc 100644 --- a/be/src/vec/data_types/serde/data_type_map_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp @@ -418,7 +418,8 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, auto& offsets = map_column.get_offsets(); for (auto j = offsets[col_index - 1]; j < offsets[col_index]; ++j) { if (j != offsets[col_index - 1]) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -427,6 +428,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_key_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -440,6 +442,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(key_serde->write_column_to_mysql(nested_keys_column, result, j, false, options)); } + --options.level; } if (0 != result.push_string(&options.map_key_delim, 1)) { return Status::InternalError("pack mysql buffer failed."); @@ -449,6 +452,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (is_val_string && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { return Status::InternalError("pack mysql buffer failed."); @@ -462,6 +466,7 @@ Status DataTypeMapSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(value_serde->write_column_to_mysql(nested_values_column, result, j, false, options)); } + --options.level; } } if (0 != result.push_string("}", 1)) { diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp b/be/src/vec/data_types/serde/data_type_number_serde.cpp index 522cf02c75fbb5..cd8b3d567e9993 100644 --- a/be/src/vec/data_types/serde/data_type_number_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp @@ -277,8 +277,15 @@ Status DataTypeNumberSerDe::_write_column_to_mysql(const IColumn& column, int buf_ret = 0; auto& data = assert_cast(column).get_data(); const auto col_index = index_check_const(row_idx, col_const); - if constexpr (std::is_same_v || std::is_same_v) { + if constexpr (std::is_same_v) { buf_ret = result.push_tinyint(data[col_index]); + } else if constexpr (std::is_same_v) { + if (options.level > 0 && !options.is_bool_value_num) { + std::string bool_value = data[col_index] ? "true" : "false"; + result.push_string(bool_value.c_str(), bool_value.size()); + } else { + buf_ret = result.push_tinyint(data[col_index]); + } } else if constexpr (std::is_same_v || std::is_same_v) { buf_ret = result.push_smallint(data[col_index]); } else if constexpr (std::is_same_v || std::is_same_v) { diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 7dedf30ac32aa6..b23a6a21501b93 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -165,6 +165,26 @@ class DataTypeSerDe { const char* nested_string_wrapper; int wrapper_len; + /** + * mysql_collection_delim is used to separate elements in collection, such as array, map, struct + * It is used to write to mysql. + */ + std::string mysql_collection_delim = ", "; + + /** + * is_bool_value_num is used to display bool value in collection, such as array, map, struct + * eg, if set to true, the array will be: + * [1] + * if set to false, the array will be: + * [true] + */ + bool is_bool_value_num = true; + + /** + * Indicate the nested level of column. It is used to control some behavior of serde + */ + mutable int level = 0; + [[nodiscard]] char get_collection_delimiter( int hive_text_complex_type_delimiter_level) const { CHECK(0 <= hive_text_complex_type_delimiter_level && diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp b/be/src/vec/data_types/serde/data_type_struct_serde.cpp index d48f42e2227b2a..d95682e604c59f 100644 --- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp @@ -348,7 +348,8 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, bool begin = true; for (size_t j = 0; j < elem_serdes_ptrs.size(); ++j) { if (!begin) { - if (0 != result.push_string(", ", 2)) { + if (0 != result.push_string(options.mysql_collection_delim.c_str(), + options.mysql_collection_delim.size())) { return Status::InternalError("pack mysql buffer failed."); } } @@ -372,6 +373,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, return Status::InternalError("pack mysql buffer failed."); } } else { + ++options.level; if (remove_nullable(col.get_column_ptr(j))->is_column_string() && options.wrapper_len > 0) { if (0 != result.push_string(options.nested_string_wrapper, options.wrapper_len)) { @@ -386,6 +388,7 @@ Status DataTypeStructSerDe::_write_column_to_mysql(const IColumn& column, RETURN_IF_ERROR(elem_serdes_ptrs[j]->write_column_to_mysql( col.get_column(j), result, col_index, false, options)); } + --options.level; } begin = false; } diff --git a/be/src/vec/sink/vmysql_result_writer.cpp b/be/src/vec/sink/vmysql_result_writer.cpp index 0cdf1b34034630..8ad1c27602527a 100644 --- a/be/src/vec/sink/vmysql_result_writer.cpp +++ b/be/src/vec/sink/vmysql_result_writer.cpp @@ -123,6 +123,8 @@ Status VMysqlResultWriter::_set_options( _options.map_key_delim = ':'; _options.null_format = "null"; _options.null_len = 4; + _options.mysql_collection_delim = ", "; + _options.is_bool_value_num = true; break; case TSerdeDialect::PRESTO: // eg: @@ -133,6 +135,20 @@ Status VMysqlResultWriter::_set_options( _options.map_key_delim = '='; _options.null_format = "NULL"; _options.null_len = 4; + _options.mysql_collection_delim = ", "; + _options.is_bool_value_num = true; + break; + case TSerdeDialect::HIVE: + // eg: + // array: ["abc","def","",null] + // map: {"k1":null,"k2":"v3"} + _options.nested_string_wrapper = "\""; + _options.wrapper_len = 1; + _options.map_key_delim = ':'; + _options.null_format = "null"; + _options.null_len = 4; + _options.mysql_collection_delim = ","; + _options.is_bool_value_num = false; break; default: return Status::InternalError("unknown serde dialect: {}", serde_dialect); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java index 89a9d220be34e9..57d88de7a4ec6d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java @@ -677,6 +677,7 @@ private void setFormatOptions() { statementContext.setFormatOptions(FormatOptions.getForPresto()); break; case "doris": + case "hive": statementContext.setFormatOptions(FormatOptions.getDefault()); break; default: diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 3f874077f516a8..7d9b841661881c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -4348,9 +4348,11 @@ public void checkSerdeDialect(String serdeDialect) { throw new UnsupportedOperationException("serdeDialect value is empty"); } - if (!serdeDialect.equalsIgnoreCase("doris") && !serdeDialect.equalsIgnoreCase("presto") - && !serdeDialect.equalsIgnoreCase("trino")) { - LOG.warn("serdeDialect value is invalid, the invalid value is {}", serdeDialect); + if (!serdeDialect.equalsIgnoreCase("doris") + && !serdeDialect.equalsIgnoreCase("presto") + && !serdeDialect.equalsIgnoreCase("trino") + && !serdeDialect.equalsIgnoreCase("hive")) { + LOG.warn("serde dialect value is invalid, the invalid value is {}", serdeDialect); throw new UnsupportedOperationException( "sqlDialect value is invalid, the invalid value is " + serdeDialect); } @@ -4512,6 +4514,8 @@ public TSerdeDialect getSerdeDialect() { case "presto": case "trino": return TSerdeDialect.PRESTO; + case "hive": + return TSerdeDialect.HIVE; default: throw new IllegalArgumentException("Unknown serde dialect: " + serdeDialect); } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index c612826836ef93..ac43d3a3dee727 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -83,7 +83,8 @@ struct TResourceLimit { enum TSerdeDialect { DORIS = 0, - PRESTO = 1 + PRESTO = 1, + HIVE = 2 } // Query options that correspond to PaloService.PaloQueryOptions, diff --git a/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out new file mode 100644 index 00000000000000..3ea1043cdf6f9c --- /dev/null +++ b/regression-test/data/datatype_p0/serde/test_serde_dialect_hive.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql01 -- +1 2 3 4 5 1.1 2.0 123456.123456789 2024-06-30 2024-06-30T10:10:11 2024-06-30T10:10:11.123456 59.50.185.152 ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff this is a string with , and " abc ef 123ndedwdw true [1,2,3,4,5] [1, 2, 3, null, 5] [1.1, 2.1, 3.1, null, 5] [1.10000, 2.10000, 3.00000, null, 5.12345] ["abc", "de, f"", null, ""] [{"k1":"v1", "k2":null, "k3":"", "k4":"a , "a"}, {"k1":"v1", "k2":null, "k3 , "abc":"", "k4":"a , "a"}] [["abc", "de, f"", null, ""], [], null] \N \N {"k1":"v1", "k2":null, "k3":"", "k4":"a , "a"} {"k1":[["abc", "de, f"", null, ""], [], null], "k2":null} {10:{"k1":[["abc", "de, f"", null, ""], [], null]}, 11:null} \N {"s_id":100, "s_name":"abc , "", "s_address":null} {"s_id":null, "s_name":["abc", "de, f"", null, ""], "s_address":""} ["2024-06-01", null, "2024-06-03"] ["2024-06-01 10:10:10.000", null, "2024-06-03 01:11:23.123"] [1, 1, 0, 0, 1, 0, 0] {"s_id":100, "s_name":"abc , "", "s_gender":1} {"k1":0, "k2":1, "k3":1, "k4":0} + +-- !sql01 -- +1 2 3 4 5 1.1 2.0 123456.123456789 2024-06-30 2024-06-30T10:10:11 2024-06-30T10:10:11.123456 59.50.185.152 ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff this is a string with , and " abc ef 123ndedwdw true [1,2,3,4,5] [1,2,3,null,5] [1.1,2.1,3.1,null,5] [1.10000,2.10000,3.00000,null,5.12345] ["abc","de, f"",null,""] [{"k1":"v1","k2":null,"k3":"","k4":"a , "a"},{"k1":"v1","k2":null,"k3 , "abc":"","k4":"a , "a"}] [["abc","de, f"",null,""],[],null] \N \N {"k1":"v1","k2":null,"k3":"","k4":"a , "a"} {"k1":[["abc","de, f"",null,""],[],null],"k2":null} {10:{"k1":[["abc","de, f"",null,""],[],null]},11:null} \N {"s_id":100,"s_name":"abc , "","s_address":null} {"s_id":null,"s_name":["abc","de, f"",null,""],"s_address":""} ["2024-06-01",null,"2024-06-03"] ["2024-06-01 10:10:10.000",null,"2024-06-03 01:11:23.123"] [true,true,false,false,true,false,false] {"s_id":100,"s_name":"abc , "","s_gender":true} {"k1":false,"k2":true,"k3":true,"k4":false} + diff --git a/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy new file mode 100644 index 00000000000000..b8e3037d770f7d --- /dev/null +++ b/regression-test/suites/datatype_p0/serde/test_serde_dialect_hive.groovy @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_serde_dialect_hive", "p0") { + + sql """create database if not exists test_serde_dialect_hive;""" + sql """use test_serde_dialect_hive;""" + sql """drop table if exists test_serde_dialect_hive_tbl""" + sql """ + create table if not exists test_serde_dialect_hive_tbl ( + c1 tinyint, + c2 smallint, + c3 int, + c4 bigint, + c5 largeint, + c6 float, + c7 double, + c8 decimal(27, 9), + c9 date, + c10 datetime, + c11 datetime(6), + c12 ipv4, + c13 ipv6, + c14 string, + c15 char(6), + c16 varchar(1024), + c17 boolean, + c18 json, + c19 array, + c20 array, + c21 array, + c22 array, + c23 array>, + c24 array>, + c25 array>, + c26 array, s_name:array, s_address:map>>, + c27 map, + c28 map>>, + c29 map>>>, + c30 map, s_name:array, s_address:map>>>, + c31 struct, + c32 struct, s_address:string>, + c33 array, + c34 array, + c35 array, + c36 struct, + c37 map + ) + distributed by random buckets 1 + properties("replication_num" = "1"); + """ + + sql """ + insert into test_serde_dialect_hive_tbl + (c1, c2,c3, c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c27,c28,c29,c31,c32,c33,c34,c35,c36,c37) + values( + 1,2,3,4,5,1.1,2.0000,123456.123456789,"2024-06-30", "2024-06-30 10:10:11", "2024-06-30 10:10:11.123456", + '59.50.185.152', + 'ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff', + 'this is a string with , and "', + 'abc ef', + ' 123ndedwdw', + true, + '[1, 2, 3, 4, 5]', + [1,2,3,null,5], + [1.1,2.1,3.1,null,5.00], + [1.1,2.1,3.00000,null,5.12345], + ['abc', 'de, f"', null, ''], + [{'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'}, {'k1': 'v1', 'k2': null, 'k3 , "abc':'', 'k4':'a , "a'}], + [['abc', 'de, f"', null, ''],[],null], + {'k1': 'v1', 'k2': null, 'k3':'', 'k4':'a , "a'}, + {'k1': [['abc', 'de, f"', null, ''],[],null], 'k2': null}, + {10: {'k1': [['abc', 'de, f"', null, ''],[],null]}, 11: null}, + named_struct('s_id', 100, 's_name', 'abc , "', 's_address', null), + named_struct('s_id', null, 's_name', ['abc', 'de, f"', null, ''], 's_address', ''), + ['2024-06-01',null,'2024-06-03'], + ['2024-06-01 10:10:10',null,'2024-06-03 01:11:23.123'], + [true, true, false, false, true, false, false], + named_struct('s_id', 100, 's_name', 'abc , "', 's_gender', true), + {'k1': false, 'k2': true, 'k3':true, 'k4': false} + ); + """ + + sql """set serde_dialect="doris";""" + qt_sql01 """select * from test_serde_dialect_hive_tbl""" + sql """set serde_dialect="hive";""" + qt_sql01 """select * from test_serde_dialect_hive_tbl""" + + test { + sql """set serde_dialect="invalid"""" + exception "sqlDialect value is invalid" + } +}