From aa2204d018237911e959d931fe6775fc1364f02e Mon Sep 17 00:00:00 2001 From: TengJianPing Date: Tue, 12 May 2026 20:26:44 +0800 Subject: [PATCH] [fix](decimal) Fix incorrect decimal cast results for scientific-notation strings (#63119) Related PR: Bug introduced by #60004 Fix incorrect string-to-decimal parsing for scientific notation. Previously, decimal parsing could count exponent characters as part of the significand, causing values like `"1.4E+2"` to be cast incorrectly as `14` instead of `140`. It could also round very small scientific-notation values incorrectly when the significant digit appeared after implicit fractional zeros, such as `"5e-17"` for scale 15. This change makes the parser track the end of the significand separately from the exponent, applies exponent-based decimal-point shifting correctly, and only rounds when the next real significand digit is the first discarded fractional digit. Also add comments to help understand the code. --- be/src/util/string_parser.cpp | 49 ++++++++++++++++--- .../column_array_update_crc32c_batch_37.out | 4 +- ...ay_update_crc32c_batch_37.out_with_nullmap | 4 +- .../column_array_update_crc32c_single_37.out | 2 +- ...y_update_crc32c_single_37.out_with_nullmap | 2 +- .../exprs/function/cast/cast_to_decimal.cpp | 33 ++++++++++++- 6 files changed, 81 insertions(+), 13 deletions(-) diff --git a/be/src/util/string_parser.cpp b/be/src/util/string_parser.cpp index 5dcb65ae0721b4..0b057467e12260 100644 --- a/be/src/util/string_parser.cpp +++ b/be/src/util/string_parser.cpp @@ -41,6 +41,20 @@ namespace doris { // ::= ? // // ::= "e" | "E" +// +// Parsing algorithm: +// 1. Trim spaces and the sign, then normalize the significand by skipping leading zeros and an +// optional leading dot. During this scan, count digits that belong to the original integral +// part (`int_part_count`) and remember where the significand ends (`end_digit_index`). +// 2. Parse the optional exponent. Scientific notation is handled by moving the decimal point: +// `result_int_part_digit_count = int_part_count + exponent`. For example, "12.34e-1" has +// int_part_count=2 and exponent=-1, so the result has one integral digit: "1.234". +// 3. Build the result in scaled-integer form: first collect the integral digits up to the shifted +// decimal point, then collect up to `type_scale` fractional digits, padding with zeros when the +// input has fewer fractional digits than the target scale. +// 4. If there are extra fractional digits, round half up using the first discarded digit. Finally, +// check the integral digit count against `type_precision - type_scale` and return the signed +// scaled integer value. template typename PrimitiveTypeTraits

::CppType::NativeType StringParser::string_to_decimal( const char* __restrict s, size_t len, int type_precision, int type_scale, @@ -50,6 +64,16 @@ typename PrimitiveTypeTraits

::CppType::NativeType StringParser::string_to_dec std::is_same_v || std::is_same_v, "Cast string to decimal only support target type int32_t, int64_t, __int128 or " "wide::Int256."); + + // Parse in two logical coordinate systems: + // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and + // leading zeros. If the original value starts with '.', the dot is also skipped so + // ".14E+3" is parsed as significand "14" with exponent 3. + // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position + // after applying scientific notation. For example, "1.4E+2" has int_part_count=1, + // exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140. + // `digit_index` always indexes the normalized significand string, which may still contain a + // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly. // Ignore leading and trailing spaces. s = skip_ascii_whitespaces(s, len); @@ -102,7 +126,9 @@ typename PrimitiveTypeTraits

::CppType::NativeType StringParser::string_to_dec *result = StringParser::PARSE_FAILURE; return 0; } - // parse exponent if any + // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts + // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after + // "E+2". int64_t exponent = 0; auto end_digit_index = i; if (i != len) { @@ -149,8 +175,6 @@ typename PrimitiveTypeTraits

::CppType::NativeType StringParser::string_to_dec return 0; } } - T int_part_number = 0; - T frac_part_number = 0; // TODO: check limit values of exponent and add UT // max string len is config::string_type_length_soft_limit_bytes, // whose max value is std::numeric_limits::max() - 4, @@ -163,9 +187,15 @@ typename PrimitiveTypeTraits

::CppType::NativeType StringParser::string_to_dec return 0; } int result_int_part_digit_count = tmp_result_int_part_digit_count; + T int_part_number = 0; + T frac_part_number = 0; int actual_frac_part_count = 0; int digit_index = 0; if (result_int_part_digit_count >= 0) { + // `max_index` is the raw significand index where integer-part digits stop. Add one extra + // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to + // collect three integer digits after the exponent shift. It is capped by end_digit_index + // because missing digits are appended later by multiplying with powers of 10. int max_index = std::min(found_dot ? (result_int_part_digit_count + ((int_part_count > 0 && exponent > 0) ? 1 : 0)) : result_int_part_digit_count, @@ -188,7 +218,11 @@ typename PrimitiveTypeTraits

::CppType::NativeType StringParser::string_to_dec } int_part_number = int_part_number * 10 + (s[digit_index] - '0'); } - auto total_significant_digit_count = i - ((found_dot && int_part_count > 0) ? 1 : 0); + // Count only significand digits, not exponent syntax. If the exponent moves the decimal + // point past all available significant digits, append zeros by scaling the integer part: + // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10. + auto total_significant_digit_count = + end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0); if (result_int_part_digit_count > total_significant_digit_count) { int_part_number *= get_scale_multiplier(result_int_part_digit_count - total_significant_digit_count); @@ -206,8 +240,11 @@ typename PrimitiveTypeTraits

::CppType::NativeType StringParser::string_to_dec ++actual_frac_part_count; } auto type_scale_multiplier = get_scale_multiplier(type_scale); - // there are still extra fraction digits left, check rounding - if (digit_index != end_digit_index) { + // Round only when the next parsed significand digit is exactly the first discarded fractional + // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions + // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of + // rounding up. + if (actual_frac_part_count == type_scale && digit_index != end_digit_index) { if (UNLIKELY(s[digit_index] == '.')) { ++digit_index; } diff --git a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out index d699fb6177a7ab..c419670b58fc13 100644 --- a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out +++ b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out @@ -2,5 +2,5 @@ 0 0 4230634956 -166888020 -1932016285 \ No newline at end of file +572890395 +2601481115 \ No newline at end of file diff --git a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap index d699fb6177a7ab..c419670b58fc13 100644 --- a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap +++ b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap @@ -2,5 +2,5 @@ 0 0 4230634956 -166888020 -1932016285 \ No newline at end of file +572890395 +2601481115 \ No newline at end of file diff --git a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out index 45a1f82c6f1a28..faaab6bc634291 100644 --- a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out +++ b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out @@ -1 +1 @@ -106414486;4062799302;0 \ No newline at end of file +880726687;3657333385;0 \ No newline at end of file diff --git a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap index 45a1f82c6f1a28..faaab6bc634291 100644 --- a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap +++ b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap @@ -1 +1 @@ -106414486;4062799302;0 \ No newline at end of file +880726687;3657333385;0 \ No newline at end of file diff --git a/be/test/exprs/function/cast/cast_to_decimal.cpp b/be/test/exprs/function/cast/cast_to_decimal.cpp index 677c7a3fa1674a..b8e1bef4825b26 100644 --- a/be/test/exprs/function/cast/cast_to_decimal.cpp +++ b/be/test/exprs/function/cast/cast_to_decimal.cpp @@ -88,6 +88,37 @@ TEST_F(FunctionCastToDecimalTest, test_from_string_invalid_input) { int table_index = 0; from_string_invalid_input_test_func(9, 3, table_index++); } + +TEST_F(FunctionCastToDecimalTest, test_from_string_scientific_notation) { + InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR}; + DataSet data_set = { + {{std::string("1.4E+2")}, DECIMAL128V3(140, 0, 15)}, + {{std::string(".14E+3")}, DECIMAL128V3(140, 0, 15)}, + {{std::string("0.001E+5")}, DECIMAL128V3(100, 0, 15)}, + {{std::string("1.E+2")}, DECIMAL128V3(100, 0, 15)}, + {{std::string("1.4E+0")}, DECIMAL128V3(1, 400000000000000, 15)}, + {{std::string("1.4E-2")}, DECIMAL128V3(0, 14000000000000, 15)}, + }; + check_function_for_cast>(input_types, data_set, 15, 38); +} + +TEST_F(FunctionCastToDecimalTest, string_parser_scientific_rounding) { + auto parse_decimal128 = [](std::string_view value) { + StringParser::ParseResult result = StringParser::PARSE_SUCCESS; + auto parsed = StringParser::string_to_decimal(value.data(), value.size(), + 38, 15, &result); + EXPECT_EQ(result, StringParser::PARSE_SUCCESS); + return parsed; + }; + + EXPECT_EQ(parse_decimal128("5e-16"), 1); + EXPECT_EQ(parse_decimal128("5e-17"), 0); + EXPECT_EQ(parse_decimal128("9e-17"), 0); + EXPECT_EQ(parse_decimal128("-5e-17"), 0); + EXPECT_EQ(parse_decimal128("0.0000000000000005"), 1); + EXPECT_EQ(parse_decimal128("0.00000000000000005"), 0); +} + TEST_F(FunctionCastToDecimalTest, test_from_bool) { from_bool_test_func(9, 0); from_bool_test_func(9, 1); @@ -122,4 +153,4 @@ TEST_F(FunctionCastToDecimalTest, test_from_bool_overflow) { from_bool_overflow_test_func(); from_bool_overflow_test_func(); } -} // namespace doris \ No newline at end of file +} // namespace doris