From aa2204d018237911e959d931fe6775fc1364f02e Mon Sep 17 00:00:00 2001
From: TengJianPing <tengjianping@selectdb.com>
Date: Tue, 12 May 2026 20:26:44 +0800
Subject: [PATCH] [fix](decimal) Fix incorrect decimal cast results for
 scientific-notation strings (#63119)

Related PR: Bug introduced by #60004

Fix incorrect string-to-decimal parsing for scientific notation.

Previously, decimal parsing could count exponent characters as part of
the significand, causing
values like `"1.4E+2"` to be cast incorrectly as `14` instead of `140`.
It could also round very
small scientific-notation values incorrectly when the significant digit
appeared after implicit
fractional zeros, such as `"5e-17"` for scale 15.

This change makes the parser track the end of the significand separately
from the exponent,
applies exponent-based decimal-point shifting correctly, and only rounds
when the next real
significand digit is the first discarded fractional digit.

Also add comments to help understand the code.
---
 be/src/util/string_parser.cpp                 | 49 ++++++++++++++++---
 .../column_array_update_crc32c_batch_37.out   |  4 +-
 ...ay_update_crc32c_batch_37.out_with_nullmap |  4 +-
 .../column_array_update_crc32c_single_37.out  |  2 +-
 ...y_update_crc32c_single_37.out_with_nullmap |  2 +-
 .../exprs/function/cast/cast_to_decimal.cpp   | 33 ++++++++++++-
 6 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/be/src/util/string_parser.cpp b/be/src/util/string_parser.cpp
index 5dcb65ae0721b4..0b057467e12260 100644
--- a/be/src/util/string_parser.cpp
+++ b/be/src/util/string_parser.cpp
@@ -41,6 +41,20 @@ namespace doris {
 // <exponent> ::= <e_marker> <sign>? <digits>
 //
 // <e_marker> ::= "e" | "E"
+//
+// Parsing algorithm:
+// 1. Trim spaces and the sign, then normalize the significand by skipping leading zeros and an
+//    optional leading dot. During this scan, count digits that belong to the original integral
+//    part (`int_part_count`) and remember where the significand ends (`end_digit_index`).
+// 2. Parse the optional exponent. Scientific notation is handled by moving the decimal point:
+//    `result_int_part_digit_count = int_part_count + exponent`. For example, "12.34e-1" has
+//    int_part_count=2 and exponent=-1, so the result has one integral digit: "1.234".
+// 3. Build the result in scaled-integer form: first collect the integral digits up to the shifted
+//    decimal point, then collect up to `type_scale` fractional digits, padding with zeros when the
+//    input has fewer fractional digits than the target scale.
+// 4. If there are extra fractional digits, round half up using the first discarded digit. Finally,
+//    check the integral digit count against `type_precision - type_scale` and return the signed
+//    scaled integer value.
 template <PrimitiveType P>
 typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_decimal(
         const char* __restrict s, size_t len, int type_precision, int type_scale,
@@ -50,6 +64,16 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_dec
                           std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
                   "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
                   "wide::Int256.");
+
+    // Parse in two logical coordinate systems:
+    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
+    //    leading zeros. If the original value starts with '.', the dot is also skipped so
+    //    ".14E+3" is parsed as significand "14" with exponent 3.
+    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
+    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
+    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
+    // `digit_index` always indexes the normalized significand string, which may still contain a
+    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
     // Ignore leading and trailing spaces.
     s = skip_ascii_whitespaces(s, len);
 
@@ -102,7 +126,9 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_dec
         *result = StringParser::PARSE_FAILURE;
         return 0;
     }
-    // parse exponent if any
+    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
+    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
+    // "E+2".
     int64_t exponent = 0;
     auto end_digit_index = i;
     if (i != len) {
@@ -149,8 +175,6 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_dec
             return 0;
         }
     }
-    T int_part_number = 0;
-    T frac_part_number = 0;
     // TODO: check limit values of exponent and add UT
     // max string len is config::string_type_length_soft_limit_bytes,
     // whose max value is std::numeric_limits<int32_t>::max() - 4,
@@ -163,9 +187,15 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_dec
         return 0;
     }
     int result_int_part_digit_count = tmp_result_int_part_digit_count;
+    T int_part_number = 0;
+    T frac_part_number = 0;
     int actual_frac_part_count = 0;
     int digit_index = 0;
     if (result_int_part_digit_count >= 0) {
+        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
+        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
+        // collect three integer digits after the exponent shift. It is capped by end_digit_index
+        // because missing digits are appended later by multiplying with powers of 10.
         int max_index = std::min(found_dot ? (result_int_part_digit_count +
                                               ((int_part_count > 0 && exponent > 0) ? 1 : 0))
                                            : result_int_part_digit_count,
@@ -188,7 +218,11 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_dec
             }
             int_part_number = int_part_number * 10 + (s[digit_index] - '0');
         }
-        auto total_significant_digit_count = i - ((found_dot && int_part_count > 0) ? 1 : 0);
+        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
+        // point past all available significant digits, append zeros by scaling the integer part:
+        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
+        auto total_significant_digit_count =
+                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
         if (result_int_part_digit_count > total_significant_digit_count) {
             int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
                                                        total_significant_digit_count);
@@ -206,8 +240,11 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_dec
         ++actual_frac_part_count;
     }
     auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
-    // there are still extra fraction digits left, check rounding
-    if (digit_index != end_digit_index) {
+    // Round only when the next parsed significand digit is exactly the first discarded fractional
+    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
+    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
+    // rounding up.
+    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
         if (UNLIKELY(s[digit_index] == '.')) {
             ++digit_index;
         }
diff --git a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out
index d699fb6177a7ab..c419670b58fc13 100644
--- a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out
+++ b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out
@@ -2,5 +2,5 @@
 0
 0
 4230634956
-166888020
-1932016285
\ No newline at end of file
+572890395
+2601481115
\ No newline at end of file
diff --git a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap
index d699fb6177a7ab..c419670b58fc13 100644
--- a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap
+++ b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap
@@ -2,5 +2,5 @@
 0
 0
 4230634956
-166888020
-1932016285
\ No newline at end of file
+572890395
+2601481115
\ No newline at end of file
diff --git a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out
index 45a1f82c6f1a28..faaab6bc634291 100644
--- a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out
+++ b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out
@@ -1 +1 @@
-106414486;4062799302;0
\ No newline at end of file
+880726687;3657333385;0
\ No newline at end of file
diff --git a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap
index 45a1f82c6f1a28..faaab6bc634291 100644
--- a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap
+++ b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap
@@ -1 +1 @@
-106414486;4062799302;0
\ No newline at end of file
+880726687;3657333385;0
\ No newline at end of file
diff --git a/be/test/exprs/function/cast/cast_to_decimal.cpp b/be/test/exprs/function/cast/cast_to_decimal.cpp
index 677c7a3fa1674a..b8e1bef4825b26 100644
--- a/be/test/exprs/function/cast/cast_to_decimal.cpp
+++ b/be/test/exprs/function/cast/cast_to_decimal.cpp
@@ -88,6 +88,37 @@ TEST_F(FunctionCastToDecimalTest, test_from_string_invalid_input) {
     int table_index = 0;
     from_string_invalid_input_test_func<Decimal32>(9, 3, table_index++);
 }
+
+TEST_F(FunctionCastToDecimalTest, test_from_string_scientific_notation) {
+    InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};
+    DataSet data_set = {
+            {{std::string("1.4E+2")}, DECIMAL128V3(140, 0, 15)},
+            {{std::string(".14E+3")}, DECIMAL128V3(140, 0, 15)},
+            {{std::string("0.001E+5")}, DECIMAL128V3(100, 0, 15)},
+            {{std::string("1.E+2")}, DECIMAL128V3(100, 0, 15)},
+            {{std::string("1.4E+0")}, DECIMAL128V3(1, 400000000000000, 15)},
+            {{std::string("1.4E-2")}, DECIMAL128V3(0, 14000000000000, 15)},
+    };
+    check_function_for_cast<DataTypeDecimal<Decimal128V3::PType>>(input_types, data_set, 15, 38);
+}
+
+TEST_F(FunctionCastToDecimalTest, string_parser_scientific_rounding) {
+    auto parse_decimal128 = [](std::string_view value) {
+        StringParser::ParseResult result = StringParser::PARSE_SUCCESS;
+        auto parsed = StringParser::string_to_decimal<TYPE_DECIMAL128I>(value.data(), value.size(),
+                                                                        38, 15, &result);
+        EXPECT_EQ(result, StringParser::PARSE_SUCCESS);
+        return parsed;
+    };
+
+    EXPECT_EQ(parse_decimal128("5e-16"), 1);
+    EXPECT_EQ(parse_decimal128("5e-17"), 0);
+    EXPECT_EQ(parse_decimal128("9e-17"), 0);
+    EXPECT_EQ(parse_decimal128("-5e-17"), 0);
+    EXPECT_EQ(parse_decimal128("0.0000000000000005"), 1);
+    EXPECT_EQ(parse_decimal128("0.00000000000000005"), 0);
+}
+
 TEST_F(FunctionCastToDecimalTest, test_from_bool) {
     from_bool_test_func<Decimal32>(9, 0);
     from_bool_test_func<Decimal32>(9, 1);
@@ -122,4 +153,4 @@ TEST_F(FunctionCastToDecimalTest, test_from_bool_overflow) {
     from_bool_overflow_test_func<Decimal128V3>();
     from_bool_overflow_test_func<Decimal256>();
 }
-} // namespace doris
\ No newline at end of file
+} // namespace doris