(impala) branch master updated: IMPALA-12562: Cast double and float to string with exact presicion

laszlog Wed, 12 Jun 2024 14:05:19 -0700

This is an automated email from the ASF dual-hosted git repository.

laszlog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git



The following commit(s) were added to refs/heads/master by this push:
     new 0d429462f IMPALA-12562: Cast double and float to string with exact 
presicion
0d429462f is described below

commit 0d429462f7f61565119ee2e593867a22886d7209
Author: zhangyifan27 <[email protected]>
AuthorDate: Fri May 17 23:28:11 2024 +0800

    IMPALA-12562: Cast double and float to string with exact presicion
    
    The builtin functions casttostring(DOUBLE) and casttostring(FLOAT)
    printed more digits when converting double and float values to
    string values. This patch fixes this by switching to use the existing
    methods DoubleToBuffer and FloatToBuffer, which are simple and fast
    implementations to print necessary digits.
    
    Testing:
      - Add end-to-end tests to verify the fixes
      - Add benchmarks for modified functions
      - Update tests in expr-test
    
    Change-Id: Icd79c55dd57dc0fa13e4ec11c2284ef2800e8b1a
    Reviewed-on: http://gerrit.cloudera.org:8080/21441
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/benchmarks/expr-benchmark.cc                | 49 ++++++------
 be/src/exprs/cast-functions-ir.cc                  | 47 ++++++------
 be/src/exprs/expr-test.cc                          | 62 ++++++++++------
 .../functional-query/queries/QueryTest/exprs.test  | 86 +++++++++++++++++++++-
 4 files changed, 173 insertions(+), 71 deletions(-)

diff --git a/be/src/benchmarks/expr-benchmark.cc 
b/be/src/benchmarks/expr-benchmark.cc
index a006c8610..fa8d84fbd 100644
--- a/be/src/benchmarks/expr-benchmark.cc
+++ b/be/src/benchmarks/expr-benchmark.cc
@@ -288,33 +288,38 @@ Benchmark* BenchmarkLike(bool codegen) {
   return suite;
 }
 
+// Machine Info: AMD EPYC 7K62 48-Core Processor
 // Cast:                      Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                          int_to_int                334      337      340    
     1X         1X         1X
-//                         int_to_bool                332      335      338    
 0.995X     0.994X     0.992X
-//                       int_to_double                700      707      711    
   2.1X       2.1X      2.09X
-//                       int_to_string                125      126      127    
 0.374X     0.375X     0.374X
-//                   double_to_boolean                155      156      157    
 0.464X     0.463X     0.463X
-//                    double_to_bigint                 90     90.6     90.8    
 0.269X     0.269X     0.267X
-//                    double_to_string                 68     68.8     69.3    
 0.204X     0.204X     0.204X
-//                       string_to_int                229      231      232    
 0.684X     0.685X     0.682X
-//                     string_to_float                103      104      105    
 0.309X     0.308X     0.308X
-//                 string_to_timestamp               39.9     40.1     40.5    
 0.119X     0.119X     0.119X
+//                          int_to_int                161      167      169    
     1X         1X         1X
+//                         int_to_bool                191      201      205    
  1.18X       1.2X      1.21X
+//                       int_to_double                506      522      529    
  3.14X      3.13X      3.13X
+//                       int_to_string               28.2     29.3     30.1    
 0.175X     0.176X     0.178X
+//                   double_to_boolean               67.6     69.7     71.4    
  0.42X     0.418X     0.422X
+//                    double_to_bigint               48.1       49     49.8    
 0.299X     0.294X     0.295X
+//                   decimal_to_string               22.4     22.8       23    
 0.139X     0.137X     0.136X
+//                    double_to_string               7.55     7.75     7.83    
0.0468X    0.0464X    0.0463X
+//                     float_to_string               7.97     8.15     8.24    
0.0495X    0.0489X    0.0488X
+//                       string_to_int                138      142      147    
 0.859X     0.854X      0.87X
+//                     string_to_float               57.7     59.3     60.2    
 0.358X     0.355X     0.356X
+//                 string_to_timestamp               22.4     23.2     23.5    
 0.139X     0.139X     0.139X
 //
 // CastCodegen:               Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                          int_to_int                824      830      837    
     1X         1X         1X
-//                         int_to_bool                815      821      828    
 0.989X     0.989X     0.989X
-//                       int_to_double                778      783      789    
 0.944X     0.943X     0.943X
-//                       int_to_string                167      169      171    
 0.203X     0.203X     0.204X
-//                   double_to_boolean                819      826      833    
 0.994X     0.994X     0.995X
-//                    double_to_bigint                777      783      792    
 0.943X     0.943X     0.946X
-//                    double_to_string                139      140      141    
 0.168X     0.168X     0.168X
-//                       string_to_int                369      372      375    
 0.448X     0.448X     0.448X
-//                     string_to_float                123      124      125    
  0.15X      0.15X      0.15X
-//                 string_to_timestamp               44.8     45.1     45.4    
0.0544X    0.0543X    0.0542X
+//                          int_to_int                166      167      169    
     1X         1X         1X
+//                         int_to_bool                198      202      204    
  1.19X      1.21X      1.21X
+//                       int_to_double                521      526      531    
  3.14X      3.15X      3.14X
+//                       int_to_string               28.9     29.7     30.5    
 0.174X     0.178X      0.18X
+//                   double_to_boolean               68.7     70.1     71.4    
 0.414X     0.419X     0.422X
+//                    double_to_bigint               48.4     49.2     49.8    
 0.292X     0.294X     0.295X
+//                   decimal_to_string               22.5     22.8     23.2    
 0.136X     0.137X     0.137X
+//                    double_to_string               7.64     7.75     7.83    
 0.046X    0.0463X    0.0463X
+//                     float_to_string               8.02     8.15      8.3    
0.0483X    0.0487X    0.0491X
+//                       string_to_int                140      145      147    
 0.847X     0.869X     0.868X
+//                     string_to_float               58.6     59.4     60.6    
 0.353X     0.355X     0.358X
+//                 string_to_timestamp                 23     23.3     23.7    
 0.138X     0.139X      0.14X
 Benchmark* BenchmarkCast(bool codegen) {
   Benchmark* suite = new Benchmark(BenchmarkName("Cast", codegen));
   BENCHMARK("int_to_int", "cast(1 as INT)");
@@ -323,7 +328,9 @@ Benchmark* BenchmarkCast(bool codegen) {
   BENCHMARK("int_to_string", "cast(1 as STRING)");
   BENCHMARK("double_to_boolean", "cast(3.14 as BOOLEAN)");
   BENCHMARK("double_to_bigint", "cast(3.14 as BIGINT)");
-  BENCHMARK("double_to_string", "cast(3.14 as STRING)");
+  BENCHMARK("decimal_to_string", "cast(3.14 as STRING)");
+  BENCHMARK("double_to_string", "cast(cast(3.14 as DOUBLE) as STRING)");
+  BENCHMARK("float_to_string", "cast(cast(3.14 as FLOAT) as STRING)");
   BENCHMARK("string_to_int", "cast('1234' as INT)");
   BENCHMARK("string_to_float", "cast('1234.5678' as FLOAT)");
   BENCHMARK("string_to_timestamp", "cast('2011-10-22 09:10:11' as TIMESTAMP)");
diff --git a/be/src/exprs/cast-functions-ir.cc 
b/be/src/exprs/cast-functions-ir.cc
index 7f4bc0a32..bce7262e6 100644
--- a/be/src/exprs/cast-functions-ir.cc
+++ b/be/src/exprs/cast-functions-ir.cc
@@ -26,16 +26,16 @@
 #include <gutil/strings/numbers.h>
 #include <gutil/strings/substitute.h>
 
+#include "common/names.h"
 #include "exprs/anyval-util.h"
 #include "exprs/cast-format-expr.h"
 #include "exprs/decimal-functions.h"
+#include "gutil/strings/numbers.h"
 #include "runtime/string-value.inline.h"
 #include "runtime/timestamp-value.h"
 #include "runtime/timestamp-value.inline.h"
 #include "util/string-parser.h"
 
-#include "common/names.h"
-
 using namespace impala;
 using namespace impala_udf;
 using namespace datetime_parse_util;
@@ -338,31 +338,28 @@ CAST_EXACT_NUMERIC_TO_STRING(SmallIntVal, 
MAX_SMALLINT_CHARS, FastInt32ToBufferL
 CAST_EXACT_NUMERIC_TO_STRING(IntVal, MAX_INT_CHARS, FastInt32ToBufferLeft);
 CAST_EXACT_NUMERIC_TO_STRING(BigIntVal, MAX_BIGINT_CHARS, 
FastInt64ToBufferLeft);
 
-
-#define CAST_FLOAT_TO_STRING(float_type, format) \
-  StringVal CastFunctions::CastToStringVal(FunctionContext* ctx, const 
float_type& val) { \
-    if (val.is_null) return StringVal::null(); \
-    /* val.val could be -nan, return "nan" instead */ \
-    if (std::isnan(val.val)) return StringVal("nan"); \
-    /* Add 1 to MAX_FLOAT_CHARS since snprintf adds a trailing '\0' */ \
-    StringVal sv(ctx, MAX_FLOAT_CHARS + 1); \
-    if (UNLIKELY(sv.is_null)) { \
-      DCHECK(!ctx->impl()->state()->GetQueryStatus().ok()); \
-      return sv; \
-    } \
-    sv.len = snprintf(reinterpret_cast<char*>(sv.ptr), sv.len, format, 
val.val); \
-    DCHECK_GT(sv.len, 0); \
-    DCHECK_LE(sv.len, MAX_FLOAT_CHARS); \
-    AnyValUtil::TruncateIfNecessary(ctx->GetReturnType(), &sv); \
-    return sv; \
+#define CAST_FLOAT_TO_STRING(float_type, convert_method, buffer_size)          
\
+  StringVal CastFunctions::CastToStringVal(                                    
\
+      FunctionContext* ctx, const float_type& val) {                           
\
+    if (val.is_null) return StringVal::null();                                 
\
+    /* val.val could be -nan, return "nan" instead */                          
\
+    if (std::isnan(val.val)) return StringVal("nan");                          
\
+    StringVal sv(ctx, buffer_size);                                            
\
+    if (UNLIKELY(sv.is_null)) {                                                
\
+      DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());                    
\
+      return sv;                                                               
\
+    }                                                                          
\
+    sv.len = strlen(convert_method(val.val, reinterpret_cast<char*>(sv.ptr))); 
\
+    DCHECK_GT(sv.len, 0);                                                      
\
+    DCHECK_LE(sv.len, MAX_FLOAT_CHARS);                                        
\
+    AnyValUtil::TruncateIfNecessary(ctx->GetReturnType(), &sv);                
\
+    return sv;                                                                 
\
   }
 
-// Floats have up to 9 significant digits, doubles up to 17
-// (see http://en.wikipedia.org/wiki/Single-precision_floating-point_format
-// and http://en.wikipedia.org/wiki/Double-precision_floating-point_format)
-CAST_FLOAT_TO_STRING(FloatVal, "%.9g");
-CAST_FLOAT_TO_STRING(DoubleVal, "%.17g");
-
+// Convert a double or float to a string and produce the exact same original 
precision.
+// See gutil/strings/numbers.h and gutil/strings/numbers.cc for more details.
+CAST_FLOAT_TO_STRING(FloatVal, FloatToBuffer, kFloatToBufferSize);
+CAST_FLOAT_TO_STRING(DoubleVal, DoubleToBuffer, kDoubleToBufferSize);
 
 StringVal CastFunctions::CastToStringVal(FunctionContext* ctx, const 
TimestampVal& val) {
   DCHECK(ctx != nullptr);
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 1cb0ee88d..61e53933f 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -1211,13 +1211,16 @@ class ExprTest : public 
testing::TestWithParam<std::tuple<bool, bool>> {
   // signed integer expected to be able to hold the value. 
'float_out_of_range' should be
   // set to true if the value does not fit in a single precision float. The 
expected
   // result is 'val' for the types that can hold the value and error for other 
types.
-  template<typename T>
-  void TestCast(const string& stmt, T val, int min_integer_size = 1,
-      bool float_out_of_range = false, bool timestamp_out_of_range = false) {
+  template <typename T>
+  void TestCast(const string& stmt, T val, bool convert_lose_precision = false,
+      int min_integer_size = 1, bool float_out_of_range = false,
+      bool timestamp_out_of_range = false) {
     TestValue("cast(" + stmt + " as boolean)", TYPE_BOOLEAN, 
static_cast<bool>(val));
     TestValue("cast(" + stmt + " as double)", TYPE_DOUBLE, 
static_cast<double>(val));
     TestValue("cast(" + stmt + " as real)", TYPE_DOUBLE, 
static_cast<double>(val));
-    TestStringValue("cast(" + stmt + " as string)", lexical_cast<string>(val));
+    if (!convert_lose_precision) {
+      TestStringValue("cast(" + stmt + " as string)", 
lexical_cast<string>(val));
+    }
 
     TestValueOrError("cast(" + stmt + " as tinyint)", TYPE_TINYINT,
         static_cast<int8_t>(val), min_integer_size > sizeof(int8_t),
@@ -1282,9 +1285,9 @@ TimestampValue ExprTest::CreateTestTimestamp(int64_t val) 
{
 
 // Test casting 'stmt' to each of the native types. See the general template 
definition
 // for more information.
-template<>
-void ExprTest::TestCast(const string& stmt, const char* val, int 
min_integer_size,
-    bool float_out_of_range, bool timestamp_out_of_range) {
+template <>
+void ExprTest::TestCast(const string& stmt, const char* val, bool 
convert_lose_precision,
+    int min_integer_size, bool float_out_of_range, bool 
timestamp_out_of_range) {
   try {
     int8_t val8 = static_cast<int8_t>(lexical_cast<int16_t>(val));
 #if 0
@@ -3266,23 +3269,28 @@ TEST_P(ExprTest, CastExprs) {
   TestCast("cast(0.0 as float)", 0.0f);
   TestCast("cast(5.0 as float)", 5.0f);
   TestCast("cast(-5.0 as float)", -5.0f);
-  TestCast("cast(0.1234567890123 as float)", 0.1234567890123f);
-  TestCast("cast(0.1234567890123 as float)", 0.123456791f); // same as above
-  TestCast("cast(0.00000000001234567890123 as float)", 
0.00000000001234567890123f);
-  TestCast("cast(123456 as float)", 123456.0f, 4, false, false);
+  TestCast("cast(0.1234567890123 as float)", 0.1234567890123f, true);
+  TestCast("cast(0.1234567890123 as float)", 0.123456791f, true); // same as 
above
+  TestStringValue("cast(cast(0.1234567890123 as float) as string)", 
"0.12345679");
+  TestCast("cast(0.00000000001234567890123 as float)", 
0.00000000001234567890123f, true);
+  TestStringValue(
+      "cast(cast(0.00000000001234567890123 as float) as string)", 
"1.2345679e-11");
+  TestCast("cast(123456 as float)", 123456.0f, false, 4, false, false);
 
   // From http://en.wikipedia.org/wiki/Single-precision_floating-point_format
   // Min positive normal value
-  TestCast("cast(1.1754944e-38 as float)", 1.1754944e-38f);
+  TestCast("cast(1.1754944e-38 as float)", 1.1754944e-38f, true);
+  TestStringValue("cast(cast(1.1754944e-38 as float) as string)", 
"1.1754944e-38");
   // Max representable value
-  TestCast("cast(3.4028234e38 as float)", 3.4028234e38f, 32, false, true);
+  TestCast("cast(3.4028234e38 as float)", 3.4028234e38f, true, 32, false, 
true);
+  TestStringValue("cast(cast(3.4028234e38 as float) as string)", 
"3.4028235e+38");
 
   // From Double
   TestCast("cast(0.0 as double)", 0.0);
   TestCast("cast(5.0 as double)", 5.0);
   TestCast("cast(-5.0 as double)", -5.0);
-  TestCast("cast(0.123e10 as double)", 0.123e10, 4, false, false);
-  TestCast("cast(123.123e10 as double)", 123.123e10, 8, false, true);
+  TestCast("cast(0.123e10 as double)", 0.123e10, false, 4, false, false);
+  TestCast("cast(123.123e10 as double)", 123.123e10, false, 8, false, true);
   TestCast("cast(1.01234567890123456789 as double)", 1.01234567890123456789);
   TestCast("cast(1.01234567890123456789 as double)", 1.0123456789012346); // 
same as above
   TestCast("cast(0.01234567890123456789 as double)", 0.01234567890123456789);
@@ -3291,14 +3299,18 @@ TEST_P(ExprTest, CastExprs) {
   // casting string to double
   TestCast("cast('0.43149576573887316' as double)", 0.43149576573887316);
   TestCast("cast('-0.43149576573887316' as double)", -0.43149576573887316);
-  TestCast("cast('0.123e10' as double)", 0.123e10, 4, false, false);
-  TestCast("cast('123.123e10' as double)", 123.123e10, 8, false, true);
+  TestCast("cast('0.123e10' as double)", 0.123e10, false, 4, false, false);
+  TestCast("cast('123.123e10' as double)", 123.123e10, false, 8, false, true);
   TestCast("cast('1.01234567890123456789' as double)", 1.01234567890123456789);
 
   // From http://en.wikipedia.org/wiki/Double-precision_floating-point_format
   // Min subnormal positive double
-  TestCast("cast(4.9406564584124654e-324 as double)", 4.9406564584124654e-324);
-  TestCast("cast('4.9406564584124654e-324' as double)", 
4.9406564584124654e-324);
+  TestCast("cast(4.9406564584124654e-324 as double)", 4.9406564584124654e-324, 
true);
+  TestStringValue(
+      "cast(cast(4.9406564584124654e-324 as double) as string)", 
"4.94065645841247e-324");
+  TestCast("cast('4.9406564584124654e-324' as double)", 
4.9406564584124654e-324, true);
+  TestStringValue("cast(cast('4.9406564584124654e-324' as double) as string)",
+      "4.94065645841247e-324");
   // Max subnormal double
   TestCast("cast(2.2250738585072009e-308 as double)", 2.2250738585072009e-308);
   TestCast("cast('2.2250738585072009e-308' as double)", 
2.2250738585072009e-308);
@@ -3306,9 +3318,9 @@ TEST_P(ExprTest, CastExprs) {
   TestCast("cast(2.2250738585072014e-308 as double)", 2.2250738585072014e-308);
   TestCast("cast('2.2250738585072014e-308' as double)", 
2.2250738585072014e-308);
   // Max Double
-  TestCast("cast(1.7976931348623157e+308 as double)", 1.7976931348623157e308,
-      128, true, true);
-  TestCast("cast('1.7976931348623157e+308' as double)", 1.7976931348623157e308,
+  TestCast("cast(1.7976931348623157e+308 as double)", 1.7976931348623157e308, 
false, 128,
+      true, true);
+  TestCast("cast('1.7976931348623157e+308' as double)", 
1.7976931348623157e308, false,
       128, true, true);
 
   // From String
@@ -7301,7 +7313,8 @@ TEST_P(ExprTest, TimestampFunctions) {
       "as bigint)", TYPE_BIGINT, 1293872461);
   // We have some rounding errors going backend to front, so do it as a string.
   TestStringValue("cast(cast (to_utc_timestamp(cast('2011-01-01 01:01:01' "
-      "as timestamp), 'PST') as float) as string)", "1.29387251e+09");
+                  "as timestamp), 'PST') as float) as string)",
+      "1.2938725e+09");
   TestValue("cast(to_utc_timestamp(cast('2011-01-01 01:01:01' as timestamp), 
'PST') "
       "as double)", TYPE_DOUBLE, 1.293872461E9);
   TestValue("cast(to_utc_timestamp(cast('2011-01-01 01:01:01.1' as timestamp), 
'PST') "
@@ -7334,7 +7347,8 @@ TEST_P(ExprTest, TimestampFunctions) {
         1293872461);
     // We have some rounding errors going backend to front, so do it as a 
string.
     TestStringValue("cast(cast(cast('2011-01-01 01:01:01' as timestamp) as 
float)"
-        " as string)", "1.29387251e+09");
+                    " as string)",
+        "1.2938725e+09");
     TestValue("cast(cast('2011-01-01 01:01:01' as timestamp) as double)", 
TYPE_DOUBLE,
         1.293872461E9);
     TestValue("cast(cast('2011-01-01 01:01:01.1' as timestamp) as double)", 
TYPE_DOUBLE,
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test 
b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index 09a88ff6b..35330ede8 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -3294,4 +3294,88 @@ select pmod(0, 0), pmod(0, 0.0);
 NULL,NULL
 ---- TYPES
 BIGINT, DOUBLE
-====
\ No newline at end of file
+====
+---- QUERY: IMPALA-12562
+# Convert double to string
+select cast(round(cast(1.33 as double), 2) as string);
+---- RESULTS
+'1.33'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert double to string
+select cast(round(cast(1.33 as double), 1) as string);
+---- RESULTS
+'1.3'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert double to string
+select cast(round(cast(1.33 as double), 7) as string);
+---- RESULTS
+'1.33'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert float to string
+select cast(round(cast(1.33 as float), 2) as string);
+---- RESULTS
+'1.33'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert float to string
+select cast(round(cast(1.33 as float), 1) as string);
+---- RESULTS
+'1.3'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert float to string
+select cast(round(cast(1.33 as float), 7) as string);
+---- RESULTS
+'1.33'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+select cast(round(sum(double_col)/count(1),2) as string) from alltypes;
+---- RESULTS
+'45.45'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+select cast(round(1/3, 0) as string);
+---- RESULTS
+'0'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+select cast(round(1/3, 1) as string);
+---- RESULTS
+'0.3'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+select cast(round(1/3, 17) as string);
+---- RESULTS
+'0.33333333333333331'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Doubles have up to 17 digits of precision
+select cast(round(1/3, 20) as string);
+---- RESULTS
+'0.33333333333333331'
+---- TYPES
+string
+====

(impala) branch master updated: IMPALA-12562: Cast double and float to string with exact presicion

Reply via email to