This is an automated email from the ASF dual-hosted git repository.
laszlog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 0d429462f IMPALA-12562: Cast double and float to string with exact
presicion
0d429462f is described below
commit 0d429462f7f61565119ee2e593867a22886d7209
Author: zhangyifan27 <[email protected]>
AuthorDate: Fri May 17 23:28:11 2024 +0800
IMPALA-12562: Cast double and float to string with exact presicion
The builtin functions casttostring(DOUBLE) and casttostring(FLOAT)
printed more digits when converting double and float values to
string values. This patch fixes this by switching to use the existing
methods DoubleToBuffer and FloatToBuffer, which are simple and fast
implementations to print necessary digits.
Testing:
- Add end-to-end tests to verify the fixes
- Add benchmarks for modified functions
- Update tests in expr-test
Change-Id: Icd79c55dd57dc0fa13e4ec11c2284ef2800e8b1a
Reviewed-on: http://gerrit.cloudera.org:8080/21441
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
be/src/benchmarks/expr-benchmark.cc | 49 ++++++------
be/src/exprs/cast-functions-ir.cc | 47 ++++++------
be/src/exprs/expr-test.cc | 62 ++++++++++------
.../functional-query/queries/QueryTest/exprs.test | 86 +++++++++++++++++++++-
4 files changed, 173 insertions(+), 71 deletions(-)
diff --git a/be/src/benchmarks/expr-benchmark.cc
b/be/src/benchmarks/expr-benchmark.cc
index a006c8610..fa8d84fbd 100644
--- a/be/src/benchmarks/expr-benchmark.cc
+++ b/be/src/benchmarks/expr-benchmark.cc
@@ -288,33 +288,38 @@ Benchmark* BenchmarkLike(bool codegen) {
return suite;
}
+// Machine Info: AMD EPYC 7K62 48-Core Processor
// Cast: Function iters/ms 10%ile 50%ile 90%ile
10%ile 50%ile 90%ile
//
(relative) (relative) (relative)
//
---------------------------------------------------------------------------------------------------------
-// int_to_int 334 337 340
1X 1X 1X
-// int_to_bool 332 335 338
0.995X 0.994X 0.992X
-// int_to_double 700 707 711
2.1X 2.1X 2.09X
-// int_to_string 125 126 127
0.374X 0.375X 0.374X
-// double_to_boolean 155 156 157
0.464X 0.463X 0.463X
-// double_to_bigint 90 90.6 90.8
0.269X 0.269X 0.267X
-// double_to_string 68 68.8 69.3
0.204X 0.204X 0.204X
-// string_to_int 229 231 232
0.684X 0.685X 0.682X
-// string_to_float 103 104 105
0.309X 0.308X 0.308X
-// string_to_timestamp 39.9 40.1 40.5
0.119X 0.119X 0.119X
+// int_to_int 161 167 169
1X 1X 1X
+// int_to_bool 191 201 205
1.18X 1.2X 1.21X
+// int_to_double 506 522 529
3.14X 3.13X 3.13X
+// int_to_string 28.2 29.3 30.1
0.175X 0.176X 0.178X
+// double_to_boolean 67.6 69.7 71.4
0.42X 0.418X 0.422X
+// double_to_bigint 48.1 49 49.8
0.299X 0.294X 0.295X
+// decimal_to_string 22.4 22.8 23
0.139X 0.137X 0.136X
+// double_to_string 7.55 7.75 7.83
0.0468X 0.0464X 0.0463X
+// float_to_string 7.97 8.15 8.24
0.0495X 0.0489X 0.0488X
+// string_to_int 138 142 147
0.859X 0.854X 0.87X
+// string_to_float 57.7 59.3 60.2
0.358X 0.355X 0.356X
+// string_to_timestamp 22.4 23.2 23.5
0.139X 0.139X 0.139X
//
// CastCodegen: Function iters/ms 10%ile 50%ile 90%ile
10%ile 50%ile 90%ile
//
(relative) (relative) (relative)
//
---------------------------------------------------------------------------------------------------------
-// int_to_int 824 830 837
1X 1X 1X
-// int_to_bool 815 821 828
0.989X 0.989X 0.989X
-// int_to_double 778 783 789
0.944X 0.943X 0.943X
-// int_to_string 167 169 171
0.203X 0.203X 0.204X
-// double_to_boolean 819 826 833
0.994X 0.994X 0.995X
-// double_to_bigint 777 783 792
0.943X 0.943X 0.946X
-// double_to_string 139 140 141
0.168X 0.168X 0.168X
-// string_to_int 369 372 375
0.448X 0.448X 0.448X
-// string_to_float 123 124 125
0.15X 0.15X 0.15X
-// string_to_timestamp 44.8 45.1 45.4
0.0544X 0.0543X 0.0542X
+// int_to_int 166 167 169
1X 1X 1X
+// int_to_bool 198 202 204
1.19X 1.21X 1.21X
+// int_to_double 521 526 531
3.14X 3.15X 3.14X
+// int_to_string 28.9 29.7 30.5
0.174X 0.178X 0.18X
+// double_to_boolean 68.7 70.1 71.4
0.414X 0.419X 0.422X
+// double_to_bigint 48.4 49.2 49.8
0.292X 0.294X 0.295X
+// decimal_to_string 22.5 22.8 23.2
0.136X 0.137X 0.137X
+// double_to_string 7.64 7.75 7.83
0.046X 0.0463X 0.0463X
+// float_to_string 8.02 8.15 8.3
0.0483X 0.0487X 0.0491X
+// string_to_int 140 145 147
0.847X 0.869X 0.868X
+// string_to_float 58.6 59.4 60.6
0.353X 0.355X 0.358X
+// string_to_timestamp 23 23.3 23.7
0.138X 0.139X 0.14X
Benchmark* BenchmarkCast(bool codegen) {
Benchmark* suite = new Benchmark(BenchmarkName("Cast", codegen));
BENCHMARK("int_to_int", "cast(1 as INT)");
@@ -323,7 +328,9 @@ Benchmark* BenchmarkCast(bool codegen) {
BENCHMARK("int_to_string", "cast(1 as STRING)");
BENCHMARK("double_to_boolean", "cast(3.14 as BOOLEAN)");
BENCHMARK("double_to_bigint", "cast(3.14 as BIGINT)");
- BENCHMARK("double_to_string", "cast(3.14 as STRING)");
+ BENCHMARK("decimal_to_string", "cast(3.14 as STRING)");
+ BENCHMARK("double_to_string", "cast(cast(3.14 as DOUBLE) as STRING)");
+ BENCHMARK("float_to_string", "cast(cast(3.14 as FLOAT) as STRING)");
BENCHMARK("string_to_int", "cast('1234' as INT)");
BENCHMARK("string_to_float", "cast('1234.5678' as FLOAT)");
BENCHMARK("string_to_timestamp", "cast('2011-10-22 09:10:11' as TIMESTAMP)");
diff --git a/be/src/exprs/cast-functions-ir.cc
b/be/src/exprs/cast-functions-ir.cc
index 7f4bc0a32..bce7262e6 100644
--- a/be/src/exprs/cast-functions-ir.cc
+++ b/be/src/exprs/cast-functions-ir.cc
@@ -26,16 +26,16 @@
#include <gutil/strings/numbers.h>
#include <gutil/strings/substitute.h>
+#include "common/names.h"
#include "exprs/anyval-util.h"
#include "exprs/cast-format-expr.h"
#include "exprs/decimal-functions.h"
+#include "gutil/strings/numbers.h"
#include "runtime/string-value.inline.h"
#include "runtime/timestamp-value.h"
#include "runtime/timestamp-value.inline.h"
#include "util/string-parser.h"
-#include "common/names.h"
-
using namespace impala;
using namespace impala_udf;
using namespace datetime_parse_util;
@@ -338,31 +338,28 @@ CAST_EXACT_NUMERIC_TO_STRING(SmallIntVal,
MAX_SMALLINT_CHARS, FastInt32ToBufferL
CAST_EXACT_NUMERIC_TO_STRING(IntVal, MAX_INT_CHARS, FastInt32ToBufferLeft);
CAST_EXACT_NUMERIC_TO_STRING(BigIntVal, MAX_BIGINT_CHARS,
FastInt64ToBufferLeft);
-
-#define CAST_FLOAT_TO_STRING(float_type, format) \
- StringVal CastFunctions::CastToStringVal(FunctionContext* ctx, const
float_type& val) { \
- if (val.is_null) return StringVal::null(); \
- /* val.val could be -nan, return "nan" instead */ \
- if (std::isnan(val.val)) return StringVal("nan"); \
- /* Add 1 to MAX_FLOAT_CHARS since snprintf adds a trailing '\0' */ \
- StringVal sv(ctx, MAX_FLOAT_CHARS + 1); \
- if (UNLIKELY(sv.is_null)) { \
- DCHECK(!ctx->impl()->state()->GetQueryStatus().ok()); \
- return sv; \
- } \
- sv.len = snprintf(reinterpret_cast<char*>(sv.ptr), sv.len, format,
val.val); \
- DCHECK_GT(sv.len, 0); \
- DCHECK_LE(sv.len, MAX_FLOAT_CHARS); \
- AnyValUtil::TruncateIfNecessary(ctx->GetReturnType(), &sv); \
- return sv; \
+#define CAST_FLOAT_TO_STRING(float_type, convert_method, buffer_size)
\
+ StringVal CastFunctions::CastToStringVal(
\
+ FunctionContext* ctx, const float_type& val) {
\
+ if (val.is_null) return StringVal::null();
\
+ /* val.val could be -nan, return "nan" instead */
\
+ if (std::isnan(val.val)) return StringVal("nan");
\
+ StringVal sv(ctx, buffer_size);
\
+ if (UNLIKELY(sv.is_null)) {
\
+ DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());
\
+ return sv;
\
+ }
\
+ sv.len = strlen(convert_method(val.val, reinterpret_cast<char*>(sv.ptr)));
\
+ DCHECK_GT(sv.len, 0);
\
+ DCHECK_LE(sv.len, MAX_FLOAT_CHARS);
\
+ AnyValUtil::TruncateIfNecessary(ctx->GetReturnType(), &sv);
\
+ return sv;
\
}
-// Floats have up to 9 significant digits, doubles up to 17
-// (see http://en.wikipedia.org/wiki/Single-precision_floating-point_format
-// and http://en.wikipedia.org/wiki/Double-precision_floating-point_format)
-CAST_FLOAT_TO_STRING(FloatVal, "%.9g");
-CAST_FLOAT_TO_STRING(DoubleVal, "%.17g");
-
+// Convert a double or float to a string and produce the exact same original
precision.
+// See gutil/strings/numbers.h and gutil/strings/numbers.cc for more details.
+CAST_FLOAT_TO_STRING(FloatVal, FloatToBuffer, kFloatToBufferSize);
+CAST_FLOAT_TO_STRING(DoubleVal, DoubleToBuffer, kDoubleToBufferSize);
StringVal CastFunctions::CastToStringVal(FunctionContext* ctx, const
TimestampVal& val) {
DCHECK(ctx != nullptr);
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 1cb0ee88d..61e53933f 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -1211,13 +1211,16 @@ class ExprTest : public
testing::TestWithParam<std::tuple<bool, bool>> {
// signed integer expected to be able to hold the value.
'float_out_of_range' should be
// set to true if the value does not fit in a single precision float. The
expected
// result is 'val' for the types that can hold the value and error for other
types.
- template<typename T>
- void TestCast(const string& stmt, T val, int min_integer_size = 1,
- bool float_out_of_range = false, bool timestamp_out_of_range = false) {
+ template <typename T>
+ void TestCast(const string& stmt, T val, bool convert_lose_precision = false,
+ int min_integer_size = 1, bool float_out_of_range = false,
+ bool timestamp_out_of_range = false) {
TestValue("cast(" + stmt + " as boolean)", TYPE_BOOLEAN,
static_cast<bool>(val));
TestValue("cast(" + stmt + " as double)", TYPE_DOUBLE,
static_cast<double>(val));
TestValue("cast(" + stmt + " as real)", TYPE_DOUBLE,
static_cast<double>(val));
- TestStringValue("cast(" + stmt + " as string)", lexical_cast<string>(val));
+ if (!convert_lose_precision) {
+ TestStringValue("cast(" + stmt + " as string)",
lexical_cast<string>(val));
+ }
TestValueOrError("cast(" + stmt + " as tinyint)", TYPE_TINYINT,
static_cast<int8_t>(val), min_integer_size > sizeof(int8_t),
@@ -1282,9 +1285,9 @@ TimestampValue ExprTest::CreateTestTimestamp(int64_t val)
{
// Test casting 'stmt' to each of the native types. See the general template
definition
// for more information.
-template<>
-void ExprTest::TestCast(const string& stmt, const char* val, int
min_integer_size,
- bool float_out_of_range, bool timestamp_out_of_range) {
+template <>
+void ExprTest::TestCast(const string& stmt, const char* val, bool
convert_lose_precision,
+ int min_integer_size, bool float_out_of_range, bool
timestamp_out_of_range) {
try {
int8_t val8 = static_cast<int8_t>(lexical_cast<int16_t>(val));
#if 0
@@ -3266,23 +3269,28 @@ TEST_P(ExprTest, CastExprs) {
TestCast("cast(0.0 as float)", 0.0f);
TestCast("cast(5.0 as float)", 5.0f);
TestCast("cast(-5.0 as float)", -5.0f);
- TestCast("cast(0.1234567890123 as float)", 0.1234567890123f);
- TestCast("cast(0.1234567890123 as float)", 0.123456791f); // same as above
- TestCast("cast(0.00000000001234567890123 as float)",
0.00000000001234567890123f);
- TestCast("cast(123456 as float)", 123456.0f, 4, false, false);
+ TestCast("cast(0.1234567890123 as float)", 0.1234567890123f, true);
+ TestCast("cast(0.1234567890123 as float)", 0.123456791f, true); // same as
above
+ TestStringValue("cast(cast(0.1234567890123 as float) as string)",
"0.12345679");
+ TestCast("cast(0.00000000001234567890123 as float)",
0.00000000001234567890123f, true);
+ TestStringValue(
+ "cast(cast(0.00000000001234567890123 as float) as string)",
"1.2345679e-11");
+ TestCast("cast(123456 as float)", 123456.0f, false, 4, false, false);
// From http://en.wikipedia.org/wiki/Single-precision_floating-point_format
// Min positive normal value
- TestCast("cast(1.1754944e-38 as float)", 1.1754944e-38f);
+ TestCast("cast(1.1754944e-38 as float)", 1.1754944e-38f, true);
+ TestStringValue("cast(cast(1.1754944e-38 as float) as string)",
"1.1754944e-38");
// Max representable value
- TestCast("cast(3.4028234e38 as float)", 3.4028234e38f, 32, false, true);
+ TestCast("cast(3.4028234e38 as float)", 3.4028234e38f, true, 32, false,
true);
+ TestStringValue("cast(cast(3.4028234e38 as float) as string)",
"3.4028235e+38");
// From Double
TestCast("cast(0.0 as double)", 0.0);
TestCast("cast(5.0 as double)", 5.0);
TestCast("cast(-5.0 as double)", -5.0);
- TestCast("cast(0.123e10 as double)", 0.123e10, 4, false, false);
- TestCast("cast(123.123e10 as double)", 123.123e10, 8, false, true);
+ TestCast("cast(0.123e10 as double)", 0.123e10, false, 4, false, false);
+ TestCast("cast(123.123e10 as double)", 123.123e10, false, 8, false, true);
TestCast("cast(1.01234567890123456789 as double)", 1.01234567890123456789);
TestCast("cast(1.01234567890123456789 as double)", 1.0123456789012346); //
same as above
TestCast("cast(0.01234567890123456789 as double)", 0.01234567890123456789);
@@ -3291,14 +3299,18 @@ TEST_P(ExprTest, CastExprs) {
// casting string to double
TestCast("cast('0.43149576573887316' as double)", 0.43149576573887316);
TestCast("cast('-0.43149576573887316' as double)", -0.43149576573887316);
- TestCast("cast('0.123e10' as double)", 0.123e10, 4, false, false);
- TestCast("cast('123.123e10' as double)", 123.123e10, 8, false, true);
+ TestCast("cast('0.123e10' as double)", 0.123e10, false, 4, false, false);
+ TestCast("cast('123.123e10' as double)", 123.123e10, false, 8, false, true);
TestCast("cast('1.01234567890123456789' as double)", 1.01234567890123456789);
// From http://en.wikipedia.org/wiki/Double-precision_floating-point_format
// Min subnormal positive double
- TestCast("cast(4.9406564584124654e-324 as double)", 4.9406564584124654e-324);
- TestCast("cast('4.9406564584124654e-324' as double)",
4.9406564584124654e-324);
+ TestCast("cast(4.9406564584124654e-324 as double)", 4.9406564584124654e-324,
true);
+ TestStringValue(
+ "cast(cast(4.9406564584124654e-324 as double) as string)",
"4.94065645841247e-324");
+ TestCast("cast('4.9406564584124654e-324' as double)",
4.9406564584124654e-324, true);
+ TestStringValue("cast(cast('4.9406564584124654e-324' as double) as string)",
+ "4.94065645841247e-324");
// Max subnormal double
TestCast("cast(2.2250738585072009e-308 as double)", 2.2250738585072009e-308);
TestCast("cast('2.2250738585072009e-308' as double)",
2.2250738585072009e-308);
@@ -3306,9 +3318,9 @@ TEST_P(ExprTest, CastExprs) {
TestCast("cast(2.2250738585072014e-308 as double)", 2.2250738585072014e-308);
TestCast("cast('2.2250738585072014e-308' as double)",
2.2250738585072014e-308);
// Max Double
- TestCast("cast(1.7976931348623157e+308 as double)", 1.7976931348623157e308,
- 128, true, true);
- TestCast("cast('1.7976931348623157e+308' as double)", 1.7976931348623157e308,
+ TestCast("cast(1.7976931348623157e+308 as double)", 1.7976931348623157e308,
false, 128,
+ true, true);
+ TestCast("cast('1.7976931348623157e+308' as double)",
1.7976931348623157e308, false,
128, true, true);
// From String
@@ -7301,7 +7313,8 @@ TEST_P(ExprTest, TimestampFunctions) {
"as bigint)", TYPE_BIGINT, 1293872461);
// We have some rounding errors going backend to front, so do it as a string.
TestStringValue("cast(cast (to_utc_timestamp(cast('2011-01-01 01:01:01' "
- "as timestamp), 'PST') as float) as string)", "1.29387251e+09");
+ "as timestamp), 'PST') as float) as string)",
+ "1.2938725e+09");
TestValue("cast(to_utc_timestamp(cast('2011-01-01 01:01:01' as timestamp),
'PST') "
"as double)", TYPE_DOUBLE, 1.293872461E9);
TestValue("cast(to_utc_timestamp(cast('2011-01-01 01:01:01.1' as timestamp),
'PST') "
@@ -7334,7 +7347,8 @@ TEST_P(ExprTest, TimestampFunctions) {
1293872461);
// We have some rounding errors going backend to front, so do it as a
string.
TestStringValue("cast(cast(cast('2011-01-01 01:01:01' as timestamp) as
float)"
- " as string)", "1.29387251e+09");
+ " as string)",
+ "1.2938725e+09");
TestValue("cast(cast('2011-01-01 01:01:01' as timestamp) as double)",
TYPE_DOUBLE,
1.293872461E9);
TestValue("cast(cast('2011-01-01 01:01:01.1' as timestamp) as double)",
TYPE_DOUBLE,
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index 09a88ff6b..35330ede8 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -3294,4 +3294,88 @@ select pmod(0, 0), pmod(0, 0.0);
NULL,NULL
---- TYPES
BIGINT, DOUBLE
-====
\ No newline at end of file
+====
+---- QUERY: IMPALA-12562
+# Convert double to string
+select cast(round(cast(1.33 as double), 2) as string);
+---- RESULTS
+'1.33'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert double to string
+select cast(round(cast(1.33 as double), 1) as string);
+---- RESULTS
+'1.3'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert double to string
+select cast(round(cast(1.33 as double), 7) as string);
+---- RESULTS
+'1.33'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert float to string
+select cast(round(cast(1.33 as float), 2) as string);
+---- RESULTS
+'1.33'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert float to string
+select cast(round(cast(1.33 as float), 1) as string);
+---- RESULTS
+'1.3'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Convert float to string
+select cast(round(cast(1.33 as float), 7) as string);
+---- RESULTS
+'1.33'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+select cast(round(sum(double_col)/count(1),2) as string) from alltypes;
+---- RESULTS
+'45.45'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+select cast(round(1/3, 0) as string);
+---- RESULTS
+'0'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+select cast(round(1/3, 1) as string);
+---- RESULTS
+'0.3'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+select cast(round(1/3, 17) as string);
+---- RESULTS
+'0.33333333333333331'
+---- TYPES
+string
+====
+---- QUERY: IMPALA-12562
+# Doubles have up to 17 digits of precision
+select cast(round(1/3, 20) as string);
+---- RESULTS
+'0.33333333333333331'
+---- TYPES
+string
+====