This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 4e8555031b4364182273954d5d3f50d31ea75ec1 Author: Eyizoha <[email protected]> AuthorDate: Wed Aug 21 13:49:31 2024 +0800 IMPALA-12957: Support reading Inf and NaN from JSON Despite the fact that special values such as Inf and NaN are not supported in standard JSON (they are considered invalid values), rapidjson does support them. However, it requires the parsing flag 'kParseNanAndInfFlag' to be enabled. This patch enables the flag to allows JsonParser to parse special numbers like Inf and NaN. Corresponding modifications have also been made to JsonSkipper to maintain consistent behavior when zero slots scans. Additionally, due to a minor issue in rapidjson v1.1.0 when parsing Inf and NaN, this patch also updates the rapidjson version to include the corresponding fix (see https://gerrit.cloudera.org/#/c/21980/). Testing: - Added and passed relevant test cases (BE, E2E). Change-Id: I05ee7c7c7fb7e78fff9570f659ce2d13c94a4e10 Reviewed-on: http://gerrit.cloudera.org:8080/21701 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/json/json-parser-test.cc | 15 ++++++------ be/src/exec/json/json-parser.cc | 28 +++++++++++++++++----- be/src/exec/json/json-parser.h | 4 +++- bin/impala-config.sh | 6 ++--- testdata/data/json_test/malformed.json | 4 +++- testdata/data/json_test/overflow.json | 5 +++- .../queries/QueryTest/malformed_json.test | 4 +++- .../queries/QueryTest/overflow_json.test | 5 +++- 8 files changed, 50 insertions(+), 21 deletions(-) diff --git a/be/src/exec/json/json-parser-test.cc b/be/src/exec/json/json-parser-test.cc index b19cd15a9..59dbe9b7b 100644 --- a/be/src/exec/json/json-parser-test.cc +++ b/be/src/exec/json/json-parser-test.cc @@ -100,6 +100,7 @@ class JsonParserTest : public ::testing::TestWithParam<int> { } if (e == kParseErrorNone) { EXPECT_TRUE(res) << v; + EXPECT_TRUE(ss.Eos()); } else { EXPECT_FALSE(res) << v; EXPECT_EQ(js.GetErrorCode(), e) << v; @@ -118,10 +119,10 @@ class JsonParserTest : public ::testing::TestWithParam<int> { // Normal Json {R"({"name": "Linda", "bool": true, "score": 76.3, "address": "Chicago"})", "Linda, true, 76.3, Chicago, "}, - {R"({"name": "Mike", "bool": null, "score": 82.1, "address": "Dallas"})", - "Mike, null, 82.1, Dallas, "}, - {R"({"name": "Sara", "bool": false, "score": 94.8, "address": "Seattle"})", - "Sara, false, 94.8, Seattle, "}, + {R"({"name": "Mike", "bool": null, "score": NaN, "address": "Dallas"})", + "Mike, null, NaN, Dallas, "}, + {R"({"name": "Sara", "bool": false, "score": Inf, "address": "Seattle"})", + "Sara, false, Inf, Seattle, "}, // String with escape or special char. {R"({"name": "Joe\nJoe", "bool": null, "score": 100, "address": "{New}\t{York}"})", @@ -187,6 +188,9 @@ TEST_P(JsonParserTest, JsonSkipperTest) { TestSkip("-9.9", TYPE_NUMBER); TestSkip("2e10", TYPE_NUMBER); TestSkip("-2e-10", TYPE_NUMBER); + TestSkip("Inf", TYPE_NUMBER); + TestSkip("-Infinity", TYPE_NUMBER); + TestSkip("NaN", TYPE_NUMBER); TestSkip("{}", TYPE_OBJECT); TestSkip(R"({"a":null, "b":[1,true,false]})", TYPE_OBJECT); @@ -213,9 +217,6 @@ TEST_P(JsonParserTest, JsonSkipperTest) { TestSkip(R"("你好🙂\")", TYPE_STRING, kParseErrorStringMissQuotationMark); TestSkip(R"("\u009f\u0099\u00\")", TYPE_STRING, kParseErrorStringMissQuotationMark); - TestSkip("Inf", TYPE_NUMBER, kParseErrorValueInvalid); - TestSkip("-Infinity", TYPE_NUMBER, kParseErrorValueInvalid); - TestSkip("NaN", TYPE_NUMBER, kParseErrorValueInvalid); TestSkip("+1", TYPE_NUMBER, kParseErrorValueInvalid); TestSkip(".123", TYPE_NUMBER, kParseErrorValueInvalid); TestSkip("1.", TYPE_NUMBER, kParseErrorNumberMissFraction); diff --git a/be/src/exec/json/json-parser.cc b/be/src/exec/json/json-parser.cc index 4fbfb3d45..847a54dd7 100644 --- a/be/src/exec/json/json-parser.cc +++ b/be/src/exec/json/json-parser.cc @@ -87,8 +87,8 @@ bool JsonParser<Scanner>::MoveToNextJson() { template <class Scanner> Status JsonParser<Scanner>::Parse(int max_rows, int* num_rows) { while (*num_rows < max_rows) { - // TODO: Support Inf and NaN. - constexpr auto parse_flags = kParseNumbersAsStringsFlag | kParseStopWhenDoneFlag; + constexpr auto parse_flags = + kParseNumbersAsStringsFlag | kParseStopWhenDoneFlag | kParseNanAndInfFlag; // Reads characters from the stream, parses them and publishes events to this // handler (JsonParser). reader_.Parse<parse_flags>(stream_, *this); @@ -339,15 +339,31 @@ bool JsonSkipper<Stream>::SkipNumber() { // consistent with the behavior of rapidjson. // Despite the fact that special values such as Inf and NaN are not supported in // standard JSON (they are considered invalid values), rapidjson does support them. - // However, it requires the parsing flag kParseNanAndInfFlag to be enabled. Since this - // flag is not enabled in the current JsonParser::Parse(), this function also remains - // consistent by not supporting Inf and NaN. - // TODO: Support Inf and NaN. + // We have already enabled the parsing flag kParseNanAndInfFlag in the + // JsonParser::Parse() to support parsing Inf and NaN, so this function also supports + // them accordingly. Consume('-'); if (UNLIKELY(s_.Peek() == '0')) { s_.Take(); } else if (LIKELY(s_.Peek() >= '1' && s_.Peek() <= '9')) { while (LIKELY(s_.Peek() >= '0' && s_.Peek() <= '9')) s_.Take(); + } else if (LIKELY(s_.Peek() == 'N')) { + s_.Take(); + ERROR_IF_FALSE(Consume('a'), kParseErrorValueInvalid); + ERROR_IF_FALSE(Consume('N'), kParseErrorValueInvalid); + return true; + } else if (LIKELY(s_.Peek() == 'I')) { + s_.Take(); + ERROR_IF_FALSE(Consume('n'), kParseErrorValueInvalid); + ERROR_IF_FALSE(Consume('f'), kParseErrorValueInvalid); + if (UNLIKELY(s_.Peek() == 'i')) { + s_.Take(); + ERROR_IF_FALSE(Consume('n'), kParseErrorValueInvalid); + ERROR_IF_FALSE(Consume('i'), kParseErrorValueInvalid); + ERROR_IF_FALSE(Consume('t'), kParseErrorValueInvalid); + ERROR_IF_FALSE(Consume('y'), kParseErrorValueInvalid); + } + return true; } else ERROR_IF_FALSE(false, kParseErrorValueInvalid); if (Consume('.')) { diff --git a/be/src/exec/json/json-parser.h b/be/src/exec/json/json-parser.h index 47d72c6bc..efe26e864 100644 --- a/be/src/exec/json/json-parser.h +++ b/be/src/exec/json/json-parser.h @@ -304,9 +304,11 @@ class SimpleStream { public: SimpleStream(const char* str) : current_(str) { } + bool Eos() { return *current_ == '\0'; } + char Peek() { return *current_; } - char Take() { return *current_ == '\0' ? '\0' : *current_++; } + char Take() { return Eos() ? '\0' : *current_++; } private: const char* current_ = nullptr; diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 55712ddac..1e6083d3f 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -81,8 +81,8 @@ export USE_AVRO_CPP=${USE_AVRO_CPP:=false} # moving to a different build of the toolchain, e.g. when a version is bumped or a # compile option is changed. The build id can be found in the output of the toolchain # build jobs, it is constructed from the build number and toolchain git hash prefix. -export IMPALA_TOOLCHAIN_BUILD_ID_AARCH64=56-810d0f4757 -export IMPALA_TOOLCHAIN_BUILD_ID_X86_64=486-810d0f4757 +export IMPALA_TOOLCHAIN_BUILD_ID_AARCH64=61-3a0ac57d41 +export IMPALA_TOOLCHAIN_BUILD_ID_X86_64=491-3a0ac57d41 export IMPALA_TOOLCHAIN_REPO=\ ${IMPALA_TOOLCHAIN_REPO:-https://github.com/cloudera/native-toolchain.git} export IMPALA_TOOLCHAIN_BRANCH=${IMPALA_TOOLCHAIN_BRANCH:-master} @@ -180,7 +180,7 @@ unset IMPALA_POSTGRES_JDBC_DRIVER_URL export IMPALA_PYTHON_VERSION=2.7.16 unset IMPALA_PYTHON_URL export IMPALA_PYTHON3_VERSION=3.8.18 -export IMPALA_RAPIDJSON_VERSION=1.1.0 +export IMPALA_RAPIDJSON_VERSION=1.1.0-p1 unset IMPALA_RAPIDJSON_URL export IMPALA_RE2_VERSION=2023-03-01 unset IMPALA_RE2_URL diff --git a/testdata/data/json_test/malformed.json b/testdata/data/json_test/malformed.json index 688387c05..72d6abffd 100644 --- a/testdata/data/json_test/malformed.json +++ b/testdata/data/json_test/malformed.json @@ -15,4 +15,6 @@ [ ] ( ) {"string_col":"abc123"} -["string_col", "abc123"] \ No newline at end of file +["string_col", "abc123"] +{"float_col":NaN.2e2, "bool_col":true} +{"float_col":Inf.2e2, "bool_col":true} \ No newline at end of file diff --git a/testdata/data/json_test/overflow.json b/testdata/data/json_test/overflow.json index 6b1dc9789..92f64f310 100644 --- a/testdata/data/json_test/overflow.json +++ b/testdata/data/json_test/overflow.json @@ -3,4 +3,7 @@ {"tinyint_col":-1000,"smallint_col":-100000,"int_col":-10000000000000000,"bigint_col":-10000000000000000000,"float_col":-1e1000000,"double_col":-1e10000,"decimal0_col":-123456789.12341,"decimal1_col":-100000000000000000000000000000000000000,"decimal2_col":-0.000000000000000000000000000000000000009} {"tinyint_col":"1","smallint_col":"2","int_col":"3","bigint_col":"4","float_col":"5.5","double_col":"6.6","decimal0_col":"123456789.1234","decimal1_col":"99999999999999999999999999999999999999","decimal2_col":"0.00000000000000000000000000000000000001"} {"tinyint_col":"1000","smallint_col":"100000","int_col":"10000000000000000","bigint_col":"10000000000000000000","float_col":"1e1000000","double_col":"1e10000","decimal0_col":"1234567890.1234","decimal1_col":"100000000000000000000000000000000000000","decimal2_col":"0.000000000000000000000000000000000000009"} -{"tinyint_col":"-1000","smallint_col":"-100000","int_col":"-10000000000000000","bigint_col":"-10000000000000000000","float_col":"-1e1000000","double_col":"-1e10000","decimal0_col":"-123456789.12341","decimal1_col":"-100000000000000000000000000000000000000","decimal2_col":"-0.000000000000000000000000000000000000009"} \ No newline at end of file +{"tinyint_col":"-1000","smallint_col":"-100000","int_col":"-10000000000000000","bigint_col":"-10000000000000000000","float_col":"-1e1000000","double_col":"-1e10000","decimal0_col":"-123456789.12341","decimal1_col":"-100000000000000000000000000000000000000","decimal2_col":"-0.000000000000000000000000000000000000009"} +{"float_col":NaN,"double_col":-NaN} +{"float_col":Inf,"double_col":-Inf} +{"float_col":Infinity,"double_col":-Infinity} \ No newline at end of file diff --git a/testdata/workloads/functional-query/queries/QueryTest/malformed_json.test b/testdata/workloads/functional-query/queries/QueryTest/malformed_json.test index ea4e8de95..11e41d69f 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/malformed_json.test +++ b/testdata/workloads/functional-query/queries/QueryTest/malformed_json.test @@ -22,11 +22,13 @@ false,9,0.899999976158,'abc123' true,10,1.0,'abc123' NULL,NULL,NULL,'NULL' NULL,NULL,NULL,'abc123' +NULL,NULL,NaN,'NULL' +NULL,NULL,Infinity,'NULL' ==== ---- QUERY select count(*) from malformed_json ---- TYPES bigint ---- RESULTS -13 +15 ==== \ No newline at end of file diff --git a/testdata/workloads/functional-query/queries/QueryTest/overflow_json.test b/testdata/workloads/functional-query/queries/QueryTest/overflow_json.test index 92986adaf..150b952bc 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/overflow_json.test +++ b/testdata/workloads/functional-query/queries/QueryTest/overflow_json.test @@ -17,11 +17,14 @@ tinyint, smallint, int, bigint, float, double 1,2,3,4,5.5,6.6 127,32767,2147483647,9223372036854775807,Infinity,Infinity -128,-32768,-2147483648,-9223372036854775808,-Infinity,-Infinity +NULL,NULL,NULL,NULL,NaN,NaN +NULL,NULL,NULL,NULL,Infinity,-Infinity +NULL,NULL,NULL,NULL,Infinity,-Infinity ==== ---- QUERY select count(*) from overflow_json ---- TYPES bigint ---- RESULTS -6 +9 ==== \ No newline at end of file
