This is an automated email from the ASF dual-hosted git repository. dbecker pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 607ae742e5b12df12e2e2e64c890190a02beb765 Author: Kurt Deschler <[email protected]> AuthorDate: Thu May 11 08:32:11 2023 -0500 IMPALA-12134: Optimize row materialization time This patch improves row materialization time by providing specialized formatting logic for default date and timestamp formats. For Beeswax protocol, performance is also improved by caching deserialized column metadata to avoid unnecessary per-row cost. Benchmarks: - Manually tested mixed datatype table showed ~20% reduction in row materialization time - Added cases to date-benchmark for new formatters. Date formatting improved by 3x and timestamp by 2x Machine Info: Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz ToYearMonthDay: Function iters/ms ------------------------------------------------------- TestCctzToYearMonthDay 16.5 TestToYearMonthDay 61.1 TestToYear 280 TestToString 18 TestToString_stringstream 1.86 TestDefaultDateToCharBuf 25.5 TestTimestampToString 11.7 TestDefaultTimestampToCharBuf 15.7 Testing: - Ran core tests Change-Id: I1ef5e4137fa6c2d0a5f08b430e01e3fb7de86330 Reviewed-on: http://gerrit.cloudera.org:8080/19875 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/benchmarks/date-benchmark.cc | 105 +++++++++++++++++++-- be/src/runtime/date-parse-util.cc | 18 ++++ be/src/runtime/date-parse-util.h | 3 + be/src/runtime/date-value.cc | 16 +++- .../runtime/datetime-simple-date-format-parser.h | 13 +++ be/src/runtime/timestamp-parse-util.cc | 27 ++++++ be/src/runtime/timestamp-parse-util.h | 4 + be/src/runtime/timestamp-value.cc | 21 ++++- be/src/service/query-result-set.cc | 24 +++-- 9 files changed, 210 insertions(+), 21 deletions(-) diff --git a/be/src/benchmarks/date-benchmark.cc b/be/src/benchmarks/date-benchmark.cc index 2c049ce69..84ccd6324 100644 --- a/be/src/benchmarks/date-benchmark.cc +++ b/be/src/benchmarks/date-benchmark.cc @@ -24,6 +24,8 @@ #include "gutil/basictypes.h" #include "runtime/date-value.h" #include "runtime/datetime-simple-date-format-parser.h" +#include "runtime/date-parse-util.h" +#include "runtime/timestamp-parse-util.h" #include "util/benchmark.h" #include "util/cpu-info.h" @@ -34,21 +36,30 @@ using std::mt19937; using std::uniform_int_distribution; using namespace impala; +using datetime_parse_util::SimpleDateFormatTokenizer; // ToYearMonthDay: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// TestCctzToYearMonthDay 23.1 23.4 23.7 1X 1X 1X -// TestToYearMonthDay 68 69.6 70.7 2.95X 2.98X 2.98X -// TestToYear 443 446 448 19.2X 19.1X 18.9X -// TestToString 9.02 9.04 9.06 0.391X 0.386X 0.382X -// TestToString_stringstream 2.04 2.04 2.08 0.0883X 0.0871X 0.0875X +// TestCctzToYearMonthDay 16.5 16.6 16.7 1X 1X 1X +// TestToYearMonthDay 61.1 62.1 62.3 3.69X 3.75X 3.73X +// TestToYear 280 308 308 16.9X 18.6X 18.5X +// TestToString 18 19.5 19.7 1.09X 1.18X 1.18X +// TestToString_stringstream 1.86 2.08 2.12 0.113X 0.125X 0.127X +// TestDefaultDateToCharBuf 25.5 27 27.2 1.54X 1.63X 1.63X +// TestTimestampToString 11.7 12.6 12.6 0.707X 0.76X 0.757X +// TestDefaultTimestampToCharBuf 15.7 17.2 17.2 0.949X 1.04X 1.03X + + + + const cctz::civil_day EPOCH_DATE(1970, 1, 1); class TestData { public: - void AddRandomRange(const DateValue& dv_min, const DateValue& dv_max, int data_size) { + void AddRandomRange(const DateValue& dv_min, const DateValue& dv_max, + int data_size) { DCHECK(dv_min.IsValid()); DCHECK(dv_max.IsValid()); @@ -60,18 +71,29 @@ public: mt19937 gen(rd()); // Random values in a [min_dse..max_dse] days range. uniform_int_distribution<int32_t> dis_dse(min_dse, max_dse); + uniform_int_distribution<int64_t> dis_utc(-9223372036, 9223372036); // Add random DateValue values in the [dv_min, dv_max] range. for (int i = 0; i <= data_size; ++i) { DateValue dv(dis_dse(gen)); DCHECK(dv.IsValid()); date_.push_back(dv); + timestamp_.push_back(TimestampValue::FromUnixTime(dis_utc(gen), UTCPTR)); } cctz_to_ymd_result_.resize(date_.size()); to_ymd_result_.resize(date_.size()); to_year_result_.resize(date_.size()); to_string_result_.resize(date_.size()); to_string_old_result_.resize(date_.size()); + date_to_char_buf_result_.resize(timestamp_.size()); + timestamp_to_char_buf_result_.resize(timestamp_.size()); + timestamp_to_string_result_.resize(timestamp_.size()); + for(int i = 0; i < timestamp_.size(); ++i) { + timestamp_to_char_buf_result_[i].reserve( + SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN); + timestamp_to_string_result_[i].reserve( + SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN); + } } void CctzToYearMonthDay(const DateValue& dv, int* year, int* month, int* day) const { @@ -129,6 +151,42 @@ public: } } + void TestDefaultDateToCharBuf(int batch_size) { + for (int i = 0; i < batch_size; ++i) { + int n = date_.size(); + for (int j = 0; j < n; ++j) { + date_to_char_buf_result_[j].resize( + SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN); + DateParser::FormatDefault(date_[j], &(date_to_char_buf_result_[j][0])); + } + } + } + + void TestTimestampToString(int batch_size) { + for (int i = 0; i < batch_size; ++i) { + int n = timestamp_.size(); + for (int j = 0; j < n; ++j) { + timestamp_to_string_result_[j] = timestamp_[j].ToString(); + } + } + } + + void TestDefaultTimestampToCharBuf(int batch_size) { + for (int i = 0; i < batch_size; ++i) { + int n = timestamp_.size(); + for (int j = 0; j < n; ++j) { + const uint32 len = timestamp_[j].time().fractional_seconds() ? + SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN : + SimpleDateFormatTokenizer::DEFAULT_SHORT_DATE_TIME_FMT_LEN; + + char buf[len + 1]; + TimestampParser::FormatDefault(timestamp_[j].date(), timestamp_[j].time(), buf); + buf[len] = '\0'; + timestamp_to_char_buf_result_[j] = buf; + } + } + } + void TestToString_stringstream(int batch_size) { for (int i = 0; i < batch_size; ++i) { int n = date_.size(); @@ -167,6 +225,18 @@ public: << to_string_result_[i] << " != " << to_string_old_result_[i] << endl; ok = false; } + if (date_to_char_buf_result_[i] != to_string_result_[i]) { + cerr << "Incorrect results (TestDefaultDateToCharBuf() vs ToString()): " + << date_to_char_buf_result_[i] << " != " << to_string_result_[i] << endl; + ok = false; + } + if (timestamp_to_char_buf_result_[i] != timestamp_to_string_result_[i]) { + cerr << "Incorrect results (TestDefaultTimestampToCharBuf()" + << " vs TestTimestampToString()): " + << timestamp_to_char_buf_result_[i] << " != " + << timestamp_to_string_result_[i] << endl; + ok = false; + } } return ok; } @@ -190,11 +260,15 @@ private: }; vector<DateValue> date_; + vector<TimestampValue> timestamp_; vector<YearMonthDayResult> cctz_to_ymd_result_; vector<YearMonthDayResult> to_ymd_result_; vector<int> to_year_result_; vector<string> to_string_result_; vector<string> to_string_old_result_; + vector<string> date_to_char_buf_result_; + vector<string> timestamp_to_char_buf_result_; + vector<string> timestamp_to_string_result_; }; void TestCctzToYearMonthDay(int batch_size, void* d) { @@ -217,6 +291,21 @@ void TestToString(int batch_size, void* d) { data->TestToString(batch_size); } +void TestDefaultDateToCharBuf(int batch_size, void* d) { + TestData* data = reinterpret_cast<TestData*>(d); + data->TestDefaultDateToCharBuf(batch_size); +} + +void TestTimestampToString(int batch_size, void* d) { + TestData* data = reinterpret_cast<TestData*>(d); + data->TestTimestampToString(batch_size); +} + +void TestDefaultTimestampToCharBuf(int batch_size, void* d) { + TestData* data = reinterpret_cast<TestData*>(d); + data->TestDefaultTimestampToCharBuf(batch_size); +} + void TestToString_stringstream(int batch_size, void* d) { TestData* data = reinterpret_cast<TestData*>(d); data->TestToString_stringstream(batch_size); @@ -239,6 +328,10 @@ int main(int argc, char* argv[]) { suite.AddBenchmark("TestToYear", TestToYear, &data); suite.AddBenchmark("TestToString", TestToString, &data); suite.AddBenchmark("TestToString_stringstream", TestToString_stringstream, &data); + suite.AddBenchmark("TestDefaultDateToCharBuf", TestDefaultDateToCharBuf, &data); + suite.AddBenchmark("TestTimestampToString", TestTimestampToString, &data); + suite.AddBenchmark("TestDefaultTimestampToCharBuf", TestDefaultTimestampToCharBuf, + &data); cout << suite.Measure(); return data.CheckResults() ? 0 : 1; diff --git a/be/src/runtime/date-parse-util.cc b/be/src/runtime/date-parse-util.cc index ea8698d82..a68532286 100644 --- a/be/src/runtime/date-parse-util.cc +++ b/be/src/runtime/date-parse-util.cc @@ -128,6 +128,24 @@ bool DateParser::ParseIsoSqlFormat(const char* str, int len, return date->IsValid(); } +// Formats date into dst using the default format +// Format: yyyy-MM-dd +// Offsets: 0123456789 +int DateParser::FormatDefault(const DateValue& date, char* dst) { + int year, month, day; + if (!date.ToYearMonthDay(&year, &month, &day)) { + *dst = '\0'; + return -1; + } + else { + ZeroPad(dst, year, 4); + ZeroPad(dst + 5, month, 2); + ZeroPad(dst + 8, day, 2); + dst[7] = dst[4] = '-'; + return SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN; + } +} + string DateParser::Format(const DateTimeFormatContext& dt_ctx, const DateValue& date) { DCHECK(dt_ctx.toks.size() > 0); DCHECK(dt_ctx.has_date_toks && !dt_ctx.has_time_toks); diff --git a/be/src/runtime/date-parse-util.h b/be/src/runtime/date-parse-util.h index 731e8e7be..e0efaeb74 100644 --- a/be/src/runtime/date-parse-util.h +++ b/be/src/runtime/date-parse-util.h @@ -59,6 +59,9 @@ class DateParser { const datetime_parse_util::DateTimeFormatContext& dt_ctx, DateValue* date) WARN_UNUSED_RESULT; + /// Optimized formatter for default date format + static int FormatDefault(const DateValue& date, char* dst); + /// Format the date values using the given format context. /// dt_ctx -- date format context /// date -- the date value diff --git a/be/src/runtime/date-value.cc b/be/src/runtime/date-value.cc index 78db95eb0..b0733c22d 100644 --- a/be/src/runtime/date-value.cc +++ b/be/src/runtime/date-value.cc @@ -429,11 +429,23 @@ bool DateValue::MonthsBetween(const DateValue& other, double* months_between) co } string DateValue::ToString() const { - return Format(*SimpleDateFormatTokenizer::GetDefaultDateFormatContext()); + string s; + s.resize(SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN); + const int out_len = DateParser::FormatDefault(*this, s.data()); + if (UNLIKELY(out_len != SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN)) { + s.clear(); + } + return s; } ostream& operator<<(ostream& os, const DateValue& date_value) { - return os << date_value.ToString(); + char dst[SimpleDateFormatTokenizer::DEFAULT_DATE_FMT_LEN + 1]; + const int out_len = DateParser::FormatDefault(date_value, dst); + if (LIKELY(out_len >= 0)) { + dst[out_len] = '\0'; + os << dst; + } + return os; } } diff --git a/be/src/runtime/datetime-simple-date-format-parser.h b/be/src/runtime/datetime-simple-date-format-parser.h index 7bd173cce..95bccbf93 100644 --- a/be/src/runtime/datetime-simple-date-format-parser.h +++ b/be/src/runtime/datetime-simple-date-format-parser.h @@ -182,6 +182,19 @@ public: const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result); }; +/// Helper function for formatting small numbers with leading zeros +/// This is used inline with data and timestamp formatting functions +inline void ZeroPad(char* const dst, uint32 val, const uint32 digits) { + char* p = dst + digits; + while(val) { + *--p = '0' + (val % 10); + val /= 10; + } + while(p != dst) { + *--p = '0'; + } +} + } } diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc index 98339dba7..d8d369725 100644 --- a/be/src/runtime/timestamp-parse-util.cc +++ b/be/src/runtime/timestamp-parse-util.cc @@ -227,6 +227,33 @@ bool TimestampParser::ParseIsoSqlFormat(const char* str, int len, return true; } +// Formats date and time into dst using the default format +// Short: yyyy-MM-dd HH:mm:ss +// Long: yyyy-MM-dd HH:mm:ss.SSSSSSSSS +// Offsets: 01234567890123456789012345678 + +int TimestampParser::FormatDefault(const date& d, const time_duration& t, char* dst) { + if (UNLIKELY(d.is_special() || t.is_special())) return -1; + const auto ymd = d.year_month_day(); + ZeroPad(dst, ymd.year, 4); + ZeroPad(dst + 5, ymd.month, 2); + ZeroPad(dst + 8, ymd.day, 2); + const auto tot_sec = t.total_seconds(); + ZeroPad(dst + 11, tot_sec / 3600, 2); + ZeroPad(dst + 14, (tot_sec / 60) % 60, 2); + ZeroPad(dst + 17, tot_sec % 60, 2); + dst[7] = dst[4] = '-'; + dst[10] = ' '; + dst[16] = dst[13] = ':'; + + if (LIKELY(t.fractional_seconds() > 0)) { + dst[19] = '.'; + ZeroPad(dst + 20, t.fractional_seconds(), 9); + return SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN; + } + return SimpleDateFormatTokenizer::DEFAULT_SHORT_DATE_TIME_FMT_LEN; +} + int TimestampParser::Format(const DateTimeFormatContext& dt_ctx, const date& d, const time_duration& t, int max_length, char* dst) { DCHECK(dt_ctx.toks.size() > 0); diff --git a/be/src/runtime/timestamp-parse-util.h b/be/src/runtime/timestamp-parse-util.h index 60eb8888a..a5c359f8e 100644 --- a/be/src/runtime/timestamp-parse-util.h +++ b/be/src/runtime/timestamp-parse-util.h @@ -73,6 +73,10 @@ class TimestampParser { const datetime_parse_util::DateTimeFormatContext& dt_ctx, boost::gregorian::date* d, boost::posix_time::time_duration* t) WARN_UNUSED_RESULT; + /// Optimized formatter for default short and long formats + static int FormatDefault(const boost::gregorian::date& d, + const boost::posix_time::time_duration& t, char* dst); + /// Format the date/time values using the given format context. /// Caller must make sure that it has enough buffer space in 'dst' to hold the output. /// Return total output length that is written into 'dst'. Return -1 If 'd' or 't' is diff --git a/be/src/runtime/timestamp-value.cc b/be/src/runtime/timestamp-value.cc index 26d819944..cc8ef971e 100644 --- a/be/src/runtime/timestamp-value.cc +++ b/be/src/runtime/timestamp-value.cc @@ -180,7 +180,14 @@ void TimestampValue::LocalToUtc(const Timezone& local_tz) { } ostream& operator<<(ostream& os, const TimestampValue& timestamp_value) { - return os << timestamp_value.ToString(); + char dst[SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN + 1]; + const int out_len = TimestampParser::FormatDefault(timestamp_value.date(), + timestamp_value.time(), dst); + if (LIKELY(out_len >= 0)) { + dst[out_len] = '\0'; + os << dst; + } + return os; } TimestampValue TimestampValue::UnixTimeToLocal( @@ -207,12 +214,20 @@ TimestampValue TimestampValue::FromUnixTime(time_t unix_time, const Timezone* lo } void TimestampValue::ToString(string& dst) const { - Format(*SimpleDateFormatTokenizer::GetDefaultTimestampFormatContext(time_), dst); + dst.resize(SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN); + const int out_len = TimestampParser::FormatDefault(date(), time(), dst.data()); + if (UNLIKELY(out_len != SimpleDateFormatTokenizer::DEFAULT_DATE_TIME_FMT_LEN)) { + if (UNLIKELY(out_len < 0)) { + dst.clear(); + } else { + dst.resize(out_len); + } + } } string TimestampValue::ToString() const { string dst; - Format(*SimpleDateFormatTokenizer::GetDefaultTimestampFormatContext(time_), dst); + ToString(dst); return dst; } diff --git a/be/src/service/query-result-set.cc b/be/src/service/query-result-set.cc index e4a828709..f32f05fa1 100644 --- a/be/src/service/query-result-set.cc +++ b/be/src/service/query-result-set.cc @@ -60,7 +60,12 @@ class AsciiQueryResultSet : public QueryResultSet { /// Rows are added into 'rowset'. AsciiQueryResultSet(const TResultSetMetadata& metadata, vector<string>* rowset, bool stringify_map_keys) - : metadata_(metadata), result_set_(rowset), stringify_map_keys_(stringify_map_keys) {} + : metadata_(metadata), result_set_(rowset), stringify_map_keys_(stringify_map_keys) { + types_.reserve(metadata.columns.size()); + for (int i = 0; i < metadata.columns.size(); ++i) { + types_.push_back(ColumnType::FromThrift(metadata_.columns[i].columnType)); + } + } virtual ~AsciiQueryResultSet() {} @@ -87,6 +92,9 @@ class AsciiQueryResultSet : public QueryResultSet { // If true, converts map keys to strings; see IMPALA-11778. const bool stringify_map_keys_; + + // De-serialized column metadata + vector<ColumnType> types_; }; /// Result set container for Hive protocol versions >= V6, where results are returned in @@ -210,16 +218,12 @@ Status AsciiQueryResultSet::AddRows(const vector<ScalarExprEvaluator*>& expr_eva // ODBC-187 - ODBC can only take "\t" as the delimiter out_stream << (i > 0 ? "\t" : ""); - if (metadata_.columns[i].columnType.types.size() == 1) { - RawValue::PrintValue(expr_evals[i]->GetValue(it.Get()), - ColumnType::FromThrift(metadata_.columns[i].columnType), scales[i], - &out_stream); - } else if (metadata_.columns[i].columnType.types.size() > 1) { - ColumnType col_type = ColumnType::FromThrift(metadata_.columns[i].columnType); - PrintComplexValue(expr_evals[i], it.Get(), &out_stream, col_type, - stringify_map_keys_); + if (!types_[i].IsComplexType()) { + RawValue::PrintValue(expr_evals[i]->GetValue(it.Get()), types_[i], + scales[i], &out_stream); } else { - DCHECK(false); + PrintComplexValue(expr_evals[i], it.Get(), &out_stream, types_[i], + stringify_map_keys_); } } result_set_->push_back(out_stream.str());
