This is an automated email from the ASF dual-hosted git repository. michaelsmith pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit b858f2acdee77b4651ac84a7063fddf8fb0caae1 Author: Michael Smith <[email protected]> AuthorDate: Mon Jan 30 12:12:27 2023 -0800 IMPALA-11883: Calculate erasure-coded bytes read directly Calculate the metric erasure-coded-bytes-read directly from HDFS reads rather than through hdfsFileGetReadStatistics. This allows us to use it for other filesystem implementations (Ozone). Also renumbers is_erasure_coded in THdfsFileSplit to 8, where it was originally before it was removed by IMPALA-9485 (and never replaced). Testing: - ran updated test_io_metrics.py with Ozone, with and without EC - ran updated test_io_metrics.py with HDFS, with and without EC Change-Id: Ide0fc806590b2328df8068a9a54645d1d1fb137c Reviewed-on: http://gerrit.cloudera.org:8080/19460 Reviewed-by: Joe McDonnell <[email protected]> Tested-by: Michael Smith <[email protected]> --- be/src/runtime/io/hdfs-file-reader.cc | 7 ++++--- be/src/runtime/io/request-context.h | 2 +- common/thrift/PlanNodes.thrift | 6 +++--- tests/query_test/test_io_metrics.py | 3 +-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/be/src/runtime/io/hdfs-file-reader.cc b/be/src/runtime/io/hdfs-file-reader.cc index d880264c5..1d814dcc8 100644 --- a/be/src/runtime/io/hdfs-file-reader.cc +++ b/be/src/runtime/io/hdfs-file-reader.cc @@ -238,6 +238,10 @@ Status HdfsFileReader::ReadFromPos(DiskQueue* queue, int64_t file_offset, uint8_ bool is_first_read = (num_remote_bytes_ == 0); // Collect and accumulate statistics GetHdfsStatistics(hdfs_file, log_slow_read); + if (scan_range_->is_erasure_coded()) { + scan_range_->reader_->bytes_read_ec_.Add(current_bytes_read); + } + if (FLAGS_fs_trace_remote_reads && expected_local_ && num_remote_bytes_ > 0 && is_first_read) { // Only log the first unexpected remote read for scan range @@ -404,9 +408,6 @@ void HdfsFileReader::GetHdfsStatistics(hdfsFile hdfs_file, bool log_stats) { scan_range_->reader_->bytes_read_short_circuit_.Add( stats->totalShortCircuitBytesRead); scan_range_->reader_->bytes_read_dn_cache_.Add(stats->totalZeroCopyBytesRead); - if (scan_range_->is_erasure_coded()) { - scan_range_->reader_->bytes_read_ec_.Add(stats->totalBytesRead); - } if (stats->totalLocalBytesRead != stats->totalBytesRead) { num_remote_bytes_ += stats->totalBytesRead - stats->totalLocalBytesRead; } diff --git a/be/src/runtime/io/request-context.h b/be/src/runtime/io/request-context.h index 646908374..585a28f47 100644 --- a/be/src/runtime/io/request-context.h +++ b/be/src/runtime/io/request-context.h @@ -400,7 +400,7 @@ class RequestContext { /// Total number of bytes read from date node cache, updated at end of each range scan AtomicInt64 bytes_read_dn_cache_{0}; - /// Total number of erasure-coded bytes read, updated at end of each range scan + /// Total number of erasure-coded bytes read AtomicInt64 bytes_read_ec_{0}; /// Total number of bytes from remote reads that were expected to be local. diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift index d76a25fa6..86519e4d1 100644 --- a/common/thrift/PlanNodes.thrift +++ b/common/thrift/PlanNodes.thrift @@ -218,6 +218,9 @@ struct THdfsFileSplit { // last modified time of the file 7: required i64 mtime + // Whether the HDFS file is stored with erasure coding. + 8: optional bool is_erasure_coded + // Hash of the partition's path. This must be hashed with a hash algorithm that is // consistent across different processes and machines. This is currently using // Java's String.hashCode(), which is consistent. For testing purposes, this can use @@ -227,9 +230,6 @@ struct THdfsFileSplit { // The absolute path of the file, it's used only when data files are outside of // the Iceberg table location (IMPALA-11507). 10: optional string absolute_path - - // Whether the HDFS file is stored with erasure coding. - 11: optional bool is_erasure_coded } // Key range for single THBaseScanNode. Corresponds to HBaseKeyRangePB and should be kept diff --git a/tests/query_test/test_io_metrics.py b/tests/query_test/test_io_metrics.py index f75d14997..b45a0f8e7 100644 --- a/tests/query_test/test_io_metrics.py +++ b/tests/query_test/test_io_metrics.py @@ -47,8 +47,7 @@ class TestIOMetrics(ImpalaTestSuite): def append_metric(metric, expect_nonzero): (expect_nonzero_metrics if expect_nonzero else expect_zero_metrics).append(metric) - # IMPALA-11697: these come from getReadStatistics, which is only implemented for HDFS - append_metric("impala-server.io-mgr.erasure-coded-bytes-read", IS_HDFS and IS_EC) + append_metric("impala-server.io-mgr.erasure-coded-bytes-read", IS_EC) append_metric("impala-server.io-mgr.short-circuit-bytes-read", IS_HDFS and not IS_DOCKERIZED_TEST_CLUSTER) append_metric("impala-server.io-mgr.local-bytes-read",
