This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit a877cde76df7f5435a72fc7ea4a870ca7bf28fb5 Author: Yida Wu <[email protected]> AuthorDate: Mon Mar 24 22:20:41 2025 -0700 IMPALA-13894: Allow slow check in tuple cache correctness verification when file sizes differ Currently, tuple cache correctness verification does a fast check, and returns an error if file sizes are different. This patch allows a slow check when file sizes differ. Because the slow check may provide a clearer error message and help prevent false mismatches when identical rows appear in a different order, which may lead to size differences. Updated TupleTextFileUtilTest for this change. Also fixes argument order in VerifyRows() to correct misleading log output. The previous order was incorrect, causing the log to show the wrong one as the reference file. Tests: Passed core tests. Manually verified that when file sizes differ, the query proceeds to the slow check after this change. Change-Id: I02e031410dac32d9df746201b156783a8b7d9a1a Reviewed-on: http://gerrit.cloudera.org:8080/22661 Reviewed-by: Michael Smith <[email protected]> Reviewed-by: Kurt Deschler <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/tuple-cache-node.cc | 2 +- be/src/exec/tuple-text-file-util-test.cc | 3 +-- be/src/exec/tuple-text-file-util.cc | 42 +++++++++++++++++++------------- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/be/src/exec/tuple-cache-node.cc b/be/src/exec/tuple-cache-node.cc index 1911bb845..3ddee4831 100644 --- a/be/src/exec/tuple-cache-node.cc +++ b/be/src/exec/tuple-cache-node.cc @@ -218,7 +218,7 @@ Status TupleCacheNode::VerifyAndMoveDebugCache(RuntimeState* state) { if (verify_status.ok() && !passed) { // Slow path to compare all rows in an order-insensitive way if the files are not the // same. - verify_status = TupleTextFileUtil::VerifyRows(ref_file_path, dump_file_path); + verify_status = TupleTextFileUtil::VerifyRows(dump_file_path, ref_file_path); passed = verify_status.ok(); } diff --git a/be/src/exec/tuple-text-file-util-test.cc b/be/src/exec/tuple-text-file-util-test.cc index e9a808505..a79a4ddfa 100644 --- a/be/src/exec/tuple-text-file-util-test.cc +++ b/be/src/exec/tuple-text-file-util-test.cc @@ -69,8 +69,7 @@ TEST_F(TupleTextFileUtilTest, VerifyDebugDumpCache_DifferentFiles) { bool passed = false; Status status = TupleTextFileUtil::VerifyDebugDumpCache(file1, file2, &passed); - EXPECT_FALSE(status.ok()); - EXPECT_EQ(status.code(), TErrorCode::TUPLE_CACHE_INCONSISTENCY); + EXPECT_TRUE(status.ok()); EXPECT_FALSE(passed); } diff --git a/be/src/exec/tuple-text-file-util.cc b/be/src/exec/tuple-text-file-util.cc index 0b58718e6..81825b376 100644 --- a/be/src/exec/tuple-text-file-util.cc +++ b/be/src/exec/tuple-text-file-util.cc @@ -128,6 +128,19 @@ Status TupleTextFileUtil::VerifyRows( return VerifyRowCount(cache); } +static Status ValidateAndOpenFile(const string& path, TupleTextFileReader& reader) { + if (!reader.Open().ok()) { + return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY, + Substitute("Failed to open file '$0'", path + DEBUG_TUPLE_CACHE_BAD_POSTFIX)); + } + + if (reader.GetFileSize() == TUPLE_TEXT_FILE_SIZE_ERROR) { + return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY, + Substitute("Size of file '$0' is invalid", path + DEBUG_TUPLE_CACHE_BAD_POSTFIX)); + } + return Status::OK(); +} + static Status CacheFileCmp( const std::string& path_a, const std::string& path_b, bool* passed) { DCHECK(passed != nullptr); @@ -137,25 +150,20 @@ static Status CacheFileCmp( TupleTextFileReader reader_a(path_a); TupleTextFileReader reader_b(path_b); - // Open both files. - if (!reader_a.Open().ok()) { - return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY, - Substitute("Failed to open file '$0'", path_a + DEBUG_TUPLE_CACHE_BAD_POSTFIX)); - } - if (!reader_b.Open().ok()) { - return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY, - Substitute("Failed to open file '$0'", path_b + DEBUG_TUPLE_CACHE_BAD_POSTFIX)); - } + // Validate and open files + RETURN_IF_ERROR(ValidateAndOpenFile(path_a, reader_a)); + RETURN_IF_ERROR(ValidateAndOpenFile(path_b, reader_b)); // Compare file sizes. - int file1_length = reader_a.GetFileSize(); - int file2_length = reader_b.GetFileSize(); - if (file1_length != file2_length || file1_length == TUPLE_TEXT_FILE_SIZE_ERROR) { - return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY, - Substitute("Size of file '$0' (size: $1) and '$2' (size: $3) are different", - path_a + DEBUG_TUPLE_CACHE_BAD_POSTFIX, file1_length, - path_b + DEBUG_TUPLE_CACHE_BAD_POSTFIX, file2_length)); - } + // Files are supposed to be written in tuple-text-file-writer.cc, and are plain + // text with '\n' as row delimiters and no padding, so a size mismatch likely + // indicates content differences. + // If the sizes are different, we leave "passed" as false to let the caller to proceed + // with a slower check. This may provide a clearer error message and prevent false + // mismatches when identical rows appear in a different order, which may result in + // different sizes. + if (reader_a.GetFileSize() != reader_b.GetFileSize()) return Status::OK(); + // Reset readers to the beginning of the files. reader_a.Rewind(); reader_b.Rewind();
