This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit a877cde76df7f5435a72fc7ea4a870ca7bf28fb5
Author: Yida Wu <[email protected]>
AuthorDate: Mon Mar 24 22:20:41 2025 -0700

    IMPALA-13894: Allow slow check in tuple cache correctness verification when 
file sizes differ
    
    Currently, tuple cache correctness verification does a fast check,
    and returns an error if file sizes are different.
    
    This patch allows a slow check when file sizes differ. Because the
    slow check may provide a clearer error message and help prevent
    false mismatches when identical rows appear in a different order,
    which may lead to size differences. Updated TupleTextFileUtilTest
    for this change.
    
    Also fixes argument order in VerifyRows() to correct misleading
    log output. The previous order was incorrect, causing the log to
    show the wrong one as the reference file.
    
    Tests:
    Passed core tests.
    Manually verified that when file sizes differ, the query proceeds
    to the slow check after this change.
    
    Change-Id: I02e031410dac32d9df746201b156783a8b7d9a1a
    Reviewed-on: http://gerrit.cloudera.org:8080/22661
    Reviewed-by: Michael Smith <[email protected]>
    Reviewed-by: Kurt Deschler <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/tuple-cache-node.cc          |  2 +-
 be/src/exec/tuple-text-file-util-test.cc |  3 +--
 be/src/exec/tuple-text-file-util.cc      | 42 +++++++++++++++++++-------------
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/be/src/exec/tuple-cache-node.cc b/be/src/exec/tuple-cache-node.cc
index 1911bb845..3ddee4831 100644
--- a/be/src/exec/tuple-cache-node.cc
+++ b/be/src/exec/tuple-cache-node.cc
@@ -218,7 +218,7 @@ Status 
TupleCacheNode::VerifyAndMoveDebugCache(RuntimeState* state) {
   if (verify_status.ok() && !passed) {
     // Slow path to compare all rows in an order-insensitive way if the files 
are not the
     // same.
-    verify_status = TupleTextFileUtil::VerifyRows(ref_file_path, 
dump_file_path);
+    verify_status = TupleTextFileUtil::VerifyRows(dump_file_path, 
ref_file_path);
     passed = verify_status.ok();
   }
 
diff --git a/be/src/exec/tuple-text-file-util-test.cc 
b/be/src/exec/tuple-text-file-util-test.cc
index e9a808505..a79a4ddfa 100644
--- a/be/src/exec/tuple-text-file-util-test.cc
+++ b/be/src/exec/tuple-text-file-util-test.cc
@@ -69,8 +69,7 @@ TEST_F(TupleTextFileUtilTest, 
VerifyDebugDumpCache_DifferentFiles) {
 
   bool passed = false;
   Status status = TupleTextFileUtil::VerifyDebugDumpCache(file1, file2, 
&passed);
-  EXPECT_FALSE(status.ok());
-  EXPECT_EQ(status.code(), TErrorCode::TUPLE_CACHE_INCONSISTENCY);
+  EXPECT_TRUE(status.ok());
   EXPECT_FALSE(passed);
 }
 
diff --git a/be/src/exec/tuple-text-file-util.cc 
b/be/src/exec/tuple-text-file-util.cc
index 0b58718e6..81825b376 100644
--- a/be/src/exec/tuple-text-file-util.cc
+++ b/be/src/exec/tuple-text-file-util.cc
@@ -128,6 +128,19 @@ Status TupleTextFileUtil::VerifyRows(
   return VerifyRowCount(cache);
 }
 
+static Status ValidateAndOpenFile(const string& path, TupleTextFileReader& 
reader) {
+  if (!reader.Open().ok()) {
+    return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY,
+        Substitute("Failed to open file '$0'", path + 
DEBUG_TUPLE_CACHE_BAD_POSTFIX));
+  }
+
+  if (reader.GetFileSize() == TUPLE_TEXT_FILE_SIZE_ERROR) {
+    return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY,
+        Substitute("Size of file '$0' is invalid", path + 
DEBUG_TUPLE_CACHE_BAD_POSTFIX));
+  }
+  return Status::OK();
+}
+
 static Status CacheFileCmp(
     const std::string& path_a, const std::string& path_b, bool* passed) {
   DCHECK(passed != nullptr);
@@ -137,25 +150,20 @@ static Status CacheFileCmp(
   TupleTextFileReader reader_a(path_a);
   TupleTextFileReader reader_b(path_b);
 
-  // Open both files.
-  if (!reader_a.Open().ok()) {
-    return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY,
-        Substitute("Failed to open file '$0'", path_a + 
DEBUG_TUPLE_CACHE_BAD_POSTFIX));
-  }
-  if (!reader_b.Open().ok()) {
-    return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY,
-        Substitute("Failed to open file '$0'", path_b + 
DEBUG_TUPLE_CACHE_BAD_POSTFIX));
-  }
+  // Validate and open files
+  RETURN_IF_ERROR(ValidateAndOpenFile(path_a, reader_a));
+  RETURN_IF_ERROR(ValidateAndOpenFile(path_b, reader_b));
 
   // Compare file sizes.
-  int file1_length = reader_a.GetFileSize();
-  int file2_length = reader_b.GetFileSize();
-  if (file1_length != file2_length || file1_length == 
TUPLE_TEXT_FILE_SIZE_ERROR) {
-    return Status(TErrorCode::TUPLE_CACHE_INCONSISTENCY,
-        Substitute("Size of file '$0' (size: $1) and '$2' (size: $3) are 
different",
-            path_a + DEBUG_TUPLE_CACHE_BAD_POSTFIX, file1_length,
-            path_b + DEBUG_TUPLE_CACHE_BAD_POSTFIX, file2_length));
-  }
+  // Files are supposed to be written in tuple-text-file-writer.cc, and are 
plain
+  // text with '\n' as row delimiters and no padding, so a size mismatch likely
+  // indicates content differences.
+  // If the sizes are different, we leave "passed" as false to let the caller 
to proceed
+  // with a slower check. This may provide a clearer error message and prevent 
false
+  // mismatches when identical rows appear in a different order, which may 
result in
+  // different sizes.
+  if (reader_a.GetFileSize() != reader_b.GetFileSize()) return Status::OK();
+
   // Reset readers to the beginning of the files.
   reader_a.Rewind();
   reader_b.Rewind();

Reply via email to