This is an automated email from the ASF dual-hosted git repository.

wgtmac pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new 53d784254 GH-3457: Validate column index for null_pages/null_counts 
contradiction (#3458)
53d784254 is described below

commit 53d78425403d1900191f6da956fd67ada729c296
Author: Chungmin Lee <[email protected]>
AuthorDate: Fri May 1 08:44:35 2026 -0700

    GH-3457: Validate column index for null_pages/null_counts contradiction 
(#3458)
    
    * Validate column index for null_pages/null_counts contradiction
    
    A null_pages entry of true indicates the page consists entirely of null 
values;
    a corresponding null_counts entry of zero contradicts this, as it indicates 
the
    page has no null values at all. This inconsistency is a sign of invalid 
metadata
    that would cause incorrect page skipping during column index filtering —
    the reader treats such pages as all-null and silently excludes them from 
query
    results for predicates requiring non-null values.
    
    Add validation in ColumnIndexBuilder.build(PrimitiveType) to detect this
    contradiction and return null, consistent with the existing pattern for 
invalid
    min/max values. The null propagates through the read path, causing the 
filter to
    fall back to reading all pages for the affected column.
    
    Also fix a pre-existing NPE in the static build() method where build(type)
    could return null but was not null-checked before accessing boundaryOrder.
    
    Co-authored-by: Copilot <[email protected]>
---
 .../column/columnindex/ColumnIndexBuilder.java     |  16 +++
 .../column/columnindex/TestBoundaryOrder.java      |   6 +-
 .../column/columnindex/TestColumnIndexBuilder.java | 152 ++++++++++++++++++++-
 3 files changed, 170 insertions(+), 4 deletions(-)

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
index e78b2ceae..bd9073e26 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
@@ -616,6 +616,9 @@ public abstract class ColumnIndexBuilder {
 
     builder.fill(nullPages, nullCounts, minValues, maxValues, 
repLevelHistogram, defLevelHistogram);
     ColumnIndexBase<?> columnIndex = builder.build(type);
+    if (columnIndex == null) {
+      return null;
+    }
     columnIndex.boundaryOrder = requireNonNull(boundaryOrder);
     return columnIndex;
   }
@@ -750,6 +753,19 @@ public abstract class ColumnIndexBuilder {
     if (!nullCounts.isEmpty()) {
       columnIndex.nullCounts = nullCounts.toLongArray();
     }
+    // A null_pages entry of true indicates the page consists entirely of null 
values.
+    // When null_counts is present, a null_counts entry of zero means the page 
has no
+    // null values at all. These two fields directly contradict each other: a 
page
+    // cannot both consist entirely of nulls and contain zero nulls. Since 
null_pages
+    // is the field that causes the reader to skip pages during column index 
filtering,
+    // the safe response is to discard the column index for this column 
entirely.
+    if (columnIndex.nullCounts != null) {
+      for (int i = 0; i < columnIndex.nullPages.length; i++) {
+        if (columnIndex.nullPages[i] && columnIndex.nullCounts[i] <= 0L) {
+          return null;
+        }
+      }
+    }
     columnIndex.pageIndexes = pageIndexes.toIntArray();
     // Repetition and definition level histograms are optional so keep them 
null if the builder has no values
     if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
 
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
index 415a21e33..658db8593 100644
--- 
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
+++ 
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
@@ -250,9 +250,11 @@ public class TestBoundaryOrder {
     RAND_DESCENDING = (ColumnIndexBase<?>) builder.build();
 
     builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE);
-    // Add a couple of empty stats so column index will contain null pages only
+    // Add a couple of all-null stats so column index will contain null pages 
only
     for (int i = 0; i < 10; ++i) {
-      builder.add(Statistics.createStats(TYPE));
+      Statistics<?> nullStats = Statistics.createStats(TYPE);
+      nullStats.incrementNumNulls(10);
+      builder.add(nullStats);
     }
     ALL_NULL_PAGES = (ColumnIndexBase<?>) builder.build();
   }
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
 
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
index 6274f3626..96c66ea52 100644
--- 
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
+++ 
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
@@ -904,11 +904,11 @@ public class TestColumnIndexBuilder {
         Types.required(BOOLEAN).named("test_boolean"),
         BoundaryOrder.DESCENDING,
         List.of(false, true, false, true, false, true),
-        List.of(9l, 8l, 7l, 6l, 5l, 0l),
+        List.of(9l, 8l, 7l, 6l, 5l, 4l),
         toBBList(false, null, false, null, true, null),
         toBBList(true, null, false, null, true, null));
     assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
-    assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 0);
+    assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 4);
     assertCorrectNullPages(columnIndex, false, true, false, true, false, true);
     assertCorrectValues(columnIndex.getMaxValues(), true, null, false, null, 
true, null);
     assertCorrectValues(columnIndex.getMinValues(), false, null, false, null, 
true, null);
@@ -1865,4 +1865,152 @@ public class TestColumnIndexBuilder {
   private static void assertCorrectFiltering(ColumnIndex ci, FilterPredicate 
predicate, int... expectedIndexes) {
     TestIndexIterator.assertEquals(predicate.accept(ci), expectedIndexes);
   }
+
+  @Test
+  public void testBuildReturnsNullForNullPageCountContradiction() {
+    // null_pages[i]=true indicates the page is entirely null, but 
null_counts[i]=0
+    // says there are zero null values. This contradiction indicates invalid 
metadata.
+    // build() should return null to prevent incorrect page skipping.
+    PrimitiveType type = Types.required(INT32).named("test_col");
+
+    // Pages 1-3 have null_pages=true with null_counts=0 — contradictory
+    assertNull(
+        "Column index with null_pages=true and null_counts=0 should be 
rejected",
+        ColumnIndexBuilder.build(
+            type,
+            BoundaryOrder.ASCENDING,
+            List.of(false, true, true, true),
+            List.of(0L, 0L, 0L, 0L),
+            toBBList(Integer.valueOf(-99), null, null, null),
+            toBBList(Integer.valueOf(5), null, null, null)));
+
+    // Contradiction on a single page (last page) should also be rejected
+    assertNull(
+        "Single contradictory page should cause rejection",
+        ColumnIndexBuilder.build(
+            type,
+            BoundaryOrder.UNORDERED,
+            List.of(false, false, true),
+            List.of(0L, 5L, 0L),
+            toBBList(Integer.valueOf(1), Integer.valueOf(50), null),
+            toBBList(Integer.valueOf(49), Integer.valueOf(99), null)));
+
+    // Contradiction on the first page
+    assertNull(
+        "Contradictory first page should cause rejection",
+        ColumnIndexBuilder.build(
+            type,
+            BoundaryOrder.UNORDERED,
+            List.of(true, false, false),
+            List.of(0L, 0L, 5L),
+            toBBList(null, Integer.valueOf(1), Integer.valueOf(50)),
+            toBBList(null, Integer.valueOf(49), Integer.valueOf(99))));
+
+    // Single page with contradiction
+    assertNull(
+        "Single-page column index with contradiction should be rejected",
+        ColumnIndexBuilder.build(
+            type,
+            BoundaryOrder.UNORDERED,
+            List.of(true),
+            List.of(0L),
+            toBBList((Integer) null),
+            toBBList((Integer) null)));
+
+    // All pages are contradictory null pages
+    assertNull(
+        "All-contradictory column index should be rejected",
+        ColumnIndexBuilder.build(
+            type,
+            BoundaryOrder.UNORDERED,
+            List.of(true, true, true),
+            List.of(0L, 0L, 0L),
+            toBBList((Integer) null, null, null),
+            toBBList((Integer) null, null, null)));
+  }
+
+  @Test
+  public void testBuildPreservesValidColumnIndex() {
+    PrimitiveType type = Types.required(INT32).named("test_col");
+
+    // Legitimate null page: null_pages=true with null_counts > 0 — valid
+    ColumnIndex ci = ColumnIndexBuilder.build(
+        type,
+        BoundaryOrder.ASCENDING,
+        List.of(false, true, false),
+        List.of(0L, 100L, 0L),
+        toBBList(Integer.valueOf(1), null, Integer.valueOf(50)),
+        toBBList(Integer.valueOf(49), null, Integer.valueOf(99)));
+    assertCorrectNullPages(ci, false, true, false);
+    assertCorrectNullCounts(ci, 0, 100, 0);
+    assertCorrectValues(ci.getMinValues(), 1, null, 50);
+    assertCorrectValues(ci.getMaxValues(), 49, null, 99);
+
+    // All non-null pages — valid
+    ColumnIndex ci2 = ColumnIndexBuilder.build(
+        type,
+        BoundaryOrder.ASCENDING,
+        List.of(false, false, false),
+        List.of(0L, 5L, 10L),
+        toBBList(Integer.valueOf(1), Integer.valueOf(50), 
Integer.valueOf(100)),
+        toBBList(Integer.valueOf(49), Integer.valueOf(99), 
Integer.valueOf(150)));
+    assertCorrectNullPages(ci2, false, false, false);
+    assertCorrectNullCounts(ci2, 0, 5, 10);
+
+    // Single non-null page
+    ColumnIndex ci3 = ColumnIndexBuilder.build(
+        type,
+        BoundaryOrder.UNORDERED,
+        List.of(false),
+        List.of(0L),
+        toBBList(Integer.valueOf(42)),
+        toBBList(Integer.valueOf(42)));
+    assertCorrectNullPages(ci3, false);
+    assertCorrectNullCounts(ci3, 0);
+
+    // Single legitimate all-null page (null_pages=true, null_counts > 0)
+    ColumnIndex ci4 = ColumnIndexBuilder.build(
+        type, BoundaryOrder.UNORDERED, List.of(true), List.of(50L), 
toBBList((Integer) null), toBBList((Integer)
+            null));
+    assertCorrectNullPages(ci4, true);
+    assertCorrectNullCounts(ci4, 50);
+
+    // All pages legitimately null
+    ColumnIndex ci5 = ColumnIndexBuilder.build(
+        type,
+        BoundaryOrder.UNORDERED,
+        List.of(true, true, true),
+        List.of(10L, 20L, 30L),
+        toBBList((Integer) null, null, null),
+        toBBList((Integer) null, null, null));
+    assertCorrectNullPages(ci5, true, true, true);
+    assertCorrectNullCounts(ci5, 10, 20, 30);
+
+    // Boundary: null_counts=1 on a null page (minimum valid value) — should 
NOT be rejected
+    ColumnIndex ci6 = ColumnIndexBuilder.build(
+        type,
+        BoundaryOrder.UNORDERED,
+        List.of(false, true),
+        List.of(0L, 1L),
+        toBBList(Integer.valueOf(1), null),
+        toBBList(Integer.valueOf(99), null));
+    assertCorrectNullPages(ci6, false, true);
+    assertCorrectNullCounts(ci6, 0, 1);
+  }
+
+  @Test
+  public void testBuildWithoutNullCountsIsNotRejected() {
+    PrimitiveType type = Types.required(INT32).named("test_col");
+
+    // null_counts absent (optional field) — cannot detect contradiction, 
should build normally
+    ColumnIndex ci = ColumnIndexBuilder.build(
+        type,
+        BoundaryOrder.UNORDERED,
+        List.of(false, true, true),
+        null,
+        toBBList(Integer.valueOf(1), null, null),
+        toBBList(Integer.valueOf(99), null, null));
+    assertCorrectNullPages(ci, false, true, true);
+    assertNull("null_counts should be null when not provided", 
ci.getNullCounts());
+  }
 }

Reply via email to