This is an automated email from the ASF dual-hosted git repository.
wgtmac pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 53d784254 GH-3457: Validate column index for null_pages/null_counts
contradiction (#3458)
53d784254 is described below
commit 53d78425403d1900191f6da956fd67ada729c296
Author: Chungmin Lee <[email protected]>
AuthorDate: Fri May 1 08:44:35 2026 -0700
GH-3457: Validate column index for null_pages/null_counts contradiction
(#3458)
* Validate column index for null_pages/null_counts contradiction
A null_pages entry of true indicates the page consists entirely of null
values;
a corresponding null_counts entry of zero contradicts this, as it indicates
the
page has no null values at all. This inconsistency is a sign of invalid
metadata
that would cause incorrect page skipping during column index filtering —
the reader treats such pages as all-null and silently excludes them from
query
results for predicates requiring non-null values.
Add validation in ColumnIndexBuilder.build(PrimitiveType) to detect this
contradiction and return null, consistent with the existing pattern for
invalid
min/max values. The null propagates through the read path, causing the
filter to
fall back to reading all pages for the affected column.
Also fix a pre-existing NPE in the static build() method where build(type)
could return null but was not null-checked before accessing boundaryOrder.
Co-authored-by: Copilot <[email protected]>
---
.../column/columnindex/ColumnIndexBuilder.java | 16 +++
.../column/columnindex/TestBoundaryOrder.java | 6 +-
.../column/columnindex/TestColumnIndexBuilder.java | 152 ++++++++++++++++++++-
3 files changed, 170 insertions(+), 4 deletions(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
index e78b2ceae..bd9073e26 100644
---
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
+++
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
@@ -616,6 +616,9 @@ public abstract class ColumnIndexBuilder {
builder.fill(nullPages, nullCounts, minValues, maxValues,
repLevelHistogram, defLevelHistogram);
ColumnIndexBase<?> columnIndex = builder.build(type);
+ if (columnIndex == null) {
+ return null;
+ }
columnIndex.boundaryOrder = requireNonNull(boundaryOrder);
return columnIndex;
}
@@ -750,6 +753,19 @@ public abstract class ColumnIndexBuilder {
if (!nullCounts.isEmpty()) {
columnIndex.nullCounts = nullCounts.toLongArray();
}
+ // A null_pages entry of true indicates the page consists entirely of null
values.
+ // When null_counts is present, a null_counts entry of zero means the page
has no
+ // null values at all. These two fields directly contradict each other: a
page
+ // cannot both consist entirely of nulls and contain zero nulls. Since
null_pages
+ // is the field that causes the reader to skip pages during column index
filtering,
+ // the safe response is to discard the column index for this column
entirely.
+ if (columnIndex.nullCounts != null) {
+ for (int i = 0; i < columnIndex.nullPages.length; i++) {
+ if (columnIndex.nullPages[i] && columnIndex.nullCounts[i] <= 0L) {
+ return null;
+ }
+ }
+ }
columnIndex.pageIndexes = pageIndexes.toIntArray();
// Repetition and definition level histograms are optional so keep them
null if the builder has no values
if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {
diff --git
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
index 415a21e33..658db8593 100644
---
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
+++
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
@@ -250,9 +250,11 @@ public class TestBoundaryOrder {
RAND_DESCENDING = (ColumnIndexBase<?>) builder.build();
builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE);
- // Add a couple of empty stats so column index will contain null pages only
+ // Add a couple of all-null stats so column index will contain null pages
only
for (int i = 0; i < 10; ++i) {
- builder.add(Statistics.createStats(TYPE));
+ Statistics<?> nullStats = Statistics.createStats(TYPE);
+ nullStats.incrementNumNulls(10);
+ builder.add(nullStats);
}
ALL_NULL_PAGES = (ColumnIndexBase<?>) builder.build();
}
diff --git
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
index 6274f3626..96c66ea52 100644
---
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
+++
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
@@ -904,11 +904,11 @@ public class TestColumnIndexBuilder {
Types.required(BOOLEAN).named("test_boolean"),
BoundaryOrder.DESCENDING,
List.of(false, true, false, true, false, true),
- List.of(9l, 8l, 7l, 6l, 5l, 0l),
+ List.of(9l, 8l, 7l, 6l, 5l, 4l),
toBBList(false, null, false, null, true, null),
toBBList(true, null, false, null, true, null));
assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
- assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 0);
+ assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 4);
assertCorrectNullPages(columnIndex, false, true, false, true, false, true);
assertCorrectValues(columnIndex.getMaxValues(), true, null, false, null,
true, null);
assertCorrectValues(columnIndex.getMinValues(), false, null, false, null,
true, null);
@@ -1865,4 +1865,152 @@ public class TestColumnIndexBuilder {
private static void assertCorrectFiltering(ColumnIndex ci, FilterPredicate
predicate, int... expectedIndexes) {
TestIndexIterator.assertEquals(predicate.accept(ci), expectedIndexes);
}
+
+ @Test
+ public void testBuildReturnsNullForNullPageCountContradiction() {
+ // null_pages[i]=true indicates the page is entirely null, but
null_counts[i]=0
+ // says there are zero null values. This contradiction indicates invalid
metadata.
+ // build() should return null to prevent incorrect page skipping.
+ PrimitiveType type = Types.required(INT32).named("test_col");
+
+ // Pages 1-3 have null_pages=true with null_counts=0 — contradictory
+ assertNull(
+ "Column index with null_pages=true and null_counts=0 should be
rejected",
+ ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.ASCENDING,
+ List.of(false, true, true, true),
+ List.of(0L, 0L, 0L, 0L),
+ toBBList(Integer.valueOf(-99), null, null, null),
+ toBBList(Integer.valueOf(5), null, null, null)));
+
+ // Contradiction on a single page (last page) should also be rejected
+ assertNull(
+ "Single contradictory page should cause rejection",
+ ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.UNORDERED,
+ List.of(false, false, true),
+ List.of(0L, 5L, 0L),
+ toBBList(Integer.valueOf(1), Integer.valueOf(50), null),
+ toBBList(Integer.valueOf(49), Integer.valueOf(99), null)));
+
+ // Contradiction on the first page
+ assertNull(
+ "Contradictory first page should cause rejection",
+ ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.UNORDERED,
+ List.of(true, false, false),
+ List.of(0L, 0L, 5L),
+ toBBList(null, Integer.valueOf(1), Integer.valueOf(50)),
+ toBBList(null, Integer.valueOf(49), Integer.valueOf(99))));
+
+ // Single page with contradiction
+ assertNull(
+ "Single-page column index with contradiction should be rejected",
+ ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.UNORDERED,
+ List.of(true),
+ List.of(0L),
+ toBBList((Integer) null),
+ toBBList((Integer) null)));
+
+ // All pages are contradictory null pages
+ assertNull(
+ "All-contradictory column index should be rejected",
+ ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.UNORDERED,
+ List.of(true, true, true),
+ List.of(0L, 0L, 0L),
+ toBBList((Integer) null, null, null),
+ toBBList((Integer) null, null, null)));
+ }
+
+ @Test
+ public void testBuildPreservesValidColumnIndex() {
+ PrimitiveType type = Types.required(INT32).named("test_col");
+
+ // Legitimate null page: null_pages=true with null_counts > 0 — valid
+ ColumnIndex ci = ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.ASCENDING,
+ List.of(false, true, false),
+ List.of(0L, 100L, 0L),
+ toBBList(Integer.valueOf(1), null, Integer.valueOf(50)),
+ toBBList(Integer.valueOf(49), null, Integer.valueOf(99)));
+ assertCorrectNullPages(ci, false, true, false);
+ assertCorrectNullCounts(ci, 0, 100, 0);
+ assertCorrectValues(ci.getMinValues(), 1, null, 50);
+ assertCorrectValues(ci.getMaxValues(), 49, null, 99);
+
+ // All non-null pages — valid
+ ColumnIndex ci2 = ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.ASCENDING,
+ List.of(false, false, false),
+ List.of(0L, 5L, 10L),
+ toBBList(Integer.valueOf(1), Integer.valueOf(50),
Integer.valueOf(100)),
+ toBBList(Integer.valueOf(49), Integer.valueOf(99),
Integer.valueOf(150)));
+ assertCorrectNullPages(ci2, false, false, false);
+ assertCorrectNullCounts(ci2, 0, 5, 10);
+
+ // Single non-null page
+ ColumnIndex ci3 = ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.UNORDERED,
+ List.of(false),
+ List.of(0L),
+ toBBList(Integer.valueOf(42)),
+ toBBList(Integer.valueOf(42)));
+ assertCorrectNullPages(ci3, false);
+ assertCorrectNullCounts(ci3, 0);
+
+ // Single legitimate all-null page (null_pages=true, null_counts > 0)
+ ColumnIndex ci4 = ColumnIndexBuilder.build(
+ type, BoundaryOrder.UNORDERED, List.of(true), List.of(50L),
toBBList((Integer) null), toBBList((Integer)
+ null));
+ assertCorrectNullPages(ci4, true);
+ assertCorrectNullCounts(ci4, 50);
+
+ // All pages legitimately null
+ ColumnIndex ci5 = ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.UNORDERED,
+ List.of(true, true, true),
+ List.of(10L, 20L, 30L),
+ toBBList((Integer) null, null, null),
+ toBBList((Integer) null, null, null));
+ assertCorrectNullPages(ci5, true, true, true);
+ assertCorrectNullCounts(ci5, 10, 20, 30);
+
+ // Boundary: null_counts=1 on a null page (minimum valid value) — should
NOT be rejected
+ ColumnIndex ci6 = ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.UNORDERED,
+ List.of(false, true),
+ List.of(0L, 1L),
+ toBBList(Integer.valueOf(1), null),
+ toBBList(Integer.valueOf(99), null));
+ assertCorrectNullPages(ci6, false, true);
+ assertCorrectNullCounts(ci6, 0, 1);
+ }
+
+ @Test
+ public void testBuildWithoutNullCountsIsNotRejected() {
+ PrimitiveType type = Types.required(INT32).named("test_col");
+
+ // null_counts absent (optional field) — cannot detect contradiction,
should build normally
+ ColumnIndex ci = ColumnIndexBuilder.build(
+ type,
+ BoundaryOrder.UNORDERED,
+ List.of(false, true, true),
+ null,
+ toBBList(Integer.valueOf(1), null, null),
+ toBBList(Integer.valueOf(99), null, null));
+ assertCorrectNullPages(ci, false, true, true);
+ assertNull("null_counts should be null when not provided",
ci.getNullCounts());
+ }
}