clairemcginty commented on code in PR #3098: URL: https://github.com/apache/parquet-java/pull/3098#discussion_r1872004619
########## parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java: ########## @@ -217,6 +219,70 @@ public <T extends Comparable<T>> Boolean visit(Contains<T> contains) { return contains.filter(this, (l, r) -> l || r, (l, r) -> l && r, v -> BLOCK_MIGHT_MATCH); } + @Override + public Boolean visit(Size size) { + final ColumnChunkMetaData metadata = getColumnChunk(size.getColumn().getColumnPath()); + if (metadata == null) { + // the column isn't in this file, so fail eq/gt/gte targeting size > 0 + final boolean blockCannotMatch = + size.filter((eq) -> eq > 0, (lt) -> false, (lte) -> false, (gt) -> gt >= 0, (gte) -> gte > 0); + return blockCannotMatch ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH; + } + + final SizeStatistics stats = metadata.getSizeStatistics(); + final List<Long> repetitionLevelHistogram = stats.getRepetitionLevelHistogram(); + final List<Long> definitionLevelHistogram = stats.getDefinitionLevelHistogram(); + + if (repetitionLevelHistogram.isEmpty() || definitionLevelHistogram.isEmpty()) { + return BLOCK_MIGHT_MATCH; + } + + // If all values have repetition level 0, then no array has more than 1 element + if (repetitionLevelHistogram.size() == 1 + || repetitionLevelHistogram.subList(1, repetitionLevelHistogram.size()).stream() + .allMatch(l -> l == 0)) { + + // Null list fields are treated as having size 0 + if (( // all lists are nulls + definitionLevelHistogram.subList(1, definitionLevelHistogram.size()).stream() + .allMatch(l -> l == 0)) + || // all lists are size 0 + (definitionLevelHistogram.get(0) == 0 + && definitionLevelHistogram.subList(2, definitionLevelHistogram.size()).stream() + .allMatch(l -> l == 0))) { + + final boolean blockCannotMatch = + size.filter((eq) -> eq > 0, (lt) -> false, (lte) -> false, (gt) -> gt >= 0, (gte) -> gte > 0); + return blockCannotMatch ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH; + } + + long maxDefinitionLevel = definitionLevelHistogram.get(definitionLevelHistogram.size() - 1); + + // If all repetition levels are zero and all definitions level are > MAX_DEFINITION_LEVEL - 1, all lists + // are of size 1 + if (definitionLevelHistogram.stream().allMatch(l -> l > maxDefinitionLevel - 1)) { + final boolean blockCannotMatch = size.filter( + (eq) -> eq != 1, (lt) -> lt <= 1, (lte) -> lte < 1, (gt) -> gt >= 1, (gte) -> gte > 1); + + return blockCannotMatch ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH; + } + } + long nonNullElementCount = + repetitionLevelHistogram.stream().mapToLong(l -> l).sum() - definitionLevelHistogram.get(0); + long numNonNullRecords = repetitionLevelHistogram.get(0) - definitionLevelHistogram.get(0); + + // Given the total number of elements and non-null fields, we can compute the max size of any array field + long maxArrayElementCount = 1 + (nonNullElementCount - numNonNullRecords); + final boolean blockCannotMatch = size.filter( + (eq) -> eq > maxArrayElementCount, + (lt) -> false, + (lte) -> false, + (gt) -> gt >= maxArrayElementCount, + (gte) -> gte > maxArrayElementCount); + + return blockCannotMatch ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH; Review Comment: hopefully this is a faithful transcription of the logic outlined here: https://github.com/apache/parquet-java/issues/1452#issuecomment-2271914678 ########## parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java: ########## @@ -435,6 +468,174 @@ public void testOr() { assertFalse(canDrop(or(no, no), columnMetas)); } + @Test + public void testSizeFilterRequiredGroupRequiredElements() throws Exception { + final IntStatistics minMaxStats = new IntStatistics(); + + // Case 1: Lists are populated + List<ColumnChunkMetaData> columnMeta = Collections.singletonList(getIntColumnMeta( + nestedListColumn.getColumnPath(), + minMaxStats, + createSizeStatisticsForRepeatedField( + true, + ImmutableList.of( + ImmutableList.of(1, 2, 3), + ImmutableList.of(1), + ImmutableList.of(1, 2, 3), + ImmutableList.of())), + 4)); + + // SizeStats tells us that there are 7 total array elements spread across 3 non-empty list_fields, + // so the max size any single list_field could have is 5 + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GTE, 6), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GT, 5), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 6), columnMeta)); + + // These predicates should not be able to filter out the page + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LT, 5), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LTE, 3), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 5), columnMeta)); + + // Case 2: All lists are empty + columnMeta = Collections.singletonList(getIntColumnMeta( + nestedListColumn.getColumnPath(), + minMaxStats, + createSizeStatisticsForRepeatedField( + true, ImmutableList.of(ImmutableList.of(), ImmutableList.of(), ImmutableList.of())), + 3)); + + // These predicates should be able to filter out the page + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GT, 0), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GTE, 1), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 5), columnMeta)); + + // These predicates should not be able to filter out the page + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LTE, 1), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LT, 1), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 0), columnMeta)); + } + + @Test + public void testSizeFilterRequiredGroupOptionalElements() throws Exception { + final IntStatistics minMaxStats = new IntStatistics(); + + // Case 1: List is non-empty + List<Integer> listWithNulls = new ArrayList<>(); + listWithNulls.add(1); + listWithNulls.add(null); + listWithNulls.add(null); + List<ColumnChunkMetaData> columnMeta = Collections.singletonList(getIntColumnMeta( + nestedListColumn.getColumnPath(), + minMaxStats, + createSizeStatisticsForRepeatedField( + true, + ImmutableList.of( + listWithNulls, ImmutableList.of(1), ImmutableList.of(1, 2, 3), ImmutableList.of())), + 4)); + + // These predicates should be able to filter out the page + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GTE, 6), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GT, 5), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 6), columnMeta)); + + // These predicates should not be able to filter out the page + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LT, 5), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LTE, 3), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 5), columnMeta)); + } + + @Test + public void testSizeFilterOptionalGroup() throws Exception { + final IntStatistics minMaxStats = new IntStatistics(); + + // Case 1: List is non-null + List<ColumnChunkMetaData> columnMeta = Collections.singletonList(getIntColumnMeta( + nestedListColumn.getColumnPath(), + minMaxStats, + createSizeStatisticsForRepeatedField( + false, + ImmutableList.of(ImmutableList.of(1, 2, 3), ImmutableList.of(1), ImmutableList.of(1, 2, 3))), + 3)); + + // These predicates should be able to filter out the page + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GTE, 6), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GT, 5), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 6), columnMeta)); + + // These predicates should not be able to filter out the page + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LT, 5), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LTE, 3), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 5), columnMeta)); + + // Case 2: List is null + columnMeta = Collections.singletonList(getIntColumnMeta( + nestedListColumn.getColumnPath(), + minMaxStats, + createSizeStatisticsForRepeatedField(true, ImmutableList.of()), + 5)); + + // These predicates should be able to filter out the page + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GT, 0), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.GTE, 1), columnMeta)); + assertTrue(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 5), columnMeta)); + + // These predicates should not be able to filter out the page + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LTE, 1), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.LT, 1), columnMeta)); + assertFalse(canDrop(size(nestedListColumn, Operators.Size.Operator.EQ, 0), columnMeta)); + } + + private static SizeStatistics createSizeStatisticsForRepeatedField( Review Comment: I'm dynamically generating SizeStatistics for each test case which does add a lot of LOC to the file--I could also just replace it with the computed `SizeStatistics` for each test case if that's simpler. I just wrote it this way originally because I wasn't that confident in my ability to translate the striping algorithm by hand for all these cases 😅 ########## parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java: ########## @@ -378,6 +379,11 @@ public <T extends Comparable<T>> PrimitiveIterator.OfInt visit(Contains<T> conta indices -> IndexIterator.all(getPageCount())); } + @Override + public PrimitiveIterator.OfInt visit(Size size) { + return IndexIterator.all(getPageCount()); Review Comment: `repetitionLevelHistogram` and `definitionLevelHistogram` are both in scope here, should I repeat the logic from `StatisticsFilter` or is that completely redundant? ########## parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicate.java: ########## @@ -223,6 +224,74 @@ public void reset() { } } + class CountingValueInspector extends ValueInspector { + private long observedValueCount; + private final ValueInspector delegate; + private final Function<Long, Boolean> shouldUpdateDelegate; + + public CountingValueInspector(ValueInspector delegate, Function<Long, Boolean> shouldUpdateDelegate) { + this.observedValueCount = 0; + this.delegate = delegate; + this.shouldUpdateDelegate = shouldUpdateDelegate; Review Comment: note: The "shouldUpdateDelegate" is needed since don't want to terminate prematurely with a false positive. For example if we're filtering on `size(eq(3))` but the input array has 4 elements, we want to prevent the delegated `Eq` from returning true after it hits the third element because it thinks the condition is satisfied. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@parquet.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@parquet.apache.org For additional commands, e-mail: issues-h...@parquet.apache.org