magibney commented on a change in pull request #2: URL: https://github.com/apache/solr/pull/2#discussion_r593312504
########## File path: solr/core/src/java/org/apache/solr/search/BitDocSet.java ########## @@ -237,6 +237,150 @@ public BitDocSet clone() { return new BitDocSet(bits.clone(), size); } + @Override + public Bits getBits(LeafReaderContext context) { + if (context.isTopLevel) { + return bits; + } + + final int base = context.docBase; + final int length = context.reader().maxDoc(); + final FixedBitSet bs = bits; + + return new Bits() { + @Override + public boolean get(int index) { + return bs.get(index + base); + } + + @Override + public int length() { + return length; + } + }; + } + + private static final int NO_DOCS_THIS_SEGMENT = -1; + private int[] cachedFloorDocs; + + /** + * Because `bits.nextSetBit(int)` (called in `nextDoc()`) has no upper limit, lazily cache + * floorDocs for each segment to avoid duplicate scanning of bits (and to enable optimization + * in consumers afforded by returning <code>null</code> when there are no docs for a given + * segment). + */ + private int getFloorDoc(final LeafReaderContext ctx) { + assert !ctx.isTopLevel; + final int[] floorDocs; + final int setMax = bits.length(); + if (cachedFloorDocs != null) { + floorDocs = cachedFloorDocs; + } else { + List<LeafReaderContext> leaves = ReaderUtil.getTopLevelContext(ctx).leaves(); + floorDocs = new int[leaves.size()]; + int idx = 0; + int nextFloorDoc = -1; + for (LeafReaderContext c : leaves) { + final int base = c.docBase; + final int max = base + c.reader().maxDoc(); + final int recordFloorDoc; + if (nextFloorDoc >= max) { + recordFloorDoc = NO_DOCS_THIS_SEGMENT; + } else if (nextFloorDoc >= base) { + recordFloorDoc = nextFloorDoc; + } else if (setMax <= base || (nextFloorDoc = bits.nextSetBit(base)) >= max) { + recordFloorDoc = NO_DOCS_THIS_SEGMENT; + } else { + recordFloorDoc = nextFloorDoc; + } + floorDocs[idx++] = recordFloorDoc; + } + + cachedFloorDocs = floorDocs; + } + return floorDocs[ctx.ord]; + } + + @Override + public DocIdSetIterator iterator(LeafReaderContext context) { + if (context.isTopLevel) { + switch (size) { + case 0: + return null; + default: + // we have an explicit size; use it + return new BitSetIterator(bits, size); + case -1: + // size has not been computed; use bits.length() as an upper bound on cost + final int maxSize = bits.length(); + if (maxSize < 1) { + return null; + } else { + return new BitSetIterator(bits, maxSize); + } + } + } + + final int maxDoc = context.reader().maxDoc(); + if (maxDoc < 1) { + // entirely empty segment; verified this actually happens + return null; + } + + final int firstDocId = getFloorDoc(context); Review comment: True, but `docBase` and `reader.maxDoc()` refer to the entire segment. What we're caching here is the floor doc for this segment that's actually present in this DocSet, which helps support returning `null` in place of empty iterators for segments that contain docs, but none that are included in this DocSet. (the main goal here isn't saving the returning of the iterator _per se_, but supporting empty-iterator optimizations in consumers). ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org