dsmiley commented on a change in pull request #2: URL: https://github.com/apache/solr/pull/2#discussion_r594859845
########## File path: solr/core/src/java/org/apache/solr/search/SortedIntDocSet.java ########## @@ -671,107 +673,141 @@ public DocSet union(DocSet other) { return new BitDocSet(newbits); } - @Override - public Filter getTopFilter() { - return new Filter() { + private int[] cachedOrdIdxMap; + private int[] getOrdIdxMap(LeafReaderContext ctx) { + final int[] ret; + if (ctx.isTopLevel) { + // don't bother caching this? + ret = new int[] {0, docs.length - 1}; + } else if (cachedOrdIdxMap != null) { + ret = cachedOrdIdxMap; + } else { + List<LeafReaderContext> leaves = ReaderUtil.getTopLevelContext(ctx).leaves(); + ret = new int[leaves.size() << 1]; int lastEndIdx = 0; - - @Override - public DocIdSet getDocIdSet(final LeafReaderContext context, final Bits acceptDocs) { - LeafReader reader = context.reader(); - // all Solr DocSets that are used as filters only include live docs - final Bits acceptDocs2 = acceptDocs == null ? null : (reader.getLiveDocs() == acceptDocs ? null : acceptDocs); - - final int base = context.docBase; - final int maxDoc = reader.maxDoc(); + for (LeafReaderContext lrc : leaves) { + final int base = lrc.docBase; + final int maxDoc = lrc.reader().maxDoc(); final int max = base + maxDoc; // one past the max doc in this segment. int sidx = Math.max(0,lastEndIdx); - if (sidx > 0 && docs[sidx-1] >= base) { - // oops, the lastEndIdx isn't correct... we must have been used - // in a multi-threaded context, or the indexreaders are being - // used out-of-order. start at 0. - sidx = 0; - } if (sidx < docs.length && docs[sidx] < base) { // if docs[sidx] is < base, we need to seek to find the real start. sidx = findIndex(docs, base, sidx, docs.length-1); } - final int startIdx = sidx; - // Largest possible end index is limited to the start index // plus the number of docs contained in the segment. Subtract 1 since // the end index is inclusive. - int eidx = Math.min(docs.length, startIdx + maxDoc) - 1; + int eidx = Math.min(docs.length, sidx + maxDoc) - 1; // find the real end - eidx = findIndex(docs, max, startIdx, eidx) - 1; + eidx = findIndex(docs, max, sidx, eidx) - 1; + + final int mapOrdIdx = lrc.ord << 1; + ret[mapOrdIdx] = sidx; + ret[mapOrdIdx + 1] = eidx; + lastEndIdx = eidx; + } + cachedOrdIdxMap = ret; // replace atomically after building + } + return ret; + } + + @Override + public DocIdSetIterator iterator(LeafReaderContext context) { + + if (docs.length == 0 || context.reader().maxDoc() < 1) { + // empty docset or entirely empty segment (verified that the latter actually happens) + // NOTE: wrt the "empty docset" case, this is not just an optimization; this shortcircuits also + // to prevent the static DocSet.EmptyLazyHolder.INSTANCE from having cachedOrdIdxMap initiated + // across different IndexReaders. + return null; + } + + int[] ordIdxMap = getOrdIdxMap(context); + final int base = context.docBase; + + final int mapOrdIdx = context.ord << 1; Review comment: Why the left shift (times two)? (deserves a comment) ########## File path: solr/core/src/java/org/apache/solr/search/SortedIntDocSet.java ########## @@ -671,107 +673,141 @@ public DocSet union(DocSet other) { return new BitDocSet(newbits); } - @Override - public Filter getTopFilter() { - return new Filter() { + private int[] cachedOrdIdxMap; + private int[] getOrdIdxMap(LeafReaderContext ctx) { + final int[] ret; + if (ctx.isTopLevel) { + // don't bother caching this? + ret = new int[] {0, docs.length - 1}; + } else if (cachedOrdIdxMap != null) { + ret = cachedOrdIdxMap; + } else { + List<LeafReaderContext> leaves = ReaderUtil.getTopLevelContext(ctx).leaves(); + ret = new int[leaves.size() << 1]; int lastEndIdx = 0; - - @Override - public DocIdSet getDocIdSet(final LeafReaderContext context, final Bits acceptDocs) { - LeafReader reader = context.reader(); - // all Solr DocSets that are used as filters only include live docs - final Bits acceptDocs2 = acceptDocs == null ? null : (reader.getLiveDocs() == acceptDocs ? null : acceptDocs); - - final int base = context.docBase; - final int maxDoc = reader.maxDoc(); + for (LeafReaderContext lrc : leaves) { + final int base = lrc.docBase; + final int maxDoc = lrc.reader().maxDoc(); final int max = base + maxDoc; // one past the max doc in this segment. int sidx = Math.max(0,lastEndIdx); - if (sidx > 0 && docs[sidx-1] >= base) { - // oops, the lastEndIdx isn't correct... we must have been used - // in a multi-threaded context, or the indexreaders are being - // used out-of-order. start at 0. - sidx = 0; - } if (sidx < docs.length && docs[sidx] < base) { // if docs[sidx] is < base, we need to seek to find the real start. sidx = findIndex(docs, base, sidx, docs.length-1); } - final int startIdx = sidx; - // Largest possible end index is limited to the start index // plus the number of docs contained in the segment. Subtract 1 since // the end index is inclusive. - int eidx = Math.min(docs.length, startIdx + maxDoc) - 1; + int eidx = Math.min(docs.length, sidx + maxDoc) - 1; // find the real end - eidx = findIndex(docs, max, startIdx, eidx) - 1; + eidx = findIndex(docs, max, sidx, eidx) - 1; + + final int mapOrdIdx = lrc.ord << 1; + ret[mapOrdIdx] = sidx; + ret[mapOrdIdx + 1] = eidx; + lastEndIdx = eidx; + } + cachedOrdIdxMap = ret; // replace atomically after building + } + return ret; + } + + @Override + public DocIdSetIterator iterator(LeafReaderContext context) { + + if (docs.length == 0 || context.reader().maxDoc() < 1) { + // empty docset or entirely empty segment (verified that the latter actually happens) + // NOTE: wrt the "empty docset" case, this is not just an optimization; this shortcircuits also + // to prevent the static DocSet.EmptyLazyHolder.INSTANCE from having cachedOrdIdxMap initiated + // across different IndexReaders. + return null; + } + + int[] ordIdxMap = getOrdIdxMap(context); + final int base = context.docBase; + + final int mapOrdIdx = context.ord << 1; + final int startIdx = ordIdxMap[mapOrdIdx]; + final int endIdx = ordIdxMap[mapOrdIdx + 1]; + + if (startIdx > endIdx) { + return null; // verified this does happen Review comment: Wha?! oh, is endIdx not +1 beyond? For consistency with most of these algorithms, lets have endIdx be one beyond. ########## File path: solr/core/src/java/org/apache/solr/search/SortedIntDocSet.java ########## @@ -671,107 +679,141 @@ public DocSet union(DocSet other) { return new BitDocSet(newbits); } - @Override - public Filter getTopFilter() { - return new Filter() { + private int[] cachedOrdIdxMap; Review comment: Achieving thread-safety without 'volatile', 'synchronized', or some higher level things that tend to use them is tricky. I see that this array ought to always have the same content. If this were an `int` or other 4-byte primitive or a final/immutable object then I'd be comfortable. But arrays... I'm not sure. It'd be good to get another opinion. I don't think computing this up-front always is a waste of time. But I can see that asking the caller to do this is not as clean as keeping this matter internal, but moreover it's places other than DocSetBuilder that are harder to figure out. So it's not important to me provided the caching works. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org