Thanks Alan, I indeed committed more than I intended! Le mar. 5 sept. 2017 à 19:49, Alan Woodward <[email protected]> a écrit :
> Hey Adrien, I think you included your patch for LUCENE-7958 here as well? > > > On 5 Sep 2017, at 17:40, [email protected] wrote: > > > > Repository: lucene-solr > > Updated Branches: > > refs/heads/branch_7_0 d6923cb4b -> 1899b36bd > > refs/heads/branch_7x df1d09de2 -> 61a48f029 > > refs/heads/master 5a8eb5388 -> 96150badc > > > > > > LUCENE-7956: Fixed potential stack overflow error in > ICUNormalizer2CharFilter. > > > > > > Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo > > Commit: > http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/96150bad > > Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/96150bad > > Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/96150bad > > > > Branch: refs/heads/master > > Commit: 96150badce8234cac00a23c2d5da55545e0be958 > > Parents: 5a8eb53 > > Author: Adrien Grand <[email protected]> > > Authored: Tue Sep 5 18:34:05 2017 +0200 > > Committer: Adrien Grand <[email protected]> > > Committed: Tue Sep 5 18:34:05 2017 +0200 > > > > ---------------------------------------------------------------------- > > lucene/CHANGES.txt | 3 + > > .../analysis/icu/ICUNormalizer2CharFilter.java | 35 +++-- > > .../icu/TestICUNormalizer2CharFilter.java | 21 +++ > > .../org/apache/lucene/search/DisiWrapper.java | 12 ++ > > .../apache/lucene/search/TermInSetQuery.java | 143 ++++--------------- > > 5 files changed, 89 insertions(+), 125 deletions(-) > > ---------------------------------------------------------------------- > > > > > > > http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/CHANGES.txt > > ---------------------------------------------------------------------- > > diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt > > index a819916..02b2231 100644 > > --- a/lucene/CHANGES.txt > > +++ b/lucene/CHANGES.txt > > @@ -195,6 +195,9 @@ Bug Fixes > > * LUCENE-7864: IndexMergeTool is not using intermediate hard links (even > > if possible). (Dawid Weiss) > > > > +* LUCENE-7956: Fixed potential stack overflow error in > ICUNormalizer2CharFilter. > > + (Adrien Grand) > > + > > Improvements > > > > * LUCENE-7489: Better storage of sparse doc-values fields with the > default > > > > > http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java > > ---------------------------------------------------------------------- > > diff --git > a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java > b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java > > index 706550a..c529f74 100644 > > --- > a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java > > +++ > b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java > > @@ -21,6 +21,7 @@ import java.io.IOException; > > import java.io.Reader; > > import java.util.Objects; > > > > +import org.apache.lucene.analysis.CharacterUtils; > > import org.apache.lucene.analysis.charfilter.BaseCharFilter; > > > > import com.ibm.icu.text.Normalizer2; > > @@ -61,7 +62,7 @@ public final class ICUNormalizer2CharFilter extends > BaseCharFilter { > > ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int > bufferSize) { > > super(in); > > this.normalizer = Objects.requireNonNull(normalizer); > > - this.tmpBuffer = new char[bufferSize]; > > + this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize); > > } > > > > @Override > > @@ -94,23 +95,31 @@ public final class ICUNormalizer2CharFilter extends > BaseCharFilter { > > return -1; > > } > > > > - private final char[] tmpBuffer; > > + private final CharacterUtils.CharacterBuffer tmpBuffer; > > > > - private int readInputToBuffer() throws IOException { > > - final int len = input.read(tmpBuffer); > > - if (len == -1) { > > - inputFinished = true; > > - return 0; > > + private void readInputToBuffer() throws IOException { > > + while (true) { > > + // CharacterUtils.fill is supplementary char aware > > + final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, > input); > > + > > + assert tmpBuffer.getOffset() == 0; > > + inputBuffer.append(tmpBuffer.getBuffer(), 0, > tmpBuffer.getLength()); > > + > > + if (hasRemainingChars == false) { > > + inputFinished = true; > > + break; > > + } > > + > > + final int lastCodePoint = > Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength()); > > + if (normalizer.isInert(lastCodePoint)) { > > + // we require an inert char so that we can normalize content > before and > > + // after this character independently > > + break; > > + } > > } > > - inputBuffer.append(tmpBuffer, 0, len); > > > > // if checkedInputBoundary was at the end of a buffer, we need to > check that char again > > checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0); > > - // this loop depends on 'isInert' (changes under normalization) but > looks only at characters. > > - // so we treat all surrogates as non-inert for simplicity > > - if (normalizer.isInert(tmpBuffer[len - 1]) && > !Character.isSurrogate(tmpBuffer[len-1])) { > > - return len; > > - } else return len + readInputToBuffer(); > > } > > > > private int readAndNormalizeFromInput() { > > > > > http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java > > ---------------------------------------------------------------------- > > diff --git > a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java > b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java > > index 438a931..822466f 100644 > > --- > a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java > > +++ > b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java > > @@ -20,12 +20,14 @@ package org.apache.lucene.analysis.icu; > > import java.io.IOException; > > import java.io.Reader; > > import java.io.StringReader; > > +import java.util.Arrays; > > > > import org.apache.lucene.analysis.Analyzer; > > import org.apache.lucene.analysis.BaseTokenStreamTestCase; > > import org.apache.lucene.analysis.CharFilter; > > import org.apache.lucene.analysis.MockTokenizer; > > import org.apache.lucene.analysis.Tokenizer; > > +import org.apache.lucene.analysis.core.KeywordTokenizer; > > import org.apache.lucene.analysis.ngram.NGramTokenizer; > > import org.apache.lucene.util.TestUtil; > > > > @@ -418,4 +420,23 @@ public class TestICUNormalizer2CharFilter extends > BaseTokenStreamTestCase { > > } > > a.close(); > > } > > + > > + // https://issues.apache.org/jira/browse/LUCENE-7956 > > + public void testVeryLargeInputOfNonInertChars() throws Exception { > > + char[] text = new char[1000000]; > > + Arrays.fill(text, 'a'); > > + try (Analyzer a = new Analyzer() { > > + @Override > > + protected TokenStreamComponents createComponents(String > fieldName) { > > + return new TokenStreamComponents(new KeywordTokenizer()); > > + } > > + > > + @Override > > + protected Reader initReader(String fieldName, Reader reader) { > > + return new ICUNormalizer2CharFilter(reader, > Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); > > + } > > + }) { > > + checkAnalysisConsistency(random(), a, false, new String(text)); > > + } > > + } > > } > > > > > http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java > > ---------------------------------------------------------------------- > > diff --git > a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java > b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java > > index f254340..28ba989 100644 > > --- a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java > > +++ b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java > > @@ -60,6 +60,18 @@ public class DisiWrapper { > > } > > } > > > > + // For TermInSetQuery > > + public DisiWrapper(DocIdSetIterator iterator) { > > + this.scorer = null; > > + this.spans = null; > > + this.iterator = iterator; > > + this.cost = iterator.cost(); > > + this.doc = -1; > > + this.twoPhaseView = null; > > + this.approximation = iterator; > > + this.matchCost = 0f; > > + } > > + > > public DisiWrapper(Spans spans) { > > this.scorer = null; > > this.spans = spans; > > > > > http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java > > ---------------------------------------------------------------------- > > diff --git > a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java > b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java > > index 9b64d37..5a6676f 100644 > > --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java > > +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java > > @@ -17,12 +17,9 @@ > > package org.apache.lucene.search; > > > > import java.io.IOException; > > -import java.util.ArrayList; > > import java.util.Arrays; > > import java.util.Collection; > > import java.util.Collections; > > -import java.util.List; > > -import java.util.Objects; > > import java.util.Set; > > import java.util.SortedSet; > > > > @@ -33,8 +30,6 @@ import org.apache.lucene.index.PostingsEnum; > > import org.apache.lucene.index.PrefixCodedTerms; > > import org.apache.lucene.index.PrefixCodedTerms.TermIterator; > > import org.apache.lucene.index.Term; > > -import org.apache.lucene.index.TermContext; > > -import org.apache.lucene.index.TermState; > > import org.apache.lucene.index.Terms; > > import org.apache.lucene.index.TermsEnum; > > import org.apache.lucene.search.BooleanClause.Occur; > > @@ -43,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil; > > import org.apache.lucene.util.BytesRef; > > import org.apache.lucene.util.BytesRefBuilder; > > import org.apache.lucene.util.DocIdSetBuilder; > > +import org.apache.lucene.util.PriorityQueue; > > import org.apache.lucene.util.RamUsageEstimator; > > > > /** > > @@ -171,39 +167,6 @@ public class TermInSetQuery extends Query > implements Accountable { > > return Collections.emptyList(); > > } > > > > - private static class TermAndState { > > - final String field; > > - final TermsEnum termsEnum; > > - final BytesRef term; > > - final TermState state; > > - final int docFreq; > > - final long totalTermFreq; > > - > > - TermAndState(String field, TermsEnum termsEnum) throws IOException { > > - this.field = field; > > - this.termsEnum = termsEnum; > > - this.term = BytesRef.deepCopyOf(termsEnum.term()); > > - this.state = termsEnum.termState(); > > - this.docFreq = termsEnum.docFreq(); > > - this.totalTermFreq = termsEnum.totalTermFreq(); > > - } > > - } > > - > > - private static class WeightOrDocIdSet { > > - final Weight weight; > > - final DocIdSet set; > > - > > - WeightOrDocIdSet(Weight weight) { > > - this.weight = Objects.requireNonNull(weight); > > - this.set = null; > > - } > > - > > - WeightOrDocIdSet(DocIdSet bitset) { > > - this.set = bitset; > > - this.weight = null; > > - } > > - } > > - > > @Override > > public Weight createWeight(IndexSearcher searcher, boolean > needsScores, float boost) throws IOException { > > return new ConstantScoreWeight(this, boost) { > > @@ -216,11 +179,8 @@ public class TermInSetQuery extends Query > implements Accountable { > > // order to protect highlighters > > } > > > > - /** > > - * On the given leaf context, try to either rewrite to a > disjunction if > > - * there are few matching terms, or build a bitset containing > matching docs. > > - */ > > - private WeightOrDocIdSet rewrite(LeafReaderContext context) > throws IOException { > > + @Override > > + public Scorer scorer(LeafReaderContext context) throws > IOException { > > final LeafReader reader = context.reader(); > > > > Terms terms = reader.terms(field); > > @@ -231,90 +191,49 @@ public class TermInSetQuery extends Query > implements Accountable { > > PostingsEnum docs = null; > > TermIterator iterator = termData.iterator(); > > > > - // We will first try to collect up to 'threshold' terms into > 'matchingTerms' > > - // if there are two many terms, we will fall back to building > the 'builder' > > - final int threshold = > Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, > BooleanQuery.getMaxClauseCount()); > > - assert termData.size() > threshold : "Query should have been > rewritten"; > > - List<TermAndState> matchingTerms = new ArrayList<>(threshold); > > - DocIdSetBuilder builder = null; > > + // Here we partition postings based on cost: longer ones will > be consumed > > + // lazily while shorter ones are consumed eagerly into a > bitset. Compared to > > + // putting everything into a bitset, this should help skip over > unnecessary doc > > + // ids in the longer postings lists. This should be especially > useful if > > + // document frequencies have a zipfian distribution. > > + final PriorityQueue<PostingsEnum> longestPostingsLists = new > PriorityQueue<PostingsEnum>(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) { > > + @Override > > + protected boolean lessThan(PostingsEnum a, PostingsEnum b) { > > + return a.cost() < b.cost(); > > + } > > + }; > > + DocIdSetBuilder shortestPostingsLists = null; > > > > for (BytesRef term = iterator.next(); term != null; term = > iterator.next()) { > > assert field.equals(iterator.field()); > > if (termsEnum.seekExact(term)) { > > - if (matchingTerms == null) { > > - docs = termsEnum.postings(docs, PostingsEnum.NONE); > > - builder.add(docs); > > - } else if (matchingTerms.size() < threshold) { > > - matchingTerms.add(new TermAndState(field, termsEnum)); > > - } else { > > - assert matchingTerms.size() == threshold; > > - builder = new DocIdSetBuilder(reader.maxDoc(), terms); > > - docs = termsEnum.postings(docs, PostingsEnum.NONE); > > - builder.add(docs); > > - for (TermAndState t : matchingTerms) { > > - t.termsEnum.seekExact(t.term, t.state); > > - docs = t.termsEnum.postings(docs, PostingsEnum.NONE); > > - builder.add(docs); > > + docs = termsEnum.postings(docs, PostingsEnum.NONE); > > + docs = longestPostingsLists.insertWithOverflow(docs); > > + if (docs != null) { // the pq is full > > + if (shortestPostingsLists == null) { > > + shortestPostingsLists = new > DocIdSetBuilder(reader.maxDoc()); > > } > > - matchingTerms = null; > > + shortestPostingsLists.add(docs); > > } > > } > > } > > - if (matchingTerms != null) { > > - assert builder == null; > > - BooleanQuery.Builder bq = new BooleanQuery.Builder(); > > - for (TermAndState t : matchingTerms) { > > - final TermContext termContext = new > TermContext(searcher.getTopReaderContext()); > > - termContext.register(t.state, context.ord, t.docFreq, > t.totalTermFreq); > > - bq.add(new TermQuery(new Term(t.field, t.term), > termContext), Occur.SHOULD); > > - } > > - Query q = new ConstantScoreQuery(bq.build()); > > - final Weight weight = > searcher.rewrite(q).createWeight(searcher, needsScores, score()); > > - return new WeightOrDocIdSet(weight); > > - } else { > > - assert builder != null; > > - return new WeightOrDocIdSet(builder.build()); > > - } > > - } > > > > - private Scorer scorer(DocIdSet set) throws IOException { > > - if (set == null) { > > - return null; > > - } > > - final DocIdSetIterator disi = set.iterator(); > > - if (disi == null) { > > + final int numClauses = longestPostingsLists.size() + > (shortestPostingsLists == null ? 0 : 1); > > + if (numClauses == 0) { > > return null; > > } > > - return new ConstantScoreScorer(this, score(), disi); > > - } > > > > - @Override > > - public BulkScorer bulkScorer(LeafReaderContext context) throws > IOException { > > - final WeightOrDocIdSet weightOrBitSet = rewrite(context); > > - if (weightOrBitSet == null) { > > - return null; > > - } else if (weightOrBitSet.weight != null) { > > - return weightOrBitSet.weight.bulkScorer(context); > > - } else { > > - final Scorer scorer = scorer(weightOrBitSet.set); > > - if (scorer == null) { > > - return null; > > - } > > - return new DefaultBulkScorer(scorer); > > + DisiPriorityQueue queue = new DisiPriorityQueue(numClauses); > > + for (PostingsEnum postings : longestPostingsLists) { > > + queue.add(new DisiWrapper(postings)); > > } > > - } > > - > > - @Override > > - public Scorer scorer(LeafReaderContext context) throws > IOException { > > - final WeightOrDocIdSet weightOrBitSet = rewrite(context); > > - if (weightOrBitSet == null) { > > - return null; > > - } else if (weightOrBitSet.weight != null) { > > - return weightOrBitSet.weight.scorer(context); > > - } else { > > - return scorer(weightOrBitSet.set); > > + if (shortestPostingsLists != null) { > > + queue.add(new > DisiWrapper(shortestPostingsLists.build().iterator())); > > } > > + final DocIdSetIterator disi = new > DisjunctionDISIApproximation(queue); > > + return new ConstantScoreScorer(this, boost, disi); > > } > > }; > > } > > + > > } > > > > > --------------------------------------------------------------------- > To unsubscribe, e-mail: [email protected] > For additional commands, e-mail: [email protected] > >
