Re: [1/3] lucene-solr:master: LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter.

Adrien Grand Tue, 05 Sep 2017 12:19:30 -0700

Thanks Alan, I indeed committed more than I intended!

Le mar. 5 sept. 2017 à 19:49, Alan Woodward <[email protected]> a écrit :


> Hey Adrien, I think you included your patch for LUCENE-7958 here as well?
>
> > On 5 Sep 2017, at 17:40, [email protected] wrote:
> >
> > Repository: lucene-solr
> > Updated Branches:
> >  refs/heads/branch_7_0 d6923cb4b -> 1899b36bd
> >  refs/heads/branch_7x df1d09de2 -> 61a48f029
> >  refs/heads/master 5a8eb5388 -> 96150badc
> >
> >
> > LUCENE-7956: Fixed potential stack overflow error in
> ICUNormalizer2CharFilter.
> >
> >
> > Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
> > Commit:
> http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/96150bad
> > Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/96150bad
> > Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/96150bad
> >
> > Branch: refs/heads/master
> > Commit: 96150badce8234cac00a23c2d5da55545e0be958
> > Parents: 5a8eb53
> > Author: Adrien Grand <[email protected]>
> > Authored: Tue Sep 5 18:34:05 2017 +0200
> > Committer: Adrien Grand <[email protected]>
> > Committed: Tue Sep 5 18:34:05 2017 +0200
> >
> > ----------------------------------------------------------------------
> > lucene/CHANGES.txt                              |   3 +
> > .../analysis/icu/ICUNormalizer2CharFilter.java  |  35 +++--
> > .../icu/TestICUNormalizer2CharFilter.java       |  21 +++
> > .../org/apache/lucene/search/DisiWrapper.java   |  12 ++
> > .../apache/lucene/search/TermInSetQuery.java    | 143 ++++---------------
> > 5 files changed, 89 insertions(+), 125 deletions(-)
> > ----------------------------------------------------------------------
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/CHANGES.txt
> > ----------------------------------------------------------------------
> > diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
> > index a819916..02b2231 100644
> > --- a/lucene/CHANGES.txt
> > +++ b/lucene/CHANGES.txt
> > @@ -195,6 +195,9 @@ Bug Fixes
> > * LUCENE-7864: IndexMergeTool is not using intermediate hard links (even
> >   if possible). (Dawid Weiss)
> >
> > +* LUCENE-7956: Fixed potential stack overflow error in
> ICUNormalizer2CharFilter.
> > +  (Adrien Grand)
> > +
> > Improvements
> >
> > * LUCENE-7489: Better storage of sparse doc-values fields with the
> default
> >
> >
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> > ----------------------------------------------------------------------
> > diff --git
> a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> > index 706550a..c529f74 100644
> > ---
> a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> > +++
> b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> > @@ -21,6 +21,7 @@ import java.io.IOException;
> > import java.io.Reader;
> > import java.util.Objects;
> >
> > +import org.apache.lucene.analysis.CharacterUtils;
> > import org.apache.lucene.analysis.charfilter.BaseCharFilter;
> >
> > import com.ibm.icu.text.Normalizer2;
> > @@ -61,7 +62,7 @@ public final class ICUNormalizer2CharFilter extends
> BaseCharFilter {
> >   ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int
> bufferSize) {
> >     super(in);
> >     this.normalizer = Objects.requireNonNull(normalizer);
> > -    this.tmpBuffer = new char[bufferSize];
> > +    this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
> >   }
> >
> >   @Override
> > @@ -94,23 +95,31 @@ public final class ICUNormalizer2CharFilter extends
> BaseCharFilter {
> >     return -1;
> >   }
> >
> > -  private final char[] tmpBuffer;
> > +  private final CharacterUtils.CharacterBuffer tmpBuffer;
> >
> > -  private int readInputToBuffer() throws IOException {
> > -    final int len = input.read(tmpBuffer);
> > -    if (len == -1) {
> > -      inputFinished = true;
> > -      return 0;
> > +  private void readInputToBuffer() throws IOException {
> > +    while (true) {
> > +      // CharacterUtils.fill is supplementary char aware
> > +      final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer,
> input);
> > +
> > +      assert tmpBuffer.getOffset() == 0;
> > +      inputBuffer.append(tmpBuffer.getBuffer(), 0,
> tmpBuffer.getLength());
> > +
> > +      if (hasRemainingChars == false) {
> > +        inputFinished = true;
> > +        break;
> > +      }
> > +
> > +      final int lastCodePoint =
> Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength());
> > +      if (normalizer.isInert(lastCodePoint)) {
> > +        // we require an inert char so that we can normalize content
> before and
> > +        // after this character independently
> > +        break;
> > +      }
> >     }
> > -    inputBuffer.append(tmpBuffer, 0, len);
> >
> >     // if checkedInputBoundary was at the end of a buffer, we need to
> check that char again
> >     checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
> > -    // this loop depends on 'isInert' (changes under normalization) but
> looks only at characters.
> > -    // so we treat all surrogates as non-inert for simplicity
> > -    if (normalizer.isInert(tmpBuffer[len - 1]) &&
> !Character.isSurrogate(tmpBuffer[len-1])) {
> > -      return len;
> > -    } else return len + readInputToBuffer();
> >   }
> >
> >   private int readAndNormalizeFromInput() {
> >
> >
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> > ----------------------------------------------------------------------
> > diff --git
> a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> > index 438a931..822466f 100644
> > ---
> a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> > +++
> b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> > @@ -20,12 +20,14 @@ package org.apache.lucene.analysis.icu;
> > import java.io.IOException;
> > import java.io.Reader;
> > import java.io.StringReader;
> > +import java.util.Arrays;
> >
> > import org.apache.lucene.analysis.Analyzer;
> > import org.apache.lucene.analysis.BaseTokenStreamTestCase;
> > import org.apache.lucene.analysis.CharFilter;
> > import org.apache.lucene.analysis.MockTokenizer;
> > import org.apache.lucene.analysis.Tokenizer;
> > +import org.apache.lucene.analysis.core.KeywordTokenizer;
> > import org.apache.lucene.analysis.ngram.NGramTokenizer;
> > import org.apache.lucene.util.TestUtil;
> >
> > @@ -418,4 +420,23 @@ public class TestICUNormalizer2CharFilter extends
> BaseTokenStreamTestCase {
> >     }
> >     a.close();
> >   }
> > +
> > +  // https://issues.apache.org/jira/browse/LUCENE-7956
> > +  public void testVeryLargeInputOfNonInertChars() throws Exception {
> > +    char[] text = new char[1000000];
> > +    Arrays.fill(text, 'a');
> > +    try (Analyzer a = new Analyzer() {
> > +      @Override
> > +      protected TokenStreamComponents createComponents(String
> fieldName) {
> > +        return new TokenStreamComponents(new KeywordTokenizer());
> > +      }
> > +
> > +      @Override
> > +      protected Reader initReader(String fieldName, Reader reader) {
> > +        return new ICUNormalizer2CharFilter(reader,
> Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
> > +      }
> > +    }) {
> > +      checkAnalysisConsistency(random(), a, false, new String(text));
> > +    }
> > +  }
> > }
> >
> >
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> > ----------------------------------------------------------------------
> > diff --git
> a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> > index f254340..28ba989 100644
> > --- a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> > +++ b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> > @@ -60,6 +60,18 @@ public class DisiWrapper {
> >     }
> >   }
> >
> > +  // For TermInSetQuery
> > +  public DisiWrapper(DocIdSetIterator iterator) {
> > +    this.scorer = null;
> > +    this.spans = null;
> > +    this.iterator = iterator;
> > +    this.cost = iterator.cost();
> > +    this.doc = -1;
> > +    this.twoPhaseView = null;
> > +    this.approximation = iterator;
> > +    this.matchCost = 0f;
> > +  }
> > +
> >   public DisiWrapper(Spans spans) {
> >     this.scorer = null;
> >     this.spans = spans;
> >
> >
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> > ----------------------------------------------------------------------
> > diff --git
> a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> > index 9b64d37..5a6676f 100644
> > --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> > +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> > @@ -17,12 +17,9 @@
> > package org.apache.lucene.search;
> >
> > import java.io.IOException;
> > -import java.util.ArrayList;
> > import java.util.Arrays;
> > import java.util.Collection;
> > import java.util.Collections;
> > -import java.util.List;
> > -import java.util.Objects;
> > import java.util.Set;
> > import java.util.SortedSet;
> >
> > @@ -33,8 +30,6 @@ import org.apache.lucene.index.PostingsEnum;
> > import org.apache.lucene.index.PrefixCodedTerms;
> > import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
> > import org.apache.lucene.index.Term;
> > -import org.apache.lucene.index.TermContext;
> > -import org.apache.lucene.index.TermState;
> > import org.apache.lucene.index.Terms;
> > import org.apache.lucene.index.TermsEnum;
> > import org.apache.lucene.search.BooleanClause.Occur;
> > @@ -43,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil;
> > import org.apache.lucene.util.BytesRef;
> > import org.apache.lucene.util.BytesRefBuilder;
> > import org.apache.lucene.util.DocIdSetBuilder;
> > +import org.apache.lucene.util.PriorityQueue;
> > import org.apache.lucene.util.RamUsageEstimator;
> >
> > /**
> > @@ -171,39 +167,6 @@ public class TermInSetQuery extends Query
> implements Accountable {
> >     return Collections.emptyList();
> >   }
> >
> > -  private static class TermAndState {
> > -    final String field;
> > -    final TermsEnum termsEnum;
> > -    final BytesRef term;
> > -    final TermState state;
> > -    final int docFreq;
> > -    final long totalTermFreq;
> > -
> > -    TermAndState(String field, TermsEnum termsEnum) throws IOException {
> > -      this.field = field;
> > -      this.termsEnum = termsEnum;
> > -      this.term = BytesRef.deepCopyOf(termsEnum.term());
> > -      this.state = termsEnum.termState();
> > -      this.docFreq = termsEnum.docFreq();
> > -      this.totalTermFreq = termsEnum.totalTermFreq();
> > -    }
> > -  }
> > -
> > -  private static class WeightOrDocIdSet {
> > -    final Weight weight;
> > -    final DocIdSet set;
> > -
> > -    WeightOrDocIdSet(Weight weight) {
> > -      this.weight = Objects.requireNonNull(weight);
> > -      this.set = null;
> > -    }
> > -
> > -    WeightOrDocIdSet(DocIdSet bitset) {
> > -      this.set = bitset;
> > -      this.weight = null;
> > -    }
> > -  }
> > -
> >   @Override
> >   public Weight createWeight(IndexSearcher searcher, boolean
> needsScores, float boost) throws IOException {
> >     return new ConstantScoreWeight(this, boost) {
> > @@ -216,11 +179,8 @@ public class TermInSetQuery extends Query
> implements Accountable {
> >         // order to protect highlighters
> >       }
> >
> > -      /**
> > -       * On the given leaf context, try to either rewrite to a
> disjunction if
> > -       * there are few matching terms, or build a bitset containing
> matching docs.
> > -       */
> > -      private WeightOrDocIdSet rewrite(LeafReaderContext context)
> throws IOException {
> > +      @Override
> > +      public Scorer scorer(LeafReaderContext context) throws
> IOException {
> >         final LeafReader reader = context.reader();
> >
> >         Terms terms = reader.terms(field);
> > @@ -231,90 +191,49 @@ public class TermInSetQuery extends Query
> implements Accountable {
> >         PostingsEnum docs = null;
> >         TermIterator iterator = termData.iterator();
> >
> > -        // We will first try to collect up to 'threshold' terms into
> 'matchingTerms'
> > -        // if there are two many terms, we will fall back to building
> the 'builder'
> > -        final int threshold =
> Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD,
> BooleanQuery.getMaxClauseCount());
> > -        assert termData.size() > threshold : "Query should have been
> rewritten";
> > -        List<TermAndState> matchingTerms = new ArrayList<>(threshold);
> > -        DocIdSetBuilder builder = null;
> > +        // Here we partition postings based on cost: longer ones will
> be consumed
> > +        // lazily while shorter ones are consumed eagerly into a
> bitset. Compared to
> > +        // putting everything into a bitset, this should help skip over
> unnecessary doc
> > +        // ids in the longer postings lists. This should be especially
> useful if
> > +        // document frequencies have a zipfian distribution.
> > +        final PriorityQueue<PostingsEnum> longestPostingsLists = new
> PriorityQueue<PostingsEnum>(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) {
> > +          @Override
> > +          protected boolean lessThan(PostingsEnum a, PostingsEnum b) {
> > +            return a.cost() < b.cost();
> > +          }
> > +        };
> > +        DocIdSetBuilder shortestPostingsLists = null;
> >
> >         for (BytesRef term = iterator.next(); term != null; term =
> iterator.next()) {
> >           assert field.equals(iterator.field());
> >           if (termsEnum.seekExact(term)) {
> > -            if (matchingTerms == null) {
> > -              docs = termsEnum.postings(docs, PostingsEnum.NONE);
> > -              builder.add(docs);
> > -            } else if (matchingTerms.size() < threshold) {
> > -              matchingTerms.add(new TermAndState(field, termsEnum));
> > -            } else {
> > -              assert matchingTerms.size() == threshold;
> > -              builder = new DocIdSetBuilder(reader.maxDoc(), terms);
> > -              docs = termsEnum.postings(docs, PostingsEnum.NONE);
> > -              builder.add(docs);
> > -              for (TermAndState t : matchingTerms) {
> > -                t.termsEnum.seekExact(t.term, t.state);
> > -                docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
> > -                builder.add(docs);
> > +            docs = termsEnum.postings(docs, PostingsEnum.NONE);
> > +            docs = longestPostingsLists.insertWithOverflow(docs);
> > +            if (docs != null) { // the pq is full
> > +              if (shortestPostingsLists == null) {
> > +                shortestPostingsLists = new
> DocIdSetBuilder(reader.maxDoc());
> >               }
> > -              matchingTerms = null;
> > +              shortestPostingsLists.add(docs);
> >             }
> >           }
> >         }
> > -        if (matchingTerms != null) {
> > -          assert builder == null;
> > -          BooleanQuery.Builder bq = new BooleanQuery.Builder();
> > -          for (TermAndState t : matchingTerms) {
> > -            final TermContext termContext = new
> TermContext(searcher.getTopReaderContext());
> > -            termContext.register(t.state, context.ord, t.docFreq,
> t.totalTermFreq);
> > -            bq.add(new TermQuery(new Term(t.field, t.term),
> termContext), Occur.SHOULD);
> > -          }
> > -          Query q = new ConstantScoreQuery(bq.build());
> > -          final Weight weight =
> searcher.rewrite(q).createWeight(searcher, needsScores, score());
> > -          return new WeightOrDocIdSet(weight);
> > -        } else {
> > -          assert builder != null;
> > -          return new WeightOrDocIdSet(builder.build());
> > -        }
> > -      }
> >
> > -      private Scorer scorer(DocIdSet set) throws IOException {
> > -        if (set == null) {
> > -          return null;
> > -        }
> > -        final DocIdSetIterator disi = set.iterator();
> > -        if (disi == null) {
> > +        final int numClauses = longestPostingsLists.size() +
> (shortestPostingsLists == null ? 0 : 1);
> > +        if (numClauses == 0) {
> >           return null;
> >         }
> > -        return new ConstantScoreScorer(this, score(), disi);
> > -      }
> >
> > -      @Override
> > -      public BulkScorer bulkScorer(LeafReaderContext context) throws
> IOException {
> > -        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
> > -        if (weightOrBitSet == null) {
> > -          return null;
> > -        } else if (weightOrBitSet.weight != null) {
> > -          return weightOrBitSet.weight.bulkScorer(context);
> > -        } else {
> > -          final Scorer scorer = scorer(weightOrBitSet.set);
> > -          if (scorer == null) {
> > -            return null;
> > -          }
> > -          return new DefaultBulkScorer(scorer);
> > +        DisiPriorityQueue queue = new DisiPriorityQueue(numClauses);
> > +        for (PostingsEnum postings : longestPostingsLists) {
> > +          queue.add(new DisiWrapper(postings));
> >         }
> > -      }
> > -
> > -      @Override
> > -      public Scorer scorer(LeafReaderContext context) throws
> IOException {
> > -        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
> > -        if (weightOrBitSet == null) {
> > -          return null;
> > -        } else if (weightOrBitSet.weight != null) {
> > -          return weightOrBitSet.weight.scorer(context);
> > -        } else {
> > -          return scorer(weightOrBitSet.set);
> > +        if (shortestPostingsLists != null) {
> > +          queue.add(new
> DisiWrapper(shortestPostingsLists.build().iterator()));
> >         }
> > +        final DocIdSetIterator disi = new
> DisjunctionDISIApproximation(queue);
> > +        return new ConstantScoreScorer(this, boost, disi);
> >       }
> >     };
> >   }
> > +
> > }
> >
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [email protected]
> For additional commands, e-mail: [email protected]
>
>

Re: [1/3] lucene-solr:master: LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter.

Reply via email to