Re: [1/3] lucene-solr:master: LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter.

Alan Woodward Tue, 05 Sep 2017 10:50:38 -0700

Hey Adrien, I think you included your patch for LUCENE-7958 here as well?

> On 5 Sep 2017, at 17:40, [email protected] wrote:
> 
> Repository: lucene-solr
> Updated Branches:
>  refs/heads/branch_7_0 d6923cb4b -> 1899b36bd
>  refs/heads/branch_7x df1d09de2 -> 61a48f029
>  refs/heads/master 5a8eb5388 -> 96150badc
> 
> 
> LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter.
> 
> 
> Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
> Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/96150bad
> Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/96150bad
> Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/96150bad
> 
> Branch: refs/heads/master
> Commit: 96150badce8234cac00a23c2d5da55545e0be958
> Parents: 5a8eb53
> Author: Adrien Grand <[email protected]>
> Authored: Tue Sep 5 18:34:05 2017 +0200
> Committer: Adrien Grand <[email protected]>
> Committed: Tue Sep 5 18:34:05 2017 +0200
> 
> ----------------------------------------------------------------------
> lucene/CHANGES.txt                              |   3 +
> .../analysis/icu/ICUNormalizer2CharFilter.java  |  35 +++--
> .../icu/TestICUNormalizer2CharFilter.java       |  21 +++
> .../org/apache/lucene/search/DisiWrapper.java   |  12 ++
> .../apache/lucene/search/TermInSetQuery.java    | 143 ++++---------------
> 5 files changed, 89 insertions(+), 125 deletions(-)
> ----------------------------------------------------------------------
> 
> 
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/CHANGES.txt
> ----------------------------------------------------------------------
> diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
> index a819916..02b2231 100644
> --- a/lucene/CHANGES.txt
> +++ b/lucene/CHANGES.txt
> @@ -195,6 +195,9 @@ Bug Fixes
> * LUCENE-7864: IndexMergeTool is not using intermediate hard links (even
>   if possible). (Dawid Weiss)
> 
> +* LUCENE-7956: Fixed potential stack overflow error in 
> ICUNormalizer2CharFilter.
> +  (Adrien Grand)
> +
> Improvements
> 
> * LUCENE-7489: Better storage of sparse doc-values fields with the default
> 
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> ----------------------------------------------------------------------
> diff --git 
> a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
>  
> b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> index 706550a..c529f74 100644
> --- 
> a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> +++ 
> b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
> @@ -21,6 +21,7 @@ import java.io.IOException;
> import java.io.Reader;
> import java.util.Objects;
> 
> +import org.apache.lucene.analysis.CharacterUtils;
> import org.apache.lucene.analysis.charfilter.BaseCharFilter;
> 
> import com.ibm.icu.text.Normalizer2;
> @@ -61,7 +62,7 @@ public final class ICUNormalizer2CharFilter extends 
> BaseCharFilter {
>   ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) 
> {
>     super(in);
>     this.normalizer = Objects.requireNonNull(normalizer);
> -    this.tmpBuffer = new char[bufferSize];
> +    this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
>   }
> 
>   @Override
> @@ -94,23 +95,31 @@ public final class ICUNormalizer2CharFilter extends 
> BaseCharFilter {
>     return -1;
>   }
> 
> -  private final char[] tmpBuffer;
> +  private final CharacterUtils.CharacterBuffer tmpBuffer;
> 
> -  private int readInputToBuffer() throws IOException {
> -    final int len = input.read(tmpBuffer);
> -    if (len == -1) {
> -      inputFinished = true;
> -      return 0;
> +  private void readInputToBuffer() throws IOException {
> +    while (true) {
> +      // CharacterUtils.fill is supplementary char aware
> +      final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, 
> input);
> +
> +      assert tmpBuffer.getOffset() == 0;
> +      inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
> +
> +      if (hasRemainingChars == false) {
> +        inputFinished = true;
> +        break;
> +      }
> +
> +      final int lastCodePoint = 
> Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength());
> +      if (normalizer.isInert(lastCodePoint)) {
> +        // we require an inert char so that we can normalize content before 
> and
> +        // after this character independently
> +        break;
> +      }
>     }
> -    inputBuffer.append(tmpBuffer, 0, len);
> 
>     // if checkedInputBoundary was at the end of a buffer, we need to check 
> that char again
>     checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
> -    // this loop depends on 'isInert' (changes under normalization) but 
> looks only at characters.
> -    // so we treat all surrogates as non-inert for simplicity
> -    if (normalizer.isInert(tmpBuffer[len - 1]) && 
> !Character.isSurrogate(tmpBuffer[len-1])) {
> -      return len;
> -    } else return len + readInputToBuffer();
>   }
> 
>   private int readAndNormalizeFromInput() {
> 
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> ----------------------------------------------------------------------
> diff --git 
> a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
>  
> b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> index 438a931..822466f 100644
> --- 
> a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> +++ 
> b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
> @@ -20,12 +20,14 @@ package org.apache.lucene.analysis.icu;
> import java.io.IOException;
> import java.io.Reader;
> import java.io.StringReader;
> +import java.util.Arrays;
> 
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.BaseTokenStreamTestCase;
> import org.apache.lucene.analysis.CharFilter;
> import org.apache.lucene.analysis.MockTokenizer;
> import org.apache.lucene.analysis.Tokenizer;
> +import org.apache.lucene.analysis.core.KeywordTokenizer;
> import org.apache.lucene.analysis.ngram.NGramTokenizer;
> import org.apache.lucene.util.TestUtil;
> 
> @@ -418,4 +420,23 @@ public class TestICUNormalizer2CharFilter extends 
> BaseTokenStreamTestCase {
>     }
>     a.close();
>   }
> +
> +  // https://issues.apache.org/jira/browse/LUCENE-7956
> +  public void testVeryLargeInputOfNonInertChars() throws Exception {
> +    char[] text = new char[1000000];
> +    Arrays.fill(text, 'a');
> +    try (Analyzer a = new Analyzer() {
> +      @Override
> +      protected TokenStreamComponents createComponents(String fieldName) {
> +        return new TokenStreamComponents(new KeywordTokenizer());
> +      }
> +
> +      @Override
> +      protected Reader initReader(String fieldName, Reader reader) {
> +        return new ICUNormalizer2CharFilter(reader, 
> Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
> +      }
> +    }) {
> +      checkAnalysisConsistency(random(), a, false, new String(text));
> +    }
> +  }
> }
> 
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> ----------------------------------------------------------------------
> diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java 
> b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> index f254340..28ba989 100644
> --- a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> +++ b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
> @@ -60,6 +60,18 @@ public class DisiWrapper {
>     }
>   }
> 
> +  // For TermInSetQuery
> +  public DisiWrapper(DocIdSetIterator iterator) {
> +    this.scorer = null;
> +    this.spans = null;
> +    this.iterator = iterator;
> +    this.cost = iterator.cost();
> +    this.doc = -1;
> +    this.twoPhaseView = null;
> +    this.approximation = iterator;
> +    this.matchCost = 0f;
> +  }
> +
>   public DisiWrapper(Spans spans) {
>     this.scorer = null;
>     this.spans = spans;
> 
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96150bad/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> ----------------------------------------------------------------------
> diff --git 
> a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java 
> b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> index 9b64d37..5a6676f 100644
> --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
> @@ -17,12 +17,9 @@
> package org.apache.lucene.search;
> 
> import java.io.IOException;
> -import java.util.ArrayList;
> import java.util.Arrays;
> import java.util.Collection;
> import java.util.Collections;
> -import java.util.List;
> -import java.util.Objects;
> import java.util.Set;
> import java.util.SortedSet;
> 
> @@ -33,8 +30,6 @@ import org.apache.lucene.index.PostingsEnum;
> import org.apache.lucene.index.PrefixCodedTerms;
> import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
> import org.apache.lucene.index.Term;
> -import org.apache.lucene.index.TermContext;
> -import org.apache.lucene.index.TermState;
> import org.apache.lucene.index.Terms;
> import org.apache.lucene.index.TermsEnum;
> import org.apache.lucene.search.BooleanClause.Occur;
> @@ -43,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil;
> import org.apache.lucene.util.BytesRef;
> import org.apache.lucene.util.BytesRefBuilder;
> import org.apache.lucene.util.DocIdSetBuilder;
> +import org.apache.lucene.util.PriorityQueue;
> import org.apache.lucene.util.RamUsageEstimator;
> 
> /**
> @@ -171,39 +167,6 @@ public class TermInSetQuery extends Query implements 
> Accountable {
>     return Collections.emptyList();
>   }
> 
> -  private static class TermAndState {
> -    final String field;
> -    final TermsEnum termsEnum;
> -    final BytesRef term;
> -    final TermState state;
> -    final int docFreq;
> -    final long totalTermFreq;
> -
> -    TermAndState(String field, TermsEnum termsEnum) throws IOException {
> -      this.field = field;
> -      this.termsEnum = termsEnum;
> -      this.term = BytesRef.deepCopyOf(termsEnum.term());
> -      this.state = termsEnum.termState();
> -      this.docFreq = termsEnum.docFreq();
> -      this.totalTermFreq = termsEnum.totalTermFreq();
> -    }
> -  }
> -
> -  private static class WeightOrDocIdSet {
> -    final Weight weight;
> -    final DocIdSet set;
> -
> -    WeightOrDocIdSet(Weight weight) {
> -      this.weight = Objects.requireNonNull(weight);
> -      this.set = null;
> -    }
> -
> -    WeightOrDocIdSet(DocIdSet bitset) {
> -      this.set = bitset;
> -      this.weight = null;
> -    }
> -  }
> -
>   @Override
>   public Weight createWeight(IndexSearcher searcher, boolean needsScores, 
> float boost) throws IOException {
>     return new ConstantScoreWeight(this, boost) {
> @@ -216,11 +179,8 @@ public class TermInSetQuery extends Query implements 
> Accountable {
>         // order to protect highlighters
>       }
> 
> -      /**
> -       * On the given leaf context, try to either rewrite to a disjunction if
> -       * there are few matching terms, or build a bitset containing matching 
> docs.
> -       */
> -      private WeightOrDocIdSet rewrite(LeafReaderContext context) throws 
> IOException {
> +      @Override
> +      public Scorer scorer(LeafReaderContext context) throws IOException {
>         final LeafReader reader = context.reader();
> 
>         Terms terms = reader.terms(field);
> @@ -231,90 +191,49 @@ public class TermInSetQuery extends Query implements 
> Accountable {
>         PostingsEnum docs = null;
>         TermIterator iterator = termData.iterator();
> 
> -        // We will first try to collect up to 'threshold' terms into 
> 'matchingTerms'
> -        // if there are two many terms, we will fall back to building the 
> 'builder'
> -        final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, 
> BooleanQuery.getMaxClauseCount());
> -        assert termData.size() > threshold : "Query should have been 
> rewritten";
> -        List<TermAndState> matchingTerms = new ArrayList<>(threshold);
> -        DocIdSetBuilder builder = null;
> +        // Here we partition postings based on cost: longer ones will be 
> consumed
> +        // lazily while shorter ones are consumed eagerly into a bitset. 
> Compared to
> +        // putting everything into a bitset, this should help skip over 
> unnecessary doc
> +        // ids in the longer postings lists. This should be especially 
> useful if
> +        // document frequencies have a zipfian distribution.
> +        final PriorityQueue<PostingsEnum> longestPostingsLists = new 
> PriorityQueue<PostingsEnum>(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) {
> +          @Override
> +          protected boolean lessThan(PostingsEnum a, PostingsEnum b) {
> +            return a.cost() < b.cost();
> +          }
> +        };
> +        DocIdSetBuilder shortestPostingsLists = null;
> 
>         for (BytesRef term = iterator.next(); term != null; term = 
> iterator.next()) {
>           assert field.equals(iterator.field());
>           if (termsEnum.seekExact(term)) {
> -            if (matchingTerms == null) {
> -              docs = termsEnum.postings(docs, PostingsEnum.NONE);
> -              builder.add(docs);
> -            } else if (matchingTerms.size() < threshold) {
> -              matchingTerms.add(new TermAndState(field, termsEnum));
> -            } else {
> -              assert matchingTerms.size() == threshold;
> -              builder = new DocIdSetBuilder(reader.maxDoc(), terms);
> -              docs = termsEnum.postings(docs, PostingsEnum.NONE);
> -              builder.add(docs);
> -              for (TermAndState t : matchingTerms) {
> -                t.termsEnum.seekExact(t.term, t.state);
> -                docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
> -                builder.add(docs);
> +            docs = termsEnum.postings(docs, PostingsEnum.NONE);
> +            docs = longestPostingsLists.insertWithOverflow(docs);
> +            if (docs != null) { // the pq is full
> +              if (shortestPostingsLists == null) {
> +                shortestPostingsLists = new DocIdSetBuilder(reader.maxDoc());
>               }
> -              matchingTerms = null;
> +              shortestPostingsLists.add(docs);
>             }
>           }
>         }
> -        if (matchingTerms != null) {
> -          assert builder == null;
> -          BooleanQuery.Builder bq = new BooleanQuery.Builder();
> -          for (TermAndState t : matchingTerms) {
> -            final TermContext termContext = new 
> TermContext(searcher.getTopReaderContext());
> -            termContext.register(t.state, context.ord, t.docFreq, 
> t.totalTermFreq);
> -            bq.add(new TermQuery(new Term(t.field, t.term), termContext), 
> Occur.SHOULD);
> -          }
> -          Query q = new ConstantScoreQuery(bq.build());
> -          final Weight weight = searcher.rewrite(q).createWeight(searcher, 
> needsScores, score());
> -          return new WeightOrDocIdSet(weight);
> -        } else {
> -          assert builder != null;
> -          return new WeightOrDocIdSet(builder.build());
> -        }
> -      }
> 
> -      private Scorer scorer(DocIdSet set) throws IOException {
> -        if (set == null) {
> -          return null;
> -        }
> -        final DocIdSetIterator disi = set.iterator();
> -        if (disi == null) {
> +        final int numClauses = longestPostingsLists.size() + 
> (shortestPostingsLists == null ? 0 : 1);
> +        if (numClauses == 0) {
>           return null;
>         }
> -        return new ConstantScoreScorer(this, score(), disi);
> -      }
> 
> -      @Override
> -      public BulkScorer bulkScorer(LeafReaderContext context) throws 
> IOException {
> -        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
> -        if (weightOrBitSet == null) {
> -          return null;
> -        } else if (weightOrBitSet.weight != null) {
> -          return weightOrBitSet.weight.bulkScorer(context);
> -        } else {
> -          final Scorer scorer = scorer(weightOrBitSet.set);
> -          if (scorer == null) {
> -            return null;
> -          }
> -          return new DefaultBulkScorer(scorer);
> +        DisiPriorityQueue queue = new DisiPriorityQueue(numClauses);
> +        for (PostingsEnum postings : longestPostingsLists) {
> +          queue.add(new DisiWrapper(postings));
>         }
> -      }
> -
> -      @Override
> -      public Scorer scorer(LeafReaderContext context) throws IOException {
> -        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
> -        if (weightOrBitSet == null) {
> -          return null;
> -        } else if (weightOrBitSet.weight != null) {
> -          return weightOrBitSet.weight.scorer(context);
> -        } else {
> -          return scorer(weightOrBitSet.set);
> +        if (shortestPostingsLists != null) {
> +          queue.add(new 
> DisiWrapper(shortestPostingsLists.build().iterator()));
>         }
> +        final DocIdSetIterator disi = new 
> DisjunctionDISIApproximation(queue);
> +        return new ConstantScoreScorer(this, boost, disi);
>       }
>     };
>   }
> +
> }
>



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [1/3] lucene-solr:master: LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter.

Reply via email to