mayya-sharipova opened a new pull request, #14430:
URL: https://github.com/apache/lucene/pull/14430
This reverts commit ce2a917cf2c2f40b3996656f3b294e3c01d25e5b.
### Description
In Elasticsearch (and probably other applications) we reuse the same
analyzer across fields. And this change breaks it.
A fix in Lucene is trivial to make a default reuse strategy per_field:

But I don't know all the implications for this. So I will revert my change
for now .
For example, the test below will fail currently:
```java
private static class PhraseWrappedAnalyzer extends AnalyzerWrapper {
private final Analyzer delegate;
private final int posIncGap;
PhraseWrappedAnalyzer(Analyzer delegate, int posIncGap) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
this.posIncGap = posIncGap;
}
@Override
public int getPositionIncrementGap(String fieldName) {
// Delegate or return fixed value? Original test didn't rely on
this.
// Returning the passed value is consistent with the constructor.
// Delegating might be safer generally: return
delegate.getPositionIncrementGap(fieldName);
return posIncGap;
}
@Override
public int getOffsetGap(String fieldName) {
// Delegate offset gap as well for completeness
return delegate.getOffsetGap(fieldName);
}
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName,
TokenStreamComponents components) {
// Wrap the delegate's token stream with FixedShingleFilter for
bigrams
return new TokenStreamComponents(components.getSource(),
new ShingleFilter(components.getTokenStream(), 2));
}
}
public void testIndexDiffFieldsSameAnalyzer() throws IOException {
final Analyzer textAnalyzer = new StandardAnalyzer();
final Analyzer phraseAnalyzer = new
PhraseWrappedAnalyzer(textAnalyzer, 0);
FieldType textVectorType = new FieldType(TextField.TYPE_NOT_STORED);
textVectorType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
textVectorType.freeze();
Map<String, Analyzer> analyzerMap = new HashMap<>();
analyzerMap.put("text", textAnalyzer);
analyzerMap.put("text_phrases", phraseAnalyzer); // Use this field
to store phrase tokens
Analyzer perFieldAnalyzer = new PerFieldAnalyzerWrapper(new
StandardAnalyzer(), analyzerMap);
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(perFieldAnalyzer);
IndexWriter writer = new IndexWriter(dir, iwc);
int maxDocs = 4;
String content = "the quick brown fox jumped over the lazy dog";
for (int i = 0; i < maxDocs; i++) {
Document doc = new Document();
doc.add(new Field("text", content, textVectorType));
doc.add(new Field("text_phrases", content, textVectorType));
writer.addDocument(doc);
}
writer.commit();
try (IndexReader reader = DirectoryReader.open(writer)) {
assertEquals("Should have indexed maxDocs documents", maxDocs,
reader.numDocs());
// Verify term frequencies for the 'text' field
Terms textTerms = MultiTerms.getTerms(reader, "text");
assertNotNull("Terms should exist for 'text' field", textTerms);
TermsEnum textTermsEnum = textTerms.iterator();
BytesRef term;
int termCount = 0;
while ((term = textTermsEnum.next()) != null) {
assertEquals("Incorrect docFreq for term '" +
term.utf8ToString() + "' in field 'text'",
maxDocs, textTermsEnum.docFreq());
termCount++;
}
assertTrue("Should find terms in 'text' field", termCount > 0);
// Verify term frequencies for the 'text_phrases' field
(shingles)
Terms phraseTerms = MultiTerms.getTerms(reader, "text_phrases");
assertNotNull("Terms should exist for 'text_phrases' field",
phraseTerms);
TermsEnum phraseTermsEnum = phraseTerms.iterator();
BytesRef phrase;
int phraseCount = 0;
while ((phrase = phraseTermsEnum.next()) != null) {
assertEquals("Incorrect docFreq for phrase '" +
phrase.utf8ToString() + "' in field 'text_phrases'",
maxDocs, phraseTermsEnum.docFreq());
phraseCount++;
}
assertTrue("Should find phrases (shingles) in 'text_phrases'
field", phraseCount > 0);
} finally {
writer.close();
dir.close();
perFieldAnalyzer.close();
}
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]