Lucene 4.9 gives much the same result. import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version;
public class LuceneMissingTerms { public static void main(String[] args) throws Exception { try (Directory directory = new RAMDirectory()) { Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_4_9); try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_4_9, analyser))) { Document document = new Document(); document.add(new TextField("content", "blah blah commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO)); writer.addDocument(document); } try (IndexReader multiReader = DirectoryReader.open(directory)) { for (AtomicReaderContext atomicReaderContext : multiReader.leaves()) { AtomicReader reader = atomicReaderContext.reader(); Terms terms = MultiFields.getFields(reader).terms("content"); TermsEnum termsEnum = terms.iterator(null); BytesRef text; //noinspection NestedAssignment while ((text = termsEnum.next()) != null) { System.out.println("term: " + text.utf8ToString()); Bits liveDocs = reader.getLiveDocs(); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, null); int doc; //noinspection NestedAssignment while ((doc = docsAndPositionsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { System.out.println(" doc: " + doc); int freq = docsAndPositionsEnum.freq(); for (int i = 0; i < freq; i++) { int pos = docsAndPositionsEnum.nextPosition(); System.out.println(" pos: " + pos); } } } } StandardQueryParser queryParser = new StandardQueryParser(analyser); queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); // quoted to work around strange behaviour of StandardQueryParser treating this as a boolean query. Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); System.out.println(query); TopDocs topDocs = new IndexSearcher(multiReader).search(query, 10); System.out.println(topDocs.totalHits); } } } } --------------------------------------------------------------------- To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org For additional commands, e-mail: java-user-h...@lucene.apache.org