StringField is not tokenized, see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/document/StringField.html
You must use TextField! ----- Uwe Schindler H.-H.-Meier-Allee 63, D-28213 Bremen http://www.thetaphi.de eMail: u...@thetaphi.de > -----Original Message----- > From: Yann-Erwan Perio [mailto:ye.pe...@gmail.com] > Sent: Sunday, September 09, 2012 11:55 AM > To: java-user@lucene.apache.org > Subject: Custom Analyzer Not Called When Indexing > > Hello, > > This is my first time writing to the list. I am a Java developer, writing a > personal > project using Lucene, and so far have been very happy with the library > (v4BETA). However, I have recently decided to build and use a custom analyzer, > and could not make it work with IndexWriter. I must be missing something > obvious, but all my searches on the web and my unit tests were to no avail. I > would be thankful if you could point me out to the right direction. You will > find > below some test classes demonstrating my problem. > > Basically, I have created a custom analyzer, which tokenizes its input to > every > two chars, then lower cases them. For instance, the string "Hello World" would > be tokenized as "he", "ll", and so forth. When I try to index some document > with > this analyzer, it seems that Lucene does not take it into account at all. To > make > sure of that, I have simply added a log entry inside my "createComponents" > method, to check when this method is called - and the method is never called > during the indexing process. Interestingly, when I use a query parser with my > custom analyzer, everything is fine - the createComponents method is called as > expected. To me, this means that I have somehow failed to properly attach my > analyzer to the index writer. > > The following classes demonstrate my issue: > - IndexUtil: has methods to create the analyzer, open a FSDirectory-based > directory writer, and the corresponding reader, > - TwoLettersAnalyzer: my custom analyzer, which puts together my custom > tokenizer with a LowerCase filter, > - TwoLettersTokenizer: my custom tokenizer, > - TwoLettersTest: the main test class, with the standard "main()" method. > > The console output generated by the program is as followed: > Calling writeDocs() <= I expected a call to createComponents() > right after this line > Calling checkDocs() > Calling TwoLettersAnalyzer.createComponents() > Calling TwoLettersTokenizer.incrementToken() > Calling TwoLettersTokenizer.incrementToken() > Have we found our doc? false > > > Thank you very much for your time. > > Kind regards, > Yep. > > > > > ================================================================ > = > IndexUtil.java > ================================================================ > = > > package experiments; > > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.index.DirectoryReader; > import org.apache.lucene.index.IndexReader; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.index.IndexWriterConfig; > import org.apache.lucene.store.FSDirectory; > import org.apache.lucene.util.Version; > > import java.io.File; > import java.io.IOException; > > public final class IndexUtil { > > private IndexUtil() { > > } > > private static final File indexDirectory = new > File("C:\\Users\\Elegie\\Documents\\_Programmation\\Tests\\LuceneWriterTe > st\\index"); > > public static Analyzer createAnalyzer() { > return new TwoLettersAnalyzer(); > } > > public static IndexWriter openWriter() throws IOException { > IndexWriterConfig config = new > IndexWriterConfig(Version.LUCENE_40, createAnalyzer()); > config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); > return new IndexWriter(FSDirectory.open(indexDirectory), config); > } > > public static IndexReader openReader() throws IOException { > return DirectoryReader.open(FSDirectory.open(indexDirectory)); > } > > } > > > ================================================================ > = > TwoLettersAnalyzer.java > ================================================================ > = > > package experiments; > > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.analysis.TokenStream; > import org.apache.lucene.analysis.Tokenizer; > import org.apache.lucene.analysis.core.LowerCaseFilter; > import org.apache.lucene.util.Version; > > import java.io.Reader; > > public class TwoLettersAnalyzer extends Analyzer { > > @Override > protected TokenStreamComponents createComponents(String s, Reader > reader) { > System.out.println("Calling TwoLettersAnalyzer.createComponents()"); > Tokenizer source = new TwoLettersTokenizer(reader); > TokenStream filter = new LowerCaseFilter(Version.LUCENE_40, source); > return new TokenStreamComponents(source, filter); > } > > } > > > ================================================================ > = > TwoLettersTokenizer.java > ================================================================ > = > > package experiments; > > import org.apache.lucene.analysis.Tokenizer; > import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; > > import java.io.IOException; > import java.io.Reader; > > public class TwoLettersTokenizer extends Tokenizer { > > private static final int TWO_LETTERS = 2; > > private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); > > protected TwoLettersTokenizer(Reader input) { > super(input); > } > > @Override > public final boolean incrementToken() throws IOException { > System.out.println("Calling TwoLettersTokenizer.incrementToken()"); > StringBuilder builder = new StringBuilder(); > boolean success = readAhead(builder, TWO_LETTERS); > if (success) { > termAtt.setEmpty(); > termAtt.append(builder); > } > return success; > } > > private boolean readAhead(StringBuilder builder, int ahead) throws > IOException { > if (ahead <= 0) { > return true; > } > int data = input.read(); > if (data != -1) { > builder.append((char) data); > return readAhead(builder, ahead - 1); > } > return false; > } > > } > > ================================================================ > = > TwoLettersTest.java > ================================================================ > = > > package experiments; > > import org.apache.lucene.document.Document; > import org.apache.lucene.document.Field; import > org.apache.lucene.document.StringField; > import org.apache.lucene.index.IndexReader; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.queryparser.classic.ParseException; > import org.apache.lucene.queryparser.classic.QueryParser; > import org.apache.lucene.search.IndexSearcher; > import org.apache.lucene.search.Query; > import org.apache.lucene.search.TopDocs; import > org.apache.lucene.util.Version; > > import java.io.IOException; > > public class TwoLettersTest { > > private static final String FIELD_NAME = "content"; > private static final String TEST_CONTENT = "Hello, World!"; > private static final String TEST_SEARCH = "he"; //llo World! > private static final int EXPECTED_MATCHED_DOCS = 1; > > public static void main(String[] args) throws IOException, ParseException > { > TwoLettersTest test = new TwoLettersTest(); > > System.out.println("Calling writeDocs()"); > test.writeDocs(); > > System.out.println("Calling checkDocs()"); > test.checkDocs(); > } > > public void writeDocs() throws IOException { > IndexWriter writer = IndexUtil.openWriter(); > writer.addDocument(createDoc(TEST_CONTENT)); > writer.close(); > } > > public void checkDocs() throws ParseException, IOException { > Query query = createQuery(); > IndexReader reader = IndexUtil.openReader(); > IndexSearcher searcher = new IndexSearcher(reader); > TopDocs results = searcher.search(query, EXPECTED_MATCHED_DOCS); > System.out.println("Have we found our doc? " + > (results.scoreDocs.length > == EXPECTED_MATCHED_DOCS)); > reader.close(); > } > > private Document createDoc(String content) { > Document doc = new Document(); > StringField field = new StringField(FIELD_NAME, content, > Field.Store.YES); > doc.add(field); > return doc; > } > > private Query createQuery() throws ParseException { > String search = FIELD_NAME + ":" + TEST_SEARCH; > QueryParser parser = new QueryParser(Version.LUCENE_40, > FIELD_NAME, IndexUtil.createAnalyzer()); > Query query = parser.parse(search); > return query; > } > > } > > --------------------------------------------------------------------- > To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org > For additional commands, e-mail: java-user-h...@lucene.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org For additional commands, e-mail: java-user-h...@lucene.apache.org