OK, I succeeded to write an Analyzer I need. I can't say that I understood all Lucene Analyzer-Tokenizer-Filter logic, but here's attached MyAnalyzer. Hope it will help somebody else.
import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; public class MyAnalyzer extends Analyzer { public TokenStream tokenStream(String field, final Reader reader) { TokenStream result = new MyCharTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return result; } static class MyCharTokenizer extends CharTokenizer { public static final char[] BAD_CHARACTERS = { '.', ',', ':', '(', ')', ' ', '[', ']', ';', '\'', '"', '|', '-', '_', '*', '<', '>', '=', '+', '%', '#', '~', '`', '^'}; public MyCharTokenizer(Reader input) { super(input); } @Override protected boolean isTokenChar(char paramChar) { if (Character.isLetterOrDigit(paramChar)) { return true; } else { return false; } //if you need to filter out specific characters and not just non-digits-or-letters as above //for (int i = 0; i < BAD_CHARACTERS.length; i++) //{ // if (BAD_CHARACTERS[i] == paramChar) // { // return false; // } //} //return true; } } } -- View this message in context: http://lucene.472066.n3.nabble.com/parsing-Java-log-file-with-Lucene-3-0-3-tp2173046p2193022.html Sent from the Lucene - Java Users mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org For additional commands, e-mail: java-user-h...@lucene.apache.org