Hi, all I currently need a TokenFilter to break token season07 into two tokens season 07
I tried PatternReplaceCharFilter to replace "season07" with "season 07", however, the offset is not correct for Highlighting. For this reason, I want to implement a TokenFilter, but I do not know how to deal with the offset. My implemtation is currently following EdgeNGramTokenFilter: public final class AlphaNumberTokenFilter extends TokenFilter { private char[] curTermBuffer; private int curTermLength; private int currentOffset; private int baseOffset; private TermAttribute termAtt; private OffsetAttribute offsetAtt; protected AlphaNumberTokenFilter(TokenStream input) { super(input); this.termAtt = addAttribute(TermAttribute.class); this.offsetAtt = addAttribute(OffsetAttribute.class); } @Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; } else { curTermBuffer = (char[]) termAtt.termBuffer().clone(); curTermLength = termAtt.termLength(); currentOffset = 0; baseOffset = offsetAtt.startOffset(); } } if (currentOffset < curTermLength) { for(int i=currentOffset;i<curTermLength-1;i++) { if(Character.isLetter(curTermBuffer[i])&&Character.isDigit(curTermBuffer[i+1])) { int start = currentOffset; int end = i+1; offsetAtt.setOffset(baseOffset+start, baseOffset+end); termAtt.setTermBuffer(curTermBuffer,start,end-start); currentOffset=i+1; return true; } } if(currentOffset<curTermLength) { int start = currentOffset; int end = curTermLength; offsetAtt.setOffset(baseOffset+start, baseOffset+end); termAtt.setTermBuffer(curTermBuffer,start,end-start); currentOffset=curTermLength; return true; } } curTermBuffer = null; } } @Override public void reset() throws IOException { super.reset(); curTermBuffer = null; } } -- Weiwei Wang Alex Wang 王巍巍 Room 403, Mengmin Wei Building Computer Science Department Gulou Campus of Nanjing University Nanjing, P.R.China, 210093 Homepage: http://cs.nju.edu.cn/rl/weiweiwang