Hi,
I try to debug boosting query.
Is there a way to see the term boost in the documents? I see them
in spans
in BoostingTermQuery, yet, from there I can't see which document I
am in.
If I want to copy some of the document in an index that saves the
boosting
- how can it be done?
The problem I am facing is that I get unexpected results - If for
word "a",
I have the worlds "1111" (boosting 3) and "2222" and for word "b" I
have the
world "1111". When I try to search for "1111" (boosting 5), word
"a" gets
better results.
When I debugged it, I saw that the boosting is always three, but
since in
the index I have a lot of documents, I tried to do the same on a
smaller
index.
I put only two words as you can see in the code below (I put all the
methods and classes needed to run this code).
The problem I saw here is the scorePayload in the Explain method -
it took
a differnt value from the one I indexed.
You can see below the output - for TTD - 1.0 = scorePayload(...)
and for finlin 3.0 = scorePayload(...)
while the boosting I used was the opposite - for TTD, I used 3 and
for
finlin, I used 1
The scorePayload should be the factor I put when I indexed, right?
Thanks a lot,
Liat
TTD, score: 1.2611988
0.26274973 = (MATCH) weight(worlds:666666 in 0), product of:
0.99999994 = queryWeight(worlds:666666), product of:
0.5945349 = idf(worlds: 666666=2)
1.681987 = queryNorm
0.26274976 = (MATCH) fieldWeight(worlds:666666 in 0), product of:
0.70710677 = (MATCH) btq, product of:
0.70710677 = tf(phraseFreq=0.5)
1.0 = scorePayload(...)
0.5945349 = idf(worlds: 666666=2)
0.625 = fieldNorm(field=worlds, doc=0)
********************************************************
finlin, score: 0.26274976
1.2611988 = (MATCH) weight(worlds:666666 in 1), product of:
0.99999994 = queryWeight(worlds:666666), product of:
0.5945349 = idf(worlds: 666666=2)
1.681987 = queryNorm
1.2611989 = (MATCH) fieldWeight(worlds:666666 in 1), product of:
2.1213202 = (MATCH) btq, product of:
0.70710677 = tf(phraseFreq=0.5)
3.0 = scorePayload(...)
0.5945349 = idf(worlds: 666666=2)
1.0 = fieldNorm(field=worlds, doc=1)
*The code*
**
public class Test
{
public Test()
{
}
public static void main(String[] args) throws IOException, Exception
{
Test st = new Test();
st.index(); //
st.testRealIndex();
}
public void index() throws IOException
{
DoubleMap wordMap = new DoubleMap();
wordMap.insert("TTD", 666666, 3);
wordMap.insert("finlin", 666666, 1);
wordMap.insert("finlin", 222222, 2);
index(wordMap, "wordIndexTry", "", "0");
}
public synchronized void index(DoubleMap doubleMap, String dirPath,
String
originalPath, String includeFreq) throws IOException
{
File f = new File(dirPath);
IndexWriter writer = null;
PayloadAnalyzer panalyzer = new PayloadAnalyzer();
if(f.exists())
{
writer = new IndexWriter(dirPath, panalyzer, false);
}
else
{
writer = new IndexWriter(dirPath, panalyzer, true);
}
Iterator it = doubleMap.getMap().entrySet().iterator();
int count = 0;
int size = doubleMap.getMap().size();
while(it.hasNext())
{
count++;
Map.Entry entry = (Map.Entry) it.next();
String word = entry.getKey().toString();
Word w = new Word();
w.word = word;
Date date = new Date();
System.out.println(date.toString() + " : Updateing word " + word
+ " ( "
+ count + " out of " + size + ") " + " FROM " + originalPath);
Map<Long, Double> innerMap = (Map<Long, Double>) entry.getValue();
Map<String, Integer> scoresMap = processMap(writer, panalyzer,
innerMap,
entry, w, dirPath, includeFreq);
index(writer, panalyzer, innerMap, scoresMap, w, dirPath,
includeFreq);
}
System.out.println("Optimizing " + dirPath + " ...");
writer.optimize();
writer.close();
}
public synchronized Map<String, Integer> processMap(IndexWriter
writer,
PayloadAnalyzer panalyzer, Map<Long, Double> innerMap, Map.Entry
entry, Word
w, String dirPath, String includeFreq) throws IOException
{
Map<String, Integer> scoresMap = new HashMap<String, Integer>();
Iterator worldsIter = innerMap.entrySet().iterator();
String worlds = "";
synchronized(worldsIter)
{
while(worldsIter.hasNext())
{
Map.Entry worldsEntry = (Map.Entry) worldsIter.next();
String world = worldsEntry.getKey().toString();
int freq = (int)
Double.parseDouble(worldsEntry.getValue().toString());
scoresMap.put(world, freq);
worlds += world + " ";
FileUtil.writeToFile("Output\\WordWorldsFreq.txt", w.word +
Constants.TAB_SEP + world + Constants.TAB_SEP + freq);
}
}
panalyzer.setMapScores(scoresMap);
//MapUtil.copyStringIntMap(scoresMap));
return scoresMap;
}
public synchronized void index(IndexWriter writer, PayloadAnalyzer
panalyzer, Map<Long, Double> innerMap, Map<String, Integer>
scoresMap, Word
w, String dirPath, String includeFreq) throws IOException
{
System.out.println("indexing");
w.worldsMap = innerMap;
WordIndex wi = new WordIndex(w);
wi.createDocument(includeFreq);
writer.addDocument(wi.getDocument());
}
public void testRealIndex() throws IOException
{
String word = "TTD";
String worlds = "666666";
DoubleMap wordsWorldsFreqMap = new DoubleMap();
wordsWorldsFreqMap.insert("TTD", 666666, 1.0);
BoostingBooleanQueryParser bbqp = new BoostingBooleanQueryParser();
BooleanQuery bq = bbqp.parse(word, worlds, wordsWorldsFreqMap,
"worlds");
IndexSearcher searcher = new IndexSearcher("wordIndexTry");
//D:\\PaiDatabase\\Indexes\\WordIndex");
searcher.setSimilarity(new WordsSimilarity());
TopDocCollector collector = new TopDocCollector(30);
searcher.search(bq, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
for(int j = 0; j < Math.min(hits.length, 10); j++)
{
int docId = hits[j].doc;
Document curDoc = searcher.doc(docId);
System.out.println(curDoc.getField("word").stringValue() + ",
score: " +
hits[j].score);
Explanation explanation = searcher.explain(bq, j);
System.out.println(explanation.toString());
String sym = curDoc.getField("word").stringValue();
}
}
public abstract class Index
{
protected Document doc = new Document();
public Index()
{
}
public Document getDocument()
{
return doc;
}
public void setDocument(Document d)
{
this.doc = d;
}
}
public class WordIndex extends Index
{
protected Word w;
public String FIELD_WORD = "word";
public String FIELD_WORLDS = "worlds";
public WordIndex(Word w)
{
this.w = w;
}
public void createDocument(String includeFreq) throws
java.io.FileNotFoundException
{
// make a new, empty document
doc = new Document();
doc.add(new Field(FIELD_WORD, w.word, Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field(FIELD_WORLDS,
String.valueOf(w.getWorldIds(includeFreq)), Field.Store.YES,
Field.Index.ANALYZED, Field.TermVector.YES));
}
public Document getDoc(String word, String indexPath) throws
IOException
{
IndexSearcher mapSearcher = new IndexSearcher(indexPath);
TermQuery mapQuery = new TermQuery(new Term(FIELD_WORD, word));
Hits mapHits = mapSearcher.search(mapQuery);
if(mapHits.length() != 0)
{
Document doc = mapHits.doc(0);
return doc;
}
return null;
}
}
public class Word
{
public String word;
public Map<Long, Double> worldsMap = new HashMap<Long, Double>();
public Word()
{
}
public String getWorldIds(String includeFreq)
{
String worlds = "";
Iterator iter = worldsMap.entrySet().iterator();
while(iter.hasNext())
{
Map.Entry entry = (Map.Entry) iter.next();
if(includeFreq.equals("1"))
{
int freq = (int) Double.parseDouble(entry.getValue().toString());
for(int i = 0; i < freq; i++)
{
worlds += entry.getKey().toString() + " ";
}
}
else
{
worlds += entry.getKey().toString() + " ";
}
}
return worlds;
}
}
public class DoubleMap
{
private Map<String, Map<Long, Double>> map;
public Map<String, String> worldsListMap = new HashMap<String,
String>();
public List<String> entriesList = new ArrayList<String>();
public DoubleMap()
{
map = new HashMap<String, Map<Long, Double>>();
}
public void insert(String word, long worldId, double beta)
{
if(map.get(word) != null)
{
Map<Long, Double> innerMap = map.get(word);
if(innerMap.get(worldId) != null)
{
return;
}
innerMap.put(worldId, beta);
map.put(word, innerMap);
}
else
{
Map<Long, Double> innerMap = new HashMap<Long, Double>();
innerMap.put(worldId, beta);
map.put(word, innerMap);
}
}
public void insert(String word, long worldId, double beta, int size)
{
if(map.get(word) != null)
{
Map<Long, Double> innerMap = map.get(word);
if(innerMap.get(worldId) != null)
{
return;
}
if(innerMap.size() == size)
{
Iterator iter = innerMap.entrySet().iterator();
int count = 0;
while(iter.hasNext())
{
Map.Entry entry = (Map.Entry) iter.next();
count++;
}
System.out.println(count);
long minWorldId = getMinItem(innerMap);
innerMap.remove(minWorldId);
}
innerMap.put(worldId, beta);
map.put(word, innerMap);
}
else
{
Map<Long, Double> innerMap = new HashMap<Long, Double>();
innerMap.put(worldId, beta);
map.put(word, innerMap);
}
}
private long getMinItem(Map<Long, Double> innerMap)
{
Iterator it = innerMap.entrySet().iterator();
long worldId = -1;
while(it.hasNext())
{
Map.Entry entry = (Map.Entry) it.next();
worldId = Long.parseLong(entry.getKey().toString());
}
return worldId;
}
public Map<String, Map<Long, Double>> getMap()
{
return map;
}
}
public class BoostingBooleanQueryParser
{
public BoostingBooleanQueryParser()
{
}
public BooleanQuery parse(String word, String worlds, DoubleMap
wordsWorldsFreqMap, String fieldName) throws IOException
{
BooleanQuery bq = new BooleanQuery();
String[] splitWorlds = worlds.split(" ");
for(int i = 0; i < splitWorlds.length; i++)
{
double freq =
wordsWorldsFreqMap
.getMap().get(word).get(Long.parseLong(splitWorlds[i]));
BoostingTermQuery tq = new BoostingTermQuery(new Term(fieldName,
splitWorlds[i]));
tq.setBoost((float) freq);
bq.add(tq, BooleanClause.Occur.SHOULD);
}
return bq;
}
}
public class PayloadAnalyzer extends Analyzer
{
private PayloadTokenStream payToken = null;
private int score;
private Map<String, Integer> scoresMap = new HashMap<String,
Integer>();
public synchronized void setScore(int s)
{
score = s;
}
public synchronized void setMapScores(Map<String, Integer>
scoresMap)
{
this.scoresMap = scoresMap;
}
public final TokenStream tokenStream(String field, Reader reader)
{
payToken = new PayloadTokenStream(new WhitespaceTokenizer(reader));
//new LowerCaseTokenizer(reader));
payToken.setScore(score);
payToken.setMapScores(scoresMap);
return payToken;
}
}
public class PayloadTokenStream extends TokenStream
{
private Tokenizer tok = null;
private int score;
private Map<String, Integer> scoresMap = new HashMap<String,
Integer>();
public PayloadTokenStream(Tokenizer tokenizer)
{
tok = tokenizer;
}
public void setScore(int s)
{
score = s;
}
public synchronized void setMapScores(Map<String, Integer>
scoresMap)
{
this.scoresMap = scoresMap;
}
public Token next(Token t) throws IOException
{
t = tok.next(t);
if(t != null)
{
//t.setTermBuffer("can change");
//Do something with the data
byte[] bytes = ("score:" + score).getBytes();
// t.setPayload(new Payload(bytes));
String word = String.copyValueOf(t.termBuffer(), 0,
t.termLength());
if(!word.equals("") && word != null)
{
int score = scoresMap.get(word);
if(score > 127)
{
score = 127;
}
byte payLoad = Byte.parseByte(String.valueOf(score));
t.setPayload(new Payload(new byte[] { Byte.valueOf(payLoad) }));
}
}
return t;
}
public void reset(Reader input) throws IOException
{
tok.reset(input);
}
public void close() throws IOException
{
tok.close();
}
}
}