I have been experimenting with payloads and BoostingTermQuery,
which I think
are excellent additions to Lucene core. Currently,
BoostingTermQuery extends
SpanQuery. I would suggest changing this class to extend TermQuery and
refactor the current version to something like 'BoostingSpanQuery'.
The reason is rooted in performance. In my testing, I compared query
throughput using TermQuery against 2 versions of BoostingTermQuery
- the
current one that extends SpanQuery and one that extends TermQuery
(which
I've included, below). Here are the results (qps = queries per
second):
TermQuery: 200 qps
BoostingTermQuery (extends SpanQuery): 97 qps
BoostingTermQuery (extends TermQuery): 130 qps
Here is a version of BoostingTermQuery that extends TermQuery. I
had to
modify TermQuery and TermScorer to make them public. A code review
would be
in order, and I would appreciate your comments on this suggestion.
Peter
-----------------------------------------
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.*;
import java.io.IOException;
/**
* Copyright 2004 The Apache Software Foundation
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* The BoostingTermQuery is very similar to the [EMAIL PROTECTED]
org.apache.lucene.search.spans.SpanTermQuery} except
* that it factors in the value of the payload located at each of the
positions where the
* [EMAIL PROTECTED] org.apache.lucene.index.Term} occurs.
* <p>
* In order to take advantage of this, you must override [EMAIL PROTECTED]
org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
* which returns 1 by default.
* <p>
* Payload scores are averaged across term occurrences in the
document.
*
* <p><font color="#FF0000">
* WARNING: The status of the <b>Payloads</b> feature is experimental.
* The APIs introduced here might change in the future and will not be
* supported anymore in such a case.</font>
*
* @see org.apache.lucene.search.Similarity#scorePayload(byte[],
int, int)
*/
public class BoostingTermQuery extends TermQuery{
Term term;
Similarity similarity;
public BoostingTermQuery(Term term) {
super(term);
this.term = term;
}
protected Weight createWeight(Searcher searcher) throws
IOException {
this.similarity = getSimilarity(searcher);
return new BoostingTermWeight(this, searcher);
}
protected class BoostingTermWeight extends TermWeight implements
Weight {
public BoostingTermWeight(BoostingTermQuery query, Searcher
searcher)
throws IOException {
super(searcher);
}
public Scorer scorer(IndexReader reader) throws IOException {
return new BoostingTermScorer(reader.termDocs(term),
reader.termPositions(term), this, similarity,
reader.norms(term.field()));
}
class BoostingTermScorer extends TermScorer {
//TODO: is this the best way to allocate this?
byte[] payload = new byte[256];
private TermPositions positions;
protected float payloadScore;
private int payloadsSeen;
public BoostingTermScorer(TermDocs termDocs, TermPositions
termPositions, Weight weight,
Similarity similarity, byte[]
norms) throws
IOException {
super(weight, termDocs, similarity, norms);
positions = termPositions;
}
/**
* Go to the next document
*
*/
public boolean next() throws IOException {
boolean result = super.next();
//set the payload. super.next() properly increments the term
positions
if (result) {
if (positions.skipTo(super.doc())) {
positions.nextPosition();
processPayload(similarity);
}
}
return result;
}
public boolean skipTo(int target) throws IOException {
boolean result = super.skipTo(target);
if (result) {
if (positions.skipTo(target)) {
positions.nextPosition();
processPayload(similarity);
}
}
return result;
}
// protected boolean setFreqCurrentDoc() throws IOException {
// if (!more) {
// return false;
// }
// doc = spans.doc();
// freq = 0.0f;
// payloadScore = 0;
// payloadsSeen = 0;
// Similarity similarity1 = getSimilarity();
// while (more && doc == spans.doc()) {
// int matchLength = spans.end() - spans.start();
//
// freq += similarity1.sloppyFreq(matchLength);
// processPayload(similarity1);
//
// more = spans.next();//this moves positions to the next
match in
this document
// }
// return more || (freq != 0);
// }
protected void processPayload(Similarity similarity) throws
IOException {
if (positions.isPayloadAvailable()) {
payload = positions.getPayload(payload, 0);
payloadScore += similarity.scorePayload(payload, 0,
positions.getPayloadLength());
payloadsSeen++;
} else {
//zero out the payload?
}
}
public float score() {
return super.score() * (payloadsSeen > 0 ? (payloadScore /
payloadsSeen) : 1);
}
public Explanation explain(final int doc) throws IOException {
Explanation result = new Explanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
//QUESTION: Is there a wau to avoid this skipTo call? We
need to
know whether to load the payload or not
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
/*
if (skipTo(doc) == true) {
processPayload();
}
*/
float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore /
payloadsSeen) : 1);
payloadBoost.setValue(avgPayloadScore);
//GSI: I suppose we could toString the payload, but I don't
think
that would be a good idea
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
result.setDescription("btq, product of:");
return result;
}
}
}
public boolean equals(Object o) {
if (!(o instanceof BoostingTermQuery))
return false;
BoostingTermQuery other = (BoostingTermQuery) o;
return (this.getBoost() == other.getBoost())
&& this.term.equals(other.term);
}
}
Diffs for TermQuery, TermScorer:
Index: src/java/org/apache/lucene/search/TermQuery.java
===================================================================
--- src/java/org/apache/lucene/search/TermQuery.java (revision
581018)
+++ src/java/org/apache/lucene/search/TermQuery.java (working copy)
@@ -31,7 +31,7 @@
public class TermQuery extends Query {
private Term term;
- private class TermWeight implements Weight {
+ public class TermWeight implements Weight {
private Similarity similarity;
private float value;
private float idf;
Index: src/java/org/apache/lucene/search/TermScorer.java
===================================================================
--- src/java/org/apache/lucene/search/TermScorer.java (revision
581018)
+++ src/java/org/apache/lucene/search/TermScorer.java (working
copy)
@@ -23,7 +23,7 @@
/** Expert: A <code>Scorer</code> for documents matching a
<code>Term</code>.
*/
-final class TermScorer extends Scorer {
+public class TermScorer extends Scorer {
private Weight weight;
private TermDocs termDocs;
private byte[] norms;
@@ -44,7 +44,7 @@
* @param similarity The </code>Similarity</code> implementation
to be
used for score computations.
* @param norms The field norms of the document fields for the
<code>Term</code>.
*/
- TermScorer(Weight weight, TermDocs td, Similarity similarity,
+ public TermScorer(Weight weight, TermDocs td, Similarity
similarity,
byte[] norms) {
super(similarity);
this.weight = weight;
Peter