On Tue, 28 Apr 2009, Max Lynch wrote:
Here is an example that is failing. However, by just doing a dir(SpanScorer
) with your console commands show it has no extra attributes other than the
base java object ones, and it is using the spans version of the code.
Indeed, the name clash between org.apache.lucene.search.spans.SpanScorer and
org.apache.lucene.search.highlight.SpanScorer was the problem.
I added support for a new command line argument ot JCC, called --rename,
that makes it possible to list one (or more, comma-separated) classes to
rename for the Python wrapper.
For example, I added this to the PyLucene build:
--rename org.apache.lucene.search.highlight.SpanScorer=HighlighterSpanScorer
to rename org.apache.lucene.search.highlight.SpanScorer to
HighlighterSpanScorer.
Your example code then ran (after modifying it to use the new class name).
Andi..
# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
from unittest import TestCase, main
from lucene import *
class TestFormatter(PythonFormatter):
def __init__(self, testCase):
super(TestFormatter, self).__init__()
self.testCase = testCase
def highlightTerm(self, originalText, group):
if group.getTotalScore() <= 0:
return originalText;
self.testCase.countHighlightTerm(originalText)
return "<b>" + originalText + "</b>"
class HighlighterTestCase(TestCase):
"""
Unit tests ported from Java Lucene.
2004 by Yura Smolsky ;)
"""
FIELD_NAME = "contents"
texts = [ "A wicked problem is one for which each attempt to create a
soluti
on changes the understanding of the problem. Wicked problems cannot be solv
ed in a traditional linear fashion, because the problem definition evolves a
s new possible solutions are considered and/or implemented."
"Wicked problems always occur in a social context --
the wickedness of the p
roblem reflects the diversity among the stakeholders in the problem."
"From http://cognexus.org/id42.htm"
"Most projects in organizations -- and virtually all
technology-related proj
ects these days -- are about wicked problems. Indeed, it is the social comp
lexity of these problems, not their technical complexity, that overwhelms mo
st current problem solving and project management approaches."
"This text has a typo in referring to whicked problems"
];
def __init__(self, *args):
super(HighlighterTestCase, self).__init__(*args)
self.foundList = []
self.parser = QueryParser(self.FIELD_NAME, StandardAnalyzer())
def testSimpleHighlighter(self):
self.doSearching("wicked")
formatter = TestFormatter(self)
for i in range(0, self.hits.length()):
self.foundList = []
text = self.hits.doc(i).get(self.FIELD_NAME)
tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
StringReader(text))
ctokenStream = CachingTokenFilter(tokenStream)
highlighter = Highlighter(formatter,
SpanScorer(self.query, self.FIELD_NAME,
ctokenStream))
result = highlighter.getBestFragments(tokenStream, text,
maxNumFragmentsRequired,
"...")
print "\t", result
print "Found: ",
print self.foundList
def doSearching(self, queryString):
searcher = IndexSearcher(self.ramDir)
self.query = self.parser.parse(queryString)
#self.query = PhraseQuery()
#self.query = queries[0]
#self.query.combine(queries)
# for any multi-term queries to work (prefix, wildcard, range,
# fuzzy etc) you must use a rewritten query!
self.query = self.query.rewrite(self.reader)
print "Searching for:", self.query.toString(self.FIELD_NAME)
self.hits = searcher.search(self.query)
self.numHighlights = 0
def countHighlightTerm(self, found):
self.foundList.append(found)
self.numHighlights += 1 # update stats used in assertions
def setUp(self):
self.analyzer=StandardAnalyzer()
self.ramDir = RAMDirectory()
writer = IndexWriter(self.ramDir, self.analyzer, True)
for text in self.texts:
self.addDoc(writer, text)
writer.optimize()
writer.close()
self.reader = IndexReader.open(self.ramDir)
self.numHighlights = 0;
def addDoc(self, writer, text):
d = Document()
f = Field(self.FIELD_NAME, text,
Field.Store.YES, Field.Index.TOKENIZED,
Field.TermVector.YES)
d.add(f)
writer.addDocument(d)
if __name__ == "__main__":
import sys, lucene
lucene.initVM(lucene.CLASSPATH)
if '-loop' in sys.argv:
sys.argv.remove('-loop')
while True:
try:
main()
except:
pass
else:
main()