Here is an example that is failing.  However, by just doing a
dir(SpanScorer) with your console commands show it has no extra
attributes other than the base java object ones, and it is using the
spans version of the code.




# ====================================================================
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
# ====================================================================

from unittest import TestCase, main
from lucene import *


class TestFormatter(PythonFormatter):

        def __init__(self, testCase):
                super(TestFormatter, self).__init__()
                self.testCase = testCase

        def highlightTerm(self, originalText, group):
                if group.getTotalScore() <= 0:
                        return originalText;

                self.testCase.countHighlightTerm(originalText)

                return "<b>" + originalText + "</b>"


class HighlighterTestCase(TestCase):
        """
        Unit tests ported from Java Lucene.
        2004 by Yura Smolsky ;)
        """

        FIELD_NAME = "contents"
        texts = [ "A wicked problem is one for which each attempt to create a
solution changes the understanding of the problem.  Wicked problems
cannot be solved in a traditional linear fashion, because the problem
definition evolves as new possible solutions are considered and/or
implemented."
                        "Wicked problems always occur in a social context -- 
the wickedness
of the problem reflects the diversity among the stakeholders in the
problem."
                        "From http://cognexus.org/id42.htm";
                        "Most projects in organizations -- and virtually all
technology-related projects these days -- are about wicked problems.
Indeed, it is the social complexity of these problems, not their
technical complexity, that overwhelms most current problem solving and
project management approaches."
                        "This text has a typo in referring to whicked problems" 
];


        def __init__(self, *args):

                super(HighlighterTestCase, self).__init__(*args)
                self.foundList = []
                self.parser = QueryParser(self.FIELD_NAME, StandardAnalyzer())

        def testSimpleHighlighter(self):

                self.doSearching("wicked")
                formatter = TestFormatter(self)

                for i in range(0, self.hits.length()):
                        self.foundList = []
                        text = self.hits.doc(i).get(self.FIELD_NAME)
                        tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                        StringReader(text))

                        ctokenStream = CachingTokenFilter(tokenStream)
                        highlighter = Highlighter(formatter, 
SpanScorer(self.query,
self.FIELD_NAME, ctokenStream))
                        result = highlighter.getBestFragments(tokenStream, text,
                                        maxNumFragmentsRequired,
                                        "...")
                        print "\t", result
                        print "Found: ",
                        print self.foundList

                

        def doSearching(self, queryString):

                searcher = IndexSearcher(self.ramDir)
                self.query = self.parser.parse(queryString)
                #self.query = PhraseQuery()
                #self.query = queries[0]
                #self.query.combine(queries)
                # for any multi-term queries to work (prefix, wildcard, range,
                # fuzzy etc) you must use a rewritten query!
                self.query = self.query.rewrite(self.reader)

                print "Searching for:", self.query.toString(self.FIELD_NAME)
                self.hits = searcher.search(self.query)
                self.numHighlights = 0

        def countHighlightTerm(self, found):

                self.foundList.append(found)

                self.numHighlights += 1 # update stats used in assertions

        def setUp(self):

                self.analyzer=StandardAnalyzer()
                self.ramDir = RAMDirectory()
                writer = IndexWriter(self.ramDir, self.analyzer, True)
                for text in self.texts:
                        self.addDoc(writer, text)

                writer.optimize()
                writer.close()
                self.reader = IndexReader.open(self.ramDir)
                self.numHighlights = 0;

        def addDoc(self, writer, text):

                d = Document()
                f = Field(self.FIELD_NAME, text,
                                Field.Store.YES, Field.Index.TOKENIZED,
                                Field.TermVector.YES)
                d.add(f)
                writer.addDocument(d)


if __name__ == "__main__":
        import sys, lucene
        lucene.initVM(lucene.CLASSPATH)
        if '-loop' in sys.argv:
                sys.argv.remove('-loop')
                while True:
                        try:
                                main()
                        except:
                                pass
        else:
                main()

Reply via email to