Hi Andi,

 > If you have any questions, feel free to ask this list.
and here we go! :)

I've benchmarked the StandardAnalyzer against a 3Mb text file. This test (se
below) was executed both in PyLucene and in *plain* Lucene, i.e. in Java.

The execution time in Java was 1.7 sec, whereas the PyLucene test below on
the same machine was 37sec.
20 times slower is quite a lot. Would it mean, that one should rather kick
the "explain" function out of the Python scope to avoid the wrapping time
overhead? Or maybe I'm doing smth wrong here? Hints are welcome :)

##### PyLucene version of the test
from unittest import TestCase, main
import codecs
from lucene import *

class MyFirstTest(TestCase) :
    """
    """

    def tokenizeContent(self, content):
        analyzer = StandardAnalyzer()
        tokenStream = analyzer.tokenStream("dummy", StringReader(content))
        self.explain(tokenStream)

    def testMy1(self):
        f = codecs.open("./3Mb-monolith.txt", 'r', "utf-8")
        content = f.read()
        f.close()
        for i in range(10):
            self.tokenizeContent(content)

    def explain(self, ts):
        status = True
        for t in ts:
            t.termText()


if __name__ == "__main__":
    import sys, lucene
    lucene.initVM(lucene.CLASSPATH)
    if '-loop' in sys.argv:
        sys.argv.remove('-loop')
        while True:
            try:
                main()
            except:
                pass
    else:
        main()

//////////////////////////////////////////////////////////////
// Java Lucene version of the same test

package my.test;

import java.io.IOException;

public class MyFirstTest {

 @Test
 public void readAAJob() throws IOException {
  InputStream resourceAsStream = getClass().getResourceAsStream(
    "/3Mb-monolith.txt");
  String content = IOUtils.toString(resourceAsStream);

  for (int i = 0; i < 100; i++)
   tokenizeContent(content);

 }

 private void tokenizeContent(String content) throws IOException {
  StandardAnalyzer analyzer = new StandardAnalyzer();

  TokenStream tokenStream = analyzer.tokenStream("dummy",
    new StringReader(content));

  explain(tokenStream);
 }

 public void explain(TokenStream ts) throws IOException {
  Token token = new Token();
  int i = 0;
  while ((token = ts.next(token)) != null) {
   // System.out.println("Token[ " + i + " ] = " + token.term());
   i++;
  }
 }
}


best regards
--
Valery A.Khamenya

Reply via email to