Dear Andi, please find in attachment the Java and the Python codes. Both of them, create an index with two records using Shingle analyser and then query it printing the query and the terms of the query.
Thanks a lot for your help Marco On Sun, Jan 29, 2017 at 3:10 AM, Andi Vajda <va...@apache.org> wrote: > > On Sat, 28 Jan 2017, marco turchi wrote: > > Dear All, >> I need to use the ShingleAnalyzerWrapper in PyLucene. >> >> I have built the analyzer similar to Lucene: >> self.analyzer = ShingleAnalyzerWrapper(WhitespaceAnalyzer(), 2, 4, " " , >> True, False, None) >> >> and I have used it inside QuertParser >> query = QueryParser("source", self.analyzer).parse("welcome world is at >> on") >> >> the output is: >> source:welcome source:world source:is source:at source:on >> >> I have run the same code in Java and the output is how I would expect it: >> source:welcome source:welcome world source:welcome world is source:welcome >> world is at source:world source:world is source:world is at source:world >> is >> at on source:is content:is at source:is at on source:at source:at on >> source:on >> >> Do you have any ideas in what I'm doing wrong in PyLucene? >> > > Please, help me help you by including two simple programs that I can run > to reproduce the problem. One in Java producing the output you expect, one > in Python producing the output you're reporting. > > Thanks ! > > Andi.. > > > >> Thanks a lot in advance for your help >> Marco >> >>
#!/usr/bin/env python INDEX_DIR = "IndexFiles.index" import sys, os, lucene, threading, time from datetime import datetime from java.nio.file import Paths from java.io import StringReader #from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer #from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.analysis.shingle import ShingleAnalyzerWrapper from org.apache.lucene.analysis.core import WhitespaceAnalyzer from org.apache.lucene.document import Document, Field, FieldType, StringField, TextField from org.apache.lucene.index import \ FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher from org.apache.lucene.index import DirectoryReader from org.apache.lucene.index import Term from org.apache.lucene.search import BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.analysis.tokenattributes import CharTermAttribute; from org.apache.lucene.analysis import TokenStream; """ This class is loosely based on the Lucene (java implementation) demo class org.apache.lucene.demo.IndexFiles. It will take a directory as an argument and will index all of the files in that directory and downward recursively. It will index on the file path, the file name and the file contents. The resulting Lucene index will be placed in the current directory and called 'index'. """ class Ticker(object): def __init__(self): self.tick = True def run(self): while self.tick: sys.stdout.write('.') sys.stdout.flush() time.sleep(1.0) class Indexer(object): """Usage: python IndexFiles <doc_directory>""" def __init__(self, storeDir): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION if not os.path.exists(storeDir): os.mkdir(storeDir) directory = SimpleFSDirectory(Paths.get(storeDir)) #analyzer = StandardAnalyzer() #analyzer = WhitespaceAnalyzer() analyzer = ShingleAnalyzerWrapper(WhitespaceAnalyzer(), 2, 6, ' ', True, False, None) #analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(directory, config) #self.indexDocs(root, writer) #ticker = Ticker() #print 'commit index', #threading.Thread(target=ticker.run).start() #writer.commit() #writer.close() #ticker.tick = False #print 'done' def commit(self): ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() self.writer.commit() #self.writer.close() ticker.tick = False print 'done' def close(self): self.writer.close() def add(self, count, source, reference): try: doc = Document() #except Exception, e: # print "Failed in adding source reference sentences:", e #doc.add(Field("id", count, t1)) #doc.add(Field("source", source, t2)) #doc.add(Field("reference", reference, t3)) doc.add(TextField("id", str(count), Field.Store.YES)) doc.add(TextField("source", source, Field.Store.YES)) doc.add(TextField("reference", reference, Field.Store.YES)) self.writer.addDocument(doc) #ticker = Ticker() # print 'commit index', #threading.Thread(target=ticker.run).start() #self.writer.commit() #self.writer.close() #ticker.tick = False #print 'done' except Exception, e: print "Failed in adding source reference sentences:", e class Searcher(object): """Usage: python IndexFiles <doc_directory>""" def __init__(self, base_dir): #lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION print base_dir directory = SimpleFSDirectory(Paths.get(base_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.analyzer = WhitespaceAnalyzer() #self.analyzer = ShingleAnalyzerWrapper(WhitespaceAnalyzer(), 2, 6, " " , True, False, None) #self.analyzer = StandardAnalyzer() def query_source(self, queryString, maxDoc): #print #print "Searching for:", queryString #query = QueryParser("source", self.analyzer).parse(queryString) query = self.getBooleanAndQuery(queryString) print query.build() scoreDocs = self.searcher.search(query.build(), int(maxDoc)).scoreDocs print "%s total matching documents." % len(scoreDocs) c = 0 for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) print 'id:', doc.get("id"), 'score:', scoreDoc.score, 'source:', doc.get("source"), 'reference:', doc.get("reference") c += 1 def getBooleanAndQuery(self, sourceString): query = BooleanQuery.Builder() #query.add(TermQuery(Term("source", command.split("##")[0])), BooleanClause.Occur.MUST) #query.add(TermQuery(Term("reference", command.split("##")[1])), BooleanClause.Occur.MUST) #bq1 = query.build() # add source ts = self.analyzer.tokenStream("source", StringReader(sourceString)) termAtt = ts.addAttribute(CharTermAttribute.class_) ts.reset() while(ts.incrementToken()): termText = termAtt.toString() print "term text: "+termText query.add(TermQuery(Term("source", termText)),BooleanClause.Occur.SHOULD) ts.close() #query.build() return query def delete(self): del self.searcher if __name__ == '__main__': if len(sys.argv) < 2: print IndexFiles.__doc__ sys.exit(1) #lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION start = datetime.now() try: base_dir = os.path.dirname(os.path.abspath(sys.argv[1])) print base_dir print os.path.join(base_dir, INDEX_DIR) indexer = Indexer(os.path.join(base_dir, INDEX_DIR)) indexer.add(0, "welcome world", "benvenuto mondo") indexer.add(1, "world to my welcome", "mondo nel mio benvenuto") indexer.commit() indexer.close() search = Searcher(os.path.join(base_dir, INDEX_DIR)) search.query_source("welcome world is at on", 10) except Exception, e: print "Failed: ", e raise e