Re: ShingleAnalyzerWrapper in PyLucene

marco turchi Sun, 29 Jan 2017 03:51:07 -0800

Dear Andi,
please find in attachment the Java and the Python codes. Both of them,
create an index with two records using Shingle analyser and then query it
printing the query and the terms of the query.


Thanks a lot for your help
Marco



On Sun, Jan 29, 2017 at 3:10 AM, Andi Vajda <va...@apache.org> wrote:

>
> On Sat, 28 Jan 2017, marco turchi wrote:
>
> Dear All,
>> I need to use the ShingleAnalyzerWrapper in PyLucene.
>>
>> I have built the analyzer similar to Lucene:
>> self.analyzer = ShingleAnalyzerWrapper(WhitespaceAnalyzer(), 2, 4, " " ,
>> True, False, None)
>>
>> and I have used it inside QuertParser
>> query = QueryParser("source", self.analyzer).parse("welcome world is at
>> on")
>>
>> the output is:
>> source:welcome source:world source:is source:at source:on
>>
>> I have run the same code in Java and the output is how I would expect it:
>> source:welcome source:welcome world source:welcome world is source:welcome
>> world is at source:world source:world is source:world is at source:world
>> is
>> at on source:is content:is at source:is at on source:at source:at on
>> source:on
>>
>> Do you have any ideas in what I'm doing wrong in PyLucene?
>>
>
> Please, help me help you by including two simple programs that I can run
> to reproduce the problem. One in Java producing the output you expect, one
> in Python producing the output you're reporting.
>
> Thanks !
>
> Andi..
>
>
>
>> Thanks a lot in advance for your help
>> Marco
>>
>>

#!/usr/bin/env python

INDEX_DIR = "IndexFiles.index"

import sys, os, lucene, threading, time
from datetime import datetime


from java.nio.file import Paths
from java.io import StringReader

#from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
#from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.shingle import ShingleAnalyzerWrapper
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.document import Document, Field, FieldType, StringField, TextField
from org.apache.lucene.index import \
    FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import Term
from org.apache.lucene.search import BooleanClause, BooleanQuery, TermQuery

from org.apache.lucene.analysis.tokenattributes import CharTermAttribute;
from org.apache.lucene.analysis import TokenStream;

"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents.  The
resulting Lucene index will be placed in the current directory and called
'index'.
"""

class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

class Indexer(object):
    """Usage: python IndexFiles <doc_directory>"""

    def __init__(self, storeDir):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    	print 'lucene', lucene.VERSION
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)


        directory = SimpleFSDirectory(Paths.get(storeDir))
        #analyzer = StandardAnalyzer()
	#analyzer = WhitespaceAnalyzer()
	
	analyzer = ShingleAnalyzerWrapper(WhitespaceAnalyzer(), 2, 6, ' ', True, False, None)
        #analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
        config = IndexWriterConfig(analyzer)
	config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(directory, config)
        #self.indexDocs(root, writer)
        #ticker = Ticker()
        #print 'commit index',
        #threading.Thread(target=ticker.run).start()
        #writer.commit()
        #writer.close()
        #ticker.tick = False
        #print 'done'

    def commit(self):
	ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        self.writer.commit()
        #self.writer.close()
        ticker.tick = False
        print 'done'

    def close(self):
	self.writer.close()

    def add(self, count, source, reference):


	try:
		doc = Document()
        #except Exception, e:
        #        print "Failed in adding source reference sentences:", e
        
		#doc.add(Field("id", count, t1))
        	#doc.add(Field("source", source, t2))
		#doc.add(Field("reference", reference, t3))
        	doc.add(TextField("id", str(count), Field.Store.YES))
        	doc.add(TextField("source", source, Field.Store.YES))
        	doc.add(TextField("reference", reference, Field.Store.YES))
		self.writer.addDocument(doc)
		#ticker = Ticker()
       		# print 'commit index',
        	#threading.Thread(target=ticker.run).start()
        	#self.writer.commit()
        	#self.writer.close()
        	#ticker.tick = False
		#print 'done'

        except Exception, e:
                print "Failed in adding source reference sentences:", e


class Searcher(object):
    """Usage: python IndexFiles <doc_directory>"""

    def __init__(self, base_dir):
        #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        print 'lucene', lucene.VERSION

	print base_dir
    	directory = SimpleFSDirectory(Paths.get(base_dir))
	self.searcher = IndexSearcher(DirectoryReader.open(directory))
    	self.analyzer = WhitespaceAnalyzer()
	#self.analyzer = ShingleAnalyzerWrapper(WhitespaceAnalyzer(), 2, 6, " " , True, False, None)

	#self.analyzer = StandardAnalyzer()




    def query_source(self, queryString, maxDoc):
        
    	#print
        #print "Searching for:", queryString
        #query = QueryParser("source", self.analyzer).parse(queryString)
	query = self.getBooleanAndQuery(queryString)
	print query.build()
        scoreDocs = self.searcher.search(query.build(), int(maxDoc)).scoreDocs
        print "%s total matching documents." % len(scoreDocs)
	c = 0 
        for scoreDoc in scoreDocs:
        	doc = self.searcher.doc(scoreDoc.doc)
        	print 'id:', doc.get("id"), 'score:', scoreDoc.score, 'source:', doc.get("source"), 'reference:', doc.get("reference")
		c += 1



    def getBooleanAndQuery(self, sourceString):

	query = BooleanQuery.Builder()
                #query.add(TermQuery(Term("source", command.split("##")[0])), BooleanClause.Occur.MUST)
                #query.add(TermQuery(Term("reference", command.split("##")[1])), BooleanClause.Occur.MUST)
                #bq1 = query.build()
	# add source
	ts = self.analyzer.tokenStream("source", StringReader(sourceString)) 
	
	termAtt = ts.addAttribute(CharTermAttribute.class_)
	ts.reset()

	while(ts.incrementToken()):
		termText = termAtt.toString()
		print "term text: "+termText
		query.add(TermQuery(Term("source", termText)),BooleanClause.Occur.SHOULD)
	ts.close()

	#query.build()

	return query



    def delete(self):
	del self.searcher	




if __name__ == '__main__':
    if len(sys.argv) < 2:
        print IndexFiles.__doc__
        sys.exit(1)
    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
        base_dir = os.path.dirname(os.path.abspath(sys.argv[1]))
	print base_dir
	print os.path.join(base_dir, INDEX_DIR)

	indexer = Indexer(os.path.join(base_dir, INDEX_DIR))

	indexer.add(0, "welcome world", "benvenuto mondo")
	indexer.add(1, "world to my welcome", "mondo nel mio benvenuto")
	indexer.commit()
	indexer.close()

	search = Searcher(os.path.join(base_dir, INDEX_DIR))
	search.query_source("welcome world is at on", 10)



    except Exception, e:
        print "Failed: ", e
        raise e

Re: ShingleAnalyzerWrapper in PyLucene

Reply via email to