i'm trying to make indexing of csv file contain arabic words my code: from whoosh import fields, index import os.path import csv import codecs # This list associates a name with each position in a row columns = ["juza","chapter","verse","voc","analysis", "unvoc","root"]
schema = fields.Schema(juza=fields.NUMERIC, chapter=fields.NUMERIC, verse=fields.NUMERIC, voc=fields.TEXT, analysis=fields.KEYWORD, unvoc=fields.TEXT, root=fields.TEXT) # Create the Whoosh index indexname = "index" if not os.path.exists(indexname): os.mkdir(indexname) ix = index.create_in(indexname, schema) # Open a writer for the index with ix.writer() as writer: # Open the CSV file """fi = codecs.open('q.csv', 'rb','utf8') data = fi.read() fi.close() f= codecs.open('mynew.csv', 'wb','utf-8') f.write(data.replace('\x00', '')) f.close() with codecs.open("mynew.csv", "rb","utf8") as csvfile: # Create a csv reader object for the file csvreader = csv.reader(csvfile)""" with codecs.open("q.csv", "r","utf8") as csvfile: # Create a csv reader object for the file csvreader = csvfile.read() # Read each row in the file for row in csvreader: # Create a dictionary to hold the document values for this row doc = {} # Read the values for the row enumerated like # (0, "name"), (1, "quantity"), etc. for colnum, value in enumerate(row): # Get the field name from the "columns" list fieldname = columns[colnum] # Strip any whitespace and convert to unicode # NOTE: you need to pass the right encoding here! try: value = unicode(value.strip(), "utf8") except TypeError: value=value.strip() # Put the value in the dictionary doc[fieldname] = value # Pass the dictionary to the add_document method writer.add_document(**doc) and i got this error: raceback (most recent call last): File "D:/Python27/rr.py", line 62, in <module> writer.add_document(**doc) File "D:/Python27\whoosh\filedb\filewriting.py", line 369, in add_document items = field.index(value) File "D:/Python27\whoosh\fields.py", line 466, in index return [(txt, 1, 1.0, '') for txt in self._tiers(num)] File "D:/Python27\whoosh\fields.py", line 454, in _tiers yield self.to_text(num, shift=shift) File "D:/Python27\whoosh\fields.py", line 487, in to_text return self._to_text(self.prepare_number(x), shift=shift, File "D:/Python27\whoosh\fields.py", line 476, in prepare_number x = self.type(x) UnicodeEncodeError: 'decimal' codec can't encode character u'\ufeff' in position 0: invalid decimal Unicode string my file is here: http://www.mediafire.com/view/?wy3asap4ba7dknl -- http://mail.python.org/mailman/listinfo/python-list