my code works well with english file but when i use text file encodede"utf-8" "my file contain some arabic letters" it doesn't work. my code: # encoding: utf-8 from whoosh import fields, index import os.path import re,string import codecs from whoosh.qparser import QueryParser
# This list associates a name with each position in a row columns = ["juza","chapter","verse","voc"] schema = fields.Schema(juza=fields.NUMERIC(stored=True), chapter=fields.NUMERIC(stored=True), verse=fields.NUMERIC(stored=True), voc=fields.TEXT(stored=True)) # Create the Whoosh index indexname = "indexdir" if not os.path.exists(indexname): os.mkdir(indexname) ix = index.create_in(indexname, schema) # Open a writer for the index with ix.writer() as writer: with codecs.open("tt.txt",encoding='utf-8') as txtfile: lines=txtfile.readlines() # Read each row in the file for i in lines: # Create a dictionary to hold the document values for this row doc = {} thisline=i.split() u=0 # Read the values for the row enumerated like # (0, "juza"), (1, "chapter"), etc. for w in thisline: # Get the field name from the "columns" list fieldname = columns[u] u+=1 #if isinstance(w, basestring): #w = unicode(w) doc[fieldname] = w # Pass the dictionary to the add_document method writer.add_document(**doc) with ix.searcher() as searcher: query = QueryParser("voc", ix.schema).parse(u"كتاب") results = searcher.search(query) print(len(results)) print(results[1]) my error: Traceback (most recent call last): File "D:\Python27\yarab (4).py", line 45, in <module> writer.add_document(**doc) File "build\bdist.win32\egg\whoosh\filedb\filewriting.py", line 369, in add_document items = field.index(value) File "build\bdist.win32\egg\whoosh\fields.py", line 466, in index return [(txt, 1, 1.0, '') for txt in self._tiers(num)] File "build\bdist.win32\egg\whoosh\fields.py", line 454, in _tiers yield self.to_text(num, shift=shift) File "build\bdist.win32\egg\whoosh\fields.py", line 487, in to_text return self._to_text(self.prepare_number(x), shift=shift, File "build\bdist.win32\egg\whoosh\fields.py", line 476, in prepare_number x = self.type(x) UnicodeEncodeError: 'decimal' codec can't encode character u'\ufeff' in position 0: invalid decimal Unicode string **my file: 2 2 3 كتاب 2 2 1 لعبة 1 1 1 كتاب **any help? -- http://mail.python.org/mailman/listinfo/python-list