On Thu, Dec 22, 2011 at 11:30 AM, Rami Chowdhury <rami.chowdh...@gmail.com>wrote:
> Could you try using the 'open' function from the 'codecs' module? > I believe this is what you meant: file = codecs.open(p + "2.txt", "r", "utf-8") for line in file: print line but got this error: 141 file = codecs.open(p + "2.txt", "r", "utf-8") 142 for line in file: 143 print line 144 *line* = '\r\n', *file* = <open file 'index2.txt', mode 'rb'> /usr/lib64/python2.4/codecs.py in *next*(self=<open file 'index2.txt', mode 'rb'>) 492 493 """ Return the next decoded line from the input stream.""" 494 return self.reader.next() 495 496 def __iter__(self): *self* = <open file 'index2.txt', mode 'rb'>, self.*reader* = <open file 'index2.txt', mode 'rb'>, self.reader.*next* = <bound method StreamReader.next of <open file 'index2.txt', mode 'rb'>> /usr/lib64/python2.4/codecs.py in *next*(self=<open file 'index2.txt', mode 'rb'>) 429 430 """ Return the next decoded line from the input stream.""" 431 line = self.readline() 432 if line: 433 return line line *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.* readline* = <bound method StreamReader.readline of <open file 'index2.txt', mode 'rb'>> /usr/lib64/python2.4/codecs.py in *readline*(self=<open file 'index2.txt', mode 'rb'>, size=None, keepends=True) 344 # If size is given, we call read() only once 345 while True: 346 data = self.read(readsize, firstline=True) 347 if data: 348 # If we're at a "\r" read one extra character (which might data *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.*read* = <bound method StreamReader.read of <open file 'index2.txt', mode 'rb'>>, * readsize* = 72, firstline *undefined*, *builtin* *True* = True /usr/lib64/python2.4/codecs.py in *read*(self=<open file 'index2.txt', mode 'rb'>, size=72, chars=-1, firstline=True) 291 data = self.bytebuffer + newdata 292 try: 293 newchars, decodedbytes = self.decode(data, self.errors) 294 except UnicodeDecodeError, exc: 295 if firstline: *newchars* = u'', *decodedbytes* = 0, *self* = <open file 'index2.txt', mode 'rb'>, self.*decode* = <built-in function utf_8_decode>, *data* = '\xe1intentado para ellos bastante sabios para discernir lo obvio. Tales perso', self.*errors* = 'strict' *UnicodeDecodeError*: 'utf8' codec can't decode bytes in position 0-2: invalid data args = ('utf8', '\xe1 intentado para ellos bastante sabios para discernir lo obvio. Tales perso', 0, 3, 'invalid data') encoding = 'utf8' end = 3 object = '\xe1 intentado para ellos bastante sabios para discernir lo obvio. Tales perso' reason = 'invalid data' start = 0 which is the letter รก (a with accent). So I tried with utf-16 and got this error: 141 file = codecs.open(p + "2.txt", "r", "utf-16") 142 for line in file: 143 print line 144 *line* = '\r\n', *file* = <open file 'index2.txt', mode 'rb'> /usr/lib64/python2.4/codecs.py in *next*(self=<open file 'index2.txt', mode 'rb'>) 492 493 """ Return the next decoded line from the input stream.""" 494 return self.reader.next() 495 496 def __iter__(self): *self* = <open file 'index2.txt', mode 'rb'>, self.*reader* = <open file 'index2.txt', mode 'rb'>, self.reader.*next* = <bound method StreamReader.next of <open file 'index2.txt', mode 'rb'>> /usr/lib64/python2.4/codecs.py in *next*(self=<open file 'index2.txt', mode 'rb'>) 429 430 """ Return the next decoded line from the input stream.""" 431 line = self.readline() 432 if line: 433 return line line *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.* readline* = <bound method StreamReader.readline of <open file 'index2.txt', mode 'rb'>> /usr/lib64/python2.4/codecs.py in *readline*(self=<open file 'index2.txt', mode 'rb'>, size=None, keepends=True) 344 # If size is given, we call read() only once 345 while True: 346 data = self.read(readsize, firstline=True) 347 if data: 348 # If we're at a "\r" read one extra character (which might data *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.*read* = <bound method StreamReader.read of <open file 'index2.txt', mode 'rb'>>, * readsize* = 72, firstline *undefined*, *builtin* *True* = True /usr/lib64/python2.4/codecs.py in *read*(self=<open file 'index2.txt', mode 'rb'>, size=72, chars=-1, firstline=True) 291 data = self.bytebuffer + newdata 292 try: 293 newchars, decodedbytes = self.decode(data, self.errors) 294 except UnicodeDecodeError, exc: 295 if firstline: newchars *undefined*, decodedbytes *undefined*, *self* = <open file 'index2.txt', mode 'rb'>, self.*decode* = <bound method StreamReader.decode of <open file 'index2.txt', mode 'rb'>>, *data* = '<span class="text">\r\n<i>Noticia: Este sitio web entre este portal est\xe1 i', self.*errors* = 'strict' /usr/lib64/python2.4/encodings/utf_16.py in *decode*(self=<open file 'index2.txt', mode 'rb'>, input='<span class="text">\r\n<i>Noticia: Este sitio web entre este portal est\xe1 i', errors='strict') 47 self.decode = codecs.utf_16_be_decode 48 elif consumed>=2: 49 raise UnicodeError,"UTF-16 stream does not start with BOM" 50 return (object, consumed) 51 *builtin* *UnicodeError* = <class exceptions.UnicodeError> *UnicodeError*: UTF-16 stream does not start with BOM args = ('UTF-16 stream does not start with BOM',)
-- http://mail.python.org/mailman/listinfo/python-list