On Thu, Dec 22, 2011 at 11:30 AM, Rami Chowdhury
wrote:
> Could you try using the 'open' function from the 'codecs' module?
>
I believe this is what you meant:
file = codecs.open(p + "2.txt", "r", "utf-8")
for line in file:
print line
but got this error:
141 file = codecs.open(p + "2.txt", "r", "utf-8")
142 for line in file:
143 print line
144
*line* = '\r\n', *file* =
/usr/lib64/python2.4/codecs.py in *next*(self=) 492
493 """ Return the next decoded line from the input stream."""
494 return self.reader.next()
495
496 def __iter__(self):
*self* = , self.*reader* = , self.reader.*next* = >
/usr/lib64/python2.4/codecs.py in *next*(self=) 429
430 """ Return the next decoded line from the input stream."""
431 line = self.readline()
432 if line:
433 return line
line *undefined*, *self* = , self.*
readline* = > /usr/lib64/python2.4/codecs.py in *readline*(self=, size=None, keepends=True) 344
# If size is given, we call read() only once
345 while True:
346 data = self.read(readsize, firstline=True)
347 if data:
348
# If we're at a "\r" read one extra character (which might
data *undefined*, *self* = , self.*read* =
>, *
readsize* = 72, firstline *undefined*, *builtin* *True* = True
/usr/lib64/python2.4/codecs.py in *read*(self=, size=72, chars=-1, firstline=True) 291
data = self.bytebuffer + newdata
292 try:
293
newchars, decodedbytes = self.decode(data, self.errors)
294 except UnicodeDecodeError, exc:
295 if firstline:
*newchars* = u'', *decodedbytes* = 0, *self* = , self.*decode* = , *data* =
'\xe1intentado para ellos bastante sabios para discernir lo obvio.
Tales perso',
self.*errors* = 'strict'
*UnicodeDecodeError*: 'utf8' codec can't decode bytes in position 0-2:
invalid data
args = ('utf8', '\xe1 intentado para ellos bastante sabios para
discernir lo obvio. Tales perso', 0, 3, 'invalid data')
encoding = 'utf8'
end = 3
object = '\xe1 intentado para ellos bastante sabios para discernir lo
obvio. Tales perso'
reason = 'invalid data'
start = 0
which is the letter á (a with accent).
So I tried with utf-16 and got this error:
141 file = codecs.open(p + "2.txt", "r", "utf-16")
142 for line in file:
143 print line
144
*line* = '\r\n', *file* =
/usr/lib64/python2.4/codecs.py in *next*(self=) 492
493 """ Return the next decoded line from the input stream."""
494 return self.reader.next()
495
496 def __iter__(self):
*self* = , self.*reader* = , self.reader.*next* = >
/usr/lib64/python2.4/codecs.py in *next*(self=) 429
430 """ Return the next decoded line from the input stream."""
431 line = self.readline()
432 if line:
433 return line
line *undefined*, *self* = , self.*
readline* = > /usr/lib64/python2.4/codecs.py in *readline*(self=, size=None, keepends=True) 344
# If size is given, we call read() only once
345 while True:
346 data = self.read(readsize, firstline=True)
347 if data:
348
# If we're at a "\r" read one extra character (which might
data *undefined*, *self* = , self.*read* =
>, *
readsize* = 72, firstline *undefined*, *builtin* *True* = True
/usr/lib64/python2.4/codecs.py in *read*(self=, size=72, chars=-1, firstline=True) 291
data = self.bytebuffer + newdata
292 try:
293
newchars, decodedbytes = self.decode(data, self.errors)
294 except UnicodeDecodeError, exc:
295 if firstline:
newchars *undefined*, decodedbytes *undefined*, *self* = , self.*decode* = >, *data* = '\r\nNoticia:
Este sitio web entre este portal est\xe1 i', self.*errors* = 'strict'
/usr/lib64/python2.4/encodings/utf_16.py in *decode*(self=, input='\r\nNoticia: Este
sitio web entre este portal est\xe1 i', errors='strict')47
self.decode = codecs.utf_16_be_decode
48 elif consumed>=2:
49
raise UnicodeError,"UTF-16 stream does not start with BOM"
50 return (object, consumed)
51
*builtin* *UnicodeError* =
*UnicodeError*: UTF-16 stream does not start with BOM
args = ('UTF-16 stream does not start with BOM',)
--
http://mail.python.org/mailman/listinfo/python-list