I'm using the code below to read a pdf document, and it has no line feeds or carriage returns in the imported text. I'm therefore trying to just replace the symbol that looks like it would be an end of line (found by examining the characters in the "for loop") unichr(167). Unfortunately, the replace isn't working, does anyone know what I'm doing wrong? I tried a number of things so I left comments in place as a subset of the bunch of things I tried to no avail.
Any help? Kurt #!/usr/bin/python # -*- coding: utf-8 -+- from pyPdf import PdfFileWriter, PdfFileReader import unicodedata fileencoding = "utf-16-LE" #"iso-8859-1" # "utf-8" doc = PdfFileReader(file(r"C:\Documents and Settings\kpeters\My Documents \SUA.pdf", "rb") # print the title of document1.pdf print "title = %s" % (doc.getDocumentInfo().title) print "Subject:", doc.getDocumentInfo().subject print "PDF Version:", doc.getDocumentInfo().producer page4 = doc.getPage(3) textu= page4.extractText() #textu=textu.decode(fileencoding) print type(textu) #print type(textu.encode(fileencoding)) #textu=textu.encode(fileencoding) #Converts to str fn = unichr(167) print('The char is %s' % fn) textu.replace(unichr(167),'\n') #print unicodedata.bidirectional(fn) unichr(167) for i, c in enumerate(textu): if (i!=302): print('# %d has char %s, ord: %d , char: %s, category %s, and Name: %s' % (i, c, ord(c), unichr(ord(c)), unicodedata.category(c), unicodedata.name(c))) #if (ord(c)==167): # print('Found it!') #textu[i]='\n' print('----------------------------------------------------') print textu print textu.encode(fileencoding) -- http://mail.python.org/mailman/listinfo/python-list