On 4 Jul 2006 08:38:47 -0700, Gaurav Agarwal <[EMAIL PROTECTED]> wrote: > Thanks Steven, Actually i wanted a do text processing for my office > where I can view all files in the system and use the first three to > give a summary of the document. Instead of having somebody actually > entering the summary. Seems there is no one code that can act as > convertor across formats, i'll have to check out convertors for > individual formats.
I have some old code that does just that. It uses pdftotext, catdoc and links to convert .doc, .pdf and .html to text. ################################################################## import mimetypes from subprocess import call, Popen, PIPE import sys class ConversionError(Exception): pass class UnknownMimeType(ConversionError): pass class NotAMimeType(ConversionError): pass class ParseError(ConversionError): pass def has_program(progname): return call(["which", progname], stdout = PIPE) == 0 def check_requirements(): missing = [] for prog in "catdoc", "pdftotext", "links": if not has_program(prog): missing.append(prog) if missing: print "You need to have the programs:", " ".join(missing) return False return True if not check_requirements(): print "Needed external programs not found, quitting" sys.exit(1) def get_catdoc_args(infile): return ["catdoc", "-s", "8859-1", infile] def get_pdftotext_args(infile): return ["pdftotext", infile, "-"] def get_links_args(infile): return ["links", infile, "-dump"] def totext(document): filetype_to_args_map = {"application/msword" : get_catdoc_args, "application/pdf" : get_pdftotext_args, "text/html" : get_links_args} ftype, ign = mimetypes.guess_type(document) if not ftype: raise NotAMimeType, "Couldn't detect mimetype for %s" % document try: argfunc = filetype_to_args_map[ftype] except KeyError: s = "Don't know how to handle %s documents" % ftype raise UnknownMimeType, s p = Popen(argfunc(document), stdout = PIPE, stderr = PIPE) text = p.stdout.read() if p.wait(): # Force a better exception to be thrown if the file doesn't exist. open(document) raise ParseError, "Failed to parse %s" % document return text if __name__ == "__main__": print totext("testpdf.pdf") -- mvh Björn -- http://mail.python.org/mailman/listinfo/python-list