On 4 Jul 2006 08:38:47 -0700, Gaurav Agarwal
> Thanks Steven, Actually i wanted a do text processing for my office
> where I can view all files in the system and use the first three to
> give a summary of the document. Instead of having somebody actually
> entering the summary. Seems there is no one code that can act as
> convertor across formats, i'll have to check out convertors for
> individual formats.

I have some old code that does just that. It uses pdftotext, catdoc
and links to convert .doc, .pdf and .html to text.

import mimetypes
from subprocess import call, Popen, PIPE
import sys

class ConversionError(Exception):

class UnknownMimeType(ConversionError):

class NotAMimeType(ConversionError):

class ParseError(ConversionError):

def has_program(progname):
    return call(["which", progname], stdout = PIPE) == 0

def check_requirements():
    missing = []
    for prog in "catdoc", "pdftotext", "links":
        if not has_program(prog):
    if missing:
        print "You need to have the programs:", " ".join(missing)
        return False
    return True

if not check_requirements():
    print "Needed external programs not found, quitting"

def get_catdoc_args(infile):
    return ["catdoc", "-s", "8859-1", infile]

def get_pdftotext_args(infile):
    return ["pdftotext", infile, "-"]

def get_links_args(infile):
    return ["links", infile, "-dump"]

def totext(document):
    filetype_to_args_map = {"application/msword" : get_catdoc_args,
                            "application/pdf" : get_pdftotext_args,
                            "text/html" : get_links_args}

    ftype, ign = mimetypes.guess_type(document)
    if not ftype:
        raise NotAMimeType, "Couldn't detect mimetype for %s" % document
        argfunc = filetype_to_args_map[ftype]
    except KeyError:
        s = "Don't know how to handle %s documents" % ftype
        raise UnknownMimeType, s

    p = Popen(argfunc(document), stdout = PIPE, stderr = PIPE)
    text = p.stdout.read()
    if p.wait():
        # Force a better exception to be thrown if the file doesn't exist.
        raise ParseError, "Failed to parse %s" % document
    return text

if __name__ == "__main__":
    print totext("testpdf.pdf")

mvh Björn

Reply via email to