Thanks Jean, I have thought about adding docstrings several times, but I was stumped at how to determine a docstring from a regular tripleqoted string ;) I have been thinking hard about the problem and I think I have an idea. If the line has nothing before the start of the string it must be a docstring. Sounds simple enough but in Python there are 12 or so 'types' of strings . Here is my crack at it feel free to improve it ;) I reversed the logic on the comments and docstrings so I could add a special mode to docstring stripping ...pep8 mode . Pep8 mode only strips double triple quotes from your source code leaving the offending single triple quotes behind. Probably just stupid but someone might find it usefull. ###################################################################### # Python source stripper ######################################################################
import os import sys import token import keyword import StringIO import tokenize import traceback __credits__ = ''' Jürgen Hermann M.E.Farmer Jean Brouwers ''' __version__ = '.8' __author__ = 'M.E.Farmer' __date__ = 'Apr 16, 2005,' \ 'Jan 15 2005,' \ 'Oct 24 2004' \ ###################################################################### class Stripper: """Python source stripper """ def __init__(self, raw): self.raw = raw def format(self, out=sys.stdout, comments=0, docstrings=0, spaces=1, untabify=1, eol='unix'): """ strip comments, strip docstrings, strip extra whitespace and lines, convert tabs to spaces, convert EOL's in Python code. """ # Store line offsets in self.lines self.lines = [0, 0] pos = 0 # Strips the first blank line if 1 self.lasttoken = 1 self.temp = StringIO.StringIO() self.spaces = spaces self.comments = comments self.docstrings = docstrings if untabify: self.raw = self.raw.expandtabs() self.raw = self.raw.rstrip()+' ' self.out = out # Have you ever had a multiple line ending script? # They can be nasty so lets get them all the same. self.raw = self.raw.replace('\r\n', '\n') self.raw = self.raw.replace('\r', '\n') self.lineend = '\n' # Gather lines while 1: pos = self.raw.find(self.lineend, pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) self.pos = 0 # Wrap text in a filelike object text = StringIO.StringIO(self.raw) # Parse the source. ## Tokenize calls the __call__ ## method for each token till done. try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: traceback.print_exc() # Ok now we write it to a file # but we also need to clean the whitespace # between the lines and at the ends. self.temp.seek(0) # All this should be written into the # __call__ method just haven't yet... # Mac CR if eol == 'mac': self.lineend = '\r' # Windows CR LF elif eol == 'win': self.lineend = '\r\n' # Unix LF else: self.lineend = '\n' for line in self.temp.readlines(): if spaces == -1: self.out.write(line.rstrip()+self.lineend) else: if not line.isspace(): self.lasttoken=0 self.out.write(line.rstrip()+self.lineend) else: self.lasttoken+=1 if self.lasttoken<=self.spaces and self.spaces: self.out.write(self.lineend) def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line): """ Token handler. """ # calculate new positions oldpos = self.pos newpos = self.lines[srow] + scol self.pos = newpos + len(toktext) # kill comments if self.comments: if toktype == tokenize.COMMENT: return # kill doc strings if self.docstrings: # Assume if there is nothing on the # left side it must be a docstring if toktype == tokenize.STRING and \ line.lstrip(' rRuU')[0] in ["'",'"']: t = toktext.lstrip('rRuU') if (t.startswith('"""') and (self.docstrings == 'pep8' or self.docstrings =='8')): return elif t.startswith('"""') or t.startswith("'''"): return # handle newlines if toktype in [token.NEWLINE, tokenize.NL]: self.temp.write(self.lineend) return # send the original whitespace if newpos > oldpos: self.temp.write(self.raw[oldpos:newpos]) # skip indenting tokens if toktype in [token.INDENT, token.DEDENT]: self.pos = newpos return # send text to the temp file self.temp.write(toktext) return ###################################################################### def Main(): import sys if sys.argv[1]: filein = open(sys.argv[1]).read() Stripper(filein).format(out=sys.stdout, comments=0, docstrings=1, untabify=1, eol='win') ###################################################################### if __name__ == '__main__': Main() -- http://mail.python.org/mailman/listinfo/python-list