There is an issue with both my and your code: it only works if doc strings are triple quoted and if there are no other triple quoted strings in the Python code.
A triple quoted string used in an assignment will be removed, for example this case s = '''this string should not be removed''' It is still unclear how to distinguish doc strings from other strings. Also, I have not checked the precise Python syntax, but doc strings do not need to be enclosed by triple quotes. A single quote may be allowed too. Maybe this rule will work: a doc string is any string preceded by a COLON token followed by zero, one or more INDENT or NEWLINE tokens. Untested! /Jean Brouwers M.E.Farmer wrote: > Thanks Jean, > I have thought about adding docstrings several times, but I was stumped > at how to determine a docstring from a regular tripleqoted string ;) > I have been thinking hard about the problem and I think I have an idea. > If the line has nothing before the start of the string it must be a > docstring. > Sounds simple enough but in Python there are 12 or so 'types' of > strings . > Here is my crack at it feel free to improve it ;) > I reversed the logic on the comments and docstrings so I could add a > special mode to docstring stripping ...pep8 mode . > Pep8 mode only strips double triple quotes from your source code > leaving the offending single triple quotes behind. Probably just stupid > but someone might find it usefull. > ###################################################################### > # Python source stripper > ###################################################################### > > import os > import sys > import token > import keyword > import StringIO > import tokenize > import traceback > __credits__ = ''' > Jürgen Hermann > M.E.Farmer > Jean Brouwers > ''' > __version__ = '.8' > __author__ = 'M.E.Farmer' > __date__ = 'Apr 16, 2005,' \ > 'Jan 15 2005,' \ > 'Oct 24 2004' \ > > > ###################################################################### > > class Stripper: > """Python source stripper > """ > def __init__(self, raw): > self.raw = raw > > def format(self, out=sys.stdout, comments=0, docstrings=0, > spaces=1, untabify=1, eol='unix'): > """ strip comments, > strip docstrings, > strip extra whitespace and lines, > convert tabs to spaces, > convert EOL's in Python code. > """ > # Store line offsets in self.lines > self.lines = [0, 0] > pos = 0 > # Strips the first blank line if 1 > self.lasttoken = 1 > self.temp = StringIO.StringIO() > self.spaces = spaces > self.comments = comments > self.docstrings = docstrings > > if untabify: > self.raw = self.raw.expandtabs() > self.raw = self.raw.rstrip()+' ' > self.out = out > > # Have you ever had a multiple line ending script? > # They can be nasty so lets get them all the same. > self.raw = self.raw.replace('\r\n', '\n') > self.raw = self.raw.replace('\r', '\n') > self.lineend = '\n' > > # Gather lines > while 1: > pos = self.raw.find(self.lineend, pos) + 1 > if not pos: break > self.lines.append(pos) > > self.lines.append(len(self.raw)) > self.pos = 0 > > # Wrap text in a filelike object > text = StringIO.StringIO(self.raw) > > # Parse the source. > ## Tokenize calls the __call__ > ## method for each token till done. > try: > tokenize.tokenize(text.readline, self) > except tokenize.TokenError, ex: > traceback.print_exc() > > # Ok now we write it to a file > # but we also need to clean the whitespace > # between the lines and at the ends. > self.temp.seek(0) > > # All this should be written into the > # __call__ method just haven't yet... > > # Mac CR > if eol == 'mac': > self.lineend = '\r' > # Windows CR LF > elif eol == 'win': > self.lineend = '\r\n' > # Unix LF > else: > self.lineend = '\n' > > for line in self.temp.readlines(): > if spaces == -1: > self.out.write(line.rstrip()+self.lineend) > else: > if not line.isspace(): > self.lasttoken=0 > self.out.write(line.rstrip()+self.lineend) > else: > self.lasttoken+=1 > if self.lasttoken<=self.spaces and self.spaces: > self.out.write(self.lineend) > > def __call__(self, toktype, toktext, > (srow,scol), (erow,ecol), line): > """ Token handler. > """ > # calculate new positions > oldpos = self.pos > newpos = self.lines[srow] + scol > self.pos = newpos + len(toktext) > > # kill comments > if self.comments: > if toktype == tokenize.COMMENT: > return > > # kill doc strings > if self.docstrings: > # Assume if there is nothing on the > # left side it must be a docstring > if toktype == tokenize.STRING and \ > line.lstrip(' rRuU')[0] in ["'",'"']: > t = toktext.lstrip('rRuU') > if (t.startswith('"""') and > (self.docstrings == 'pep8' or > self.docstrings =='8')): > return > elif t.startswith('"""') or t.startswith("'''"): > return > > # handle newlines > if toktype in [token.NEWLINE, tokenize.NL]: > self.temp.write(self.lineend) > return > > # send the original whitespace > if newpos > oldpos: > self.temp.write(self.raw[oldpos:newpos]) > > # skip indenting tokens > if toktype in [token.INDENT, token.DEDENT]: > self.pos = newpos > return > > # send text to the temp file > self.temp.write(toktext) > return > ###################################################################### > > def Main(): > import sys > if sys.argv[1]: > filein = open(sys.argv[1]).read() > Stripper(filein).format(out=sys.stdout, > comments=0, docstrings=1, untabify=1, eol='win') > ###################################################################### > > if __name__ == '__main__': > Main() -- http://mail.python.org/mailman/listinfo/python-list