Is there a Python script or tool available which can remove comments and docstrings from Python source?
It should take care of cases like:
"""
aas
"""
def f():
m = {
u'x':
u'y'
} # faake docstring ;)
if 1:
'string' >> m
if 2:
'string' , m
if 3:
'string' > m
So at last I have come up with a simple script, which uses the tokenize module and removes comment tokens. It seems to work pretty well, except that I am not able to remove docstrings in all cases. See if you can improve it to remove docstrings.
import cStringIO
import tokenize
def remove_comments(src):
"""
This reads tokens using tokenize.generate_tokens and recombines them
using tokenize.untokenize, and skipping comment/docstring tokens in between
"""
f = cStringIO.StringIO(src)
class SkipException(Exception): pass
processed_tokens = []
last_token = None
# go thru all the tokens and try to skip comments and docstrings
for tok in tokenize.generate_tokens(f.readline):
t_type, t_string, t_srow_scol, t_erow_ecol, t_line = tok
try:
if t_type == tokenize.COMMENT:
raise SkipException()
elif t_type == tokenize.STRING:
if last_token is None or last_token[0] in [tokenize.INDENT]:
# FIXEME: this may remove valid strings too?
#raise SkipException()
pass
except SkipException:
pass
else:
processed_tokens.append(tok)
last_token = tok
return tokenize.untokenize(processed_tokens)
Also I would like to test it on a very large collection of scripts with good unit test coverage. Can you suggest such a open source project?
This does the job:
""" Strip comments and docstrings from a file.
"""
import sys, token, tokenize
def do_file(fname):
""" Run on just one file.
"""
source = open(fname)
mod = open(fname + ",strip", "w")
prev_toktype = token.INDENT
first_line = None
last_lineno = -1
last_col = 0
tokgen = tokenize.generate_tokens(source.readline)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if 0: # Change to if 1 to see the tokens fly by.
print("%10s %-14s %-20r %r" % (
tokenize.tok_name.get(toktype, toktype),
"%d.%d-%d.%d" % (slineno, scol, elineno, ecol),
ttext, ltext
))
if slineno > last_lineno:
last_col = 0
if scol > last_col:
mod.write(" " * (scol - last_col))
if toktype == token.STRING and prev_toktype == token.INDENT:
# Docstring
mod.write("#--")
elif toktype == tokenize.COMMENT:
# Comment
mod.write("##\n")
else:
mod.write(ttext)
prev_toktype = toktype
last_col = ecol
last_lineno = elineno
if __name__ == '__main__':
do_file(sys.argv[1])
I'm leaving stub comments in the place of docstrings and comments since it simplifies the code. If you remove them completely, you also have to get rid of indentation before them.