Comment extractor
An answer to this question on Stack Overflow.
Question
I am looking for a command line tool or a library for C, C++, Python, or Node.js which can extract just the comments from source files in a wide variety of languages.
For instance, given "bob.c":
int main(){ //Here is a comment
int i=3; /*Another comment*/
}
The following should be returned:
Here is a comment
Another comment
Possibly with line numbers included.
This should work for "bob.py", "bob.js", "bob.css", "bob.rb", "bob.asm", and so on.
This question differs from this other one because I am interested not only in C-style comments, but others as well.
Additionally, I am deeply suspicious of regular expressions as a solution. Comment-esque phrases can be sited within quoted text in deeply convoluted ways; I have yet to see a regex solution on SO addressing this.
Answer
With helpful suggestions from Ira Baxter, I tracked down Pygments via searching for lexers.
Pygments understands a massive number of languages and converts input in any one of this languages to standardized HTML output appropriate for highlighting.
The following takes a path to a directory, searches it recursively for code files, and returns a dictionary of filenames and the comments within each file:
import glob
import io
import os
import pathlib
import git
from pygments.formatter import Formatter
import pygments
import pygments.lexers
class CommentExtractor(Formatter):
def __init__(self, **options):
Formatter.__init__(self, **options)
def format(self, tokensource, outfile):
for ttype, value in tokensource:
if ttype in pygments.token.Comment:
outfile.write(value)
def GetCommentsFromFile(path):
lexer = pygments.lexers.get_lexer_for_filename(path)
comments = io.StringIO()
pygments.highlight(
code = open(path,'r').read(),
lexer = lexer,
formatter = CommentExtractor(),
outfile = comments
)
return comments.getvalue()
def GetCommentsFromFiles(rootpath, excluded):
files = {}
for (dirpath, dirnames, filenames) in os.walk(rootpath):
#Skip hidden directories
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in filenames:
if filename.startswith('.'): #Skip hidden files
continue
if pathlib.Path(filename).suffix in excluded:
continue
filename = os.path.join(dirpath, filename)
try:
files[filename] = GetCommentsFromFile(filename)
except pygments.util.ClassNotFound:
pass
return files
excluded_files_types = {".md", ".yml", ".bat", ".sh"}
files_and_comments = GetCommentsFromFiles(
rootpath = "root_of_code_directories",
excluded = excluded_files_types
)