Created
December 15, 2015 13:43
-
-
Save sils/53c2f1a999e49983ffcf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from coalib.bearlib.languages.documentation.DocstyleDefinition import ( | |
DocstyleDefinition) | |
from coalib.bearlib.languages.documentation.DocumentationComment import ( | |
DocumentationComment) | |
from coalib.results.TextRange import TextRange | |
# Used to break out of outer loops via exception raise. | |
class _BreakOut(Exception): | |
pass | |
def _compile_multi_match_regex(strings): | |
""" | |
Compiles a regex object that matches each of the given strings. | |
:param strings: The strings to match. | |
:return: A regex object. | |
""" | |
return re.compile("|".join(re.escape(s) for s in strings)) | |
def get_matching_marker_sets(content, line: str, start_column, markers: [(str, str, str)]) -> [(str, str, str)]: | |
raise NotImplementedError | |
def get_comment(content: [str], line: int, column: int, markers: (str, str, str)): | |
matching_marker_sets = get_matching_marker_sets(content, line, column, markers) | |
for marker in matching_marker_sets: | |
textrange = extract_comment(content, i, marker) | |
if textrange: | |
return textrange.end.line, textrange.end.column, textrange | |
return line+1, 0, None | |
def pseuo_coe(content: [str], markers: [(str, str, str)]): | |
line, column = 0, 0 | |
while line < len(content): | |
line, column, comment = get_comment(content, line, column, markers) | |
if comment: | |
yield comment | |
def extract_documentation_with_docstyle(content, docstyle_definition): | |
""" | |
Extracts all documentation texts inside the given source-code-string. | |
:param content: The source-code-string where to extract | |
documentation from or an iterable with strings | |
where each string is a single line (including | |
ending whitespaces like `\\n`). | |
:param docstyle_definition: The DocstyleDefinition that identifies the | |
documentation comments. | |
:return: An iterator returning each documentation text | |
found in the content. | |
""" | |
# Prepare marker-tuple dict that maps a begin pattern to the corresponding | |
# marker_set(s). This makes it faster to retrieve a marker-set from a | |
# begin sequence we initially want to search for in source code. Then | |
# the possible found documentation match is processed further with the | |
# rest markers. | |
begin_sequence_dict = {} | |
for marker_set in docstyle_definition.markers: | |
if marker_set[0] not in begin_sequence_dict: | |
begin_sequence_dict[marker_set[0]] = [marker_set] | |
else: | |
begin_sequence_dict[marker_set[0]].append(marker_set) | |
# Using regexes to perform a variable match is faster than finding each | |
# substring with `str.find()` choosing the lowest match. | |
begin_regex = _compile_multi_match_regex( | |
marker_set[0] for marker_set in docstyle_definition.markers) | |
line = 0 | |
line_pos = 0 | |
while line < len(content): | |
begin_match = begin_regex.search(content[line], line_pos) | |
if begin_match: | |
begin_match_line = line | |
# Prevents infinite loop when the start marker matches but not the | |
# complete documentation comment. | |
line_pos = begin_match.end() | |
# begin_sequence_dict[begin_match.group()] returns the marker set | |
# the begin sequence from before matched. | |
for marker_set in begin_sequence_dict[begin_match.group()]: | |
try: | |
# If the each-line marker and the end marker do equal, | |
# search for the each-line marker until it runs out. | |
if marker_set[1] == marker_set[2]: | |
docstring = content[line][begin_match.end():] | |
line2 = line + 1 | |
stripped_content = content[line2].lstrip() | |
# Now the each-line marker is no requirement for a | |
# docstring any more, just extract as long as there are | |
# no each-line markers any more. | |
while (stripped_content[:len(marker_set[1])] == | |
marker_set[1]): | |
docstring += stripped_content[len(marker_set[1]):] | |
line2 += 1 | |
if line2 >= len(content): | |
# End of content reached, done with | |
# doc-extraction. | |
break | |
stripped_content = content[line2].lstrip() | |
line = line2 - 1 | |
line_pos = len(content[line]) | |
else: | |
end_marker_pos = content[line].find(marker_set[2], | |
begin_match.end()) | |
if end_marker_pos == -1: | |
docstring = content[line][begin_match.end():] | |
line2 = line + 1 | |
if line2 >= len(content): | |
continue | |
end_marker_pos = content[line2].find(marker_set[2]) | |
while end_marker_pos == -1: | |
if marker_set[1] == "": | |
# When no each-line marker is set (i.e. for | |
# Python docstrings), then align the | |
# comment to the start-marker. | |
stripped_content = ( | |
content[line2][begin_match.start():]) | |
else: | |
# Check whether we violate the each-line | |
# marker "rule". | |
current_each_line_marker = (content[line2] | |
[begin_match.start(): | |
begin_match.start() | |
+ len(marker_set[1])]) | |
if (current_each_line_marker != | |
marker_set[1]): | |
# Effectively a 'continue' for the | |
# outer for-loop. | |
raise _BreakOut | |
stripped_content = ( | |
content[line2][begin_match.start() | |
+ len(marker_set[1]):]) | |
docstring += stripped_content | |
line2 += 1 | |
if line2 >= len(content): | |
# End of content reached, so there's no | |
# closing marker and that's a mismatch. | |
raise _BreakOut | |
end_marker_pos = content[line2].find( | |
marker_set[2]) | |
docstring += (content[line2] | |
[begin_match.start():end_marker_pos]) | |
line = line2 | |
else: | |
docstring = (content[line] | |
[begin_match.end():end_marker_pos]) | |
line_pos = end_marker_pos + len(marker_set[2]) | |
rng = TextRange.from_values(begin_match_line + 1, | |
begin_match.start() + 1, | |
line + 1, | |
line_pos + 1) | |
yield DocumentationComment(docstring, | |
docstyle_definition, | |
marker_set, | |
rng) | |
break | |
except _BreakOut: | |
# Continues the marker_set loop. | |
pass | |
else: | |
line += 1 | |
line_pos = 0 | |
def extract_documentation(content, language, docstyle): | |
""" | |
Extracts all documentation texts inside the given source-code-string using | |
the coala docstyle definition files. | |
The documentation texts are sorted by their order appearing in `content`. | |
For more information about how documentation comments are identified and | |
extracted, see DocstyleDefinition.doctypes enumeration. | |
:param content: The source-code-string where to extract | |
documentation from. | |
:param language: The programming language used. | |
:param docstyle: The documentation style/tool used | |
(i.e. doxygen). | |
:raises FileNotFoundError: Raised when the docstyle definition file was not | |
found. This is a compatability exception from | |
`coalib.misc.Compatability` module. | |
:raises KeyError: Raised when the given language is not defined in | |
given docstyle. | |
:raises ValueError: Raised when a docstyle definition setting has an | |
invalid format. | |
:return: An iterator returning each DocumentationComment | |
found in the content. | |
""" | |
docstyle_definition = DocstyleDefinition.load(language, docstyle) | |
return extract_documentation_with_docstyle(content, docstyle_definition) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment