Created
August 3, 2017 20:57
-
-
Save HarlanH/277006989774372515c7130e63809315 to your computer and use it in GitHub Desktop.
extracts structured comments/documentation from a dbt directory tree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# This Q&D script scans through SQL files in the models directory and outputs a Markdown document | |
# with per-model comments. | |
import os | |
from os import path | |
import re | |
import warnings | |
import time | |
dirs = ['models/staging', 'models/analytics', 'models/reporting'] # in order | |
for dir in dirs: | |
sql_files = [file for file in os.listdir(dir) if file.lower().endswith('.sql')] | |
for sql_file in sql_files: | |
# iterate over the file | |
# save all rows that look like a block comment | |
# save all rows that look like a column comment | |
# save all table reference rows | |
# these should never happen overlapping, which makes this easy... | |
with open(path.join(dir, sql_file), 'r') as infile: | |
# header | |
model_name = sql_file[:len(sql_file)-4] | |
print("# %s\n\n" % model_name) | |
block_comments = [] | |
column_comments = [] | |
table_refs = [] | |
for line in infile: | |
line = line.strip() | |
#print(line) | |
# block comments | |
if line.startswith('--@'): | |
block_comments.append(line[3:].strip()) | |
# column comments | |
# <from_col>(::<cast>) (AS <to_col>)(,) -- @col <comment> | |
# using an iterative regex strategy because I'm not smart enough to write it | |
# all at once... | |
col = re.search(r'[\s,]*(.+?)[\s,]*--\s*@col\s*(.*)', line) | |
if col: | |
sql = col.group(1) | |
comment = col.group(2) | |
rename = re.search(r'^(.*?)\s+(?:AS|as)?\s*(\w+)$', sql) | |
if rename: | |
src = rename.group(1) | |
target = rename.group(2) | |
else: | |
src = sql | |
target = None | |
cast = re.search(r'(.*)::(.*)', src) | |
if cast: | |
src = cast.group(1) | |
cast = cast.group(2) | |
if len(src) >= 30: | |
src = "_complex_" | |
column_comments.append({"from_col": src, | |
"cast": cast, | |
"to_col": target, | |
"comment": comment}) | |
# table refs | |
res = re.search(r"{{\s+ref\('(\w+)'\)\s+}}", line) | |
if res: | |
table_refs.append(res.group(1)) | |
for bc in block_comments: | |
print(bc) | |
print("\nColumns:\n") | |
for cc in column_comments: | |
if cc['to_col']: | |
col_name = cc['to_col'] | |
col_from = "from `%s`" % cc['from_col'] | |
else: | |
col_name = cc['from_col'] | |
col_from = None | |
paran_stuff = [] | |
if cc['cast']: | |
paran_stuff.append("`%s`" % cc['cast']) | |
if col_from: | |
paran_stuff.append(col_from) | |
print("* `%s` (%s) -- %s" % (col_name, | |
', '.join(paran_stuff), | |
cc['comment'])) | |
print("\nTable References:\n") | |
for tr in table_refs: | |
print("* [%s](#%s)" % (tr, tr)) | |
print("\n") | |
print("\n\nGenerated %s" % time.ctime()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment