Last active
October 19, 2021 14:26
-
-
Save kkirsche/4f31f2153ed7a3248be1ec44ca6ddbc9 to your computer and use it in GitHub Desktop.
Reserved Word Retriever for SQLAlchemy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Usage Example: | |
$ python reserved_words_retriever.py --help | |
usage: reserved_words_retriever.py [-h] [-y] [-a] [-w] [-d] | |
Retrieve the reserved keywords for MySQL and/or MariaDB | |
options: | |
-h, --help show this help message and exit | |
-y, --mysql Retrieve the keywords for MySQL (default: True) | |
-a, --mariadb Retrieve the keywords for MariaDB (default: True) | |
-w, --write Write the keywords to a file (default: True) | |
-d, --diff Print a diff between the original list and the new list, if available (default: True) | |
$ python reserved_words_retriever.py | |
Parsed MariaDB: 249 reserved words | |
Parsed MySQL: 275 reserved words | |
No original list set, ignoring diffs... | |
""" | |
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace | |
# python -m pip install -U beautifulsoup4 requests lxml | |
from typing import List, Text, TypedDict | |
from bs4 import BeautifulSoup | |
from requests import Session, get | |
original_list = set() # removed for gist | |
reserved_words_mariadb = set() | |
reserved_words_mysql = set() | |
class MysqlDict(TypedDict): | |
"""The structure used to relate HTML source code to a MySQL version.""" | |
html: str | |
version: str | |
def get_mysql_word_list_htmls( | |
versions: List[str] = ["8.0", "5.7", "5.6"] | |
) -> List[MysqlDict]: | |
"""Retrieve the HTML of the keywords page, for each version. | |
Parameters: | |
versions: A list of version numbers with documented keywords. | |
(default: ["8.0", "5.7", "5.6"]) | |
Returns: | |
List[MySQLDict]: A list of dictionaries containing the HTML and the | |
version that was used to retrieve it. | |
Raises: | |
requests.HTTPError: If a non-successful status code is received | |
requests.Timeout: If the request times out. | |
""" | |
responses = [] | |
session = Session() # use a session to keepalive | |
try: | |
for version in versions: | |
url = f"https://dev.mysql.com/doc/refman/{version}/en/keywords.html" | |
response = session.get(url=url, timeout=10) | |
response.raise_for_status() | |
responses.append(MysqlDict(html=response.text, version=version)) | |
finally: | |
session.close() | |
return responses | |
def parse_mysql_word_list_htmls(version_contents: List[MysqlDict]) -> None: | |
"""Parses the HTML source code to extract the reserved keywords. | |
Parameters: | |
version_contents: A list of MysqlDict's containing the HTML and | |
the version number used to retrieve it. | |
Raises: | |
ValueError: Raised if the MySQL keywords page has an unexpected number | |
of segments in it, indicating the structure of the page has changed | |
since this was written. | |
""" | |
for version in version_contents: | |
soup = BeautifulSoup(version["html"], "lxml") | |
sects = soup.find_all("div", {"class": "simplesect"}) | |
if len(sects) != 3: | |
raise ValueError("Unexpected number of sections") | |
# section 1: MySQL x.0 Keywords and Reserved Words | |
# section 2: MySQL x.0 New Keywords and Reserved Words | |
# section 3: MySQL x.0 Removed Keywords and Reserved Words | |
for sect_num, sect in enumerate(sects): | |
if sect_num > 0: | |
# sect_num is zero based | |
# skip the new / removed diff | |
continue | |
for row in sect.find_all("li", {"class": "listitem"}): | |
paragraph = row.find("p") | |
paragraph_text = paragraph.get_text() | |
word = row.find("code", {"class": "literal"}).get_text() | |
if ( | |
paragraph_text.endswith("(R)") | |
or "(reserved)" in paragraph_text | |
or "(R);" | |
in paragraph_text # used to handle FUNCTION (R); reserved in 8.0.1 | |
or "became nonreserved in" | |
in paragraph_text # used to handle PERSIST; became nonreserved in 8.0.16 | |
): | |
reserved_words_mysql.add(word.lower()) | |
print(f"Parsed MySQL: {len(reserved_words_mysql)} reserved words") | |
def get_mariadb_word_list_html( | |
url: Text | bytes = "https://mariadb.com/kb/en/reserved-words/", | |
) -> str: | |
"""Retrieve the HTML of the reserved words page. | |
Parameters: | |
url: The url of the MariaDB reserved word list website. | |
(default: https://mariadb.com/kb/en/reserved-words) | |
Returns: | |
str: The HTML of the MariaDB reserved word list website. | |
Raises: | |
requests.HTTPError: If a non-successful status code is received | |
requests.Timeout: If the request times out. | |
""" | |
response = get(url=url, timeout=10) | |
response.raise_for_status() | |
return response.text | |
def parse_mariadb_word_list_html(html: str) -> None: | |
"""Parses the HTML source code to extract the reserved keywords. | |
Parameters: | |
html: The HTML source code of the MariaDB reserved word list website. | |
Raises: | |
ValueError: Raised if the MariaDB reserved words page has an unexpected number | |
of tables in it, indicating the structure of the page has changed | |
since this was written. | |
""" | |
soup = BeautifulSoup(html, "lxml") | |
tables = soup.find_all("table") | |
if len(tables) != 3: | |
raise ValueError("Unexpected number of tables") | |
# table 1: reserved words | |
# table 2: exceptions | |
# table 3: oracle mode resered words | |
for table_num, table in enumerate(tables): | |
if table_num > 0: | |
# table_num is zero-based, skip exceptions | |
# exceptions are not actually reserved | |
continue | |
for row in table.find_all("tr"): | |
cells = row.find_all("td") | |
if len(cells): | |
word = cells[0].find(text=True) | |
reserved_words_mariadb.add(word.lower()) | |
print(f"Parsed MariaDB: {len(reserved_words_mariadb)} reserved words") | |
def print_diffs() -> None: | |
"""Prints the additions and removals between the original list and new lists.""" | |
if len(original_list) == 0: | |
print("No original list set, ignoring diffs...") | |
return | |
mariadb_additions = reserved_words_mariadb - original_list | |
mysql_additions = reserved_words_mysql - original_list | |
mariadb_removals = original_list - reserved_words_mariadb | |
mysql_removals = original_list - reserved_words_mysql | |
print("# MariaDB") | |
for word in mariadb_additions: | |
print(f"+ {word}") | |
for word in mariadb_removals: | |
print(f"- {word}") | |
print("# MySQL") | |
for word in mysql_additions: | |
print(f"+ {word}") | |
for word in mysql_removals: | |
print(f"- {word}") | |
def write_reserved_words_to_file( | |
parsed_args: Namespace, filename: str = "reserved_words.py", encoding="utf-8" | |
) -> None: | |
"""Writes the MariaDB and MySQL word lists to a file. | |
Parameters: | |
parsed_args: The arguments that were parsed from argparse. | |
filename: The filename of the file to write to. Note, this will overwrite the | |
file. (default: reserved_words.py) | |
encoding: The encoding of the file when it is opened. (default: utf-8) | |
""" | |
with open(filename, "w", encoding=encoding) as f: | |
f.write( | |
"\n".join( | |
[ | |
"# generated using:", | |
"# https://gist.github.com/kkirsche/4f31f2153ed7a3248be1ec44ca6ddbc9", | |
"#\n", # include a trailing new line | |
] | |
) | |
) | |
if parsed_args.retrieve_mariadb: | |
mariadb_headers = [ | |
"# https://mariadb.com/kb/en/reserved-words/", | |
"# includes: Reserved Words", | |
"# excludes: Exceptions, Oracle Mode, Function Names", | |
"RESERVED_WORDS_MARIADB = set([", | |
] | |
for header in mariadb_headers: | |
f.write(f"{header}\n") | |
for word in sorted(reserved_words_mariadb, key=str.lower): | |
f.write(f' "{word.lower()}",\n') | |
f.write("])\n\n") | |
if parsed_args.retrieve_mysql: | |
mysql_headers = [ | |
"# https://dev.mysql.com/doc/refman/8.0/en/keywords.html", | |
"# https://dev.mysql.com/doc/refman/5.7/en/keywords.html", | |
"# https://dev.mysql.com/doc/refman/5.6/en/keywords.html", | |
"# includes: MySQL x.0 Keywords and Reserved Words", | |
"# excludes: MySQL x.0 New Keywords and Reserved Words,", | |
"# MySQL x.0 Removed Keywords and Reserved Words", | |
"RESERVED_WORDS_MYSQL = set([", | |
] | |
for header in mysql_headers: | |
f.write(f"{header}\n") | |
for word in sorted(reserved_words_mysql, key=str.lower): | |
f.write(f' "{word.lower()}",\n') | |
f.write("])\n") | |
def main() -> None: | |
"""The entrypoint of the script.""" | |
parser = ArgumentParser( | |
description="Retrieve the reserved keywords for MySQL and/or MariaDB", | |
formatter_class=ArgumentDefaultsHelpFormatter, | |
) | |
parser.add_argument( | |
"-y", | |
"--mysql", | |
action="store_false", | |
default=True, | |
dest="retrieve_mysql", | |
help="Retrieve the keywords for MySQL", | |
) | |
parser.add_argument( | |
"-a", | |
"--mariadb", | |
action="store_false", | |
default=True, | |
dest="retrieve_mariadb", | |
help="Retrieve the keywords for MariaDB", | |
) | |
parser.add_argument( | |
"-w", | |
"--write", | |
action="store_false", | |
default=True, | |
dest="write_changes", | |
help="Write the keywords to a file", | |
) | |
parser.add_argument( | |
"-d", | |
"--diff", | |
action="store_false", | |
default=True, | |
dest="show_diff", | |
help="Print a diff between the original list and the new list, if available", | |
) | |
args = parser.parse_args() | |
if args.retrieve_mariadb: | |
html = get_mariadb_word_list_html() | |
parse_mariadb_word_list_html(html=html) | |
if args.retrieve_mysql: | |
version_contents = get_mysql_word_list_htmls() | |
parse_mysql_word_list_htmls(version_contents=version_contents) | |
if args.write_changes: | |
write_reserved_words_to_file(parsed_args=args) | |
if args.show_diff: | |
print_diffs() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment