Last active
March 23, 2021 19:49
-
-
Save Rubix982/895a27d6673f35326530b8bf6ac8deb9 to your computer and use it in GitHub Desktop.
Extract UTF 8 encodings from W3Schools
We can make this file beautiful and searchable if this error is corrected: Illegal quoting in line 3.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
' ','%20', | |
'!','%21', | |
'"','%22', | |
'#','%23', | |
'$','%24', | |
'%','%25', | |
'&','%26', | |
''','%27', | |
'(','%28', | |
')','%29', | |
'*','%2A', | |
'+','%2B', | |
',','%2C', | |
'-','%2D', | |
'.','%2E', | |
':','%3A', | |
';','%3B', | |
'<','%3C', | |
'=','%3D', | |
'>','%3E', | |
'?','%3F', | |
'@','%40', | |
'[','%5B', | |
']','%5D', | |
'^','%5E', | |
'_','%5F', | |
'`','%60', | |
'{','%7B', | |
'|','%7C', | |
'}','%7D', | |
'~','%7E', | |
' ','%7F', | |
'`','%E2%82%AC', | |
'Â','%81', | |
'â','%E2%80%9A', | |
'Æ','%C6%92', | |
'â','%E2%80%9E', | |
'â¦','%E2%80%A6', | |
'â ','%E2%80%A0', | |
'â¡','%E2%80%A1', | |
'Ë','%CB%86', | |
'â°','%E2%80%B0', | |
'Å ','%C5%A0', | |
'â¹','%E2%80%B9', | |
'Å','%C5%92', | |
'Â','%C5%8D', | |
'Ž','%C5%BD', | |
'Â','%8F', | |
'Â','%C2%90', | |
'â','%E2%80%98', | |
'â','%E2%80%99', | |
'â','%E2%80%9C', | |
'â','%E2%80%9D', | |
'â¢','%E2%80%A2', | |
'â','%E2%80%93', | |
'â','%E2%80%94', | |
'Ë','%CB%9C', | |
'â¢','%E2%84', | |
'Å¡','%C5%A1', | |
'âº','%E2%80', | |
'Å','%C5%93', | |
'Â','%9D', | |
'ž','%C5%BE', | |
'Ÿ','%C5%B8', | |
' ','%C2%A0', | |
'¡','%C2%A1', | |
'¢','%C2%A2', | |
'£','%C2%A3', | |
'¤','%C2%A4', | |
'Â¥','%C2%A5', | |
'¦','%C2%A6', | |
'§','%C2%A7', | |
'¨','%C2%A8', | |
'©','%C2%A9', | |
'ª','%C2%AA', | |
'«','%C2%AB', | |
'¬','%C2%AC', | |
'Â','%C2%AD', | |
'®','%C2%AE', | |
'¯','%C2%AF', | |
'°','%C2%B0', | |
'±','%C2%B1', | |
'²','%C2%B2', | |
'³','%C2%B3', | |
'´','%C2%B4', | |
'µ','%C2%B5', | |
'¶','%C2%B6', | |
'·','%C2%B7', | |
'¸','%C2%B8', | |
'¹','%C2%B9', | |
'º','%C2%BA', | |
'»','%C2%BB', | |
'¼','%C2%BC', | |
'½','%C2%BD', | |
'¾','%C2%BE', | |
'¿','%C2%BF', | |
'Ã','%C3%80', | |
'Ã','%C3%81', | |
'Ã','%C3%82', | |
'Ã','%C3%83', | |
'Ã','%C3%84', | |
'Ã ','%C3%85', | |
'Ã','%C3%86', | |
'Ã','%C3%87', | |
'Ã','%C3%88', | |
'Ã','%C3%89', | |
'Ã','%C3%8A', | |
'Ã','%C3%8B', | |
'Ã','%C3%8C', | |
'Ã','%C3%8D', | |
'Ã','%C3%8E', | |
'Ã','%C3%8F', | |
'Ã','%C3%90', | |
'Ã','%C3%91', | |
'Ã','%C3%92', | |
'Ã','%C3%93', | |
'Ã','%C3%94', | |
'Ã','%C3%95', | |
'Ã','%C3%96', | |
'Ã','%C3%97', | |
'Ã','%C3%98', | |
'Ã','%C3%99', | |
'Ã','%C3%9A', | |
'Ã','%C3%9B', | |
'Ã','%C3%9C', | |
'Ã','%C3%9D', | |
'Ã','%C3%9E', | |
'Ã','%C3%9F', | |
'Ã ','%C3%A0', | |
'á','%C3%A1', | |
'â','%C3%A2', | |
'ã','%C3%A3', | |
'ä','%C3%A4', | |
'Ã¥','%C3%A5', | |
'æ','%C3%A6', | |
'ç','%C3%A7', | |
'è','%C3%A8', | |
'é','%C3%A9', | |
'ê','%C3%AA', | |
'ë','%C3%AB', | |
'ì','%C3%AC', | |
'Ã','%C3%AD', | |
'î','%C3%AE', | |
'ï','%C3%AF', | |
'ð','%C3%B0', | |
'ñ','%C3%B1', | |
'ò','%C3%B2', | |
'ó','%C3%B3', | |
'ô','%C3%B4', | |
'õ','%C3%B5', | |
'ö','%C3%B6', | |
'÷','%C3%B7', | |
'ø','%C3%B8', | |
'ù','%C3%B9', | |
'ú','%C3%BA', | |
'û','%C3%BB', | |
'ü','%C3%BC', | |
'ý','%C3%BD', | |
'þ','%C3%BE', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests as req | |
from bs4 import BeautifulSoup | |
import os | |
res = req.get("https://www.w3schools.com/tags/ref_urlencode.asp") | |
soup = BeautifulSoup(res.text, features='lxml') | |
table_row_parse_tree = soup.findAll('table')[0].findAll('tr')[1:-1] | |
encodings = [] | |
numerical_skip = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] | |
uppercase_alphabatical_skip = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', | |
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] | |
lowercase_alphabatical_skip = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', | |
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] | |
misc_skip = ['/', '\\'] | |
total_skip_characters = numerical_skip + uppercase_alphabatical_skip + \ | |
lowercase_alphabatical_skip + misc_skip | |
for idx, row in enumerate(table_row_parse_tree): | |
character = table_row_parse_tree[idx].findAll('td')[0].contents[0] | |
if character in total_skip_characters: | |
continue | |
if character == 'space': | |
character = ' ' | |
utf_8 = table_row_parse_tree[idx].findAll('td')[2].contents[0] | |
encodings.append([(character, utf_8)]) | |
folder_path = str(os.path.abspath(__file__))[ | |
0:-len(os.path.basename(__file__))] + "data/" | |
if not os.path.exists(folder_path): | |
os.makedirs(folder_path) | |
with open(os.path.join(folder_path, f"encodings.csv"), encoding='utf-8', mode='w') as file: | |
for entry in encodings: | |
file.write(f"'{entry[0][0]}','{entry[0][1]}',\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment