Last active
January 8, 2025 09:41
-
-
Save lances101/5cfe430ff32d900454133f0a837e350f to your computer and use it in GitHub Desktop.
Scrap locales from https://lh.2xlibre.net/locales/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### | |
# Downloads and parses https://lh.2xlibre.net/locales/ into a | |
# JSON file split into the following fields: | |
# - code: locale code, i.e. 'en_GB' | |
# - suffix: locale code suffix, i.e. 'latin' from 'be_BY' | |
# - name: locale name, i.e. 'English' from 'en_GB' | |
# - country: locale country 'title'lized, i.e. 'United Kingdom' from 'en_GB' | |
# Settings as on where to save the html file and locale file can be found below | |
### | |
# <===== CONFIG =====> | |
STORAGE_PATH = 'tools/locale_parser' | |
HTML_FILENAME = 'locales.html' | |
LOCALE_FILENAME = 'locales.json' | |
# <===== CONFIG END =====> | |
import json, os, re, requests | |
from bs4 import BeautifulSoup | |
if not os.path.exists(STORAGE_PATH): | |
os.makedirs(STORAGE_PATH) | |
HTML_FILENAME = os.path.join(STORAGE_PATH, HTML_FILENAME) | |
LOCALE_FILENAME = os.path.join(STORAGE_PATH, LOCALE_FILENAME) | |
text = '' | |
if not os.path.exists(HTML_FILENAME): | |
file = open(HTML_FILENAME, 'w') | |
resp = requests.get('https://lh.2xlibre.net/locales/') | |
file.write(resp.text) | |
file.close() | |
text = resp.text | |
else: | |
file = open(HTML_FILENAME, 'r') | |
text = file.read() | |
file.close() | |
node = BeautifulSoup(text, 'html.parser') | |
result = {'locales': []} | |
rows = node.find_all('tr', attrs={'class': 'glibc-HEAD'}) | |
print(f'Found {len(rows)} rows') | |
max_len = { | |
'code':0, | |
'suffix':0, | |
'name':0, | |
'country':0 | |
} | |
for row in rows: | |
entry = {} | |
first_column_text: str = row.contents[0].text | |
locale_code_array = first_column_text.split('@') | |
entry['code'] = locale_code_array[0] | |
max_len['code'] = max([max_len['code'], len(entry['code'])]) | |
if len(locale_code_array) > 1: | |
entry['suffix'] = locale_code_array[1] | |
max_len['suffix'] = max([max_len['suffix'], len(entry['suffix'])]) | |
second_column_text: str = row.contents[1].text | |
locale_name_search_result = re.search('— (.*) —', second_column_text) | |
entry['name'] = locale_name_search_result.group(1) | |
max_len['name'] = max([max_len['name'], len(entry['name'])]) | |
third_column_text: str = row.contents[2].text | |
entry['country'] = third_column_text.title() | |
max_len['country'] = max([max_len['country'], len(entry['country'])]) | |
result['locales'].append({'locale': entry}) | |
json_dumped = json.dumps(result, indent=4) | |
print('Dumping into file locale') | |
if os.path.exists(LOCALE_FILENAME): | |
os.remove(LOCALE_FILENAME) | |
file = open(LOCALE_FILENAME, 'w') | |
file.write(json_dumped) | |
file.close() | |
print(f'Dumped. File size {str(int(os.path.getsize(LOCALE_FILENAME)/1000))} KB') | |
print(max_len) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment