Created
October 15, 2019 09:31
-
-
Save K0lb3/71aa811eedfd29d4953b41da56e67e9b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mwclient import Site | |
import os | |
import re | |
import json | |
from lib import PATH, resPath | |
page_path = os.path.join(resPath, 'pages') | |
pages = {fp: open(os.path.join(page_path, fp), 'rb').read().decode('utf8') for fp in os.listdir(page_path)} | |
# download pages | |
wiki = Site(host='finalfantasy.fandom.com', path='/') | |
for page in wiki.categories['Translations']: | |
title = page.page_title.replace(':', ' ').replace('/', '-') | |
if title not in pages: | |
text = page.text() | |
open(os.path.join(page_path, title), 'wb').write(text.encode('utf8')) | |
pages[title] = text | |
# extract translations | |
# theory: first coloumn - jp, last coloumn en | |
# -> re | |
reCategory = re.compile(r'==+\s*\[*(.+?)(\|.+?)?\]*\s*==+') | |
reRow = re.compile(r'\|class=".+?\|\[*(.+?)\s*[\|\]<\(\n].+?\n') | |
parsed = {} | |
translations = {} | |
for name, text in pages.items(): | |
translations[name] = {} | |
categories = [(cat[1], cat.regs[0][1]) for cat in reCategory.finditer(text)] | |
for i, (cname, begin) in enumerate(categories): | |
ltrans = {} | |
end = categories[i + 1][1] if i + 1 < len(categories) else -1 | |
rows = text[begin:end].split('\n|-\n') | |
for row in rows: | |
match = reRow.findall(f'{row}\n') | |
if match and len(match) == 2: | |
ltrans[match[0]] = match[1].rstrip(' ') | |
if ltrans: | |
translations[name][cname] = ltrans | |
open(os.path.join(resPath, 'Translations.json'), 'wb').write( | |
json.dumps(translations, ensure_ascii=False, indent='\t').encode('utf8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment