Skip to content

Instantly share code, notes, and snippets.

@K0lb3
Created October 15, 2019 09:31
Show Gist options
  • Save K0lb3/71aa811eedfd29d4953b41da56e67e9b to your computer and use it in GitHub Desktop.
Save K0lb3/71aa811eedfd29d4953b41da56e67e9b to your computer and use it in GitHub Desktop.
from mwclient import Site
import os
import re
import json
from lib import PATH, resPath
page_path = os.path.join(resPath, 'pages')
pages = {fp: open(os.path.join(page_path, fp), 'rb').read().decode('utf8') for fp in os.listdir(page_path)}
# download pages
wiki = Site(host='finalfantasy.fandom.com', path='/')
for page in wiki.categories['Translations']:
title = page.page_title.replace(':', ' ').replace('/', '-')
if title not in pages:
text = page.text()
open(os.path.join(page_path, title), 'wb').write(text.encode('utf8'))
pages[title] = text
# extract translations
# theory: first coloumn - jp, last coloumn en
# -> re
reCategory = re.compile(r'==+\s*\[*(.+?)(\|.+?)?\]*\s*==+')
reRow = re.compile(r'\|class=".+?\|\[*(.+?)\s*[\|\]<\(\n].+?\n')
parsed = {}
translations = {}
for name, text in pages.items():
translations[name] = {}
categories = [(cat[1], cat.regs[0][1]) for cat in reCategory.finditer(text)]
for i, (cname, begin) in enumerate(categories):
ltrans = {}
end = categories[i + 1][1] if i + 1 < len(categories) else -1
rows = text[begin:end].split('\n|-\n')
for row in rows:
match = reRow.findall(f'{row}\n')
if match and len(match) == 2:
ltrans[match[0]] = match[1].rstrip(' ')
if ltrans:
translations[name][cname] = ltrans
open(os.path.join(resPath, 'Translations.json'), 'wb').write(
json.dumps(translations, ensure_ascii=False, indent='\t').encode('utf8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment