Last active
May 30, 2020 14:18
-
-
Save GokulNC/5772e2d659384ce3d7a3d72f27cb45b2 to your computer and use it in GitHub Desktop.
Basic script to find old Wikipedia XML dump sizes and number of words/tokens in them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Basically I was trying to find out the size of MBERT model's data for Indian languages | |
# Date based on: https://github.com/google-research/bert/blob/f18bd94b8fee9bda3c293e0932d100add35b780c/multilingual.md | |
import requests | |
from bs4 import BeautifulSoup | |
from time import sleep | |
ARCHIVE_API = 'http://web.archive.org/cdx/search/cdx?url=%s&output=json' | |
WIKIDUMP_URL = 'https://dumps.wikimedia.org/%swiki/%s' | |
ARCHIVE_URL = 'http://web.archive.org/web/%s/%s' | |
LANGS = ['as', 'bn', 'gu', 'kn', 'hi', 'ml', 'mr', 'or', 'pa', 'ta', 'te'] | |
def get_wikidump_size(lang_code, date): | |
dump_url = WIKIDUMP_URL % (lang_code, date) | |
query = ARCHIVE_API % dump_url | |
response = requests.get(query).json() | |
if len(response) < 2: | |
print('Failed for: ', lang_code, date) | |
return None | |
dump_url = ARCHIVE_URL % (response[1][1], dump_url) | |
dump_html = requests.get(dump_url).text | |
soup_dump = BeautifulSoup(dump_html, 'html.parser') | |
main_dump = str(soup_dump.find_all('li', {'class': 'file'})[0]) | |
size = main_dump[main_dump.find('</a>') + len('</a>') : main_dump.find('</li>')] | |
return size.strip() | |
def get_lang_sizes(dump_date, save_to='wikidump_sizes.txt'): | |
with open(save_to, 'w', encoding='utf-8') as f: | |
for lang in LANGS: | |
f.write('%s %s\n'%(lang, get_wikidump_size(lang, dump_date))) | |
return | |
WIKI_STATS_URL = 'https://%s.wikipedia.org/wiki/Special:Statistics' | |
def get_wiki_words(lang_code, year): | |
actual_url = requests.get(WIKI_STATS_URL%lang_code).url | |
archive_query = ARCHIVE_API % actual_url | |
response = requests.get(archive_query).json() | |
if len(response) < 2: | |
print('Failed for: ', lang_code, date) | |
return None | |
year = str(year) | |
stats_url = None | |
for row in response[1:]: | |
if row[1].startswith(year): | |
stats_url = ARCHIVE_URL % (row[1], actual_url) | |
break | |
if not stats_url: | |
return None | |
dump_html = requests.get(stats_url).content | |
soup_dump = BeautifulSoup(dump_html, 'html.parser') | |
print(stats_url) | |
try: | |
num_words = soup_dump.find_all('tr', {'id': 'mw-cirrussearch-article-words'})[0].find_all('td', {'class': 'mw-statistics-numbers'})[0].text | |
return num_words | |
except: | |
return None | |
def get_lang_words(year, save_to='wiki_words.txt'): | |
with open(save_to, 'w', encoding='utf-8') as f: | |
for lang in LANGS: | |
f.write('%s %s\n'%(lang, get_wiki_words(lang, year))) | |
sleep(0.5) | |
return | |
get_lang_sizes('20181001') | |
get_lang_words(2018) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
as 28,18,782 | |
bn ২,৪৯,১৯,৮৪৩ | |
gu ૭૬,૧૪,૭૨૨ | |
kn ೧,೫೬,೧೭,೦೭೮ | |
hi 3,74,55,014 | |
ml 1,54,66,075 | |
mr ६६,२५,४४४ | |
or None | |
pa 83,23,194 | |
ta 2,80,52,682 | |
te 3,47,61,585 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
as 17.9 MB | |
bn 135.2 MB | |
gu 26.7 MB | |
kn 68.0 MB | |
hi 131.9 MB | |
ml 109.8 MB | |
mr 47.8 MB | |
or 23.7 MB | |
pa 38.7 MB | |
ta 136.7 MB | |
te 110.3 MB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment