Skip to content

Instantly share code, notes, and snippets.

@GokulNC
Last active May 30, 2020 14:18
Show Gist options
  • Save GokulNC/5772e2d659384ce3d7a3d72f27cb45b2 to your computer and use it in GitHub Desktop.
Save GokulNC/5772e2d659384ce3d7a3d72f27cb45b2 to your computer and use it in GitHub Desktop.
Basic script to find old Wikipedia XML dump sizes and number of words/tokens in them
# Basically I was trying to find out the size of MBERT model's data for Indian languages
# Date based on: https://github.com/google-research/bert/blob/f18bd94b8fee9bda3c293e0932d100add35b780c/multilingual.md
import requests
from bs4 import BeautifulSoup
from time import sleep
ARCHIVE_API = 'http://web.archive.org/cdx/search/cdx?url=%s&output=json'
WIKIDUMP_URL = 'https://dumps.wikimedia.org/%swiki/%s'
ARCHIVE_URL = 'http://web.archive.org/web/%s/%s'
LANGS = ['as', 'bn', 'gu', 'kn', 'hi', 'ml', 'mr', 'or', 'pa', 'ta', 'te']
def get_wikidump_size(lang_code, date):
dump_url = WIKIDUMP_URL % (lang_code, date)
query = ARCHIVE_API % dump_url
response = requests.get(query).json()
if len(response) < 2:
print('Failed for: ', lang_code, date)
return None
dump_url = ARCHIVE_URL % (response[1][1], dump_url)
dump_html = requests.get(dump_url).text
soup_dump = BeautifulSoup(dump_html, 'html.parser')
main_dump = str(soup_dump.find_all('li', {'class': 'file'})[0])
size = main_dump[main_dump.find('</a>') + len('</a>') : main_dump.find('</li>')]
return size.strip()
def get_lang_sizes(dump_date, save_to='wikidump_sizes.txt'):
with open(save_to, 'w', encoding='utf-8') as f:
for lang in LANGS:
f.write('%s %s\n'%(lang, get_wikidump_size(lang, dump_date)))
return
WIKI_STATS_URL = 'https://%s.wikipedia.org/wiki/Special:Statistics'
def get_wiki_words(lang_code, year):
actual_url = requests.get(WIKI_STATS_URL%lang_code).url
archive_query = ARCHIVE_API % actual_url
response = requests.get(archive_query).json()
if len(response) < 2:
print('Failed for: ', lang_code, date)
return None
year = str(year)
stats_url = None
for row in response[1:]:
if row[1].startswith(year):
stats_url = ARCHIVE_URL % (row[1], actual_url)
break
if not stats_url:
return None
dump_html = requests.get(stats_url).content
soup_dump = BeautifulSoup(dump_html, 'html.parser')
print(stats_url)
try:
num_words = soup_dump.find_all('tr', {'id': 'mw-cirrussearch-article-words'})[0].find_all('td', {'class': 'mw-statistics-numbers'})[0].text
return num_words
except:
return None
def get_lang_words(year, save_to='wiki_words.txt'):
with open(save_to, 'w', encoding='utf-8') as f:
for lang in LANGS:
f.write('%s %s\n'%(lang, get_wiki_words(lang, year)))
sleep(0.5)
return
get_lang_sizes('20181001')
get_lang_words(2018)
as 28,18,782
bn ২,৪৯,১৯,৮৪৩
gu ૭૬,૧૪,૭૨૨
kn ೧,೫೬,೧೭,೦೭೮
hi 3,74,55,014
ml 1,54,66,075
mr ६६,२५,४४४
or None
pa 83,23,194
ta 2,80,52,682
te 3,47,61,585
as 17.9 MB
bn 135.2 MB
gu 26.7 MB
kn 68.0 MB
hi 131.9 MB
ml 109.8 MB
mr 47.8 MB
or 23.7 MB
pa 38.7 MB
ta 136.7 MB
te 110.3 MB
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment