Last active
February 18, 2016 23:02
-
-
Save mayhewsw/1600aeade3693db38195 to your computer and use it in GitHub Desktop.
Scrape script information from scriptsource
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from bs4 import BeautifulSoup | |
# use url: http://unicode.org/iso15924/iso15924-codes.html | |
with open("iso15924list.html") as f: | |
html_doc = f.read() | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
for tr in soup.find_all("tr"): | |
print tr.td.text.encode("utf8") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import re | |
import urllib2 | |
import os.path | |
url = "http://scriptsource.org/cms/scripts/page.php?item_id=script_detail&key={0}" | |
with open("iso15924codes.txt") as f: | |
codes = f.readlines() | |
codes = map(lambda c: c.strip(), codes) | |
pat = re.compile("Writing systems that use this script \((\d+)\)") | |
otherpat = re.compile("Writing systems that use this script") | |
langs = [] | |
for code in codes: | |
print code, | |
fname = "scriptpages/" + code + ".html" | |
if os.path.isfile(fname): | |
# open and read | |
#print code, "file exists." | |
with open(fname) as f: | |
html = f.read() | |
else: | |
#print "gotta get it!" | |
response = urllib2.urlopen(url.format(code)) | |
html = response.read() | |
with open(fname, "w") as out: | |
out.write(html) | |
groups = pat.search(html) | |
groups2 = otherpat.search(html) | |
if groups2 is not None: | |
if groups is not None: | |
print groups.group(1) | |
langs.append((code, int(groups.group(1)))) | |
else: | |
print "prob 0" | |
langs.append((code, 0)) | |
else: | |
print "weird" | |
langs.append((code, 0)) | |
import matplotlib.pyplot as plt | |
langs = sorted(langs, key=lambda p: p[1], reverse=True) | |
# Hard coded. | |
langs.insert(0, ("Latn", 513)) | |
langs = langs[:50] | |
x = range(len(langs)) | |
y = map(lambda p: p[1], langs) | |
plt.plot(x, y) | |
plt.xlabel('Individual Script') | |
plt.ylabel('Num langs per script') | |
plt.title('Number of languages by script') | |
plt.show() | |
print sum(y) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Adlm 0 | |
Afak 1 | |
Aghb 0 | |
Ahom 1 | |
Arab 171 | |
Aran 69 | |
Armi 0 | |
Armn 1 | |
Avst 1 | |
Bali 3 | |
Bamu 1 | |
Bass 1 | |
Batk 6 | |
Beng 39 | |
Bhks 0 | |
Blis 1 | |
Bopo 6 | |
Brah 0 | |
Brai 97 | |
Bugi 3 | |
Buhd 1 | |
Cakm 2 | |
Cans 26 | |
Cari 0 | |
Cham 2 | |
Cher 1 | |
Cirt 0 | |
Copt 4 | |
Cprt 1 | |
Cyrl 116 | |
Cyrs 2 | |
Deva 182 | |
Dsrt 1 | |
Dupl 0 | |
Egyd 0 | |
Egyh 0 | |
Egyp 0 | |
Elba 1 | |
Ethi 43 | |
Geok 1 | |
Geor 6 | |
Glag 1 | |
Goth 1 | |
Gran 0 | |
Grek 12 | |
Gujr 12 | |
Guru 3 | |
Hanb 0 | |
Hang 1 | |
Hani 24 | |
Hano 1 | |
Hans 14 | |
Hant 13 | |
Hatr 0 | |
Hebr 24 | |
Hira 0 | |
Hluw 0 | |
Hmng 4 | |
Hrkt 0 | |
Hung 0 | |
Inds 0 | |
Ital 3 | |
Jamo 0 | |
Java 5 | |
Jpan 1 | |
Jurc 0 | |
Kali 2 | |
Kana 3 | |
Khar 0 | |
Khmr 7 | |
Khoj 4 | |
Kitl 0 | |
Kits 0 | |
Knda 11 | |
Kore 1 | |
Kpel 1 | |
Kthi 3 | |
Lana 5 | |
Laoo 19 | |
Latf 2 | |
Latg 1 | |
Latn 513 | |
Leke 0 | |
Lepc 1 | |
Limb 1 | |
Lina 1 | |
Linb 1 | |
Lisu 3 | |
Loma 1 | |
Lyci 0 | |
Lydi 0 | |
Mahj 0 | |
Mand 2 | |
Mani 0 | |
Marc 1 | |
Maya 0 | |
Mend 1 | |
Merc 0 | |
Mero 0 | |
Mlym 10 | |
Modi 0 | |
Mong 11 | |
Moon 0 | |
Mroo 1 | |
Mtei 1 | |
Mult 0 | |
Mymr 27 | |
Narb 0 | |
Nbat 0 | |
Newa 0 | |
Nkgb 1 | |
Nkoo 4 | |
Nshu 2 | |
Ogam 3 | |
Olck 1 | |
Orkh 0 | |
Orya 22 | |
Osge 1 | |
Osma 1 | |
Palm 0 | |
Pauc 0 | |
Perm 2 | |
Phag 5 | |
Phli 0 | |
Phlp 0 | |
Phlv 0 | |
Phnx 1 | |
Piqd 0 | |
Plrd 8 | |
Prti 0 | |
Qaaa 0 | |
Qabx 0 | |
Rjng 4 | |
Roro 0 | |
Runr 1 | |
Samr 2 | |
Sara 0 | |
Sarb 0 | |
Saur 1 | |
Sgnw 12 | |
Shaw 1 | |
Shrd 2 | |
Sidd 0 | |
Sind 0 | |
Sinh 3 | |
Sora 1 | |
Sund 1 | |
Sylo 1 | |
Syrc 9 | |
Syre 0 | |
Syrj 0 | |
Syrn 0 | |
Tagb 1 | |
Takr 9 | |
Tale 3 | |
Talu 1 | |
Taml 10 | |
Tang 1 | |
Tavt 5 | |
Telu 19 | |
Teng 0 | |
Tfng 12 | |
Tglg 3 | |
Thaa 1 | |
Thai 35 | |
Tibt 33 | |
Tirh 1 | |
Ugar 1 | |
Vaii 1 | |
Visp 0 | |
Wara 1 | |
Wole 1 | |
Xpeo 1 | |
Xsux 1 | |
Yiii 10 | |
Zinh 0 | |
Zmth 0 | |
Zsye 0 | |
Zsym 0 | |
Zxxx 0 | |
Zyyy 0 | |
Zzzz 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment