Created
August 17, 2016 17:02
-
-
Save tzengyuxio/d54a5f1075862640a59459a5c65d2a6b to your computer and use it in GitHub Desktop.
Fetch all county-level administrative divisions of PRC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
from urllib.parse import urljoin | |
import io | |
import json | |
homepage = "https://zh.wikipedia.org/zh-tw/Category:%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD%E5%90%84%E7%9C%81%E7%BA%A7%E8%A1%8C%E6%94%BF%E5%8C%BA%E8%A1%8C%E6%94%BF%E5%8C%BA%E5%88%92%E6%A8%A1%E6%9D%BF" # noqa | |
soup = BeautifulSoup(urlopen(homepage), 'html.parser') | |
divisions = {} | |
def generate_admin2_dict(tr): | |
admin2 = {} | |
# admin2_name = tr.th.a.get_text() if tr.th is not None else "(直轄縣級行政區)" | |
admin2_name = "(直轄縣級行政區)" | |
if tr.th is not None: | |
admin2_name = tr.th.a.get_text() if tr.th.a is not None else tr.th.get_text() | |
if admin2_name in ['自治縣', '縣', '市轄區']: | |
admin2_name = '(' + admin2_name + ')' | |
admin3 = [] | |
for a in tr.td.find_all('a'): | |
postfix = a.findNextSibling(text=True) | |
if postfix is None or postfix[0] != '*': | |
admin3_name = a.get_text() | |
if admin3_name == '無縣級行政區': | |
admin3_name = '(無縣級行政區)' | |
admin3.append(admin3_name) | |
if len(admin3) != 0: | |
admin2[admin2_name] = admin3 | |
return admin2 | |
for link in soup.find_all('a'): | |
link_text = link.get_text() | |
if link_text[:9] != 'Template:' or link_text[9] == '中': | |
continue | |
page2 = urljoin(homepage, link.get('href')).replace("/wiki/", "/zh-tw/") | |
soup2 = BeautifulSoup(urlopen(page2), 'html.parser') | |
admin1_name = soup2.find_all('th', {'class': 'navbox-title'})[0].find_all('a')[2].get('title') | |
admin2 = {} | |
cate = soup2.find_all('th', {'class': 'navbox-group'})[0].a.get('title') | |
print(admin1_name + " : " + cate) | |
if cate == '地級行政區': | |
for table in soup2.find_all('table', {'class': 'navbox-subgroup'}): | |
for tr in table.find_all("tr"): | |
if tr.td is None: | |
continue | |
admin2.update(generate_admin2_dict(tr)) | |
elif cate == '縣級行政區': | |
for th in soup2.find_all('th', {'class': 'navbox-group'})[1:]: | |
tr = th.parent | |
if tr.td is None: | |
continue | |
admin2.update(generate_admin2_dict(tr)) | |
divisions[admin1_name] = admin2 | |
with io.open('data.text', 'w', encoding='utf8') as outfile: | |
json.dump(divisions, outfile, ensure_ascii=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment