Skip to content

Instantly share code, notes, and snippets.

@tzengyuxio
Created August 17, 2016 17:02
Show Gist options
  • Save tzengyuxio/d54a5f1075862640a59459a5c65d2a6b to your computer and use it in GitHub Desktop.
Save tzengyuxio/d54a5f1075862640a59459a5c65d2a6b to your computer and use it in GitHub Desktop.
Fetch all county-level administrative divisions of PRC
#!/usr/bin/env python3
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin
import io
import json
homepage = "https://zh.wikipedia.org/zh-tw/Category:%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD%E5%90%84%E7%9C%81%E7%BA%A7%E8%A1%8C%E6%94%BF%E5%8C%BA%E8%A1%8C%E6%94%BF%E5%8C%BA%E5%88%92%E6%A8%A1%E6%9D%BF" # noqa
soup = BeautifulSoup(urlopen(homepage), 'html.parser')
divisions = {}
def generate_admin2_dict(tr):
admin2 = {}
# admin2_name = tr.th.a.get_text() if tr.th is not None else "(直轄縣級行政區)"
admin2_name = "(直轄縣級行政區)"
if tr.th is not None:
admin2_name = tr.th.a.get_text() if tr.th.a is not None else tr.th.get_text()
if admin2_name in ['自治縣', '縣', '市轄區']:
admin2_name = '(' + admin2_name + ')'
admin3 = []
for a in tr.td.find_all('a'):
postfix = a.findNextSibling(text=True)
if postfix is None or postfix[0] != '*':
admin3_name = a.get_text()
if admin3_name == '無縣級行政區':
admin3_name = '(無縣級行政區)'
admin3.append(admin3_name)
if len(admin3) != 0:
admin2[admin2_name] = admin3
return admin2
for link in soup.find_all('a'):
link_text = link.get_text()
if link_text[:9] != 'Template:' or link_text[9] == '中':
continue
page2 = urljoin(homepage, link.get('href')).replace("/wiki/", "/zh-tw/")
soup2 = BeautifulSoup(urlopen(page2), 'html.parser')
admin1_name = soup2.find_all('th', {'class': 'navbox-title'})[0].find_all('a')[2].get('title')
admin2 = {}
cate = soup2.find_all('th', {'class': 'navbox-group'})[0].a.get('title')
print(admin1_name + " : " + cate)
if cate == '地級行政區':
for table in soup2.find_all('table', {'class': 'navbox-subgroup'}):
for tr in table.find_all("tr"):
if tr.td is None:
continue
admin2.update(generate_admin2_dict(tr))
elif cate == '縣級行政區':
for th in soup2.find_all('th', {'class': 'navbox-group'})[1:]:
tr = th.parent
if tr.td is None:
continue
admin2.update(generate_admin2_dict(tr))
divisions[admin1_name] = admin2
with io.open('data.text', 'w', encoding='utf8') as outfile:
json.dump(divisions, outfile, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment