Skip to content

Instantly share code, notes, and snippets.

@smellman
Created May 16, 2016 02:50
Show Gist options
  • Select an option

  • Save smellman/25b71c8827bac48ce91db8f13971ddf2 to your computer and use it in GitHub Desktop.

Select an option

Save smellman/25b71c8827bac48ce91db8f13971ddf2 to your computer and use it in GitHub Desktop.
千葉市のオープンデータのサイトからオープンデータの一覧をjsonに落とすスクリプト
aiohttp==0.20.2
beautifulsoup4==4.4.1
cchardet==1.0.0
chardet==2.3.0
wheel==0.24.0
# coding: utf-8
import asyncio
import aiohttp
import bs4
import urllib
import json
ROOT_URL = "http://www.city.chiba.jp"
BASE_URL = ROOT_URL + "/opendata/index.php"
PAGER_PARAM = "?p="
async def fetch_page(client, url):
async with client.get(url) as response:
assert response.status == 200
return await response.read()
loop = asyncio.get_event_loop()
client = aiohttp.ClientSession(loop=loop)
content = loop.run_until_complete(
fetch_page(client, BASE_URL))
soup = bs4.BeautifulSoup(content, "html.parser")
client.close()
# check max pager
link = soup.find_all("p", class_='next pager')[1].find('a').get('href')
qs = urllib.parse.urlparse(link).query
max_pager = int(urllib.parse.parse_qs(qs)['p'][0].split("_")[0])
min_pager = 1
# memo
# s.find_all('table')[0].find_all('tr')[1].find_all('td')
def parse(content):
soup = bs4.BeautifulSoup(content, "html.parser")
results = []
trs = soup.find_all('table')[0].find_all('tr')
tr_len = len(trs)
for i in range(1, tr_len):
ret = {}
tds = trs[i].find_all('td')
ret['title'] = tds[0].string
ret['description'] = tds[1].string
img = tds[2].find('img')
if img:
url = img.get('src')
if "cc-by.png" in url:
ret['license'] = "CC-BY"
elif "cc-by-nc.png" in url:
ret['license'] = "CC-BY-NC"
elif "cc-by-sa.png" in url:
ret['license'] = "CC-BY-SA"
elif "cc-by-nc-sa.png" in url:
ret['license'] = "CC-BY-NC-SA"
elif "cc-by-nd.png" in url:
ret['license'] = "CC-BY-ND"
elif "cc-by-nc-nd.png" in url:
ret['license'] = "CC-BY-NC-ND"
else:
ret['license'] = ""
else:
ret['license'] = tds[2].string
ret['category'] = tds[3].string
ret['format'] = tds[4].string
ret['data_time'] = tds[5].string
ret['updated_time'] = tds[6].string
ret['download_link'] = ROOT_URL + tds[7].find('a').get('href')
ret['data_website'] = ROOT_URL + tds[8].find('a').get('href')
ret['department'] = tds[9].string
for key in ret:
if hasattr(ret[key], 'strip'):
if callable(getattr(ret[key], 'strip', None)):
ret[key] = ret[key].strip()
results.append(ret)
return results
# get each page
async def fetch_catalog_site(client, page_num):
url = BASE_URL + PAGER_PARAM + str(page_num)
with (await sem):
content = await fetch_page(client, url)
results = parse(content)
return results
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
client = aiohttp.ClientSession(loop=loop)
f = asyncio.wait([fetch_catalog_site(client, i) for i in range(min_pager, max_pager + 1)])
results = loop.run_until_complete(f)
client.close()
values = []
for task in iter(results[0]):
for value in task.result():
values.append(value)
fp = open("../data/chibacity_opendata_catalog.json", 'w')
fp.write(json.dumps(values, indent=4))
fp.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment