Created
May 16, 2016 02:50
-
-
Save smellman/25b71c8827bac48ce91db8f13971ddf2 to your computer and use it in GitHub Desktop.
千葉市のオープンデータのサイトからオープンデータの一覧をjsonに落とすスクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| aiohttp==0.20.2 | |
| beautifulsoup4==4.4.1 | |
| cchardet==1.0.0 | |
| chardet==2.3.0 | |
| wheel==0.24.0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| import asyncio | |
| import aiohttp | |
| import bs4 | |
| import urllib | |
| import json | |
| ROOT_URL = "http://www.city.chiba.jp" | |
| BASE_URL = ROOT_URL + "/opendata/index.php" | |
| PAGER_PARAM = "?p=" | |
| async def fetch_page(client, url): | |
| async with client.get(url) as response: | |
| assert response.status == 200 | |
| return await response.read() | |
| loop = asyncio.get_event_loop() | |
| client = aiohttp.ClientSession(loop=loop) | |
| content = loop.run_until_complete( | |
| fetch_page(client, BASE_URL)) | |
| soup = bs4.BeautifulSoup(content, "html.parser") | |
| client.close() | |
| # check max pager | |
| link = soup.find_all("p", class_='next pager')[1].find('a').get('href') | |
| qs = urllib.parse.urlparse(link).query | |
| max_pager = int(urllib.parse.parse_qs(qs)['p'][0].split("_")[0]) | |
| min_pager = 1 | |
| # memo | |
| # s.find_all('table')[0].find_all('tr')[1].find_all('td') | |
| def parse(content): | |
| soup = bs4.BeautifulSoup(content, "html.parser") | |
| results = [] | |
| trs = soup.find_all('table')[0].find_all('tr') | |
| tr_len = len(trs) | |
| for i in range(1, tr_len): | |
| ret = {} | |
| tds = trs[i].find_all('td') | |
| ret['title'] = tds[0].string | |
| ret['description'] = tds[1].string | |
| img = tds[2].find('img') | |
| if img: | |
| url = img.get('src') | |
| if "cc-by.png" in url: | |
| ret['license'] = "CC-BY" | |
| elif "cc-by-nc.png" in url: | |
| ret['license'] = "CC-BY-NC" | |
| elif "cc-by-sa.png" in url: | |
| ret['license'] = "CC-BY-SA" | |
| elif "cc-by-nc-sa.png" in url: | |
| ret['license'] = "CC-BY-NC-SA" | |
| elif "cc-by-nd.png" in url: | |
| ret['license'] = "CC-BY-ND" | |
| elif "cc-by-nc-nd.png" in url: | |
| ret['license'] = "CC-BY-NC-ND" | |
| else: | |
| ret['license'] = "" | |
| else: | |
| ret['license'] = tds[2].string | |
| ret['category'] = tds[3].string | |
| ret['format'] = tds[4].string | |
| ret['data_time'] = tds[5].string | |
| ret['updated_time'] = tds[6].string | |
| ret['download_link'] = ROOT_URL + tds[7].find('a').get('href') | |
| ret['data_website'] = ROOT_URL + tds[8].find('a').get('href') | |
| ret['department'] = tds[9].string | |
| for key in ret: | |
| if hasattr(ret[key], 'strip'): | |
| if callable(getattr(ret[key], 'strip', None)): | |
| ret[key] = ret[key].strip() | |
| results.append(ret) | |
| return results | |
| # get each page | |
| async def fetch_catalog_site(client, page_num): | |
| url = BASE_URL + PAGER_PARAM + str(page_num) | |
| with (await sem): | |
| content = await fetch_page(client, url) | |
| results = parse(content) | |
| return results | |
| sem = asyncio.Semaphore(5) | |
| loop = asyncio.get_event_loop() | |
| client = aiohttp.ClientSession(loop=loop) | |
| f = asyncio.wait([fetch_catalog_site(client, i) for i in range(min_pager, max_pager + 1)]) | |
| results = loop.run_until_complete(f) | |
| client.close() | |
| values = [] | |
| for task in iter(results[0]): | |
| for value in task.result(): | |
| values.append(value) | |
| fp = open("../data/chibacity_opendata_catalog.json", 'w') | |
| fp.write(json.dumps(values, indent=4)) | |
| fp.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment