Last active
December 28, 2015 07:12
-
-
Save tanishiking/415e97b566bb6cf4d432 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from time import sleep | |
import json | |
import pickle | |
import lxml.html | |
TARGET_URL = 'http://www.kyochari-navi.jp/churin/index.html' | |
BASE_URL = 'http://www.kyochari-navi.jp/churin/' | |
def _extract_urls(links): | |
res = [] | |
for a in links: | |
dic = {'name': a.text, 'url': a.get('href')} | |
res.append(dic) | |
return res | |
def get_all_link_list(url): | |
root = lxml.html.parse(url).getroot() | |
content = root.get_element_by_id('right_contents') | |
a_tags = content.find_class('ensen_list')[0].findall('.//a') | |
station_list = _extract_urls(a_tags) | |
return station_list | |
def get_churinjo_detail(churinjo): | |
print("crawling " + churinjo['name'] + " ...\n") | |
detail = {'name': churinjo['name'], 'url': BASE_URL + churinjo['url']} | |
root = lxml.html.parse(BASE_URL + churinjo['url']).getroot() | |
content = root.get_element_by_id('right_contents') | |
rows = content.xpath('.//tr') | |
for row in rows: | |
index = row.find('th').text | |
if index == '所在地': | |
detail['address'] = row.find('td').text | |
elif index == '最寄駅': | |
detail['nearest_station'] = row.find('td').text_content() | |
elif index == '問合せ先': | |
detail['tel'] = row.find('td').text | |
elif index == '収容台数': | |
detail['capacity'] = row.find('td').text_content() | |
elif index == '利用料金': | |
detail['fee'] = row.find('td').text_content() | |
elif index == '月極': | |
if row.find('td').text == '有': | |
detail['monthly'] = True | |
elif row.find('td').text == '無': | |
detail['monthly'] = False | |
else: | |
detail['monthly'] = None | |
elif index == '利用可能時間': | |
detail['business_hours'] = row.find('td').text | |
return detail | |
def main(): | |
churinjo_json = [] | |
with open('churinjo.json', 'w') as f: | |
ensen_list = get_all_link_list(TARGET_URL) | |
for ensen in ensen_list: | |
stations = get_all_link_list(BASE_URL + ensen['url']) | |
for station in stations: | |
churinjos = get_all_link_list(BASE_URL + station['url']) | |
for churinjo in churinjos: | |
churinjo_detail = get_churinjo_detail(churinjo) | |
churinjo_json.append(churinjo_detail) | |
sleep(1) | |
with open('pickle.dump', 'wb') as pf: | |
pickle.dump(churinjo_json, pf) | |
json.dump(churinjo_json, f, sort_keys=True, | |
ensure_ascii=False, indent=2) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment