Skip to content

Instantly share code, notes, and snippets.

@tanishiking
Last active December 28, 2015 07:12
Show Gist options
  • Save tanishiking/415e97b566bb6cf4d432 to your computer and use it in GitHub Desktop.
Save tanishiking/415e97b566bb6cf4d432 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from time import sleep
import json
import pickle
import lxml.html
TARGET_URL = 'http://www.kyochari-navi.jp/churin/index.html'
BASE_URL = 'http://www.kyochari-navi.jp/churin/'
def _extract_urls(links):
res = []
for a in links:
dic = {'name': a.text, 'url': a.get('href')}
res.append(dic)
return res
def get_all_link_list(url):
root = lxml.html.parse(url).getroot()
content = root.get_element_by_id('right_contents')
a_tags = content.find_class('ensen_list')[0].findall('.//a')
station_list = _extract_urls(a_tags)
return station_list
def get_churinjo_detail(churinjo):
print("crawling " + churinjo['name'] + " ...\n")
detail = {'name': churinjo['name'], 'url': BASE_URL + churinjo['url']}
root = lxml.html.parse(BASE_URL + churinjo['url']).getroot()
content = root.get_element_by_id('right_contents')
rows = content.xpath('.//tr')
for row in rows:
index = row.find('th').text
if index == '所在地':
detail['address'] = row.find('td').text
elif index == '最寄駅':
detail['nearest_station'] = row.find('td').text_content()
elif index == '問合せ先':
detail['tel'] = row.find('td').text
elif index == '収容台数':
detail['capacity'] = row.find('td').text_content()
elif index == '利用料金':
detail['fee'] = row.find('td').text_content()
elif index == '月極':
if row.find('td').text == '有':
detail['monthly'] = True
elif row.find('td').text == '無':
detail['monthly'] = False
else:
detail['monthly'] = None
elif index == '利用可能時間':
detail['business_hours'] = row.find('td').text
return detail
def main():
churinjo_json = []
with open('churinjo.json', 'w') as f:
ensen_list = get_all_link_list(TARGET_URL)
for ensen in ensen_list:
stations = get_all_link_list(BASE_URL + ensen['url'])
for station in stations:
churinjos = get_all_link_list(BASE_URL + station['url'])
for churinjo in churinjos:
churinjo_detail = get_churinjo_detail(churinjo)
churinjo_json.append(churinjo_detail)
sleep(1)
with open('pickle.dump', 'wb') as pf:
pickle.dump(churinjo_json, pf)
json.dump(churinjo_json, f, sort_keys=True,
ensure_ascii=False, indent=2)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment