tanishiking · December 28, 2015 07:12
diff --git a/churinjo-crawler.py b/churinjo-crawler.py
 # -*- coding: utf-8 -*-

 from time import sleep
 import json
 import pickle

 import lxml.html

 TARGET_URL = 'http://www.kyochari-navi.jp/churin/index.html'
 BASE_URL = 'http://www.kyochari-navi.jp/churin/'


 def _extract_urls(links):
    res = []
    for a in links:
        dic = {'name': a.text, 'url': a.get('href')}
        res.append(dic)
    return res


 def get_all_link_list(url):
    root = lxml.html.parse(url).getroot()
    content = root.get_element_by_id('right_contents')
    a_tags = content.find_class('ensen_list')[0].findall('.//a')
    station_list = _extract_urls(a_tags)
    return station_list


 def get_churinjo_detail(churinjo):
    print("crawling " + churinjo['name'] + " ...\n")
    detail = {'name': churinjo['name'], 'url': BASE_URL + churinjo['url']}
    root = lxml.html.parse(BASE_URL + churinjo['url']).getroot()
    content = root.get_element_by_id('right_contents')
    rows = content.xpath('.//tr')
    for row in rows:
        index = row.find('th').text
        if index == '所在地':
            detail['address'] = row.find('td').text
        elif index == '最寄駅':
            detail['nearest_station'] = row.find('td').text_content()
        elif index == '問合せ先':
            detail['tel'] = row.find('td').text
        elif index == '収容台数':
            detail['capacity'] = row.find('td').text_content()
        elif index == '利用料金':
            detail['fee'] = row.find('td').text_content()
        elif index == '月極':
            if row.find('td').text == '有':
                detail['monthly'] = True
            elif row.find('td').text == '無':
                detail['monthly'] = False
            else:
                detail['monthly'] = None
        elif index == '利用可能時間':
            detail['business_hours'] = row.find('td').text
    return detail


 def main():
    churinjo_json = []
    with open('churinjo.json', 'w') as f:
        ensen_list = get_all_link_list(TARGET_URL)
        for ensen in ensen_list:
            stations = get_all_link_list(BASE_URL + ensen['url'])
            for station in stations:
                churinjos = get_all_link_list(BASE_URL + station['url'])
                for churinjo in churinjos:
                    churinjo_detail = get_churinjo_detail(churinjo)
                    churinjo_json.append(churinjo_detail)
                    sleep(1)
        with open('pickle.dump', 'wb') as pf:
            pickle.dump(churinjo_json, pf)
        json.dump(churinjo_json, f, sort_keys=True,
                  ensure_ascii=False, indent=2)


 if __name__ == '__main__':
    main()
	# -- coding: utf-8 --

	from time import sleep
	import json
	import pickle

	import lxml.html

	TARGET_URL = 'http://www.kyochari-navi.jp/churin/index.html'
	BASE_URL = 'http://www.kyochari-navi.jp/churin/'


	def _extract_urls(links):
	res = []
	for a in links:
	dic = {'name': a.text, 'url': a.get('href')}
	res.append(dic)
	return res


	def get_all_link_list(url):
	root = lxml.html.parse(url).getroot()
	content = root.get_element_by_id('right_contents')
	a_tags = content.find_class('ensen_list')[0].findall('.//a')
	station_list = _extract_urls(a_tags)
	return station_list


	def get_churinjo_detail(churinjo):
	print("crawling " + churinjo['name'] + " ...\n")
	detail = {'name': churinjo['name'], 'url': BASE_URL + churinjo['url']}
	root = lxml.html.parse(BASE_URL + churinjo['url']).getroot()
	content = root.get_element_by_id('right_contents')
	rows = content.xpath('.//tr')
	for row in rows:
	index = row.find('th').text
	if index == '所在地':
	detail['address'] = row.find('td').text
	elif index == '最寄駅':
	detail['nearest_station'] = row.find('td').text_content()
	elif index == '問合せ先':
	detail['tel'] = row.find('td').text
	elif index == '収容台数':
	detail['capacity'] = row.find('td').text_content()
	elif index == '利用料金':
	detail['fee'] = row.find('td').text_content()
	elif index == '月極':
	if row.find('td').text == '有':
	detail['monthly'] = True
	elif row.find('td').text == '無':
	detail['monthly'] = False
	else:
	detail['monthly'] = None
	elif index == '利用可能時間':
	detail['business_hours'] = row.find('td').text
	return detail


	def main():
	churinjo_json = []
	with open('churinjo.json', 'w') as f:
	ensen_list = get_all_link_list(TARGET_URL)
	for ensen in ensen_list:
	stations = get_all_link_list(BASE_URL + ensen['url'])
	for station in stations:
	churinjos = get_all_link_list(BASE_URL + station['url'])
	for churinjo in churinjos:
	churinjo_detail = get_churinjo_detail(churinjo)
	churinjo_json.append(churinjo_detail)
	sleep(1)
	with open('pickle.dump', 'wb') as pf:
	pickle.dump(churinjo_json, pf)
	json.dump(churinjo_json, f, sort_keys=True,
	ensure_ascii=False, indent=2)


	if __name__ == '__main__':
	main()