bluele · April 2, 2013 22:47
diff --git a/tvschedule.py b/tvschedule.py
 #-*- coding:utf-8 -*-
 __author__ = 'bluele'

 from BeautifulSoup import BeautifulSoup as bs
 import requests
 import re

 url = 'http://program.tv.jp.msn.com/tv.php?site=032&mode=06&category=g&area=013&template=program&sdate=20130321&lhour=7&shour=05'
 host = 'http://program.tv.jp.msn.com/tv.php'


 def get_items(soup):
    u""" 番組を格納したlineを取得します """
    return soup.findAll('td', valign='top')

 def translate(string):
    # 実体参照をタグに変換
    _string = string.replace('&lt;', '<')
    return _string.replace('&gt;', '>')

 def strip_tag(string):
    # string中の実体参照をタグに置き換えてからタグを削除
    pat = re.compile(ur'<[^>]+>')
    return pat.sub(u'', translate(string))

 def parse_item(item):
    """ 指定した番組から情報を

        @return {
            'channel': 放送局
            'time': 放映時間
        }
    """
    dummy = {
        'channel': 'dummy',
        'time': 'dummy'
    }
    # 番組タイトルを取得
    info = item.firstText()
    title = strip_tag(info.find('h1').text)
    # Like: "NHK　04:30～08:00"
    channel_time = info.find('h2').text
    channel, time = channel_time.split(u'　')
    channel_info = info.findAll('p')
    description = strip_tag(channel_info[0].text)
    casts_raw = strip_tag(channel_info[1].text)
    casts = casts_raw.strip(u'　').strip(u' ').split(u'  ')
    if not (len(casts) and casts[0] != ""):
        casts = list()

    return {
        'channel': channel,
        'time': time,
        'title': title,
        'description': description,
        'casts': casts
    }

 def get_schedule(cache=False):
    if cache: # cacheを使用
        with open('dat/sample.html', 'rb') as f:
            return f.read()
    headers = {}
    params = {
        'site': '032',
        'mode': '06',
        'category': 'g',
        'area': '013',
        'template': 'program',
        'sdate': '20130321',
        'lhour': '7',
        'shour': '05'
    }

    response = requests.get(
        host,
        headers=headers,
        params=params
    )
    return response.content

 def execute():
    countd = dict()
    soup = bs(get_schedule(cache=True))
    for item in get_items(soup):
        info = parse_item(item)
        # print info['channel'] ,info['title'] , info['channel'], info['time'], info['description']
        # print info['title'], u','.join(info['casts'])
        for cast in info['casts']:
            countd.setdefault(cast, 0)
            countd[cast] += 1

    # print countd
    for k, v in countd.iteritems():
        if v >= 2:
            print k, v

 def main():
    execute()

 if __name__ == '__main__':
    main()
	#-- coding:utf-8 --
	__author__ = 'bluele'

	from BeautifulSoup import BeautifulSoup as bs
	import requests
	import re

	url = 'http://program.tv.jp.msn.com/tv.php?site=032&mode=06&category=g&area=013&template=program&sdate=20130321&lhour=7&shour=05'
	host = 'http://program.tv.jp.msn.com/tv.php'


	def get_items(soup):
	u""" 番組を格納したlineを取得します """
	return soup.findAll('td', valign='top')

	def translate(string):
	# 実体参照をタグに変換
	_string = string.replace('<', '<')
	return _string.replace('>', '>')

	def strip_tag(string):
	# string中の実体参照をタグに置き換えてからタグを削除
	pat = re.compile(ur'<[^>]+>')
	return pat.sub(u'', translate(string))

	def parse_item(item):
	""" 指定した番組から情報を

	@return {
	'channel': 放送局
	'time': 放映時間
	}
	"""
	dummy = {
	'channel': 'dummy',
	'time': 'dummy'
	}
	# 番組タイトルを取得
	info = item.firstText()
	title = strip_tag(info.find('h1').text)
	# Like: "NHK　04:30～08:00"
	channel_time = info.find('h2').text
	channel, time = channel_time.split(u'　')
	channel_info = info.findAll('p')
	description = strip_tag(channel_info[0].text)
	casts_raw = strip_tag(channel_info[1].text)
	casts = casts_raw.strip(u'　').strip(u' ').split(u' ')
	if not (len(casts) and casts[0] != ""):
	casts = list()

	return {
	'channel': channel,
	'time': time,
	'title': title,
	'description': description,
	'casts': casts
	}

	def get_schedule(cache=False):
	if cache: # cacheを使用
	with open('dat/sample.html', 'rb') as f:
	return f.read()
	headers = {}
	params = {
	'site': '032',
	'mode': '06',
	'category': 'g',
	'area': '013',
	'template': 'program',
	'sdate': '20130321',
	'lhour': '7',
	'shour': '05'
	}

	response = requests.get(
	host,
	headers=headers,
	params=params
	)
	return response.content

	def execute():
	countd = dict()
	soup = bs(get_schedule(cache=True))
	for item in get_items(soup):
	info = parse_item(item)
	# print info['channel'] ,info['title'] , info['channel'], info['time'], info['description']
	# print info['title'], u','.join(info['casts'])
	for cast in info['casts']:
	countd.setdefault(cast, 0)
	countd[cast] += 1

	# print countd
	for k, v in countd.iteritems():
	if v >= 2:
	print k, v

	def main():
	execute()

	if __name__ == '__main__':
	main()