Created
April 2, 2013 22:47
-
-
Save bluele/5296898 to your computer and use it in GitHub Desktop.
Get tv schedule.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
__author__ = 'bluele' | |
from BeautifulSoup import BeautifulSoup as bs | |
import requests | |
import re | |
url = 'http://program.tv.jp.msn.com/tv.php?site=032&mode=06&category=g&area=013&template=program&sdate=20130321&lhour=7&shour=05' | |
host = 'http://program.tv.jp.msn.com/tv.php' | |
def get_items(soup): | |
u""" 番組を格納したlineを取得します """ | |
return soup.findAll('td', valign='top') | |
def translate(string): | |
# 実体参照をタグに変換 | |
_string = string.replace('<', '<') | |
return _string.replace('>', '>') | |
def strip_tag(string): | |
# string中の実体参照をタグに置き換えてからタグを削除 | |
pat = re.compile(ur'<[^>]+>') | |
return pat.sub(u'', translate(string)) | |
def parse_item(item): | |
""" 指定した番組から情報を | |
@return { | |
'channel': 放送局 | |
'time': 放映時間 | |
} | |
""" | |
dummy = { | |
'channel': 'dummy', | |
'time': 'dummy' | |
} | |
# 番組タイトルを取得 | |
info = item.firstText() | |
title = strip_tag(info.find('h1').text) | |
# Like: "NHK 04:30~08:00" | |
channel_time = info.find('h2').text | |
channel, time = channel_time.split(u' ') | |
channel_info = info.findAll('p') | |
description = strip_tag(channel_info[0].text) | |
casts_raw = strip_tag(channel_info[1].text) | |
casts = casts_raw.strip(u' ').strip(u' ').split(u' ') | |
if not (len(casts) and casts[0] != ""): | |
casts = list() | |
return { | |
'channel': channel, | |
'time': time, | |
'title': title, | |
'description': description, | |
'casts': casts | |
} | |
def get_schedule(cache=False): | |
if cache: # cacheを使用 | |
with open('dat/sample.html', 'rb') as f: | |
return f.read() | |
headers = {} | |
params = { | |
'site': '032', | |
'mode': '06', | |
'category': 'g', | |
'area': '013', | |
'template': 'program', | |
'sdate': '20130321', | |
'lhour': '7', | |
'shour': '05' | |
} | |
response = requests.get( | |
host, | |
headers=headers, | |
params=params | |
) | |
return response.content | |
def execute(): | |
countd = dict() | |
soup = bs(get_schedule(cache=True)) | |
for item in get_items(soup): | |
info = parse_item(item) | |
# print info['channel'] ,info['title'] , info['channel'], info['time'], info['description'] | |
# print info['title'], u','.join(info['casts']) | |
for cast in info['casts']: | |
countd.setdefault(cast, 0) | |
countd[cast] += 1 | |
# print countd | |
for k, v in countd.iteritems(): | |
if v >= 2: | |
print k, v | |
def main(): | |
execute() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment