Skip to content

Instantly share code, notes, and snippets.

@maowug
Last active December 16, 2015 14:39
Show Gist options
  • Save maowug/5450387 to your computer and use it in GitHub Desktop.
Save maowug/5450387 to your computer and use it in GitHub Desktop.
dict3 nhk enews
#!/usr/bin/env python
#encoding: utf-8
import urllib
from bs4 import BeautifulSoup as bs
import cPickle
def getDay(url):
"""
get the day page
:param url:http://cgi2.nhk.or.jp/e-news/news/index.cgi?ymd=20090831
:return: a dict object of every infor
"""
bsDay=bs((urllib.urlopen(url).read()))
raw_tab_1=bsDay.find('div',id='tab_1')
contentE= u''.join(map(lambda elem: unicode(elem) , raw_tab_1.contents))
raw_tab_2=bsDay.find('div',id='tab_2')
contentJ= u''.join(map(lambda elem: unicode(elem) , raw_tab_2.contents))
return contentE,contentJ
def getMonth(url):
"""
get the month
:param url:http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym=201002
:return:
"""
monthList=[]
bsMonth=bs((urllib.urlopen(url).read()))
entryList=bsMonth.find_all('table',class_='tbl_data')
# for entry in entryList[0:10]:
for entry in entryList:
date=unicode(entry.find('strong').get_text())
cat_raw=unicode(entry.find('div',class_='fl_rgt').get_text())
catList=[unicode(c).replace(u':',u'') for c in cat_raw.split() if c!=u'' and c!=u'カテゴリ']
titleJ=unicode(entry.find('h4',class_='archives').get_text())
titleE=unicode(entry.find('h4',class_='archives').next_sibling)
# print date,catList,titleE,titleJ
urlDay=r'http://cgi2.nhk.or.jp/e-news/news'+\
entry.find('h4',class_='archives').find('a')['href'][1:]
# print url
contentE,contentJ=getDay(urlDay)
day=dict(
date=date,
titleE=titleE,
titleJ=titleJ,
catList=catList,
contentE=contentE,
contentJ=contentJ,
)
monthList.append(day)
#imp
return monthList
#from 200903 to now
from datetime import datetime
from dateutil.rrule import rrule, DAILY, MONTHLY
yearList=[]
dayStart=datetime(2009,3,1)
dayEnd=datetime.now()
for i in rrule(MONTHLY,dtstart=dayStart,until=dayEnd):
urlMonth=r'http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym='
urlMonth+= i.strftime('%Y')+i.strftime('%m')
print i.strftime('%Y-%m'),urlMonth
monthList=getMonth(urlMonth)
yearList+=monthList
#cPickle
data_NHK=open(u'dict3_NHK_enews.data','wb')
cPickle.dump(yearList,data_NHK)
data_NHK.close()
# testDayURL=r'http://cgi2.nhk.or.jp/e-news/news/index.cgi?ymd=20090831'
# getDay(testDayURL)
#
# testMonthURL=r'http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym=201303'
# getMonth(testMonthURL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment