maowug · December 16, 2015 14:39
diff --git a/dict3_NHK_enews.py b/dict3_NHK_enews.py
 #!/usr/bin/env python
 #encoding: utf-8

 import urllib
 from bs4 import BeautifulSoup as bs
 import cPickle


 def getDay(url):
    """
    get the day page
    :param url:http://cgi2.nhk.or.jp/e-news/news/index.cgi?ymd=20090831
    :return: a dict object of every infor
    """
    bsDay=bs((urllib.urlopen(url).read()))
    raw_tab_1=bsDay.find('div',id='tab_1')
    contentE= u''.join(map(lambda elem: unicode(elem) , raw_tab_1.contents))
    raw_tab_2=bsDay.find('div',id='tab_2')
    contentJ= u''.join(map(lambda elem: unicode(elem) , raw_tab_2.contents))
    return contentE,contentJ


 def getMonth(url):
    """
    get the month
    :param url:http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym=201002
    :return:
    """
    monthList=[]
    bsMonth=bs((urllib.urlopen(url).read()))
    entryList=bsMonth.find_all('table',class_='tbl_data')
    # for entry in entryList[0:10]:
    for entry in entryList:
        date=unicode(entry.find('strong').get_text())
        cat_raw=unicode(entry.find('div',class_='fl_rgt').get_text())
        catList=[unicode(c).replace(u':',u'') for c in cat_raw.split() if c!=u'' and c!=u'カテゴリ']
        titleJ=unicode(entry.find('h4',class_='archives').get_text())
        titleE=unicode(entry.find('h4',class_='archives').next_sibling)
        # print date,catList,titleE,titleJ
        urlDay=r'http://cgi2.nhk.or.jp/e-news/news'+\
            entry.find('h4',class_='archives').find('a')['href'][1:]
        # print url
        contentE,contentJ=getDay(urlDay)
        day=dict(
            date=date,
            titleE=titleE,
            titleJ=titleJ,
            catList=catList,
            contentE=contentE,
            contentJ=contentJ,
            )
        monthList.append(day)
    #imp
    return monthList


 #from 200903 to now
 from datetime import datetime
 from dateutil.rrule import rrule, DAILY, MONTHLY
 yearList=[]
 dayStart=datetime(2009,3,1)
 dayEnd=datetime.now()
 for i in rrule(MONTHLY,dtstart=dayStart,until=dayEnd):
    urlMonth=r'http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym='
    urlMonth+= i.strftime('%Y')+i.strftime('%m')
    print i.strftime('%Y-%m'),urlMonth
    monthList=getMonth(urlMonth)
    yearList+=monthList

 #cPickle
 data_NHK=open(u'dict3_NHK_enews.data','wb')
 cPickle.dump(yearList,data_NHK)
 data_NHK.close()


 # testDayURL=r'http://cgi2.nhk.or.jp/e-news/news/index.cgi?ymd=20090831'
 # getDay(testDayURL)
 #
 # testMonthURL=r'http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym=201303'
 # getMonth(testMonthURL)
	#!/usr/bin/env python
	#encoding: utf-8

	import urllib
	from bs4 import BeautifulSoup as bs
	import cPickle


	def getDay(url):
	"""
	get the day page
	:param url:http://cgi2.nhk.or.jp/e-news/news/index.cgi?ymd=20090831
	:return: a dict object of every infor
	"""
	bsDay=bs((urllib.urlopen(url).read()))
	raw_tab_1=bsDay.find('div',id='tab_1')
	contentE= u''.join(map(lambda elem: unicode(elem) , raw_tab_1.contents))
	raw_tab_2=bsDay.find('div',id='tab_2')
	contentJ= u''.join(map(lambda elem: unicode(elem) , raw_tab_2.contents))
	return contentE,contentJ


	def getMonth(url):
	"""
	get the month
	:param url:http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym=201002
	:return:
	"""
	monthList=[]
	bsMonth=bs((urllib.urlopen(url).read()))
	entryList=bsMonth.find_all('table',class_='tbl_data')
	# for entry in entryList[0:10]:
	for entry in entryList:
	date=unicode(entry.find('strong').get_text())
	cat_raw=unicode(entry.find('div',class_='fl_rgt').get_text())
	catList=[unicode(c).replace(u':',u'') for c in cat_raw.split() if c!=u'' and c!=u'カテゴリ']
	titleJ=unicode(entry.find('h4',class_='archives').get_text())
	titleE=unicode(entry.find('h4',class_='archives').next_sibling)
	# print date,catList,titleE,titleJ
	urlDay=r'http://cgi2.nhk.or.jp/e-news/news'+\
	entry.find('h4',class_='archives').find('a')['href'][1:]
	# print url
	contentE,contentJ=getDay(urlDay)
	day=dict(
	date=date,
	titleE=titleE,
	titleJ=titleJ,
	catList=catList,
	contentE=contentE,
	contentJ=contentJ,
	)
	monthList.append(day)
	#imp
	return monthList


	#from 200903 to now
	from datetime import datetime
	from dateutil.rrule import rrule, DAILY, MONTHLY
	yearList=[]
	dayStart=datetime(2009,3,1)
	dayEnd=datetime.now()
	for i in rrule(MONTHLY,dtstart=dayStart,until=dayEnd):
	urlMonth=r'http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym='
	urlMonth+= i.strftime('%Y')+i.strftime('%m')
	print i.strftime('%Y-%m'),urlMonth
	monthList=getMonth(urlMonth)
	yearList+=monthList

	#cPickle
	data_NHK=open(u'dict3_NHK_enews.data','wb')
	cPickle.dump(yearList,data_NHK)
	data_NHK.close()


	# testDayURL=r'http://cgi2.nhk.or.jp/e-news/news/index.cgi?ymd=20090831'
	# getDay(testDayURL)
	#
	# testMonthURL=r'http://cgi2.nhk.or.jp/e-news/news/archive.cgi?ym=201303'
	# getMonth(testMonthURL)