billy3321 · February 15, 2014 13:19
diff --git a/get_metadata.py b/get_metadata.py
 #!/usr/local/bin/python
 # -*- coding: utf-8 -*-

 import urllib, urllib2
 import json
 import cookielib
 from BeautifulSoup import BeautifulSoup, SoupStrainer

 base_url = 'http://ivod.ly.gov.tw/'
 committee_url = 'http://ivod.ly.gov.tw/Committee/CommsDate'

 committee = {u'院會':19, 
            u'內政':1,
            u'外交及國防':17,
            u'經濟':5,
            u'財政':6,
            u'教育及文化':8,
            u'交通':9,
            u'司法及法制':9,
            u'社會福利及衛生環境':12,
            u'程序': 13,
            u'紀律':23,
            }

 def init_cookie():
    cookie=cookielib.CookieJar()
    opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
    urllib2.install_opener(opener)
    reset_cookie()

 def reset_cookie():
    http_header = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'ivod.ly.gov.tw'}
    req = urllib2.Request('http://ivod.ly.gov.tw/', None, http_header)
    web = urllib2.urlopen(req)
    result = web.read()
    #print result

 def get_date_list(comt):
    http_header = {'Referer': 'http://ivod.ly.gov.tw/Committee', 
        'Accept': '*/*',
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 
        'Host': 'ivod.ly.gov.tw',
        'Connection': 'keep-alive',
        'X-Requested-With': 'XMLHttpRequest',
        'Pragma': 'no-cache'}
    req = urllib2.Request(committee_url, urllib.urlencode({'comtid': comt}), http_header)
    #try:
    web = urllib2.urlopen(req)
    if web.getcode() == 200:
        html = web.read()
        result = json.loads(html)
        date_list = []
        for i in result['mdate']:
            date_list.append(i['METDAT'])
        return date_list
    else:
        return False
    #except:
    #    return False

 def get_movie_by_date(comit, date, page):
    http_header = {'Referer': 'http://ivod.ly.gov.tw/Committee', 
        'Accept': '*/*',
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 
        'Host': 'ivod.ly.gov.tw',
        'Connection': 'keep-alive',
        'X-Requested-With': 'XMLHttpRequest',
        'Pragma': 'no-cache'}
    req = urllib2.Request('http://ivod.ly.gov.tw/Committee/MovieByDate', urllib.urlencode({'comtid': comit, 'date': date, 'page': page}), http_header)
    #try:
    web = urllib2.urlopen(req)
    if web.getcode() == 200:
        html_result = web.read()
        #print html_result
        result = json.loads(html_result)
        return result
        #Find WZS_ID
    else:
        return False

 def get_movie_url(wzs_id):
    http_header = {'Referer': 'http://ivod.ly.gov.tw/Committee', 
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 
        'Host': 'ivod.ly.gov.tw',
        'Connection': 'keep-alive'}
    url = 'http://ivod.ly.gov.tw/Play/Full/%s/1M' % wzs_id
    req = urllib2.Request(url, None, http_header)
    web = urllib2.urlopen(req)
    if web.getcode() == 200:
        html_result = web.read()
        xml = BeautifulSoup(html_result)
        div_movie_large = xml.find('div', {'class': 'movie_large'})
        if div_movie_large:
            script_text = div_movie_large.find('script').text
            script_text = script_text.replace("readyPlayer('http://ivod.ly.gov.tw/public/scripts/','", '')
            script_text = script_text.replace("');", '')
            print script_text
            return script_text
        #return xml

 def main():
    for k in committee.keys():
        date_list = get_date_list(committee[k])
        for date in date_list:
            movie_list = get_movie_by_date(committee[k], date, 1)
            full_list = []
            single_list = []
            for i in movie_list['full']:
                item = {}
                item['url'] = get_movie_url(i['MEREID'])
                item['committee_name'] = i['CM_NAM']
                item['desc'] = i['METDEC']
                full_list.append(item)
            for i in movie_list['result']:
                item = {}
                item['url'] = get_movie_url(i['WZS_ID'])
                item['name'] = i['CH_NAM']
                item['desc'] = i['METDEC']
                single_list.append(item)
            print full_list
            print single_list


 if __name__ == '__main__':
    main()
	#!/usr/local/bin/python
	# -- coding: utf-8 --

	import urllib, urllib2
	import json
	import cookielib
	from BeautifulSoup import BeautifulSoup, SoupStrainer

	base_url = 'http://ivod.ly.gov.tw/'
	committee_url = 'http://ivod.ly.gov.tw/Committee/CommsDate'

	committee = {u'院會':19,
	u'內政':1,
	u'外交及國防':17,
	u'經濟':5,
	u'財政':6,
	u'教育及文化':8,
	u'交通':9,
	u'司法及法制':9,
	u'社會福利及衛生環境':12,
	u'程序': 13,
	u'紀律':23,
	}

	def init_cookie():
	cookie=cookielib.CookieJar()
	opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
	urllib2.install_opener(opener)
	reset_cookie()

	def reset_cookie():
	http_header = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'ivod.ly.gov.tw'}
	req = urllib2.Request('http://ivod.ly.gov.tw/', None, http_header)
	web = urllib2.urlopen(req)
	result = web.read()
	#print result

	def get_date_list(comt):
	http_header = {'Referer': 'http://ivod.ly.gov.tw/Committee',
	'Accept': '/',
	'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
	'Host': 'ivod.ly.gov.tw',
	'Connection': 'keep-alive',
	'X-Requested-With': 'XMLHttpRequest',
	'Pragma': 'no-cache'}
	req = urllib2.Request(committee_url, urllib.urlencode({'comtid': comt}), http_header)
	#try:
	web = urllib2.urlopen(req)
	if web.getcode() == 200:
	html = web.read()
	result = json.loads(html)
	date_list = []
	for i in result['mdate']:
	date_list.append(i['METDAT'])
	return date_list
	else:
	return False
	#except:
	# return False

	def get_movie_by_date(comit, date, page):
	http_header = {'Referer': 'http://ivod.ly.gov.tw/Committee',
	'Accept': '/',
	'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
	'Host': 'ivod.ly.gov.tw',
	'Connection': 'keep-alive',
	'X-Requested-With': 'XMLHttpRequest',
	'Pragma': 'no-cache'}
	req = urllib2.Request('http://ivod.ly.gov.tw/Committee/MovieByDate', urllib.urlencode({'comtid': comit, 'date': date, 'page': page}), http_header)
	#try:
	web = urllib2.urlopen(req)
	if web.getcode() == 200:
	html_result = web.read()
	#print html_result
	result = json.loads(html_result)
	return result
	#Find WZS_ID
	else:
	return False

	def get_movie_url(wzs_id):
	http_header = {'Referer': 'http://ivod.ly.gov.tw/Committee',
	'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
	'Host': 'ivod.ly.gov.tw',
	'Connection': 'keep-alive'}
	url = 'http://ivod.ly.gov.tw/Play/Full/%s/1M' % wzs_id
	req = urllib2.Request(url, None, http_header)
	web = urllib2.urlopen(req)
	if web.getcode() == 200:
	html_result = web.read()
	xml = BeautifulSoup(html_result)
	div_movie_large = xml.find('div', {'class': 'movie_large'})
	if div_movie_large:
	script_text = div_movie_large.find('script').text
	script_text = script_text.replace("readyPlayer('http://ivod.ly.gov.tw/public/scripts/','", '')
	script_text = script_text.replace("');", '')
	print script_text
	return script_text
	#return xml

	def main():
	for k in committee.keys():
	date_list = get_date_list(committee[k])
	for date in date_list:
	movie_list = get_movie_by_date(committee[k], date, 1)
	full_list = []
	single_list = []
	for i in movie_list['full']:
	item = {}
	item['url'] = get_movie_url(i['MEREID'])
	item['committee_name'] = i['CM_NAM']
	item['desc'] = i['METDEC']
	full_list.append(item)
	for i in movie_list['result']:
	item = {}
	item['url'] = get_movie_url(i['WZS_ID'])
	item['name'] = i['CH_NAM']
	item['desc'] = i['METDEC']
	single_list.append(item)
	print full_list
	print single_list


	if __name__ == '__main__':
	main()