Created
March 2, 2012 11:35
-
-
Save tigerwang202/1957916 to your computer and use it in GitHub Desktop.
Fetch VOA Special English RSS feed & convert it to html file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Fetch VOA Special English | |
# wangmengyin 2012-02-26 | |
import urllib.request | |
import urllib.error | |
from xml.dom import minidom, Node | |
from os import system, path | |
import os | |
import re | |
import sys | |
from subprocess import call | |
import pickle | |
localmp3 = False | |
localjpg = False | |
#rss_link = 'http://www.voanews.com/templates/Articles.rss?' \ | |
# 'sectionPath=/learningenglish/home' | |
rss_link = 'http://rss2proxy.appspot.com' | |
proxy_url = 'http://203.208.46.1:80/' # ip of google.cn | |
# article content page template | |
template_file = 'voa_special_english_content.html' | |
def main(): | |
# clear buffer files | |
clearBuf() | |
# read html template file | |
template = readtemplateFile('rss', template_file) | |
# fetch rss via proxy | |
print('Fetch rss seed, via {%s} proxy, wait.' %proxy_url) | |
proxies = {'http': proxy_url} | |
opener = urllib.request.FancyURLopener(proxies) | |
link = opener.open(rss_link) | |
# parse rss content | |
print('Parse content') | |
articles = [] | |
dom = minidom.parse(link) | |
page_num = 0 | |
for node in dom.getElementsByTagName('item'): | |
mp3 = handleMp3(node) | |
jpg = handlePic(node) | |
article_content = handleArticleContent(node) | |
# skip invaild page | |
if mp3 == '' or jpg == '' or article_content == '': | |
continue | |
# parse summary | |
article = { | |
'title' : handleText(node, 'title'), | |
'link' : handleText(node, 'link'), | |
'pubDate' : handleText(node, 'pubDate'), | |
'description' : handleText(node, 'description'), | |
'mp3' : mp3, | |
'jpg' : jpg | |
} | |
articles.append(article) | |
print('Fecth article[%d]: %s' %(page_num + 1, article['title'])) | |
if localjpg: | |
# save pic | |
pathPic = savePic(article['jpg']) | |
if pathPic != '': | |
article['jpg'] = pathPic # replace pic with local buffer. | |
if localmp3: | |
# save mp3 | |
pathMp3 = saveMp3(article['mp3']) | |
if pathMp3 != '': | |
article['mp3'] = pathMp3 # replace mp3 with local buffer. | |
# generate article page | |
html = generateArticle(template, article, article_content) | |
html_path = path.join('rss', | |
"{0:d}{1}".format(page_num + 1, '.html')) | |
f = open(html_path, 'w', encoding = 'utf-8') | |
f.write(html) | |
f.write | |
f.close() | |
page_num = page_num + 1 | |
print('\nGet %d articles, saved to html page.' %page_num) | |
# generate rss index | |
f = open('rss.xml', 'w', encoding = 'utf-8') | |
f.write(genXml(articles)) | |
f.close() | |
def getText(nodelist): | |
rc = [] | |
for node in nodelist: | |
if node.nodeType == node.TEXT_NODE: | |
rc.append(node.data) | |
return ''.join(rc) | |
def handleText(node, s): | |
return getText(node.getElementsByTagName(s)[0].childNodes) | |
def handleMp3(node): | |
elements = node.getElementsByTagName('media:content') | |
if len(elements) != 0: | |
for element in elements: | |
if element.hasAttribute('type'): | |
return element.getAttribute('url') | |
return '' | |
def handlePic(node): | |
elements = node.getElementsByTagName('enclosure') | |
if len(elements) != 0: | |
for element in elements: | |
if element.hasAttribute('url'): | |
return element.getAttribute('url') | |
return '' | |
def handleArticleContent(node): | |
# parse embedded html content | |
elements = node.getElementsByTagName('content:encoded') | |
if len(elements) != 0: | |
content = elements[0].toxml() | |
# format html file | |
match = re.search(r'</object>', content) | |
if match: | |
# 27 = len('</div>]]></content:encoded>') | |
content = content[match.end(): -27] | |
return content | |
return '' | |
def generateArticle(template, article, article_content): | |
html = template.format(link = article['link'], | |
title = article['title'], | |
pubDate = article['pubDate'], | |
jpg = article['jpg'], | |
mp3 = article['mp3'], | |
content = article_content) | |
return html | |
def readtemplateFile(directory, filename): | |
fp = path.join(os.getcwd(), directory, filename) | |
print('Read html template file: %s' %fp) | |
f = open(fp, 'r', encoding = 'utf-8') | |
template = f.read() | |
f.close() | |
return template | |
#function that downloads a file | |
def downloadFile(file_name,file_mode,url): | |
ok = False | |
# Open the url | |
try: | |
print("downloading ", url) | |
f = urllib.request.urlopen(url) | |
# Open our local file for writing | |
local_file = open(file_name, "w" + file_mode) | |
#Write to our local file | |
local_file.write(f.read()) | |
local_file.close() | |
ok = True | |
#handle errors | |
except urllib.error.HTTPError as e: | |
print("HTTP Error:",e.code , url) | |
except urllib.error.URLError as e: | |
print("URL Error:",e.reason , url) | |
return ok | |
# save pic | |
def savePic(url): | |
print('Pic ', end = '') | |
file_name = path.join(os.getcwd(), 'rss', 'jpg', url.split('/')[-1]) | |
ok = downloadFile(file_name, 'b', url) | |
if ok: | |
return path.join('jpg', url.split('/')[-1]) | |
else: | |
os.remove(file_name) | |
return '' | |
# save mp3 using curl or wget, bacause file size is big. | |
def saveMp3(url): | |
print('Mp3 downloading %s' %url) | |
file_name = path.join(os.getcwd(), 'rss', 'mp3', url.split('/')[-1]) | |
ret = call('curl --connect-timeout 50 -o {0} {1}'.format(file_name, url)) | |
if ret != 0: | |
print('curl Error %d refer to http://linux.die.net/man/1/curl' %ret) | |
os.remove(file_name) | |
return '' | |
else: | |
return path.join('mp3', url.split('/')[-1]) | |
# clear temp buffer files | |
def clearBuf(): | |
curdir = os.getcwd() | |
rmfiles(path.join(curdir, 'rss', 'jpg')) # rm rss/jpg/*.jpg | |
rmfiles(path.join(curdir, 'rss', 'mp3')) # rm rss/mp3/*.mp3 | |
# rm rss/[0-9].html seed.xml | |
for root, dirs, files in os.walk(path.join(curdir, 'rss'), topdown=False): | |
for name in files: | |
p = os.path.join(root, name) | |
if re.search(r'[0-9]+.html', p): | |
os.remove(p) | |
# rm rss.xml | |
if os.path.exists(os.path.join(curdir, 'rss.xml')): | |
os.remove(os.path.join(curdir, 'rss.xml')) | |
# remove files in curdir | |
def rmfiles(curdir): | |
for root, dirs, files in os.walk(curdir, topdown=False): | |
for name in files: | |
os.remove(os.path.join(root, name)) | |
# generate article list xml | |
def genXml(articles): | |
s = '' | |
s = s + '<rssfeed>' + '\n' | |
for article in articles: | |
s = s + handleNode(article) + '\n' | |
s = s + '</rssfeed>' | |
return s | |
def handleNode(article): | |
s = '' | |
s = s + '<item>' + '\n' | |
for key in article.keys(): | |
s = s + '\t' + handleElement(article, key) | |
s = s + '</item>' + '\n' | |
return s | |
def handleElement(article, i): | |
return '<{0}>{1}</{0}>\n'.format(i, article[i]) | |
# Standard boilerplate to call the main() function to begin | |
# the program. | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment