-
-
Save ustbgaofan/6095027 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
# 下载速度很慢, | |
import urllib2, urllib | |
import sys | |
import os | |
import socket | |
import re | |
import socks | |
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9050, rdns=False) | |
socket.socket = socks.socksocket | |
# set urllib2 timeout | |
socket.setdefaulttimeout(300) | |
# config | |
INDEX_PAGE = 'http://learningenglish.voanews.com/programindex.html' | |
HOST = 'http://learningenglish.voanews.com' | |
VOA_DIR = os.path.join('D:\\', 'VOA') | |
RETRY_TIMES = 3 | |
# re patterns | |
re_themes = re.compile('''<h4><a href=['"](.*?http.*?latest.*?)['"]>(.*?)</a></h4>''') | |
re_articles = re.compile('<h4.*?(/content/.*?/\d+\.html).*?</h4>') | |
re_article_title = re.compile('<title>\s+(.*)\s+</title>') | |
re_article_pdf = re.compile('''href=['"](.*pdf)['"]''') | |
re_audio_page = re.compile('/audio/Audio/\d+\.html') | |
re_article_audio = re.compile('(http:.*mp3)') | |
# helper | |
def download_data( url ): | |
count = 0 | |
while count < RETRY_TIMES: | |
count += 1 | |
data = urllib2.urlopen(url).read() | |
if data: | |
return data | |
else: | |
continue | |
return '' | |
def save_url_to_file(url, file_path): | |
# if file already exists and has the same length(in bytes) with the server, do not download data | |
# if server do not return Content-Length header, then do not download again | |
if os.path.isfile(file_path): | |
# check length | |
length_s = urllib.urlopen( url ).info().get('Content-Length', 0) | |
length_l = os.path.getsize( file_path ) | |
#print 'length_s = ', repr(length_s) | |
#print 'length_l = ', repr(length_l) | |
if length_s == 0 or long(length_s) == (length_l): | |
return True | |
# so, redownload the file | |
# when exception happen, delete the partly downloaded file | |
try: | |
urllib.urlretrieve(url, file_path, reporthook) | |
except: | |
os.remove( file_path) | |
raise | |
else: | |
return True | |
# show download progress | |
def reporthook(blocks_read,block_size,total_size): | |
if not blocks_read: | |
print ("Connection opened") | |
if total_size <0: | |
#print "\rRead %d blocks" % blocks_read | |
sys.stdout.write("\rRead %d blocks " % blocks_read) | |
sys.stdout.flush() | |
else: | |
#print "\rdownloading: %d KB, totalsize: %d KB" % (blocks_read*block_size/1024.0,total_size/1024.0) | |
sys.stdout.write("\rdownloading: %d KB, totalsize: %d KB " % (blocks_read*block_size/1024.0,total_size/1024.0)) | |
sys.stdout.flush() | |
# get themes | |
# theme name and page for latest articles | |
print 'From %s parsing themes ...' % INDEX_PAGE | |
html = download_data(INDEX_PAGE) | |
themes = re.findall(re_themes, html) | |
if themes: | |
themes = set(themes) | |
print 'Got %d themes:' % len(themes) | |
for theme in themes: | |
print 'Theme: %s. Page: %s.' % (theme[1], theme[0]) | |
else: | |
sys.exit() | |
for theme in themes: | |
theme_name = theme[1] | |
theme_index = theme[0] | |
# get article's page | |
theme_html = download_data(theme_index) | |
if not theme_html: sys.exit() | |
article_urls = re.findall(re_articles, theme_html) | |
for article in article_urls: | |
article_url = HOST + article | |
print 'Getting info from %s' % article_url | |
article_html = download_data(article_url) | |
if not article_html: sys.exit() | |
print 'Got it!' | |
try: | |
# get article title | |
article_title = re.search(re_article_title, article_html).groups() | |
if not article_title: sys.exit() | |
article_title = '-'.join( re.findall('(\w+)', article_title[0]) ) | |
#article_title = '-'.join( article_title.split(' ') ) | |
print 'Got article title: %s' % article_title | |
# get pdf url | |
article_pdf = re.search(re_article_pdf, article_html).groups() | |
if not article_pdf: sys.exit() | |
article_pdf = article_pdf[0] | |
print 'Got pdf url: %s' % article_pdf | |
# get audio url | |
audio_url = HOST + re.search(re_audio_page, article_html).group() | |
print 'Getting info from audio_url %s' % audio_url | |
audio_html = download_data(audio_url) | |
if not audio_html: sys.exit() | |
article_audio = re.search(re_article_audio, audio_html).group() | |
print 'Got audio url: %s' % article_audio | |
print 'Downloading PDF ...' | |
file = os.path.join(VOA_DIR, article_title + '.pdf') | |
if save_url_to_file( article_pdf, file): | |
print 'OK' | |
else: | |
print 'Failed' | |
print 'Downloading MP3 ...' | |
#print str(article_audio) | |
file = os.path.join(VOA_DIR, article_title + '.mp3') | |
if save_url_to_file( article_audio, file): | |
print 'OK' | |
else: | |
print 'Failed' | |
except AttributeError as e: | |
pass | |
print 'end' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment