ustbgaofan · July 27, 2013 14:31
diff --git a/download_voa.py b/download_voa.py
 # -*- coding: utf8 -*-

 # 下载速度很慢，
 import urllib2, urllib
 import sys
 import os
 import socket
 import re

 import socks
 socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9050, rdns=False)
 socket.socket = socks.socksocket

 # set urllib2 timeout
 socket.setdefaulttimeout(300)

 # config
 INDEX_PAGE = 'http://learningenglish.voanews.com/programindex.html'
 HOST = 'http://learningenglish.voanews.com'
 VOA_DIR = os.path.join('D:\\', 'VOA')
 RETRY_TIMES = 3

 # re patterns
 re_themes = re.compile('''<h4><a href=['"](.*?http.*?latest.*?)['"]>(.*?)</a></h4>''')
 re_articles = re.compile('<h4.*?(/content/.*?/\d+\.html).*?</h4>')
 re_article_title = re.compile('<title>\s+(.*)\s+</title>')
 re_article_pdf = re.compile('''href=['"](.*pdf)['"]''')
 re_audio_page = re.compile('/audio/Audio/\d+\.html')
 re_article_audio = re.compile('(http:.*mp3)')

 # helper
 def download_data( url ):
    count = 0
    while count < RETRY_TIMES:
        count += 1
        data = urllib2.urlopen(url).read()
        if data:
            return data
        else:
            continue
    return ''

 def save_url_to_file(url, file_path):
    # if file already exists and has the same length(in bytes) with the server, do not download data
    # if server do not return Content-Length header, then do not download again
    if os.path.isfile(file_path):
        # check length
        length_s = urllib.urlopen( url ).info().get('Content-Length', 0)
        length_l = os.path.getsize( file_path )
        #print 'length_s = ', repr(length_s)
        #print 'length_l = ', repr(length_l)
        if length_s == 0 or long(length_s) == (length_l):
            return True
    
    # so, redownload the file
    # when exception happen, delete the partly downloaded file
    try:
        urllib.urlretrieve(url, file_path, reporthook)
    except:
        os.remove( file_path)
        raise
    else:
        return True
    
 # show download progress
 def reporthook(blocks_read,block_size,total_size):  
    if not blocks_read:  
        print ("Connection opened")  
    if total_size <0:  
        #print "\rRead %d blocks"  % blocks_read
        sys.stdout.write("\rRead %d blocks   "  % blocks_read)
        sys.stdout.flush()
    else:  
        #print "\rdownloading: %d KB, totalsize: %d KB" % (blocks_read*block_size/1024.0,total_size/1024.0)
        sys.stdout.write("\rdownloading: %d KB, totalsize: %d KB   " % (blocks_read*block_size/1024.0,total_size/1024.0))
        sys.stdout.flush()
    	
 # get themes
 # theme name and page for latest articles
 print 'From %s parsing themes ...' % INDEX_PAGE
 html = download_data(INDEX_PAGE)
 themes = re.findall(re_themes, html)
 if themes:
    themes = set(themes)
    print 'Got %d themes:' % len(themes)
    for theme in themes:
        print 'Theme: %s. Page: %s.' % (theme[1], theme[0])
 else:
    sys.exit()
  
 for theme in themes:
    theme_name = theme[1]
    theme_index = theme[0]

    # get article's page
    theme_html = download_data(theme_index)
    if not theme_html: sys.exit()
    article_urls = re.findall(re_articles, theme_html)

    for article in article_urls:
        article_url = HOST + article
        print 'Getting info from %s' % article_url
        article_html = download_data(article_url)
        if not article_html: sys.exit()
        print 'Got it!'
        
        try:
            # get article title          
            article_title = re.search(re_article_title, article_html).groups()
            if not article_title: sys.exit()
            article_title =  '-'.join( re.findall('(\w+)', article_title[0]) )
            #article_title = '-'.join( article_title.split(' ') )
            print 'Got article title: %s' % article_title
          
            # get pdf url
            article_pdf = re.search(re_article_pdf, article_html).groups()
            if not article_pdf: sys.exit()
            article_pdf = article_pdf[0]
            print 'Got pdf url: %s' % article_pdf
            
            # get audio url
            audio_url = HOST + re.search(re_audio_page, article_html).group()
            print 'Getting info from audio_url %s' % audio_url
            audio_html = download_data(audio_url)
            if not audio_html: sys.exit()
            article_audio = re.search(re_article_audio, audio_html).group()
            print 'Got audio url: %s' % article_audio

            print 'Downloading PDF ...'
            file = os.path.join(VOA_DIR, article_title + '.pdf')
            if save_url_to_file( article_pdf, file):
                print 'OK'
            else:
                print 'Failed'
            print 'Downloading MP3 ...'
            #print str(article_audio)
            file = os.path.join(VOA_DIR, article_title + '.mp3')
            if save_url_to_file( article_audio, file):
                print 'OK'
            else:
                print 'Failed' 
            
        except AttributeError as e:
            pass
                       
 print 'end'
	# -- coding: utf8 --

	# 下载速度很慢，
	import urllib2, urllib
	import sys
	import os
	import socket
	import re

	import socks
	socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9050, rdns=False)
	socket.socket = socks.socksocket

	# set urllib2 timeout
	socket.setdefaulttimeout(300)

	# config
	INDEX_PAGE = 'http://learningenglish.voanews.com/programindex.html'
	HOST = 'http://learningenglish.voanews.com'
	VOA_DIR = os.path.join('D:\\', 'VOA')
	RETRY_TIMES = 3

	# re patterns
	re_themes = re.compile('''<h4><a href=['"](.?http.?latest.?)['"]>(.?)</a></h4>''')
	re_articles = re.compile('<h4.?(/content/.?/\d+\.html).*?</h4>')
	re_article_title = re.compile('<title>\s+(.*)\s+</title>')
	re_article_pdf = re.compile('''href=['"](.*pdf)['"]''')
	re_audio_page = re.compile('/audio/Audio/\d+\.html')
	re_article_audio = re.compile('(http:.*mp3)')

	# helper
	def download_data( url ):
	count = 0
	while count < RETRY_TIMES:
	count += 1
	data = urllib2.urlopen(url).read()
	if data:
	return data
	else:
	continue
	return ''

	def save_url_to_file(url, file_path):
	# if file already exists and has the same length(in bytes) with the server, do not download data
	# if server do not return Content-Length header, then do not download again
	if os.path.isfile(file_path):
	# check length
	length_s = urllib.urlopen( url ).info().get('Content-Length', 0)
	length_l = os.path.getsize( file_path )
	#print 'length_s = ', repr(length_s)
	#print 'length_l = ', repr(length_l)
	if length_s == 0 or long(length_s) == (length_l):
	return True

	# so, redownload the file
	# when exception happen, delete the partly downloaded file
	try:
	urllib.urlretrieve(url, file_path, reporthook)
	except:
	os.remove( file_path)
	raise
	else:
	return True

	# show download progress
	def reporthook(blocks_read,block_size,total_size):
	if not blocks_read:
	print ("Connection opened")
	if total_size <0:
	#print "\rRead %d blocks" % blocks_read
	sys.stdout.write("\rRead %d blocks " % blocks_read)
	sys.stdout.flush()
	else:
	#print "\rdownloading: %d KB, totalsize: %d KB" % (blocks_read*block_size/1024.0,total_size/1024.0)
	sys.stdout.write("\rdownloading: %d KB, totalsize: %d KB " % (blocks_read*block_size/1024.0,total_size/1024.0))
	sys.stdout.flush()

	# get themes
	# theme name and page for latest articles
	print 'From %s parsing themes ...' % INDEX_PAGE
	html = download_data(INDEX_PAGE)
	themes = re.findall(re_themes, html)
	if themes:
	themes = set(themes)
	print 'Got %d themes:' % len(themes)
	for theme in themes:
	print 'Theme: %s. Page: %s.' % (theme[1], theme[0])
	else:
	sys.exit()

	for theme in themes:
	theme_name = theme[1]
	theme_index = theme[0]

	# get article's page
	theme_html = download_data(theme_index)
	if not theme_html: sys.exit()
	article_urls = re.findall(re_articles, theme_html)

	for article in article_urls:
	article_url = HOST + article
	print 'Getting info from %s' % article_url
	article_html = download_data(article_url)
	if not article_html: sys.exit()
	print 'Got it!'

	try:
	# get article title
	article_title = re.search(re_article_title, article_html).groups()
	if not article_title: sys.exit()
	article_title = '-'.join( re.findall('(\w+)', article_title[0]) )
	#article_title = '-'.join( article_title.split(' ') )
	print 'Got article title: %s' % article_title

	# get pdf url
	article_pdf = re.search(re_article_pdf, article_html).groups()
	if not article_pdf: sys.exit()
	article_pdf = article_pdf[0]
	print 'Got pdf url: %s' % article_pdf

	# get audio url
	audio_url = HOST + re.search(re_audio_page, article_html).group()
	print 'Getting info from audio_url %s' % audio_url
	audio_html = download_data(audio_url)
	if not audio_html: sys.exit()
	article_audio = re.search(re_article_audio, audio_html).group()
	print 'Got audio url: %s' % article_audio

	print 'Downloading PDF ...'
	file = os.path.join(VOA_DIR, article_title + '.pdf')
	if save_url_to_file( article_pdf, file):
	print 'OK'
	else:
	print 'Failed'
	print 'Downloading MP3 ...'
	#print str(article_audio)
	file = os.path.join(VOA_DIR, article_title + '.mp3')
	if save_url_to_file( article_audio, file):
	print 'OK'
	else:
	print 'Failed'

	except AttributeError as e:
	pass

	print 'end'