Skip to content

Instantly share code, notes, and snippets.

@redmoses
Created April 6, 2014 10:37
Show Gist options
  • Save redmoses/10004294 to your computer and use it in GitHub Desktop.
Save redmoses/10004294 to your computer and use it in GitHub Desktop.
RSS Downloader
"""
RSS Downloader
~~~~~~~~~~~~~~
The application parses rss feeds from a given url and downloads the files
from the feed to a given output directory.
Features:
~ Resume partially completed download
~ Supports http, and ftp protocols
Logging:
~ All logs are generated in the rssdownloader.log file
Dependencies:
~ feedparser (https://pypi.python.org/pypi/feedparser)
Usage:
python downloader.py --feed=<RSS-Feed-URL> --output=<PATH-TO-DIRECTORY>
:Author: Musa Nasrullah
:Email: [email protected]
:Website: http://www.redmoses.org
"""
## Code Starts ##
from __future__ import print_function
import sys
import getopt
import logging
import os
import urllib
import time
import feedparser # feedparser package for parsing rss feeds
# default file downloader for protocols : HTTP, HTTPS, FTP
def default_downloader(url, download_location, logger):
# get the file name from the url
file_name = url.split('/')[-1]
# create the temporary file name
temp_file_name = file_name + ".tmp"
temp_file_path = download_location + temp_file_name
file_path = download_location + file_name
# initiate url opener
url_opener = urllib.FancyURLopener()
# check if the file already exist in the location
if not os.path.exists(file_path):
file_size_dl = 0
# check if temporary file exists
if os.path.exists(temp_file_path):
# open the file in append mode for resuming download
temp_file = open(temp_file_path, 'ab')
# set downloaded amount to existing size
file_size_dl = os.path.getsize(temp_file_path)
# file range to header
url_opener.addheader("Range", "bytes=%s-" % file_size_dl)
else: # temporary file doesn't exist so just open it for writing
temp_file = open(temp_file_path, 'wb')
# open connection for downloading the file
remote_file = url_opener.open(url)
# get file meta information
meta = remote_file.info()
# get file size in bytes
file_size = float(meta.getheaders("Content-Length")[0])
# check if the url is valid
if remote_file.getcode() >= 400:
return
resume_support = True
# check resume support for http
#if protocol == "http":
if remote_file.getcode() != 206:
resume_support = False
if not resume_support: # server doesn't support resume
if os.path.isfile(temp_file_path):
# delete temporary file if it exists
temp_file.close()
os.remove(temp_file_path)
# create and open the file for writing
temp_file = open(temp_file_path, 'wb')
# set downloaded size to 0
file_size_dl = 0
# start download
print("\n")
logger.info("Downloading: %s [%3.1f bytes]" % (file_name, file_size))
# set each chunk size to be 8192 bytes
block_sz = 8192
# keep on looping till finish reading the remote file completely
while True:
# read the remote file
remote_data = remote_file.read(block_sz)
if not remote_data:
break
file_size_dl += len(remote_data)
# write the downloaded data
temp_file.write(remote_data)
# calculate percentage downloaded
status = r"%10d bytes [%3.1f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
# print the new status message, replacing the previous one
print(status, end='\r')
# wait for the downloading to completely finish
time.sleep(3)
# download completed
if file_size_dl == file_size:
logger.info("\nDownload completed: %s [%3.1f bytes]" % (file_name, file_size))
# close the files
temp_file.close()
remote_file.close()
# rename the temporary file to its original name
os.rename(temp_file_path, file_path)
else: # file already downloaded
logger.error("%s already downloaded." % file_name)
# determine protocol and download file accordingly
def download_file(url, download_location, protocol, logger):
if protocol == "http" or protocol == "ftp": # use default protocols
# for default protocols: http and ftp
default_downloader(url, download_location, logger)
#elif protocol == 'sftp':
# call sftp downloader
# get the download links from the feed
def download_feed(feed_url, download_location, logger):
logger.info("Retrieving download links from the feed...")
# get the rss feed
feed = feedparser.parse(feed_url)
logger.info("Downloading files...")
# loop through all the items in the feed
for item in feed.entries:
file_url = item.link
# get url protocol
protocol = file_url.split(':')[0].lower()
download_file(file_url, download_location, protocol, logger)
logger.info("Operations completed")
# show usage function
def show_usage():
print('Usage example: downloader.py --feed=<RSS-Feed-URL> --output=<PATH-TO-DIRECTORY>')
# the main function
def main(argv):
# initiate the parameters
feed_url = ''
output_location = ''
try:
# define parameters
params, args = getopt.getopt(argv, "hf:o:", ["feed=", "output="])
except getopt.GetoptError:
# if no parameter is supplied show usage
show_usage()
sys.exit(2)
for param, arg in params:
# help
if param == '-h':
show_usage()
sys.exit()
# get feed url
elif param in ("-f", "--feed"):
feed_url = arg
# get output location
elif param in ("-o", "--output"):
output_location = arg
# configure logger
logger = logging.getLogger('RSS_Downloader')
# configure log file
file_handler = logging.FileHandler('downloader.log')
# set log formatting
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
file_handler.setFormatter(formatter)
# set log to print and write log messages
logger.addHandler(logging.StreamHandler())
logger.addHandler(file_handler)
# set logging level
logger.setLevel(logging.INFO)
if feed_url != '' or output_location != '':
logger.info('Feed URL: %s | Download Destination: %s ' % (feed_url, output_location))
# download the feed
download_feed(feed_url, output_location, logger)
else:
logger.error('Error: Required parameters are missing.')
show_usage()
sys.exit(2)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment