Skip to content

Instantly share code, notes, and snippets.

@mrdaemon
Created January 23, 2012 18:55
Show Gist options
  • Save mrdaemon/1664867 to your computer and use it in GitHub Desktop.
Save mrdaemon/1664867 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#
# Shittiest script to scrape http://irssi.org/themes and fetch the theme
# image screenshots, because frankly, that shit is unviewable.
from os import path, mkdir
import re
import sys
import urllib2
from urlparse import urlsplit, urljoin
from BeautifulSoup import BeautifulSoup
LOCATION = "http://irssi.org/themes"
def parse_url(pageurl):
""" Parse url, returns soup." """
try:
page_handle = urllib2.urlopen(pageurl)
except urllib2.URLError as e:
print "Error: Failed to connect to %s: %s" % (pageurl, e.reason)
sys.exit(1)
else:
return BeautifulSoup(page_handle)
def download(url):
""" Download a file from url to destination """
try:
data = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print "Failed to download %s: %s" % (url, e.reason)
return False
else:
filename = path.basename(urlsplit(url)[2])
try:
out = open(path.join("out", filename), 'wb')
out.write(data)
out.close()
except IOError as e:
print "Error occured saving file: %s" % (e)
return False
else:
return True
def main():
print "Shitty image scraper v1"
print "(c) Alexandre Gauthier 2010-2011"
print "--------------------------------------"
print "Downloading all the themes from %s" % (LOCATION)
print "Terribly hardcoded output dir is %s" % "out"
soup = parse_url(LOCATION)
p = re.compile('^\/themefiles\/.+\.\w+', re.IGNORECASE)
if path.isdir('out'):
pass
elif path.isfile('out'):
print "A file named 'out' already exists here. Bailing."
sys.exit(1)
else:
# Bah, let it throw an exception, that's enough try/catch
# bullshit for today, I think.
mkdir('out')
print "NOTE: Created directory 'out'"
for link in soup.findAll('a'):
if p.match(link['href']):
print "%s matches pattern, fetching... " % link['href'],
if download(urljoin(LOCATION,link['href'])):
print "OK"
else:
print "FAILED!"
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment