Skip to content

Instantly share code, notes, and snippets.

@mrdaemon
Created February 22, 2011 15:41
Show Gist options
  • Save mrdaemon/838851 to your computer and use it in GitHub Desktop.
Save mrdaemon/838851 to your computer and use it in GitHub Desktop.
[0:527] callisto:one_off_hacks $ python ScrapeThemes_irssi.py
Shitty image scraper v1
(c) Alexandre Gauthier 2010-2011
--------------------------------------
Downloading all the themes from http://irssi.org/themes
Terribly hardcoded output dir is out
NOTE: Created directory 'out'
/themefiles/h3rbz.png matches pattern, fetching... OK
/themefiles/h3rbz.theme matches pattern, fetching... OK
/themefiles/spring.png matches pattern, fetching... OK
/themefiles/spring.theme matches pattern, fetching... OK
/themefiles/dark_winter.png matches pattern, fetching... OK
/themefiles/dark_winter.theme matches pattern, fetching... OK
/themefiles/elite.png matches pattern, fetching... OK
/themefiles/elite.theme matches pattern, fetching... OK
/themefiles/revolutionary.png matches pattern, fetching... OK
/themefiles/revolutionary.theme matches pattern, fetching... OK
/themefiles/revolutionaryv2.png matches pattern, fetching... OK
[...]
#!/usr/bin/env python
#
# Shittiest script to scrape http://irssi.org/themes and fetch the theme
# image screenshots, because frankly, that shit is unviewable.
from os import path, mkdir
import re
import sys
import urllib2
from urlparse import urlsplit, urljoin
from BeautifulSoup import BeautifulSoup
LOCATION = "http://irssi.org/themes"
def parse_url(pageurl):
""" Parse url, returns soup." """
try:
page_handle = urllib2.urlopen(pageurl)
except urllib2.URLError as e:
print "Error: Failed to connect to %s: %s" % (pageurl, e.reason)
sys.exit(1)
else:
return BeautifulSoup(page_handle)
def download(url):
""" Download a file from url to destination """
try:
data = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print "Failed to download %s: %s" % (url, e.reason)
return False
else:
filename = path.basename(urlsplit(url)[2])
try:
out = open(path.join("out", filename), 'wb')
out.write(data)
out.close()
except IOError as e:
print "Error occured saving file: %s" % (e)
return False
else:
return True
def main():
print "Shitty image scraper v1"
print "(c) Alexandre Gauthier 2010-2011"
print "--------------------------------------"
print "Downloading all the themes from %s" % (LOCATION)
print "Terribly hardcoded output dir is %s" % "out"
soup = parse_url(LOCATION)
p = re.compile('^\/themefiles\/.+\.\w+', re.IGNORECASE)
if path.isdir('out'):
pass
elif path.isfile('out'):
print "A file named 'out' already exists here. Bailing."
sys.exit(1)
else:
# Bah, let it throw and exception, that's enough try/catch
# bullshit for today, I think.
mkdir('out')
print "NOTE: Created directory 'out'"
for link in soup.findAll('a'):
if p.match(link['href']):
print "%s matches pattern, fetching... " % link['href'],
if download(urljoin(LOCATION,link['href'])):
print "OK"
else:
print "FAILED!"
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment