Created
February 22, 2011 15:41
-
-
Save mrdaemon/838851 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[0:527] callisto:one_off_hacks $ python ScrapeThemes_irssi.py | |
Shitty image scraper v1 | |
(c) Alexandre Gauthier 2010-2011 | |
-------------------------------------- | |
Downloading all the themes from http://irssi.org/themes | |
Terribly hardcoded output dir is out | |
NOTE: Created directory 'out' | |
/themefiles/h3rbz.png matches pattern, fetching... OK | |
/themefiles/h3rbz.theme matches pattern, fetching... OK | |
/themefiles/spring.png matches pattern, fetching... OK | |
/themefiles/spring.theme matches pattern, fetching... OK | |
/themefiles/dark_winter.png matches pattern, fetching... OK | |
/themefiles/dark_winter.theme matches pattern, fetching... OK | |
/themefiles/elite.png matches pattern, fetching... OK | |
/themefiles/elite.theme matches pattern, fetching... OK | |
/themefiles/revolutionary.png matches pattern, fetching... OK | |
/themefiles/revolutionary.theme matches pattern, fetching... OK | |
/themefiles/revolutionaryv2.png matches pattern, fetching... OK | |
[...] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Shittiest script to scrape http://irssi.org/themes and fetch the theme | |
# image screenshots, because frankly, that shit is unviewable. | |
from os import path, mkdir | |
import re | |
import sys | |
import urllib2 | |
from urlparse import urlsplit, urljoin | |
from BeautifulSoup import BeautifulSoup | |
LOCATION = "http://irssi.org/themes" | |
def parse_url(pageurl): | |
""" Parse url, returns soup." """ | |
try: | |
page_handle = urllib2.urlopen(pageurl) | |
except urllib2.URLError as e: | |
print "Error: Failed to connect to %s: %s" % (pageurl, e.reason) | |
sys.exit(1) | |
else: | |
return BeautifulSoup(page_handle) | |
def download(url): | |
""" Download a file from url to destination """ | |
try: | |
data = urllib2.urlopen(url).read() | |
except urllib2.URLError as e: | |
print "Failed to download %s: %s" % (url, e.reason) | |
return False | |
else: | |
filename = path.basename(urlsplit(url)[2]) | |
try: | |
out = open(path.join("out", filename), 'wb') | |
out.write(data) | |
out.close() | |
except IOError as e: | |
print "Error occured saving file: %s" % (e) | |
return False | |
else: | |
return True | |
def main(): | |
print "Shitty image scraper v1" | |
print "(c) Alexandre Gauthier 2010-2011" | |
print "--------------------------------------" | |
print "Downloading all the themes from %s" % (LOCATION) | |
print "Terribly hardcoded output dir is %s" % "out" | |
soup = parse_url(LOCATION) | |
p = re.compile('^\/themefiles\/.+\.\w+', re.IGNORECASE) | |
if path.isdir('out'): | |
pass | |
elif path.isfile('out'): | |
print "A file named 'out' already exists here. Bailing." | |
sys.exit(1) | |
else: | |
# Bah, let it throw and exception, that's enough try/catch | |
# bullshit for today, I think. | |
mkdir('out') | |
print "NOTE: Created directory 'out'" | |
for link in soup.findAll('a'): | |
if p.match(link['href']): | |
print "%s matches pattern, fetching... " % link['href'], | |
if download(urljoin(LOCATION,link['href'])): | |
print "OK" | |
else: | |
print "FAILED!" | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment