Skip to content

Instantly share code, notes, and snippets.

@lukas-buergi
Created December 9, 2012 19:47
Show Gist options
  • Select an option

  • Save lukas-buergi/4246699 to your computer and use it in GitHub Desktop.

Select an option

Save lukas-buergi/4246699 to your computer and use it in GitHub Desktop.
Web comic downloader
##############################################################################
# comicdl.py
# mainly for downloading web comics
# will maybe expand to general purpose scraping tool for other stuff too
##############################################################################
# module imports
import re, random, time, pickle, urllib.request, os, urllib.parse, pprint, math, collections, argparse
##############################################################################
# settings
saveFolder='/home/t4b/comics/'
waitTime=3.0
waitOffset=0
debug=1
##############################################################################
# helper functions
def downloadDelay():
time.sleep((random.random()+waitOffset)*waitTime)
##############################################################################
class scrapeOne:
"""Data and methods to scrape one value of a certain web site."""
def __init__(self, name, scraper, regex=None, valueType=None):
self.name=name
self.scraper=scraper
self.regex=regex
self.valueType=valueType
def value(self, info):
"""Return the value based on the information (downloaded html page, context) given in info.
Depending on the type of the value some stuff to improve it can be done, like making links absolute."""
# get the value
if self.scraper == 'regex' and self.regex:
match=re.search(self.regex, info.page['raw'])
if match:
self.retValue=match.group(self.name)
else:
self.retValue=None
elif self.scraper == 'bs4': ## There should at least also be 'bs4' for Beautiful Soup 4
pass
elif self.scraper == 'none':
self.retValue=None
# now maybe do some stuff with the raw value before returning it
if self.retValue:
if self.valueType == 'link': # value is a link which may be relativ, so we change it to an absolute one
self.retValue=urllib.parse.urljoin(info.url, self.retValue)
elif self.valueType == 'month': # value is a month name as text, but we want it as a number
pass
else: # nothing special
self.retValue=self.retValue
return(self.retValue)
##############################################################################
class comicsClass:
"""A class containing methods and information to scrape one website.
Data structure:
self.info: dictionary with general information about the website
self.info['name']: the websites name, spelled so it's okay for filenames etc. I suggest [a-zA-Z0-9]
self.info['nameN']: the websites name, this time the way it's normally spelled. Should probably still be ASCII, haven't got a clue about encoding-handling by Python etc.
self.info['baseLink']: initial url
self.info['regExprs']: dictionary containing pairs of regular expressions and names describing what kind of information can be found with the regex
self.url: the url of the present page
self.page: dictionary with information on the present page. The contained information depends on self.info['regExprs']. Always available is:
self.page['raw']: the raw download of the present page as a string"""
def __init__(self, info):
"""Sets up self.info and self.url, then calls self.populate"""
self.info=info
self.url=self.info['baseLink']
self.page={}
self.populatePage()
def populatePage(self):
"""Updates self.page after self.url changed."""
# Download the new raw html page
self.page['raw']=str(list(urllib.request.urlopen(self.url)))
downloadDelay()
# Update the rest of self.page
for findValue in self.info['findValues']:
self.page[findValue.name]=findValue.value(self)
def move(self, where):
"""Moves to another page, that is: modifies self.url and then calls self.populate.
The only argument is where to move: to the first, prev, next or last page."""
if self.page[where] and not self.page[where] == self.url:
self.url=self.page[where]
self.populatePage()
return(1)
else:
return(0)
def populatePages(self, direction='forward'):
"""Populates self.pages with information about the self.page-values encountered when going through the website from the beginning to the end (default) or the other way round."""
if direction == 'forward':
movAbs='first'
movRel='next'
elif direction == 'backward':
movAbs='last'
movRel='prev'
self.pages=[]
# This of course removes a feature in debug mode which means it also can't be debugged - but this is worth it because when debugging I certainly never want to go through the entire site. It just means that while debugging I need to set self.info['baseLink'] right
if not debug:
self.move(movAbs)
self.pages.append(dict(self.page))
while self.move(movRel):
self.pages.append(dict(self.page))
if direction == 'backward':
self.pages.reverse()
# Now I can fill 'number' no matter whether it was possible to already get from each individual page
for i in range(len(self.pages)):
self.pages[i]['number']=i+1
def download(self):
#Get all the data
self.populatePages('backward')
#Make a folder for the downloads
try:
os.mkdir(saveFolder + self.info['name'])
except OSError as FileExistsError:
pass # I don't care, idiots
#Actually download
for page in self.pages:
fileExtension=re.search(r'(\.[^\.]+)$', page['imageLink']).group(1)
urllib.request.urlretrieve(page['imageLink'], saveFolder + self.info['name'] + '/' + str(page['number']) + fileExtension)
downloadDelay()
comics={
'questionablecontent' : {
'baseLink' : 'http://www.questionablecontent.net/view.php?comic=3',
'nameN' : 'Questionable Content',
'name' : 'questionablecontent',
'findValues' : [
scrapeOne('first', 'regex', regex=r'<li><a href="(?P<first>[^"]+)">First</a></li>', valueType='link'),
scrapeOne('prev', 'regex', regex=r'<li><a href="(?P<prev>[^"]+)">Previous</a></li>', valueType='link'),
scrapeOne('next', 'regex', regex=r'<li><a href="(?P<next>[^"]+)">Next</a></li>', valueType='link'),
scrapeOne('last', 'regex', regex=r'<li><a href="(?P<last>[^"]+)">Latest</a></li>', valueType='link'),
scrapeOne('imageLink', 'regex', regex=r'<img id="strip" src="(?P<imageLink>[^"]+)">', valueType='link'),
scrapeOne('mouseOver', 'none'),
scrapeOne('width', 'none'),
scrapeOne('height', 'none'),
scrapeOne('title', 'none'),
scrapeOne('day', 'none'),
scrapeOne('month', 'none'),
scrapeOne('year', 'none'),
scrapeOne('number', 'regex', regex=r'<img id="strip" src="\./comics/(?P<number>[0-9]+)\.[^"]{3,4}">'),
]
},
'intrepidgirlbot' : {
'baseLink' : 'http://www.intrepidgirlbot.com/2009/03/11/a-visit-to-the-country/',
'nameN' : 'The Intrepid Girlbot',
'name' : 'intrepidgirlbot',
'findValues' : [
scrapeOne('first', 'regex', regex=r'<div class="comic-nav-first"><a href="(?P<first>[^"]+)" title="([^"]*)"', valueType='link'),
scrapeOne('prev', 'regex', regex=r'<div class="comic-nav-prev"><a href="(?P<prev>[^"]+)" title="([^"]*)"', valueType='link'),
scrapeOne('next', 'regex', regex=r'<div class="comic-nav-next"><a href="(?P<next>[^"]+)" title="([^"]*)"', valueType='link'),
scrapeOne('last', 'regex', regex=r'<div class="comic-nav-last"><a href="(?P<last>[^"]+)" title="([^"]*)"', valueType='link'),
scrapeOne('imageLink', 'regex', regex=r'<img src="(?P<imageLink>[^"]+)" width="[^"]+" height="[^"]+" alt="[^"]+" title="[^"]+" class="[^"]+" />', valueType='link'),
scrapeOne('mouseOver', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="[^"]+" alt="[^"]+" title="(?P<mouseOver>[^"]+)" class="[^"]+" />'),
scrapeOne('width', 'regex', regex=r'<img src="[^"]+" width="(?P<width>[^"]+)" height="[^"]+" alt="[^"]+" title="[^"]+" class="[^"]+" />'),
scrapeOne('height', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="(?P<height>[^"]+)" alt="[^"]+" title="[^"]+" class="[^"]+" />'),
scrapeOne('title', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="[^"]+" alt="(?P<title>[^"]+)" title="[^"]+" class="[^"]+" />'),
scrapeOne('day', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in '),
scrapeOne('month', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in ', valueType='month'),
scrapeOne('year', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in '),
scrapeOne('number', 'none')
]
},
}
##############################################################################
#do work, finished with defining stuff to set everything up
if 1:
#main program
parser = argparse.ArgumentParser()
parser.add_argument("comic", help="select which comic to download")
args = parser.parse_args()
comic=comicsClass(comics[args.comic])
comic.download()
else:
#temporary testing routines
tmp=comicsClass(comics['questionablecontent'])
pprint.pprint(tmp.page)
To do:
High priority:
Updating instead of downloading everything
Low priority:
Maybe add some numbering capabilities in comicsClass.populate instead of just in comicsClass.populatePages?
Add the possibility to use Beautiful Soup to get the values from the html?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment