lukas-buergi · December 9, 2012 19:47
diff --git a/comicdl.py b/comicdl.py
 ##############################################################################
 # comicdl.py
 # mainly for downloading web comics
 # will maybe expand to general purpose scraping tool for other stuff too
 ##############################################################################
 # module imports
 import re, random, time, pickle, urllib.request, os, urllib.parse, pprint, math, collections, argparse
 ##############################################################################
 # settings
 saveFolder='/home/t4b/comics/'
 waitTime=3.0
 waitOffset=0
 debug=1
 ##############################################################################
 # helper functions
 def downloadDelay():
 	time.sleep((random.random()+waitOffset)*waitTime)
 ##############################################################################
 class scrapeOne:
 	"""Data and methods to scrape one value of a certain web site."""
 	def __init__(self, name, scraper, regex=None, valueType=None):
 		self.name=name
 		self.scraper=scraper
 		self.regex=regex
 		self.valueType=valueType
 	def value(self, info):
 		"""Return the value based on the information (downloaded html page, context) given in info.
 Depending on the type of the value some stuff to improve it can be done, like making links absolute."""
 		# get the value
 		if self.scraper == 'regex' and self.regex:
 			match=re.search(self.regex, info.page['raw'])
 			if match:
 				self.retValue=match.group(self.name)
 			else:
 				self.retValue=None
 		elif self.scraper == 'bs4': ## There should at least also be 'bs4' for Beautiful Soup 4
 			pass
 		elif self.scraper == 'none':
 			self.retValue=None
 		# now maybe do some stuff with the raw value before returning it
 		if self.retValue:
 			if self.valueType == 'link': # value is a link which may be relativ, so we change it to an absolute one
 				self.retValue=urllib.parse.urljoin(info.url, self.retValue)
 			elif self.valueType == 'month': # value is a month name as text, but we want it as a number
 				pass
 			else: # nothing special
 				self.retValue=self.retValue
 		return(self.retValue)
 ##############################################################################
 class comicsClass:
 	"""A class containing methods and information to scrape one website.
 Data structure:
 	self.info: dictionary with general information about the website
 		self.info['name']: the websites name, spelled so it's okay for filenames etc. I suggest [a-zA-Z0-9]
 		self.info['nameN']: the websites name, this time the way it's normally spelled. Should probably still be ASCII, haven't got a clue about encoding-handling by Python etc.
 		self.info['baseLink']: initial url
 		self.info['regExprs']: dictionary containing pairs of regular expressions and names describing what kind of information can be found with the regex
 	self.url: the url of the present page
 	self.page: dictionary with information on the present page. The contained information depends on self.info['regExprs']. Always available is:
 		self.page['raw']: the raw download of the present page as a string"""
 	
 	def __init__(self, info):
 		"""Sets up self.info and self.url, then calls self.populate"""
 		self.info=info
 		self.url=self.info['baseLink']
 		self.page={}
 		self.populatePage()

 	def populatePage(self):
 		"""Updates self.page after self.url changed."""
 		# Download the new raw html page
 		self.page['raw']=str(list(urllib.request.urlopen(self.url)))
 		downloadDelay()
 		# Update the rest of self.page
 		for findValue in self.info['findValues']:
 			self.page[findValue.name]=findValue.value(self) 
 	
 	def move(self, where):
 		"""Moves to another page, that is: modifies self.url and then calls self.populate.
 The only argument is where to move: to the first, prev, next or last page."""
 		if self.page[where] and not self.page[where] == self.url:
 			self.url=self.page[where]
 			self.populatePage()
 			return(1)
 		else:
 			return(0)
 	
 	def populatePages(self, direction='forward'):
 		"""Populates self.pages with information about the self.page-values encountered when going through the website from the beginning to the end (default) or the other way round."""
 		if direction == 'forward':
 			movAbs='first'
 			movRel='next'
 		elif direction == 'backward':
 			movAbs='last'
 			movRel='prev'
 		
 		self.pages=[]
 		# This of course removes a feature in debug mode which means it also can't be debugged - but this is worth it because when debugging I certainly never want to go through the entire site. It just means that while debugging I need to set self.info['baseLink'] right
 		if not debug:
 			self.move(movAbs)
 		self.pages.append(dict(self.page))
 		while self.move(movRel):
 			self.pages.append(dict(self.page))
 		
 		if direction == 'backward':
 			self.pages.reverse()
 		
 		# Now I can fill 'number' no matter whether it was possible to already get from each individual page
 		for i in range(len(self.pages)):
 			self.pages[i]['number']=i+1
 	
 	def download(self):
 		#Get all the data
 		self.populatePages('backward')
 		#Make a folder for the downloads
 		try:
 			os.mkdir(saveFolder + self.info['name'])
 		except OSError as FileExistsError:
 			pass # I don't care, idiots
 		#Actually download
 		for page in self.pages:
 			fileExtension=re.search(r'(\.[^\.]+)$', page['imageLink']).group(1)
 			urllib.request.urlretrieve(page['imageLink'], saveFolder + self.info['name'] + '/' + str(page['number']) + fileExtension)
 			downloadDelay()

 comics={
 	'questionablecontent' : {
 		'baseLink' : 'http://www.questionablecontent.net/view.php?comic=3',
 		'nameN' : 'Questionable Content',
 		'name' : 'questionablecontent',
 		'findValues' : [
 			scrapeOne('first', 'regex', regex=r'<li><a href="(?P<first>[^"]+)">First</a></li>', valueType='link'),
 			scrapeOne('prev', 'regex', regex=r'<li><a href="(?P<prev>[^"]+)">Previous</a></li>', valueType='link'),
 			scrapeOne('next', 'regex', regex=r'<li><a href="(?P<next>[^"]+)">Next</a></li>', valueType='link'),
 			scrapeOne('last', 'regex', regex=r'<li><a href="(?P<last>[^"]+)">Latest</a></li>', valueType='link'),
 			scrapeOne('imageLink', 'regex', regex=r'<img id="strip" src="(?P<imageLink>[^"]+)">', valueType='link'),
 			scrapeOne('mouseOver', 'none'),
 			scrapeOne('width', 'none'),
 			scrapeOne('height', 'none'),
 			scrapeOne('title', 'none'),
 			scrapeOne('day', 'none'),
 			scrapeOne('month', 'none'),
 			scrapeOne('year', 'none'),
 			scrapeOne('number', 'regex', regex=r'<img id="strip" src="\./comics/(?P<number>[0-9]+)\.[^"]{3,4}">'),
 		]
 	},
 	'intrepidgirlbot' : {
 		'baseLink' : 'http://www.intrepidgirlbot.com/2009/03/11/a-visit-to-the-country/',
 		'nameN' : 'The Intrepid Girlbot',
 		'name' : 'intrepidgirlbot',
 		'findValues' : [
 			scrapeOne('first', 'regex', regex=r'<div class="comic-nav-first"><a href="(?P<first>[^"]+)" title="([^"]*)"', valueType='link'),
 			scrapeOne('prev', 'regex', regex=r'<div class="comic-nav-prev"><a href="(?P<prev>[^"]+)" title="([^"]*)"', valueType='link'),
 			scrapeOne('next', 'regex', regex=r'<div class="comic-nav-next"><a href="(?P<next>[^"]+)" title="([^"]*)"', valueType='link'),
 			scrapeOne('last', 'regex', regex=r'<div class="comic-nav-last"><a href="(?P<last>[^"]+)" title="([^"]*)"', valueType='link'),
 			scrapeOne('imageLink', 'regex', regex=r'<img src="(?P<imageLink>[^"]+)" width="[^"]+" height="[^"]+" alt="[^"]+" title="[^"]+" class="[^"]+" />', valueType='link'),
 			scrapeOne('mouseOver', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="[^"]+" alt="[^"]+" title="(?P<mouseOver>[^"]+)" class="[^"]+" />'),
 			scrapeOne('width', 'regex', regex=r'<img src="[^"]+" width="(?P<width>[^"]+)" height="[^"]+" alt="[^"]+" title="[^"]+" class="[^"]+" />'),
 			scrapeOne('height', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="(?P<height>[^"]+)" alt="[^"]+" title="[^"]+" class="[^"]+" />'),
 			scrapeOne('title', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="[^"]+" alt="(?P<title>[^"]+)" title="[^"]+" class="[^"]+" />'),
 			scrapeOne('day', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in '),
 			scrapeOne('month', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in ', valueType='month'),
 			scrapeOne('year', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in '),
 			scrapeOne('number', 'none')
 		]
 	},
 }

 ##############################################################################
 #do work, finished with defining stuff to set everything up
 if 1:
 	#main program
 	parser = argparse.ArgumentParser()
 	parser.add_argument("comic", help="select which comic to download")
 	args = parser.parse_args()
 	comic=comicsClass(comics[args.comic])
 	comic.download()
 else:
 	#temporary testing routines
 	tmp=comicsClass(comics['questionablecontent'])
 	pprint.pprint(tmp.page)
diff --git a/TODO b/TODO
 To do:
 	High priority:
 		Updating instead of downloading everything
 	Low priority:
 		Maybe add some numbering capabilities in comicsClass.populate instead of just in comicsClass.populatePages?
 		Add the possibility to use Beautiful Soup to get the values from the html?
	##############################################################################
	# comicdl.py
	# mainly for downloading web comics
	# will maybe expand to general purpose scraping tool for other stuff too
	##############################################################################
	# module imports
	import re, random, time, pickle, urllib.request, os, urllib.parse, pprint, math, collections, argparse
	##############################################################################
	# settings
	saveFolder='/home/t4b/comics/'
	waitTime=3.0
	waitOffset=0
	debug=1
	##############################################################################
	# helper functions
	def downloadDelay():
	time.sleep((random.random()+waitOffset)*waitTime)
	##############################################################################
	class scrapeOne:
	"""Data and methods to scrape one value of a certain web site."""
	def __init__(self, name, scraper, regex=None, valueType=None):
	self.name=name
	self.scraper=scraper
	self.regex=regex
	self.valueType=valueType
	def value(self, info):
	"""Return the value based on the information (downloaded html page, context) given in info.
	Depending on the type of the value some stuff to improve it can be done, like making links absolute."""
	# get the value
	if self.scraper == 'regex' and self.regex:
	match=re.search(self.regex, info.page['raw'])
	if match:
	self.retValue=match.group(self.name)
	else:
	self.retValue=None
	elif self.scraper == 'bs4': ## There should at least also be 'bs4' for Beautiful Soup 4
	pass
	elif self.scraper == 'none':
	self.retValue=None
	# now maybe do some stuff with the raw value before returning it
	if self.retValue:
	if self.valueType == 'link': # value is a link which may be relativ, so we change it to an absolute one
	self.retValue=urllib.parse.urljoin(info.url, self.retValue)
	elif self.valueType == 'month': # value is a month name as text, but we want it as a number
	pass
	else: # nothing special
	self.retValue=self.retValue
	return(self.retValue)
	##############################################################################
	class comicsClass:
	"""A class containing methods and information to scrape one website.
	Data structure:
	self.info: dictionary with general information about the website
	self.info['name']: the websites name, spelled so it's okay for filenames etc. I suggest [a-zA-Z0-9]
	self.info['nameN']: the websites name, this time the way it's normally spelled. Should probably still be ASCII, haven't got a clue about encoding-handling by Python etc.
	self.info['baseLink']: initial url
	self.info['regExprs']: dictionary containing pairs of regular expressions and names describing what kind of information can be found with the regex
	self.url: the url of the present page
	self.page: dictionary with information on the present page. The contained information depends on self.info['regExprs']. Always available is:
	self.page['raw']: the raw download of the present page as a string"""

	def __init__(self, info):
	"""Sets up self.info and self.url, then calls self.populate"""
	self.info=info
	self.url=self.info['baseLink']
	self.page={}
	self.populatePage()

	def populatePage(self):
	"""Updates self.page after self.url changed."""
	# Download the new raw html page
	self.page['raw']=str(list(urllib.request.urlopen(self.url)))
	downloadDelay()
	# Update the rest of self.page
	for findValue in self.info['findValues']:
	self.page[findValue.name]=findValue.value(self)

	def move(self, where):
	"""Moves to another page, that is: modifies self.url and then calls self.populate.
	The only argument is where to move: to the first, prev, next or last page."""
	if self.page[where] and not self.page[where] == self.url:
	self.url=self.page[where]
	self.populatePage()
	return(1)
	else:
	return(0)

	def populatePages(self, direction='forward'):
	"""Populates self.pages with information about the self.page-values encountered when going through the website from the beginning to the end (default) or the other way round."""
	if direction == 'forward':
	movAbs='first'
	movRel='next'
	elif direction == 'backward':
	movAbs='last'
	movRel='prev'

	self.pages=[]
	# This of course removes a feature in debug mode which means it also can't be debugged - but this is worth it because when debugging I certainly never want to go through the entire site. It just means that while debugging I need to set self.info['baseLink'] right
	if not debug:
	self.move(movAbs)
	self.pages.append(dict(self.page))
	while self.move(movRel):
	self.pages.append(dict(self.page))

	if direction == 'backward':
	self.pages.reverse()

	# Now I can fill 'number' no matter whether it was possible to already get from each individual page
	for i in range(len(self.pages)):
	self.pages[i]['number']=i+1

	def download(self):
	#Get all the data
	self.populatePages('backward')
	#Make a folder for the downloads
	try:
	os.mkdir(saveFolder + self.info['name'])
	except OSError as FileExistsError:
	pass # I don't care, idiots
	#Actually download
	for page in self.pages:
	fileExtension=re.search(r'(\.[^\.]+)$', page['imageLink']).group(1)
	urllib.request.urlretrieve(page['imageLink'], saveFolder + self.info['name'] + '/' + str(page['number']) + fileExtension)
	downloadDelay()

	comics={
	'questionablecontent' : {
	'baseLink' : 'http://www.questionablecontent.net/view.php?comic=3',
	'nameN' : 'Questionable Content',
	'name' : 'questionablecontent',
	'findValues' : [
	scrapeOne('first', 'regex', regex=r'<li><a href="(?P<first>[^"]+)">First</a></li>', valueType='link'),
	scrapeOne('prev', 'regex', regex=r'<li><a href="(?P<prev>[^"]+)">Previous</a></li>', valueType='link'),
	scrapeOne('next', 'regex', regex=r'<li><a href="(?P<next>[^"]+)">Next</a></li>', valueType='link'),
	scrapeOne('last', 'regex', regex=r'<li><a href="(?P<last>[^"]+)">Latest</a></li>', valueType='link'),
	scrapeOne('imageLink', 'regex', regex=r'<img id="strip" src="(?P<imageLink>[^"]+)">', valueType='link'),
	scrapeOne('mouseOver', 'none'),
	scrapeOne('width', 'none'),
	scrapeOne('height', 'none'),
	scrapeOne('title', 'none'),
	scrapeOne('day', 'none'),
	scrapeOne('month', 'none'),
	scrapeOne('year', 'none'),
	scrapeOne('number', 'regex', regex=r'<img id="strip" src="\./comics/(?P<number>[0-9]+)\.[^"]{3,4}">'),
	]
	},
	'intrepidgirlbot' : {
	'baseLink' : 'http://www.intrepidgirlbot.com/2009/03/11/a-visit-to-the-country/',
	'nameN' : 'The Intrepid Girlbot',
	'name' : 'intrepidgirlbot',
	'findValues' : [
	scrapeOne('first', 'regex', regex=r'<div class="comic-nav-first"><a href="(?P<first>[^"]+)" title="([^"]*)"', valueType='link'),
	scrapeOne('prev', 'regex', regex=r'<div class="comic-nav-prev"><a href="(?P<prev>[^"]+)" title="([^"]*)"', valueType='link'),
	scrapeOne('next', 'regex', regex=r'<div class="comic-nav-next"><a href="(?P<next>[^"]+)" title="([^"]*)"', valueType='link'),
	scrapeOne('last', 'regex', regex=r'<div class="comic-nav-last"><a href="(?P<last>[^"]+)" title="([^"]*)"', valueType='link'),
	scrapeOne('imageLink', 'regex', regex=r'<img src="(?P<imageLink>[^"]+)" width="[^"]+" height="[^"]+" alt="[^"]+" title="[^"]+" class="[^"]+" />', valueType='link'),
	scrapeOne('mouseOver', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="[^"]+" alt="[^"]+" title="(?P<mouseOver>[^"]+)" class="[^"]+" />'),
	scrapeOne('width', 'regex', regex=r'<img src="[^"]+" width="(?P<width>[^"]+)" height="[^"]+" alt="[^"]+" title="[^"]+" class="[^"]+" />'),
	scrapeOne('height', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="(?P<height>[^"]+)" alt="[^"]+" title="[^"]+" class="[^"]+" />'),
	scrapeOne('title', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="[^"]+" alt="(?P<title>[^"]+)" title="[^"]+" class="[^"]+" />'),
	scrapeOne('day', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in '),
	scrapeOne('month', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in ', valueType='month'),
	scrapeOne('year', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in '),
	scrapeOne('number', 'none')
	]
	},
	}

	##############################################################################
	#do work, finished with defining stuff to set everything up
	if 1:
	#main program
	parser = argparse.ArgumentParser()
	parser.add_argument("comic", help="select which comic to download")
	args = parser.parse_args()
	comic=comicsClass(comics[args.comic])
	comic.download()
	else:
	#temporary testing routines
	tmp=comicsClass(comics['questionablecontent'])
	pprint.pprint(tmp.page)
	To do:
	High priority:
	Updating instead of downloading everything
	Low priority:
	Maybe add some numbering capabilities in comicsClass.populate instead of just in comicsClass.populatePages?
	Add the possibility to use Beautiful Soup to get the values from the html?