-
-
Save lukas-buergi/4246699 to your computer and use it in GitHub Desktop.
Web comic downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ############################################################################## | |
| # comicdl.py | |
| # mainly for downloading web comics | |
| # will maybe expand to general purpose scraping tool for other stuff too | |
| ############################################################################## | |
| # module imports | |
| import re, random, time, pickle, urllib.request, os, urllib.parse, pprint, math, collections, argparse | |
| ############################################################################## | |
| # settings | |
| saveFolder='/home/t4b/comics/' | |
| waitTime=3.0 | |
| waitOffset=0 | |
| debug=1 | |
| ############################################################################## | |
| # helper functions | |
| def downloadDelay(): | |
| time.sleep((random.random()+waitOffset)*waitTime) | |
| ############################################################################## | |
| class scrapeOne: | |
| """Data and methods to scrape one value of a certain web site.""" | |
| def __init__(self, name, scraper, regex=None, valueType=None): | |
| self.name=name | |
| self.scraper=scraper | |
| self.regex=regex | |
| self.valueType=valueType | |
| def value(self, info): | |
| """Return the value based on the information (downloaded html page, context) given in info. | |
| Depending on the type of the value some stuff to improve it can be done, like making links absolute.""" | |
| # get the value | |
| if self.scraper == 'regex' and self.regex: | |
| match=re.search(self.regex, info.page['raw']) | |
| if match: | |
| self.retValue=match.group(self.name) | |
| else: | |
| self.retValue=None | |
| elif self.scraper == 'bs4': ## There should at least also be 'bs4' for Beautiful Soup 4 | |
| pass | |
| elif self.scraper == 'none': | |
| self.retValue=None | |
| # now maybe do some stuff with the raw value before returning it | |
| if self.retValue: | |
| if self.valueType == 'link': # value is a link which may be relativ, so we change it to an absolute one | |
| self.retValue=urllib.parse.urljoin(info.url, self.retValue) | |
| elif self.valueType == 'month': # value is a month name as text, but we want it as a number | |
| pass | |
| else: # nothing special | |
| self.retValue=self.retValue | |
| return(self.retValue) | |
| ############################################################################## | |
| class comicsClass: | |
| """A class containing methods and information to scrape one website. | |
| Data structure: | |
| self.info: dictionary with general information about the website | |
| self.info['name']: the websites name, spelled so it's okay for filenames etc. I suggest [a-zA-Z0-9] | |
| self.info['nameN']: the websites name, this time the way it's normally spelled. Should probably still be ASCII, haven't got a clue about encoding-handling by Python etc. | |
| self.info['baseLink']: initial url | |
| self.info['regExprs']: dictionary containing pairs of regular expressions and names describing what kind of information can be found with the regex | |
| self.url: the url of the present page | |
| self.page: dictionary with information on the present page. The contained information depends on self.info['regExprs']. Always available is: | |
| self.page['raw']: the raw download of the present page as a string""" | |
| def __init__(self, info): | |
| """Sets up self.info and self.url, then calls self.populate""" | |
| self.info=info | |
| self.url=self.info['baseLink'] | |
| self.page={} | |
| self.populatePage() | |
| def populatePage(self): | |
| """Updates self.page after self.url changed.""" | |
| # Download the new raw html page | |
| self.page['raw']=str(list(urllib.request.urlopen(self.url))) | |
| downloadDelay() | |
| # Update the rest of self.page | |
| for findValue in self.info['findValues']: | |
| self.page[findValue.name]=findValue.value(self) | |
| def move(self, where): | |
| """Moves to another page, that is: modifies self.url and then calls self.populate. | |
| The only argument is where to move: to the first, prev, next or last page.""" | |
| if self.page[where] and not self.page[where] == self.url: | |
| self.url=self.page[where] | |
| self.populatePage() | |
| return(1) | |
| else: | |
| return(0) | |
| def populatePages(self, direction='forward'): | |
| """Populates self.pages with information about the self.page-values encountered when going through the website from the beginning to the end (default) or the other way round.""" | |
| if direction == 'forward': | |
| movAbs='first' | |
| movRel='next' | |
| elif direction == 'backward': | |
| movAbs='last' | |
| movRel='prev' | |
| self.pages=[] | |
| # This of course removes a feature in debug mode which means it also can't be debugged - but this is worth it because when debugging I certainly never want to go through the entire site. It just means that while debugging I need to set self.info['baseLink'] right | |
| if not debug: | |
| self.move(movAbs) | |
| self.pages.append(dict(self.page)) | |
| while self.move(movRel): | |
| self.pages.append(dict(self.page)) | |
| if direction == 'backward': | |
| self.pages.reverse() | |
| # Now I can fill 'number' no matter whether it was possible to already get from each individual page | |
| for i in range(len(self.pages)): | |
| self.pages[i]['number']=i+1 | |
| def download(self): | |
| #Get all the data | |
| self.populatePages('backward') | |
| #Make a folder for the downloads | |
| try: | |
| os.mkdir(saveFolder + self.info['name']) | |
| except OSError as FileExistsError: | |
| pass # I don't care, idiots | |
| #Actually download | |
| for page in self.pages: | |
| fileExtension=re.search(r'(\.[^\.]+)$', page['imageLink']).group(1) | |
| urllib.request.urlretrieve(page['imageLink'], saveFolder + self.info['name'] + '/' + str(page['number']) + fileExtension) | |
| downloadDelay() | |
| comics={ | |
| 'questionablecontent' : { | |
| 'baseLink' : 'http://www.questionablecontent.net/view.php?comic=3', | |
| 'nameN' : 'Questionable Content', | |
| 'name' : 'questionablecontent', | |
| 'findValues' : [ | |
| scrapeOne('first', 'regex', regex=r'<li><a href="(?P<first>[^"]+)">First</a></li>', valueType='link'), | |
| scrapeOne('prev', 'regex', regex=r'<li><a href="(?P<prev>[^"]+)">Previous</a></li>', valueType='link'), | |
| scrapeOne('next', 'regex', regex=r'<li><a href="(?P<next>[^"]+)">Next</a></li>', valueType='link'), | |
| scrapeOne('last', 'regex', regex=r'<li><a href="(?P<last>[^"]+)">Latest</a></li>', valueType='link'), | |
| scrapeOne('imageLink', 'regex', regex=r'<img id="strip" src="(?P<imageLink>[^"]+)">', valueType='link'), | |
| scrapeOne('mouseOver', 'none'), | |
| scrapeOne('width', 'none'), | |
| scrapeOne('height', 'none'), | |
| scrapeOne('title', 'none'), | |
| scrapeOne('day', 'none'), | |
| scrapeOne('month', 'none'), | |
| scrapeOne('year', 'none'), | |
| scrapeOne('number', 'regex', regex=r'<img id="strip" src="\./comics/(?P<number>[0-9]+)\.[^"]{3,4}">'), | |
| ] | |
| }, | |
| 'intrepidgirlbot' : { | |
| 'baseLink' : 'http://www.intrepidgirlbot.com/2009/03/11/a-visit-to-the-country/', | |
| 'nameN' : 'The Intrepid Girlbot', | |
| 'name' : 'intrepidgirlbot', | |
| 'findValues' : [ | |
| scrapeOne('first', 'regex', regex=r'<div class="comic-nav-first"><a href="(?P<first>[^"]+)" title="([^"]*)"', valueType='link'), | |
| scrapeOne('prev', 'regex', regex=r'<div class="comic-nav-prev"><a href="(?P<prev>[^"]+)" title="([^"]*)"', valueType='link'), | |
| scrapeOne('next', 'regex', regex=r'<div class="comic-nav-next"><a href="(?P<next>[^"]+)" title="([^"]*)"', valueType='link'), | |
| scrapeOne('last', 'regex', regex=r'<div class="comic-nav-last"><a href="(?P<last>[^"]+)" title="([^"]*)"', valueType='link'), | |
| scrapeOne('imageLink', 'regex', regex=r'<img src="(?P<imageLink>[^"]+)" width="[^"]+" height="[^"]+" alt="[^"]+" title="[^"]+" class="[^"]+" />', valueType='link'), | |
| scrapeOne('mouseOver', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="[^"]+" alt="[^"]+" title="(?P<mouseOver>[^"]+)" class="[^"]+" />'), | |
| scrapeOne('width', 'regex', regex=r'<img src="[^"]+" width="(?P<width>[^"]+)" height="[^"]+" alt="[^"]+" title="[^"]+" class="[^"]+" />'), | |
| scrapeOne('height', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="(?P<height>[^"]+)" alt="[^"]+" title="[^"]+" class="[^"]+" />'), | |
| scrapeOne('title', 'regex', regex=r'<img src="[^"]+" width="[^"]+" height="[^"]+" alt="(?P<title>[^"]+)" title="[^"]+" class="[^"]+" />'), | |
| scrapeOne('day', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in '), | |
| scrapeOne('month', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in ', valueType='month'), | |
| scrapeOne('year', 'regex', regex=r'<p>Posted on (?P<month>[^ ]+) (?P<day>[0-9]+), (?P<year>[0-9]+) by Diana Nock in '), | |
| scrapeOne('number', 'none') | |
| ] | |
| }, | |
| } | |
| ############################################################################## | |
| #do work, finished with defining stuff to set everything up | |
| if 1: | |
| #main program | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("comic", help="select which comic to download") | |
| args = parser.parse_args() | |
| comic=comicsClass(comics[args.comic]) | |
| comic.download() | |
| else: | |
| #temporary testing routines | |
| tmp=comicsClass(comics['questionablecontent']) | |
| pprint.pprint(tmp.page) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| To do: | |
| High priority: | |
| Updating instead of downloading everything | |
| Low priority: | |
| Maybe add some numbering capabilities in comicsClass.populate instead of just in comicsClass.populatePages? | |
| Add the possibility to use Beautiful Soup to get the values from the html? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment