Created
January 24, 2016 16:16
-
-
Save HakurouKen/ce2cdae743676fc443bb to your computer and use it in GitHub Desktop.
get xkcd comic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib,urllib2,json | |
import re | |
import os | |
import logging | |
from HTMLParser import HTMLParser | |
class HTMLStripper(HTMLParser): | |
''' | |
strip html tags | |
Solution from: http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python#answer-925630 | |
''' | |
def __init__(self): | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def get_data(self): | |
return ''.join(self.fed) | |
def strip_tags(html): | |
''' | |
deal with some title with html tags, such as comic 472 | |
''' | |
s = HTMLStripper() | |
s.feed(html) | |
return s.get_data() | |
class Xkcd(object): | |
''' | |
get xkcd comic | |
''' | |
link = 'http://www.xkcd.com/{}/info.0.json' | |
def __init__(self, index): | |
if not isinstance(index, int): | |
raise TypeError('Parameter "index" must be integer.') | |
self.index = index | |
self.link = Xkcd.link.format(index) | |
def get(self): | |
ret = {} | |
# No.404 xkcd is not exist just like it's name. | |
if self.index == 404: | |
return -2,'The comic 404 is a joke.' | |
try: | |
resp = urllib2.urlopen(self.link).read() | |
info = json.loads(resp) | |
img = info['img'] | |
suffix_match = re.search(r'\.\w*?$',img) | |
if suffix_match: | |
suffix = suffix_match.group() | |
else: | |
suffix = '.jpg' | |
ret['img'] = img | |
# make the title with '/' a valid filename | |
title = re.sub('/','_', strip_tags(info['title'])) | |
ret['title'] = "{index} - {title}{suffix}".format(index=self.index,title=title,suffix=suffix) | |
return 0,ret | |
except urllib2.HTTPError, error: | |
if error.code == 404: | |
return -2,'Comic {} does not exists.'.format(self.index) | |
else: | |
return -1,'Error at comic {} ,HTTPError {} happend.'.format(self.index,error.code) | |
except: | |
return -1,'Error at comic {}, unknown error happend.'.format(self.index) | |
def save(self,src,path): | |
try: | |
with open(path,'wb') as f: | |
f.write(urllib2.urlopen(src).read()) | |
return 0 | |
except urllib2.HTTPError, error: | |
# some xkcd has no picture, such as comic 1608 | |
if error.code == 404: | |
return -1 | |
else: | |
return -2 | |
except: | |
return -2 | |
class Progress(object): | |
''' | |
save and load the progress to file. | |
''' | |
_instance = None | |
def __new__(cls,*args,**kwargs): | |
if not cls._instance: | |
cls._instance = super(Progress,cls).__new__(cls,*args,**kwargs) | |
return cls._instance | |
def __init__(self,filename): | |
self.filename = filename | |
def load(self): | |
if not os.path.isfile(self.filename): | |
return 1 | |
else: | |
with open(self.filename) as f: | |
s = f.readline().strip() | |
try: | |
return int(s) | |
except ValueError,e: | |
return 1 | |
def save(self,index): | |
if not isinstance(index,int): | |
raise TypeError('Parameter "index" must be integer.') | |
with open(self.filename,'w') as f: | |
f.write(str(index)) | |
if __name__ == '__main__': | |
logging.basicConfig( | |
level=logging.INFO, | |
format= '%(asctime)s [%(levelname)s] %(message)s', | |
datefmt='[%Y-%m-%d %H:%M:%S]', | |
filename='xkcd.log' | |
) | |
progress = Progress('progress.data') | |
index = progress.load() | |
if not os.path.isdir('comic'): | |
os.mkdir('comic') | |
while True: | |
comic = Xkcd(index) | |
ret,info = comic.get() | |
if ret == 0: | |
saved = comic.save(info['img'],os.path.join('comic',info['title'])) | |
if saved == 0: | |
logging.info('Comic {} saved successfully.'.format(index)) | |
elif saved == -1: | |
logging.info('Comic {} has no image.'.format(index)) | |
else: | |
logging.warn('Comic {} picture saved error.'.format(index)) | |
else: | |
logging.warn(info) | |
if ret == -2 and index != 404: | |
progress.save(index) | |
break | |
index += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment