Last active
June 7, 2016 08:04
-
-
Save yue82/b6bdadaf564fc2bddbf5f93a95f73cca to your computer and use it in GitHub Desktop.
photozou image crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import urllib | |
import urllib2 | |
import argparse | |
import json | |
class Crawler(object): | |
endpoint = 'https://api.photozou.jp/rest/search_public.json' | |
imgurl = 'http://photozou.jp/p/img/' | |
thumbrurl = 'http://photozou.jp/p/thumb/' | |
maxlimit = 1000 | |
def __init__(self): | |
self.digit = len(str(self.maxlimit)) | |
def search(self, keyword, isthumb, limit): | |
if limit > self.maxlimit: | |
limit = self.maxlimit | |
self.digit = len(str(limit)) | |
data = {'keyword': keyword, | |
'limit': limit} | |
query = '{}?{}'.format(Crawler.endpoint, urllib.urlencode(data)) | |
res = urllib2.urlopen(query) | |
resjson = json.loads(res.read()) | |
res.close() | |
return [photo['photo_id'] for photo in resjson['info']['photo']] | |
def fetch_img(self, ids, dirname, filename): | |
for i, id in enumerate(ids): | |
if filename is None: | |
out_imgname = '{}.jpg'.format(str(id)) | |
else: | |
out_img_format = '{{}}-{{:0>{}}}.jpg'.format(self.digit) | |
out_imgname = out_img_format.format(filename, i) | |
out_imgpath = '{}/{}'.format(dirname, out_imgname) | |
query_url = '{}{}'.format(self.imgurl, id) | |
redirect_url = urllib2.urlopen(query_url).geturl() | |
res = urllib2.urlopen(redirect_url) | |
with open(out_imgpath, 'wb') as fo: | |
fo.write(res.read()) | |
res.close() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='') | |
parser.add_argument('keyword', | |
help='image search keyword') | |
args = parser.parse_args() | |
keyword = args.keyword | |
isthumb = True | |
limit_num = 20 | |
dirname = './output' | |
filename = keyword | |
crawler = Crawler() | |
ids = crawler.search(keyword, isthumb, limit_num) | |
crawler.fetch_img(ids, dirname, filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment