Skip to content

Instantly share code, notes, and snippets.

@yue82
Last active June 7, 2016 08:04
Show Gist options
  • Save yue82/b6bdadaf564fc2bddbf5f93a95f73cca to your computer and use it in GitHub Desktop.
Save yue82/b6bdadaf564fc2bddbf5f93a95f73cca to your computer and use it in GitHub Desktop.
photozou image crawler
import os
import urllib
import urllib2
import argparse
import json
class Crawler(object):
endpoint = 'https://api.photozou.jp/rest/search_public.json'
imgurl = 'http://photozou.jp/p/img/'
thumbrurl = 'http://photozou.jp/p/thumb/'
maxlimit = 1000
def __init__(self):
self.digit = len(str(self.maxlimit))
def search(self, keyword, isthumb, limit):
if limit > self.maxlimit:
limit = self.maxlimit
self.digit = len(str(limit))
data = {'keyword': keyword,
'limit': limit}
query = '{}?{}'.format(Crawler.endpoint, urllib.urlencode(data))
res = urllib2.urlopen(query)
resjson = json.loads(res.read())
res.close()
return [photo['photo_id'] for photo in resjson['info']['photo']]
def fetch_img(self, ids, dirname, filename):
for i, id in enumerate(ids):
if filename is None:
out_imgname = '{}.jpg'.format(str(id))
else:
out_img_format = '{{}}-{{:0>{}}}.jpg'.format(self.digit)
out_imgname = out_img_format.format(filename, i)
out_imgpath = '{}/{}'.format(dirname, out_imgname)
query_url = '{}{}'.format(self.imgurl, id)
redirect_url = urllib2.urlopen(query_url).geturl()
res = urllib2.urlopen(redirect_url)
with open(out_imgpath, 'wb') as fo:
fo.write(res.read())
res.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
parser.add_argument('keyword',
help='image search keyword')
args = parser.parse_args()
keyword = args.keyword
isthumb = True
limit_num = 20
dirname = './output'
filename = keyword
crawler = Crawler()
ids = crawler.search(keyword, isthumb, limit_num)
crawler.fetch_img(ids, dirname, filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment