# -*- coding: utf-8 -*-
"""
custom_search_api.py
"""
from cgi import parse_header
from logging import getLogger, StreamHandler, Formatter, DEBUG
from mimetypes import guess_all_extensions
from hashlib import sha512
from pathlib import Path
import requests
LOGGER = getLogger('custom_search_api')
HANDLER = StreamHandler()
HANDLER.setLevel(DEBUG)
HANDLER.setFormatter(Formatter('%(message)s'))
LOGGER.setLevel(DEBUG)
LOGGER.addHandler(HANDLER)
def fetch(url: str, params: dict=None):
res = requests.get(url, params)
res.raise_for_status()
return res
def create_file_name(params: dict, idx: int, url: str, ext: str):
file_name = [params['q'], str(idx), sha512(url.encode('utf-8')).hexdigest()]
#return str(idx) + ext #連番ファイル名
return "_".join(file_name) + ext
def main():
IMAGE_DIR = Path(__file__).parent / 'images'
IMAGE_DIR.mkdir(parents=True, exist_ok=True)
auth = {
"cx": "", # 検索エンジンID
"key": "", # API-KEY
}
start_index = 1
keyword = "xxx"
params = {
"q": keyword, # 検索ワード
"searchType": "image",
"start": start_index, # 開始インデックス
"num": 10, # 1回の検索における取得件数(デフォルトで10件)
}
params.update(auth)
try:
for _ in range(10):
# @see https://developers.google.com/custom-search/json-api/v1/reference/cse/list?hl=ja
res = fetch('https://www.googleapis.com/customsearch/v1', params)
LOGGER.info('#' * 80)
res_json = res.json()
for idx, items in enumerate(res_json['items'], start=start_index):
try:
download_link = items['link']
LOGGER.info(f'url:{download_link}')
r = fetch(download_link)
mime_type, _ = parse_header(r.headers['Content-Type'])
ext, *_ = guess_all_extensions(mime_type)
if ext in guess_all_extensions('image/jpeg'):
ext = ".jpg" # .jpe が戻り値になるため、拡張子をjpgにする。
print(create_file_name(params, idx, download_link, ext))
out_file = Path(IMAGE_DIR, create_file_name(params, idx, download_link, ext))
out_file.write_bytes(r.content)
LOGGER.info(f'save:{out_file}')
except Exception as ex:
LOGGER.exception(ex)
# 次ページのindex
start_index = res_json['queries']['nextPage'][0].get('startIndex')
LOGGER.info(f'next:{start_index}')
params['start'] = start_index
# 100件を超えるとステータスコード:400 Bad Request
except Exception as ex:
LOGGER.exception(ex)
if __name__ == '__main__':
main()
Last active
August 27, 2018 07:09
-
-
Save umyuu/724e08d8189825858fd8cbe309db4fd5 to your computer and use it in GitHub Desktop.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment