Skip to content

Instantly share code, notes, and snippets.

@umyuu
Last active August 27, 2018 07:09
Show Gist options
  • Save umyuu/724e08d8189825858fd8cbe309db4fd5 to your computer and use it in GitHub Desktop.
Save umyuu/724e08d8189825858fd8cbe309db4fd5 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
    custom_search_api.py
"""
from cgi import parse_header
from logging import getLogger, StreamHandler, Formatter, DEBUG
from mimetypes import guess_all_extensions
from hashlib import sha512
from pathlib import Path
import requests

LOGGER = getLogger('custom_search_api')
HANDLER = StreamHandler()
HANDLER.setLevel(DEBUG)
HANDLER.setFormatter(Formatter('%(message)s'))
LOGGER.setLevel(DEBUG)
LOGGER.addHandler(HANDLER)


def fetch(url: str, params: dict=None):
    res = requests.get(url, params)
    res.raise_for_status()
    return res


def create_file_name(params: dict, idx: int, url: str, ext: str):
    file_name = [params['q'], str(idx), sha512(url.encode('utf-8')).hexdigest()]
    #return str(idx) + ext  #連番ファイル名
    return "_".join(file_name) + ext


def main():
    IMAGE_DIR = Path(__file__).parent / 'images'
    IMAGE_DIR.mkdir(parents=True, exist_ok=True)
    auth = {
        "cx": "",  # 検索エンジンID
        "key": "",  # API-KEY
    }
    start_index = 1
    keyword = "xxx"
    params = {
        "q": keyword,  # 検索ワード
        "searchType": "image",
        "start": start_index,  # 開始インデックス
        "num": 10,  # 1回の検索における取得件数(デフォルトで10件)
    }
    params.update(auth)
    try:
        for _ in range(10):
            # @see https://developers.google.com/custom-search/json-api/v1/reference/cse/list?hl=ja
            res = fetch('https://www.googleapis.com/customsearch/v1', params)
            LOGGER.info('#' * 80)
            res_json = res.json()
            for idx, items in enumerate(res_json['items'], start=start_index):
                try:
                    download_link = items['link']
                    LOGGER.info(f'url:{download_link}')
                    r = fetch(download_link)
                    mime_type, _ = parse_header(r.headers['Content-Type'])
                    ext, *_ = guess_all_extensions(mime_type)
                    if ext in guess_all_extensions('image/jpeg'):
                        ext = ".jpg" # .jpe が戻り値になるため、拡張子をjpgにする。
                    print(create_file_name(params, idx, download_link, ext))
                    out_file = Path(IMAGE_DIR,  create_file_name(params, idx, download_link, ext))
                    out_file.write_bytes(r.content)
                    LOGGER.info(f'save:{out_file}')
                except Exception as ex:
                    LOGGER.exception(ex)
            # 次ページのindex
            start_index = res_json['queries']['nextPage'][0].get('startIndex')
            LOGGER.info(f'next:{start_index}')
            params['start'] = start_index
            # 100件を超えるとステータスコード:400 Bad Request
    except Exception as ex:
        LOGGER.exception(ex)


if __name__ == '__main__':
    main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment