Last active
July 23, 2018 13:06
-
-
Save pypeach/d22960d5c81b1da65cafe999f54d3dc1 to your computer and use it in GitHub Desktop.
WEBスクレイピングのユーティリティです
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf-8 | |
import logging | |
import socket | |
import urllib.request | |
from bs4 import BeautifulSoup | |
""" | |
Web Scrapeを行う | |
""" | |
__author__ = "t.ebinuma" | |
__version__ = "1.1" | |
__date__ = "14 July 2018" | |
def scrape(url): | |
""" | |
Web Scrapingを行う | |
""" | |
# User-Agentを定義する | |
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) ' \ | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' \ | |
'Chrome/55.0.2883.95 Safari/537.36 ' | |
# Webアクセスのリトライ回数を指定する | |
retry_max_count = 3 | |
retry_count = 1 | |
logging.debug("url={}".format(url)) | |
try: | |
for i in range(0, retry_max_count): | |
retry_count += 1 | |
try: | |
# Webアクセス時のUser-Agentを指定する | |
req = urllib.request.Request(url, headers={'User-Agent': ua}) | |
# Webアクセスの読み込みを行う(タイムアウトを30秒で指定する) | |
html = urllib.request.urlopen(req, timeout=30).read() | |
# HTMLパーサーでパースする | |
return BeautifulSoup(html, 'lxml') | |
except socket.timeout as e: | |
# タイムアウト時のメッセージを出力する | |
logging.debug("socket.timeout:%s", e) | |
except urllib.error.URLError as e: | |
# URLError時のメッセージを出力する | |
logging.debug("urllib.error.URLError:%s", e) | |
# リトライ回数の上限を超えた場合はエラーにする | |
if retry_count > retry_max_count: | |
raise Exception("リトライ回数の上限を超えました") | |
except Exception as e: | |
raise e | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment