Last active
August 29, 2015 14:26
-
-
Save ikegami-yukino/c5c6d0d2c4e9bb5ee02c to your computer and use it in GitHub Desktop.
意識の低い単純クローラー
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import re | |
from encodings.aliases import aliases | |
import nkf | |
import tornado | |
from tornado import httpclient, gen | |
UA = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/31.0.1650.57 Safari/537.36') | |
httpclient.AsyncHTTPClient.configure(None, max_clients=1000) | |
all_encodings = set(aliases.values()) | set(aliases.keys()) | |
header_encoding_pattern = re.compile('charset=([\w\-0-9]+)', re.I) | |
meta_encoding_pattern = re.compile(b'<meta [^>]*charset="?([^">\s]+)', re.I) | |
class SimpleCrawler(object): | |
def __init__(self, urls): | |
self.urls = urls | |
def extract_encoding_by_request(self, headers, body): | |
encoding = None | |
content_type = headers.get('Content-Type') | |
if content_type: | |
m = header_encoding_pattern.search(content_type) | |
if m: | |
encoding = m.group(1) | |
if not encoding: | |
m = meta_encoding_pattern.search(body) | |
if m: | |
encoding = m.group(1) | |
return encoding | |
def normalize_encoding(self, encoding): | |
encoding = encoding.lower() | |
if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'): | |
return 'cp932' | |
return encoding | |
def decode(self, headers, body): | |
encoding = self.extract_encoding_by_request(headers, body) | |
if not encoding or encoding.upper() == 'ISO-8859-1': | |
encoding = nkf.guess(body) | |
if encoding in ('BINARY', 'ISO-8859-1'): | |
encoding = 'utf8' | |
encoding = self.normalize_encoding(encoding) | |
if encoding not in all_encodings: | |
return nkf.nkf('-w', body).decode('utf8') | |
return body.decode(encoding, 'replace') | |
@gen.coroutine | |
def store_data(self): | |
client = httpclient.AsyncHTTPClient() | |
for (i, url) in enumerate(self.urls): | |
if i % 500 == 0: | |
print(i, url) | |
try: | |
request = httpclient.HTTPRequest(url, follow_redirects=True, user_agent=UA, | |
validate_cert=False, allow_ipv6=False) | |
response = yield client.fetch(request) | |
url = url.replace("http://", "").replace("https://", "") | |
filename = url.replace("/", " ").rstrip() | |
filename = os.path.join("pages", filename) | |
with open(filename, "w") as fd: | |
body = self.decode(response.headers, response.body) | |
fd.write(body) | |
except Exception as e: | |
print('%s %s %s' % (type(e), e, url)) | |
continue | |
def run(self): | |
tornado.ioloop.IOLoop.current().run_sync(self.store_data) | |
if __name__ == "__main__": | |
import sys | |
with open(sys.argv[1]) as fd: | |
urls = fd.read().splitlines() | |
crawler = SimpleCrawler(urls) | |
crawler.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment