Created
April 6, 2017 09:32
-
-
Save hidesakai/27432c9372e959da4895cc5d9304b2f2 to your computer and use it in GitHub Desktop.
[Python] AWSのサーバーレスアーキテクチャを使って、イベントドリブンなWebクローラーを作ってみる ref: http://qiita.com/hidesakai/items/963bfebeed7230650dc2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import elasticsearch | |
from requests_aws4auth import AWS4Auth | |
import json | |
if __name__ == '__main__': | |
# ESのエンドポイントを指定 | |
host='search-***************.ap-northeast-1.es.amazonaws.com' | |
awsauth = AWS4Auth( | |
# AWSユーザーのアクセスキーIDとシークレットアクセスキー | |
'ACCESS_KRY_ID', | |
'SECRET_ACCESS_KEY', | |
'ap-northeast-1', 'es') | |
es = elasticsearch.Elasticsearch( | |
hosts=[{'host': host, 'port': 443}], | |
http_auth=awsauth, | |
use_ssl=True, | |
verify_certs=True, | |
connection_class=elasticsearch.connection.RequestsHttpConnection | |
) | |
f = open('mapping.json', 'r') | |
mapping = json.load(f) | |
es.indices.create(index='website') | |
es.indices.put_mapping(index='website', doc_type='article', body=mapping['mappings']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy.conf import settings | |
import boto3 | |
import json | |
kinesis = boto3.client( | |
'kinesis', | |
aws_access_key_id=settings['AWS_ACCESS_KEY_ID'], | |
aws_secret_access_key=settings['AWS_SECRET_ACCESS_KEY'], | |
region_name='ap-northeast-1') | |
class HotEntrySpider(scrapy.Spider): | |
name = "hotentry" | |
allowed_domains = ["b.hatena.ne.jp"] | |
start_urls = ['http://b.hatena.ne.jp/hotentry/general'] | |
def parse(self, response): | |
for sel in response.css("li.hb-entry-unit-with-favorites"): | |
url = sel.css("a.entry-link::attr('href')").extract_first() | |
if url is None: | |
continue | |
kinesis.put_record( | |
StreamName = "scraping_url", | |
Data = sel.css("a.entry-link::attr('href')").extract_first(), | |
PartitionKey = "scraper" | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import base64 | |
from readability import Document | |
import html2text | |
import requests | |
import elasticsearch | |
from elasticsearch import helpers | |
from requests_aws4auth import AWS4Auth | |
def lambda_handler(event, context): | |
host = os.environ['ES_HOST'] | |
# ElasticSearch Serviceへの認証にIAM Roleを利用する | |
awsauth = AWS4Auth( | |
os.environ['ACCESS_ID'], | |
os.environ['SECRET_KEY'], 'ap-northeast-1', 'es') | |
es = elasticsearch.Elasticsearch( | |
hosts=[{'host': host, 'port': 443}], | |
http_auth=awsauth, | |
use_ssl=True, | |
verify_certs=True, | |
connection_class=elasticsearch.connection.RequestsHttpConnection | |
) | |
articles = [] | |
# Kinesis Streamからイベントを取得 | |
for record in event['Records']: | |
payload = base64.b64decode(record['kinesis']['data']) | |
try: | |
response = requests.get(payload) | |
if response.ok: | |
article = Document(response.content).summary() | |
titleText = html2text.html2text(Document(response.content).title()) | |
contentsText = html2text.html2text(article) | |
res = es.search(index="website", body={"query": {"match": {"url": payload}}}) | |
# ESにURLが既に登録されているか | |
if res['hits']['total'] is 0: | |
doc = { | |
'url': payload, | |
'title': titleText.encode('utf-8'), | |
'contents': contentsText.encode('utf-8') | |
} | |
articles.append({'_index':'website', '_type':'scraper', '_source':doc}) | |
except requests.exceptions.HTTPError as err: | |
print("HTTPError: " + err) | |
# Bulk Insert | |
helpers.bulk(es, articles) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment