Created
March 2, 2015 15:31
-
-
Save miraculixx/fd48fa8b52636e0ba49e to your computer and use it in GitHub Desktop.
Scrapy spider for Gmail API, using Django AllAuth as the token source
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import base64 | |
from items import EmailItem, EmailLabelItem | |
from loader import JSONItemLoader | |
from oauth2spider import OAuth2Spider | |
class GmailSpider(OAuth2Spider): | |
name = "gmail" | |
provider = 'google' | |
allowed_domains = ["googleapis.com"] | |
api_url = 'https://www.googleapis.com/gmail/v1/users/me' | |
credential_type = 'Bearer' | |
refresh_url = 'https://www.googleapis.com/oauth2/v3/token' | |
#max_items = 5 | |
start_verbs = [ | |
('list', 'messages'), | |
('list', 'labels'), | |
] | |
def parse_list_messages(self, response): | |
# get messages from list | |
for message in response.obj['messages']: | |
url = self.build_url_from_verb('get', 'messages') | |
url = '{0}/{1}'.format(url, message['id']) | |
credentials = self.credentials_from(response) | |
request = self.build_request(url, credentials, 'get', 'messages') | |
yield request | |
# get additional pages | |
pageToken = response.obj.get('nextPageToken') | |
if pageToken: | |
credentials = self.credentials_from(response) | |
url = self.build_url_from_verb('list', 'messages') | |
url = '{0}?pageToken={1}'.format(url, pageToken) | |
request = self.build_request(url, credentials, 'list', 'messages') | |
yield request | |
def parse_get_messages(self, response): | |
# parse a single message | |
l = JSONItemLoader(item=EmailItem(), response=response, | |
selector='payload') | |
l.add_value('subject', 'headers[name=Subject].value') | |
l.add_value('received', 'headers[name=Delivery-date].value') | |
l.add_value('sent', 'headers[name=Date].value') | |
l.add_value('from_email', 'headers[name=From].value') | |
l.add_value('to_email', 'headers[name=To].value') | |
l.add_value('mime', 'mimeType') | |
l.add_value('text', 'parts[mimeType=text/plain].body.data') | |
item = l.load_item() | |
item['labels'] = '.'.join(response.obj.get('labelIds', [])) | |
item['is_read'] = 'UNREAD' not in item['labels'] | |
item['threadid'] = response.obj.get('threadId') | |
# http://stackoverflow.com/a/24481560/890242 | |
text = item['text'] | |
text = text.replace('-', '+') | |
text = text.replace('_', '/') | |
item['text'] = base64.decodestring(text) | |
item['text_length'] = len(text) | |
yield item | |
def parse_list_labels(self, response): | |
# parse list of labels | |
for label in response.obj['labels']: | |
l = JSONItemLoader(item=EmailLabelItem(), obj=label) | |
l.add_value('itemid', 'id') | |
l.add_value('name', 'name') | |
l.add_value('messagesTotal', 'messagesTotal') | |
l.add_value('messagesUnread', 'messagesUnread') | |
item = l.load_item() | |
yield item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class EmailItem(scrapy.Item): | |
user = scrapy.Field() | |
threadid = scrapy.Field() | |
from_email = scrapy.Field() | |
to_email = scrapy.Field() | |
subject = scrapy.Field() | |
text = scrapy.Field() | |
is_read = scrapy.Field() | |
labels = scrapy.Field() | |
received = scrapy.Field() | |
sent = scrapy.Field() | |
labels = scrapy.Field() | |
mime = scrapy.Field() | |
text_length = scrapy.Field() | |
class EmailLabelItem(scrapy.Item): | |
user = scrapy.Field() | |
itemid = scrapy.Field() | |
name = scrapy.Field() | |
messagesTotal = scrapy.Field() | |
messagesUnread = scrapy.Field() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
from scrapy.contrib.loader import ItemLoader | |
class JSONItemLoader(ItemLoader): | |
""" | |
item loader for json | |
if a selector is given, this is the default selector applied to | |
all subsequent add_value() calls. | |
""" | |
def __init__(self, item=None, selector=None, response=None, obj=None, | |
**context): | |
self.item = item | |
self.selector = selector or '' | |
self.response = response | |
self.context = context | |
context.update(selector=selector, response=response) | |
self.obj = obj or self.get_value(self.json_obj, selector) | |
self._values = {} | |
@property | |
def json_obj(self): | |
return json.loads(self.response.body) | |
def get_value(self, obj, selector): | |
""" | |
in dict obj access subsequent keys given by selector. | |
A selector is a sequence of key within the dict. Returns '' if | |
a key is not found | |
obj = { 'foo' : { 'bar' : 'baz' }} | |
get_value(obj, 'foo.bar') | |
=> 'baz' | |
Selectors can specify filter criteria for dicts in sublists, e.g. | |
obj = { 'foo' : [ | |
{ | |
'name' : 'foobar', | |
'value' : 'foofoo' | |
}, | |
... | |
} | |
get_value(obj, 'foo[name=foobar].value') | |
=> 'foofoo' | |
""" | |
selector = selector or '' | |
value = obj | |
for key in selector.split('.'): | |
# if key is empty, stop | |
if not key: | |
break | |
# extract filter, if any | |
criteria = None | |
if '[' in key and ']' in key: | |
key, k, v = re.search('(.*)\[(.*)=(.*)\]', key).groups() | |
criteria = k, v | |
# see if we have a dict to get value from | |
if isinstance(value, dict): | |
value = value.get(key, '') | |
else: | |
# nothing else to process, stop | |
value = '' | |
break | |
# apply filter to list | |
if criteria and isinstance(value, list): | |
k, v = criteria | |
value = filter(lambda i : i.get(k) == v, value) | |
if len(value): | |
value = value[0] | |
return value | |
def add_value(self, field, selector): | |
""" | |
get the value given by selector | |
""" | |
self._values.update({ field : self.get_value(self.obj, selector)}) | |
def load_item(self): | |
""" | |
return the item loaded with all values previously added by add_value() | |
""" | |
for k, v in self._values.iteritems(): | |
self.item[k] = v | |
return self.item | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from scrapy import log | |
import scrapy | |
from scrapy.http.request import Request | |
from scrapytest.scrapydjango import setup_django | |
class OAuth2Spider(scrapy.Spider): | |
""" | |
An API spider for OAuth2 REST resources | |
works with allauth.SocialTokens. User needs to have authorized | |
API access before via django-allauth. | |
Configuration: | |
api_url = the start URI | |
credential_type = first string in Authorization header | |
provider = provider name in social app | |
refresh_url = the refresh url to refresh OAuth tokens | |
Build API requests using | |
request = self.build_request(url, api_method, api_type, | |
credentials) | |
Parse responses by implementing methods of format | |
parse_<api_method>_<api_type> | |
e.g. parse_list_messages(response) | |
parse_get_messages(response) | |
response objects provide the .api_method and .api_type attributes. | |
You can build a new request from a response as follows: | |
credentials = self.credentials_from(response) | |
request = self.build_request(url, method, type, credentials) | |
""" | |
api_url = '' | |
refresh_url = '' | |
provider = '' | |
credential_type = 'Bearer' | |
max_items = 0 | |
start_verbs = [] | |
trailing_slash = False | |
def start_requests(self): | |
""" | |
build initial requests | |
this builds initial requests from .api_url and start_verbs. start_verbs | |
is a list of tuples ('verb', 'resource'), where verb is the semantic | |
method for the api, resource is the name of the resource | |
e.g. start_verbs = [('list', 'messages'), ('list', 'labels')] | |
=> GET <api_url>/messages with api_method=list, api_type=messages | |
=> GET <api_url>/labels with api_method=list, api_type=labels | |
Using start_verbs you can have multiple types of index resources | |
queried. | |
If start_verbs is empty, it defaults to [('list', '<resource>')] where | |
<resource> is the last verb in api_url. | |
e.g. api_url = 'some.domain.com/api/messages' | |
=> start_verbs = [('list', 'messages')] | |
see http://doc.scrapy.org/en/latest/topics/spiders.html?highlight=spider#scrapy.spider.Spider.start_requests | |
""" | |
for credential in self.get_credentials(): | |
for api_method, api_type in self.get_start_verbs(): | |
url = self.build_url_from_verb(api_method, api_type) | |
request = self.build_request(url, | |
api_method=api_method, | |
api_type=api_type, | |
credentials=credential) | |
yield request | |
def get_start_verbs(self): | |
if not len(self.start_verbs): | |
self.start_verbs = [('list', '')] | |
return self.start_verbs | |
def build_url_from_verb(self, api_method, api_type, url=None): | |
""" | |
from a given api_method and api_type build the actual | |
url to call. | |
the url is of the form <api_url>/<method>/<type> | |
:param api_method: the api semantic method (e.g. list, get etc.) | |
:param api_type: the api resource type | |
:param url: the base url. defaults to .api_url | |
""" | |
base_url = url or self.api_url | |
if api_type: | |
url = '{0}/{1}'.format(base_url, api_type) | |
else: | |
url = base_url | |
path = base_url.split('/') | |
api_type = path[-1] or path[-2] | |
if self.trailing_slash and url[-1] != '/': | |
url = '%s/' % url | |
return url | |
def get_credentials(self): | |
""" | |
list all credentials for which to access the api_url | |
returns an iterable of credentials. each credential is | |
of format <credential_type> <token>. Default implementation | |
returns credentials for all SocialTokens that match self.provider | |
""" | |
from allauth.socialaccount.models import SocialToken | |
for token in SocialToken.objects.filter(app__provider=self.provider): | |
credentials = '{0} {token}'.format(self.credential_type, | |
**token.__dict__) | |
yield credentials | |
def refresh_kwargs(self, token): | |
""" | |
return the dict of values required to build a | |
refresh token request | |
:param token: the SocialToken instance | |
""" | |
return { | |
'client_id' : token.app.client_id, | |
'refresh_token' : token.token_secret, | |
'client_secret' : token.app.secret, | |
} | |
def refresh_token(self, request): | |
""" | |
refresh the access token in a request | |
called by error() in case of a 401 response. This assumes | |
the token has expired and attempts to refresh it | |
""" | |
# get current token from request | |
from requests_oauthlib import OAuth2Session | |
token_value = self.credentials_from(request) | |
token_value = token_value.replace(self.credential_type, '') | |
try: | |
# get SocialToken and app credentials from django db | |
setup_django() | |
from allauth.socialaccount.models import SocialToken | |
token = SocialToken.objects.get(token=token_value.lstrip().strip()) | |
app = token.app | |
except: | |
raise | |
# create an oauth2session with auto refresh url | |
# see http://requests-oauthlib.readthedocs.org/en/latest/oauth2_workflow.html#third-recommended-define-automatic-token-refresh-and-update | |
# and http://requests-oauthlib.readthedocs.org/en/latest/api.html#requests_oauthlib.OAuth2Session.refresh_token | |
try: | |
refresh_kwargs = self.refresh_kwargs(token) | |
client = OAuth2Session(app.client_id, token=token.token) | |
resp = client.refresh_token(self.refresh_url, | |
**refresh_kwargs) | |
except: | |
raise | |
else: | |
# we finally have a new token, save it | |
token.token = resp['access_token'] | |
token.save() | |
# rebuild the request and schedule for download | |
credentials = self.credentials_from(request) | |
request = self.build_request(request.url, | |
api_method=request.api_method, | |
api_type=request.api_type, | |
credentials=credentials) | |
return [request] | |
return [] | |
def error(self, failure): | |
""" | |
process error | |
if error is 401, assume we need a new token, call refresh_token() | |
otherwise print error to log | |
""" | |
try: | |
response = failure.value.response | |
except: | |
pass | |
else: | |
if response.status == 401: | |
self.refresh_token(failure.request) | |
self.log(failure.value, level=log.ERROR) | |
def build_request(self, url, credentials=None, api_method=None, | |
api_type=None): | |
""" | |
build a request object | |
:param url: the url to access | |
:param credentials: the credentials (to be set in Authorization header) | |
:param api_method: semantic api method (e.g. list, get) | |
:param api_type: semantic api type (e.g. messages, threads) | |
""" | |
request = scrapy.Request(url, | |
headers={'Authorization' : credentials }, | |
errback=self.error, | |
callback=self.parse) | |
request.api_method = api_method | |
request.api_type = api_type | |
return request | |
def credentials_from(self, request_or_response, header=False): | |
""" | |
get the credentials from the authorization headers. | |
this ensures we can build subsequent Requests from a previous | |
request or response. | |
:return: Authorization header value (header=True => the full header) | |
""" | |
rr = request_or_response | |
request = rr if isinstance(rr, Request) else rr.request | |
credentials = request.headers.get('Authorization') | |
if header: | |
credentials = {'Authorization' : credentials } | |
return credentials | |
def parser_for(self, request): | |
""" | |
return the method that can parse this request. defaults | |
to | |
parse_<api_type>_<api_method> | |
where api_type is the | |
""" | |
url_type = 'parse_{0}_{1}'.format(request.api_method, | |
request.api_type) | |
return getattr(self, url_type) | |
def parse(self, response): | |
""" | |
parse api response. | |
1. from the body create an obj by deserializing it into a python dict | |
2. find the parser for the response by checking the | |
""" | |
response.obj = json.loads(response.body) | |
parser = self.parser_for(response.request) | |
self.log('%s - %s' % (parser.__name__, response), level=log.DEBUG) | |
for i, r in enumerate(parser(response)): | |
self.log('> %s - %s' % (parser.__name__, r), level=log.DEBUG) | |
if self.max_items == 0 or i < self.max_items: | |
if self.trailing_slash and isinstance(r, Request): | |
if r.url[-1] != '/': | |
r.url = '%s/' % r.url | |
yield r | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# add in your scrapy app's settings.py | |
from app import settings as dj_app_settings | |
from django.conf import settings as dj_conf | |
if not dj_conf.configured: | |
# https://docs.djangoproject.com/en/dev/topics/settings/#using-settings-without-setting-django-settings-module | |
dj_conf.configure(**dj_app_settings.__dict__) | |
from django.apps import apps | |
apps.populate(dj_conf.INSTALLED_APPS) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment