Skip to content

Instantly share code, notes, and snippets.

@miraculixx
Created March 2, 2015 15:31
Show Gist options
  • Save miraculixx/fd48fa8b52636e0ba49e to your computer and use it in GitHub Desktop.
Save miraculixx/fd48fa8b52636e0ba49e to your computer and use it in GitHub Desktop.
Scrapy spider for Gmail API, using Django AllAuth as the token source
# -*- coding: utf-8 -*-
import base64
from items import EmailItem, EmailLabelItem
from loader import JSONItemLoader
from oauth2spider import OAuth2Spider
class GmailSpider(OAuth2Spider):
name = "gmail"
provider = 'google'
allowed_domains = ["googleapis.com"]
api_url = 'https://www.googleapis.com/gmail/v1/users/me'
credential_type = 'Bearer'
refresh_url = 'https://www.googleapis.com/oauth2/v3/token'
#max_items = 5
start_verbs = [
('list', 'messages'),
('list', 'labels'),
]
def parse_list_messages(self, response):
# get messages from list
for message in response.obj['messages']:
url = self.build_url_from_verb('get', 'messages')
url = '{0}/{1}'.format(url, message['id'])
credentials = self.credentials_from(response)
request = self.build_request(url, credentials, 'get', 'messages')
yield request
# get additional pages
pageToken = response.obj.get('nextPageToken')
if pageToken:
credentials = self.credentials_from(response)
url = self.build_url_from_verb('list', 'messages')
url = '{0}?pageToken={1}'.format(url, pageToken)
request = self.build_request(url, credentials, 'list', 'messages')
yield request
def parse_get_messages(self, response):
# parse a single message
l = JSONItemLoader(item=EmailItem(), response=response,
selector='payload')
l.add_value('subject', 'headers[name=Subject].value')
l.add_value('received', 'headers[name=Delivery-date].value')
l.add_value('sent', 'headers[name=Date].value')
l.add_value('from_email', 'headers[name=From].value')
l.add_value('to_email', 'headers[name=To].value')
l.add_value('mime', 'mimeType')
l.add_value('text', 'parts[mimeType=text/plain].body.data')
item = l.load_item()
item['labels'] = '.'.join(response.obj.get('labelIds', []))
item['is_read'] = 'UNREAD' not in item['labels']
item['threadid'] = response.obj.get('threadId')
# http://stackoverflow.com/a/24481560/890242
text = item['text']
text = text.replace('-', '+')
text = text.replace('_', '/')
item['text'] = base64.decodestring(text)
item['text_length'] = len(text)
yield item
def parse_list_labels(self, response):
# parse list of labels
for label in response.obj['labels']:
l = JSONItemLoader(item=EmailLabelItem(), obj=label)
l.add_value('itemid', 'id')
l.add_value('name', 'name')
l.add_value('messagesTotal', 'messagesTotal')
l.add_value('messagesUnread', 'messagesUnread')
item = l.load_item()
yield item
class EmailItem(scrapy.Item):
user = scrapy.Field()
threadid = scrapy.Field()
from_email = scrapy.Field()
to_email = scrapy.Field()
subject = scrapy.Field()
text = scrapy.Field()
is_read = scrapy.Field()
labels = scrapy.Field()
received = scrapy.Field()
sent = scrapy.Field()
labels = scrapy.Field()
mime = scrapy.Field()
text_length = scrapy.Field()
class EmailLabelItem(scrapy.Item):
user = scrapy.Field()
itemid = scrapy.Field()
name = scrapy.Field()
messagesTotal = scrapy.Field()
messagesUnread = scrapy.Field()
import json
import re
from scrapy.contrib.loader import ItemLoader
class JSONItemLoader(ItemLoader):
"""
item loader for json
if a selector is given, this is the default selector applied to
all subsequent add_value() calls.
"""
def __init__(self, item=None, selector=None, response=None, obj=None,
**context):
self.item = item
self.selector = selector or ''
self.response = response
self.context = context
context.update(selector=selector, response=response)
self.obj = obj or self.get_value(self.json_obj, selector)
self._values = {}
@property
def json_obj(self):
return json.loads(self.response.body)
def get_value(self, obj, selector):
"""
in dict obj access subsequent keys given by selector.
A selector is a sequence of key within the dict. Returns '' if
a key is not found
obj = { 'foo' : { 'bar' : 'baz' }}
get_value(obj, 'foo.bar')
=> 'baz'
Selectors can specify filter criteria for dicts in sublists, e.g.
obj = { 'foo' : [
{
'name' : 'foobar',
'value' : 'foofoo'
},
...
}
get_value(obj, 'foo[name=foobar].value')
=> 'foofoo'
"""
selector = selector or ''
value = obj
for key in selector.split('.'):
# if key is empty, stop
if not key:
break
# extract filter, if any
criteria = None
if '[' in key and ']' in key:
key, k, v = re.search('(.*)\[(.*)=(.*)\]', key).groups()
criteria = k, v
# see if we have a dict to get value from
if isinstance(value, dict):
value = value.get(key, '')
else:
# nothing else to process, stop
value = ''
break
# apply filter to list
if criteria and isinstance(value, list):
k, v = criteria
value = filter(lambda i : i.get(k) == v, value)
if len(value):
value = value[0]
return value
def add_value(self, field, selector):
"""
get the value given by selector
"""
self._values.update({ field : self.get_value(self.obj, selector)})
def load_item(self):
"""
return the item loaded with all values previously added by add_value()
"""
for k, v in self._values.iteritems():
self.item[k] = v
return self.item
import json
from scrapy import log
import scrapy
from scrapy.http.request import Request
from scrapytest.scrapydjango import setup_django
class OAuth2Spider(scrapy.Spider):
"""
An API spider for OAuth2 REST resources
works with allauth.SocialTokens. User needs to have authorized
API access before via django-allauth.
Configuration:
api_url = the start URI
credential_type = first string in Authorization header
provider = provider name in social app
refresh_url = the refresh url to refresh OAuth tokens
Build API requests using
request = self.build_request(url, api_method, api_type,
credentials)
Parse responses by implementing methods of format
parse_<api_method>_<api_type>
e.g. parse_list_messages(response)
parse_get_messages(response)
response objects provide the .api_method and .api_type attributes.
You can build a new request from a response as follows:
credentials = self.credentials_from(response)
request = self.build_request(url, method, type, credentials)
"""
api_url = ''
refresh_url = ''
provider = ''
credential_type = 'Bearer'
max_items = 0
start_verbs = []
trailing_slash = False
def start_requests(self):
"""
build initial requests
this builds initial requests from .api_url and start_verbs. start_verbs
is a list of tuples ('verb', 'resource'), where verb is the semantic
method for the api, resource is the name of the resource
e.g. start_verbs = [('list', 'messages'), ('list', 'labels')]
=> GET <api_url>/messages with api_method=list, api_type=messages
=> GET <api_url>/labels with api_method=list, api_type=labels
Using start_verbs you can have multiple types of index resources
queried.
If start_verbs is empty, it defaults to [('list', '<resource>')] where
<resource> is the last verb in api_url.
e.g. api_url = 'some.domain.com/api/messages'
=> start_verbs = [('list', 'messages')]
see http://doc.scrapy.org/en/latest/topics/spiders.html?highlight=spider#scrapy.spider.Spider.start_requests
"""
for credential in self.get_credentials():
for api_method, api_type in self.get_start_verbs():
url = self.build_url_from_verb(api_method, api_type)
request = self.build_request(url,
api_method=api_method,
api_type=api_type,
credentials=credential)
yield request
def get_start_verbs(self):
if not len(self.start_verbs):
self.start_verbs = [('list', '')]
return self.start_verbs
def build_url_from_verb(self, api_method, api_type, url=None):
"""
from a given api_method and api_type build the actual
url to call.
the url is of the form <api_url>/<method>/<type>
:param api_method: the api semantic method (e.g. list, get etc.)
:param api_type: the api resource type
:param url: the base url. defaults to .api_url
"""
base_url = url or self.api_url
if api_type:
url = '{0}/{1}'.format(base_url, api_type)
else:
url = base_url
path = base_url.split('/')
api_type = path[-1] or path[-2]
if self.trailing_slash and url[-1] != '/':
url = '%s/' % url
return url
def get_credentials(self):
"""
list all credentials for which to access the api_url
returns an iterable of credentials. each credential is
of format <credential_type> <token>. Default implementation
returns credentials for all SocialTokens that match self.provider
"""
from allauth.socialaccount.models import SocialToken
for token in SocialToken.objects.filter(app__provider=self.provider):
credentials = '{0} {token}'.format(self.credential_type,
**token.__dict__)
yield credentials
def refresh_kwargs(self, token):
"""
return the dict of values required to build a
refresh token request
:param token: the SocialToken instance
"""
return {
'client_id' : token.app.client_id,
'refresh_token' : token.token_secret,
'client_secret' : token.app.secret,
}
def refresh_token(self, request):
"""
refresh the access token in a request
called by error() in case of a 401 response. This assumes
the token has expired and attempts to refresh it
"""
# get current token from request
from requests_oauthlib import OAuth2Session
token_value = self.credentials_from(request)
token_value = token_value.replace(self.credential_type, '')
try:
# get SocialToken and app credentials from django db
setup_django()
from allauth.socialaccount.models import SocialToken
token = SocialToken.objects.get(token=token_value.lstrip().strip())
app = token.app
except:
raise
# create an oauth2session with auto refresh url
# see http://requests-oauthlib.readthedocs.org/en/latest/oauth2_workflow.html#third-recommended-define-automatic-token-refresh-and-update
# and http://requests-oauthlib.readthedocs.org/en/latest/api.html#requests_oauthlib.OAuth2Session.refresh_token
try:
refresh_kwargs = self.refresh_kwargs(token)
client = OAuth2Session(app.client_id, token=token.token)
resp = client.refresh_token(self.refresh_url,
**refresh_kwargs)
except:
raise
else:
# we finally have a new token, save it
token.token = resp['access_token']
token.save()
# rebuild the request and schedule for download
credentials = self.credentials_from(request)
request = self.build_request(request.url,
api_method=request.api_method,
api_type=request.api_type,
credentials=credentials)
return [request]
return []
def error(self, failure):
"""
process error
if error is 401, assume we need a new token, call refresh_token()
otherwise print error to log
"""
try:
response = failure.value.response
except:
pass
else:
if response.status == 401:
self.refresh_token(failure.request)
self.log(failure.value, level=log.ERROR)
def build_request(self, url, credentials=None, api_method=None,
api_type=None):
"""
build a request object
:param url: the url to access
:param credentials: the credentials (to be set in Authorization header)
:param api_method: semantic api method (e.g. list, get)
:param api_type: semantic api type (e.g. messages, threads)
"""
request = scrapy.Request(url,
headers={'Authorization' : credentials },
errback=self.error,
callback=self.parse)
request.api_method = api_method
request.api_type = api_type
return request
def credentials_from(self, request_or_response, header=False):
"""
get the credentials from the authorization headers.
this ensures we can build subsequent Requests from a previous
request or response.
:return: Authorization header value (header=True => the full header)
"""
rr = request_or_response
request = rr if isinstance(rr, Request) else rr.request
credentials = request.headers.get('Authorization')
if header:
credentials = {'Authorization' : credentials }
return credentials
def parser_for(self, request):
"""
return the method that can parse this request. defaults
to
parse_<api_type>_<api_method>
where api_type is the
"""
url_type = 'parse_{0}_{1}'.format(request.api_method,
request.api_type)
return getattr(self, url_type)
def parse(self, response):
"""
parse api response.
1. from the body create an obj by deserializing it into a python dict
2. find the parser for the response by checking the
"""
response.obj = json.loads(response.body)
parser = self.parser_for(response.request)
self.log('%s - %s' % (parser.__name__, response), level=log.DEBUG)
for i, r in enumerate(parser(response)):
self.log('> %s - %s' % (parser.__name__, r), level=log.DEBUG)
if self.max_items == 0 or i < self.max_items:
if self.trailing_slash and isinstance(r, Request):
if r.url[-1] != '/':
r.url = '%s/' % r.url
yield r
# add in your scrapy app's settings.py
from app import settings as dj_app_settings
from django.conf import settings as dj_conf
if not dj_conf.configured:
# https://docs.djangoproject.com/en/dev/topics/settings/#using-settings-without-setting-django-settings-module
dj_conf.configure(**dj_app_settings.__dict__)
from django.apps import apps
apps.populate(dj_conf.INSTALLED_APPS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment