Skip to content

Instantly share code, notes, and snippets.

@hkurokawa
Last active April 28, 2018 09:48
Show Gist options
  • Save hkurokawa/1487a76879e1f1b1cd75f36209f8ea0c to your computer and use it in GitHub Desktop.
Save hkurokawa/1487a76879e1f1b1cd75f36209f8ea0c to your computer and use it in GitHub Desktop.
Twitter crawler for Elasticsearch 6.x. This crawler is just crawling my twitter timeline and index the tweets.
import sys
import time
from os import environ
import elasticsearch
import twitter
def sanitise_place(place):
if place is None or 'bounding_box' not in place or place['bounding_box']['type'] != 'Polygon':
return
bb = place['bounding_box']
coords = bb['coordinates'][0]
# Sometimes the given coordinates have exact the same values.
# In that case, modify it from Polygon to Point so that it avoids an Elasticsearch error.
if coords[0] == coords[1]:
bb['type'] = 'Point'
bb['coordinates'] = coords[0]
def sanitise_geo(geo):
if geo is None or geo['type'] != 'Point':
return
# The attribute is deprecated and the order of the coordinates is [latitude, longitude] here.
# See https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
geo['coordinates'] = list(reversed(geo['coordinates']))
if __name__ == '__main__':
ck = environ.get('TWITTER_CONSUMER_KEY')
cs = environ.get('TWITTER_CONSUMER_SECRET')
tk = environ.get('TWITTER_ACCESS_TOKEN')
ts = environ.get('TWITTER_ACCESS_TOKEN_SECRET')
if not ck or not cs or not tk or not ts:
print("Environmental variables are not set:",
"TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET",
file=sys.stderr)
exit(1)
auth = twitter.OAuth(consumer_key=ck, consumer_secret=cs, token=tk, token_secret=ts)
api = twitter.Twitter(auth=auth)
opt = {'count': 200}
last_id = None
try:
file = open('.last_id', 'r')
last_id = int(file.read())
print("since_id: ", last_id)
opt['since_id'] = last_id
file.close()
except FileNotFoundError:
pass
first_id = None
max_id = None
es = elasticsearch.Elasticsearch()
while max_id is None or last_id is None or last_id < max_id:
if max_id is not None:
opt['max_id'] = max_id
tweets = api.statuses.home_timeline(**opt)
if len(tweets) == 0:
break
for t in tweets:
sanitise_place(t['place'])
sanitise_geo(t['geo'])
tid = t['id_str']
if first_id is None:
first_id = tid
print('indexing ', tid, flush=True)
res = es.index(index="my_twitter_timeline", doc_type='tweets', body=t, id=tid)
max_id = int(tid) - 1
time.sleep(1)
if first_id is not None:
file = open('.last_id', 'w')
file.write(first_id)
file.close()
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"kuromoji_analyzer": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer"
}
}
}
}
},
"mappings": {
"tweets": {
"properties": {
"coordinates": {
"type": "geo_shape"
},
"created_at": {
"format": "EEE MMM dd HH:mm:ss Z YYYY",
"type": "date"
},
"entities": {
"properties": {
"hashtags": {
"properties": {
"indices": {
"type": "long"
},
"text": {
"type": "keyword"
}
}
},
"urls": {
"properties": {
"display_url": {
"type": "keyword"
},
"expanded_url": {
"type": "keyword"
},
"indices": {
"type": "long"
},
"url": {
"type": "keyword"
}
}
}
}
},
"favorite_count": {
"type": "long"
},
"favorited": {
"type": "boolean"
},
"filter_level": {
"type": "keyword"
},
"geo": {
"type": "geo_shape"
},
"id": {
"type": "long"
},
"id_str": {
"type": "keyword"
},
"lang": {
"type": "keyword"
},
"place": {
"properties": {
"attributes": {
"type": "object"
},
"bounding_box": {
"type": "geo_shape",
"coerce": true
},
"country": {
"type": "keyword"
},
"country_code": {
"type": "keyword"
},
"full_name": {
"type": "keyword"
},
"id": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"place_type": {
"type": "keyword"
},
"url": {
"type": "keyword"
}
}
},
"possibly_sensitive": {
"type": "boolean"
},
"retweet_count": {
"type": "long"
},
"retweeted": {
"type": "boolean"
},
"source": {
"type": "keyword"
},
"text": {
"type": "text",
"analyzer": "kuromoji_analyzer"
},
"timestamp_ms": {
"type": "date"
},
"truncated": {
"type": "boolean"
},
"user": {
"properties": {
"contributors_enabled": {
"type": "boolean"
},
"created_at": {
"format": "EEE MMM dd HH:mm:ss Z YYYY",
"type": "date"
},
"default_profile": {
"type": "boolean"
},
"default_profile_image": {
"type": "boolean"
},
"description": {
"type": "text"
},
"favourites_count": {
"type": "long"
},
"followers_count": {
"type": "long"
},
"friends_count": {
"type": "long"
},
"geo_enabled": {
"type": "boolean"
},
"id": {
"type": "long"
},
"id_str": {
"type": "keyword"
},
"is_translator": {
"type": "boolean"
},
"lang": {
"type": "keyword"
},
"listed_count": {
"type": "long"
},
"location": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"profile_background_color": {
"type": "keyword"
},
"profile_background_image_url": {
"type": "keyword"
},
"profile_background_image_url_https": {
"type": "keyword"
},
"profile_background_tile": {
"type": "boolean"
},
"profile_banner_url": {
"type": "keyword"
},
"profile_image_url": {
"type": "keyword"
},
"profile_image_url_https": {
"type": "keyword"
},
"profile_link_color": {
"type": "keyword"
},
"profile_sidebar_border_color": {
"type": "keyword"
},
"profile_sidebar_fill_color": {
"type": "keyword"
},
"profile_text_color": {
"type": "keyword"
},
"profile_use_background_image": {
"type": "boolean"
},
"protected": {
"type": "boolean"
},
"screen_name": {
"type": "keyword"
},
"statuses_count": {
"type": "long"
},
"time_zone": {
"type": "keyword"
},
"url": {
"type": "keyword"
},
"utc_offset": {
"type": "long"
},
"verified": {
"type": "boolean"
}
}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment