Last active
April 28, 2018 09:48
-
-
Save hkurokawa/1487a76879e1f1b1cd75f36209f8ea0c to your computer and use it in GitHub Desktop.
Twitter crawler for Elasticsearch 6.x. This crawler is just crawling my twitter timeline and index the tweets.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import time | |
from os import environ | |
import elasticsearch | |
import twitter | |
def sanitise_place(place): | |
if place is None or 'bounding_box' not in place or place['bounding_box']['type'] != 'Polygon': | |
return | |
bb = place['bounding_box'] | |
coords = bb['coordinates'][0] | |
# Sometimes the given coordinates have exact the same values. | |
# In that case, modify it from Polygon to Point so that it avoids an Elasticsearch error. | |
if coords[0] == coords[1]: | |
bb['type'] = 'Point' | |
bb['coordinates'] = coords[0] | |
def sanitise_geo(geo): | |
if geo is None or geo['type'] != 'Point': | |
return | |
# The attribute is deprecated and the order of the coordinates is [latitude, longitude] here. | |
# See https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object | |
geo['coordinates'] = list(reversed(geo['coordinates'])) | |
if __name__ == '__main__': | |
ck = environ.get('TWITTER_CONSUMER_KEY') | |
cs = environ.get('TWITTER_CONSUMER_SECRET') | |
tk = environ.get('TWITTER_ACCESS_TOKEN') | |
ts = environ.get('TWITTER_ACCESS_TOKEN_SECRET') | |
if not ck or not cs or not tk or not ts: | |
print("Environmental variables are not set:", | |
"TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET", | |
file=sys.stderr) | |
exit(1) | |
auth = twitter.OAuth(consumer_key=ck, consumer_secret=cs, token=tk, token_secret=ts) | |
api = twitter.Twitter(auth=auth) | |
opt = {'count': 200} | |
last_id = None | |
try: | |
file = open('.last_id', 'r') | |
last_id = int(file.read()) | |
print("since_id: ", last_id) | |
opt['since_id'] = last_id | |
file.close() | |
except FileNotFoundError: | |
pass | |
first_id = None | |
max_id = None | |
es = elasticsearch.Elasticsearch() | |
while max_id is None or last_id is None or last_id < max_id: | |
if max_id is not None: | |
opt['max_id'] = max_id | |
tweets = api.statuses.home_timeline(**opt) | |
if len(tweets) == 0: | |
break | |
for t in tweets: | |
sanitise_place(t['place']) | |
sanitise_geo(t['geo']) | |
tid = t['id_str'] | |
if first_id is None: | |
first_id = tid | |
print('indexing ', tid, flush=True) | |
res = es.index(index="my_twitter_timeline", doc_type='tweets', body=t, id=tid) | |
max_id = int(tid) - 1 | |
time.sleep(1) | |
if first_id is not None: | |
file = open('.last_id', 'w') | |
file.write(first_id) | |
file.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"settings": { | |
"index": { | |
"analysis": { | |
"analyzer": { | |
"kuromoji_analyzer": { | |
"type": "custom", | |
"tokenizer": "kuromoji_tokenizer" | |
} | |
} | |
} | |
} | |
}, | |
"mappings": { | |
"tweets": { | |
"properties": { | |
"coordinates": { | |
"type": "geo_shape" | |
}, | |
"created_at": { | |
"format": "EEE MMM dd HH:mm:ss Z YYYY", | |
"type": "date" | |
}, | |
"entities": { | |
"properties": { | |
"hashtags": { | |
"properties": { | |
"indices": { | |
"type": "long" | |
}, | |
"text": { | |
"type": "keyword" | |
} | |
} | |
}, | |
"urls": { | |
"properties": { | |
"display_url": { | |
"type": "keyword" | |
}, | |
"expanded_url": { | |
"type": "keyword" | |
}, | |
"indices": { | |
"type": "long" | |
}, | |
"url": { | |
"type": "keyword" | |
} | |
} | |
} | |
} | |
}, | |
"favorite_count": { | |
"type": "long" | |
}, | |
"favorited": { | |
"type": "boolean" | |
}, | |
"filter_level": { | |
"type": "keyword" | |
}, | |
"geo": { | |
"type": "geo_shape" | |
}, | |
"id": { | |
"type": "long" | |
}, | |
"id_str": { | |
"type": "keyword" | |
}, | |
"lang": { | |
"type": "keyword" | |
}, | |
"place": { | |
"properties": { | |
"attributes": { | |
"type": "object" | |
}, | |
"bounding_box": { | |
"type": "geo_shape", | |
"coerce": true | |
}, | |
"country": { | |
"type": "keyword" | |
}, | |
"country_code": { | |
"type": "keyword" | |
}, | |
"full_name": { | |
"type": "keyword" | |
}, | |
"id": { | |
"type": "keyword" | |
}, | |
"name": { | |
"type": "keyword" | |
}, | |
"place_type": { | |
"type": "keyword" | |
}, | |
"url": { | |
"type": "keyword" | |
} | |
} | |
}, | |
"possibly_sensitive": { | |
"type": "boolean" | |
}, | |
"retweet_count": { | |
"type": "long" | |
}, | |
"retweeted": { | |
"type": "boolean" | |
}, | |
"source": { | |
"type": "keyword" | |
}, | |
"text": { | |
"type": "text", | |
"analyzer": "kuromoji_analyzer" | |
}, | |
"timestamp_ms": { | |
"type": "date" | |
}, | |
"truncated": { | |
"type": "boolean" | |
}, | |
"user": { | |
"properties": { | |
"contributors_enabled": { | |
"type": "boolean" | |
}, | |
"created_at": { | |
"format": "EEE MMM dd HH:mm:ss Z YYYY", | |
"type": "date" | |
}, | |
"default_profile": { | |
"type": "boolean" | |
}, | |
"default_profile_image": { | |
"type": "boolean" | |
}, | |
"description": { | |
"type": "text" | |
}, | |
"favourites_count": { | |
"type": "long" | |
}, | |
"followers_count": { | |
"type": "long" | |
}, | |
"friends_count": { | |
"type": "long" | |
}, | |
"geo_enabled": { | |
"type": "boolean" | |
}, | |
"id": { | |
"type": "long" | |
}, | |
"id_str": { | |
"type": "keyword" | |
}, | |
"is_translator": { | |
"type": "boolean" | |
}, | |
"lang": { | |
"type": "keyword" | |
}, | |
"listed_count": { | |
"type": "long" | |
}, | |
"location": { | |
"type": "keyword" | |
}, | |
"name": { | |
"type": "keyword" | |
}, | |
"profile_background_color": { | |
"type": "keyword" | |
}, | |
"profile_background_image_url": { | |
"type": "keyword" | |
}, | |
"profile_background_image_url_https": { | |
"type": "keyword" | |
}, | |
"profile_background_tile": { | |
"type": "boolean" | |
}, | |
"profile_banner_url": { | |
"type": "keyword" | |
}, | |
"profile_image_url": { | |
"type": "keyword" | |
}, | |
"profile_image_url_https": { | |
"type": "keyword" | |
}, | |
"profile_link_color": { | |
"type": "keyword" | |
}, | |
"profile_sidebar_border_color": { | |
"type": "keyword" | |
}, | |
"profile_sidebar_fill_color": { | |
"type": "keyword" | |
}, | |
"profile_text_color": { | |
"type": "keyword" | |
}, | |
"profile_use_background_image": { | |
"type": "boolean" | |
}, | |
"protected": { | |
"type": "boolean" | |
}, | |
"screen_name": { | |
"type": "keyword" | |
}, | |
"statuses_count": { | |
"type": "long" | |
}, | |
"time_zone": { | |
"type": "keyword" | |
}, | |
"url": { | |
"type": "keyword" | |
}, | |
"utc_offset": { | |
"type": "long" | |
}, | |
"verified": { | |
"type": "boolean" | |
} | |
} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment