Last active
October 31, 2016 11:28
-
-
Save jlecour/5eae6b24cfe62a08826f422ba9e5fb68 to your computer and use it in GitHub Desktop.
A (not optimized at all) Ruby script to import tweets from a Twitter Archive to an Elasticsearch index. The mapping can be PUT to the index beforehand, to optimize indexing.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"mappings": { | |
"tweet": { | |
"properties": { | |
"created_at": { | |
"format": "YYYY-MM-dd HH:mm:ss Z", | |
"type": "date" | |
}, | |
"entities": { | |
"properties": { | |
"hashtags": { | |
"properties": { | |
"indices": { | |
"type": "long" | |
}, | |
"text": { | |
"type": "string" | |
} | |
} | |
}, | |
"urls": { | |
"properties": { | |
"display_url": { | |
"type": "string" | |
}, | |
"expanded_url": { | |
"index": "not_analyzed", | |
"type": "string" | |
}, | |
"indices": { | |
"type": "long" | |
}, | |
"url": { | |
"type": "string" | |
} | |
} | |
}, | |
"media": { | |
"properties": { | |
"display_url": { | |
"type": "string" | |
}, | |
"expanded_url": { | |
"index": "not_analyzed", | |
"type": "string" | |
}, | |
"indices": { | |
"type": "long" | |
}, | |
"url": { | |
"type": "string" | |
}, | |
"media_url": { | |
"type": "string" | |
}, | |
"media_url_https": { | |
"type": "string" | |
}, | |
"sizes": { | |
"properties": { | |
"h": { | |
"type": "long" | |
}, | |
"resize": { | |
"type": "string" | |
}, | |
"w": { | |
"type": "long" | |
} | |
} | |
} | |
} | |
} | |
} | |
}, | |
"geo": { | |
"type": "geo_point" | |
}, | |
"id": { | |
"type": "long" | |
}, | |
"id_str": { | |
"type": "string" | |
}, | |
"source": { | |
"index": "not_analyzed", | |
"type": "string" | |
}, | |
"text": { | |
"type": "string" | |
}, | |
"user": { | |
"properties": { | |
"id": { | |
"type": "long" | |
}, | |
"id_str": { | |
"type": "string" | |
}, | |
"name": { | |
"index": "not_analyzed", | |
"type": "string" | |
}, | |
"profile_image_url_https": { | |
"type": "string" | |
}, | |
"protected": { | |
"type": "boolean" | |
}, | |
"screen_name": { | |
"type": "string" | |
}, | |
"verified": { | |
"type": "boolean" | |
} | |
} | |
} | |
} | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require "pathname" | |
require "elasticsearch" | |
require "json" | |
client = Elasticsearch::Client.new | |
data_dir = Pathname.new("./data/js/tweets/") | |
data_dir.each_child do |file| | |
puts file | |
file_content = file.read | |
raw_tweets = file_content.gsub(/\A[^\[]+/,'') | |
tweets = JSON.parse(raw_tweets) | |
tweets.each do |tweet| | |
id = tweet["id"] | |
body = tweet | |
if body["geo"].empty? | |
body.delete("geo") | |
else | |
body["geo"] = body["geo"]["coordinates"].reverse! | |
end | |
client.index index: 'twitter', type: 'tweet', id: id, body: body | |
end | |
end | |
puts "Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment