Last active
November 12, 2022 21:04
-
-
Save dbreunig/d4f94756c1c607e0fb3b8d53be04990c to your computer and use it in GitHub Desktop.
Convert your downloaded Twitter data to a sqlite database. Comments detail how to run it. #twitter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tweetToSqlite.rb | |
# Drew Breunig, 2022 | |
# Use however you want. | |
# | |
# To run this script, put the 'tweets.js' file in the same directory | |
# as this script. There's one non-standard gem requirement, `sequel`. | |
# Install it with `$ gem install sequel`. Then run `ruby tweetToSqlite.rb`. | |
# | |
# TODO | |
# - Extract and save hashtag entities | |
# - Extract and save symbol entities | |
# - Extract and save geolocation | |
# - Extract and save media associated with tweets | |
require 'sequel' | |
require 'time' | |
require 'json' | |
require 'csv' | |
# | |
# Database connection | |
# | |
DB = Sequel.sqlite('tweets.db') | |
# | |
# Database tables creation | |
# | |
DB.create_table?(:tweets) do | |
String :id, primary_key: true | |
TrueClass :retweeted, default: false | |
Integer :favorite_count, default: 0 | |
String :in_reply_to_user_id | |
String :in_reply_to_status_id | |
TrueClass :truncated, default: false | |
Integer :retweet_count, default: 0 | |
Time :created_at | |
TrueClass :favorited, default: false | |
String :full_text, text: true | |
String :lang | |
end | |
DB.create_table?(:users) do | |
String :id, primary_key: true | |
String :name | |
String :screen_name | |
end | |
DB.create_table?(:tweets_users) do | |
String :tweet_id | |
String :user_id | |
end | |
DB.create_table?(:urls) do | |
String :url, primary_key: true | |
String :expanded_url | |
String :display_url | |
String :host | |
end | |
DB.create_table?(:tweets_urls) do | |
String :tweet_id | |
String :url_id | |
end | |
# | |
# Helper Methods | |
# | |
# Checks if string is valid json, returns bool | |
def valid_json?(json) | |
JSON.parse(json) | |
true | |
rescue JSON::ParserError, TypeError => e | |
false | |
end | |
# | |
# JS Parsing | |
# | |
# Walk through the file tweets.js and parse them into ruby hashes. | |
# The tweets are archived in js, not json. Which causes problems with Ruby's parser. | |
# Thankfully the data itself is pretty predictable. We have an array of dicts, each | |
# representing a tweet. We look for start lines, which are: /$\s{3}{/. Everytime | |
# you see that, parse your `holder` and load it into the tweets. | |
tweets = [] | |
holder = "" | |
File.readlines('./tweets.js').each do |line| | |
if line =~ /^\s{2}{/ | |
# Store | |
holder = holder.strip.gsub(/,\z/, "") | |
if valid_json?(holder) | |
tweet = JSON.parse(holder) | |
tweets << tweet | |
end | |
holder = "" | |
# New Tweet | |
holder = line | |
else | |
holder = holder + line | |
end | |
end | |
# | |
# Preparation | |
# | |
# Arrays to load up for mass insertion | |
tweets_to_insert = [] | |
users_to_insert = [] | |
urls_to_insert = [] | |
tweets_users_to_insert = [] | |
tweets_urls_to_insert = [] | |
# Enumerate each tweet and load the necessary data into the arrays | |
tweets.each do |t| | |
t = t["tweet"] | |
tweets_to_insert << [ | |
t["id"], t["retweeted"], t["favorite_count"], t["in_reply_to_user_id"], | |
t["in_reply_to_status_id"], t["truncated"], t["retweet_count"], | |
Time.parse(t["created_at"]), t["favorited"], t["full_text"], t["lang"] | |
] | |
t["entities"]["user_mentions"].each do |user| | |
users_to_insert << [ | |
user['id'], user['name'], user['screen_name'] | |
] | |
tweets_users_to_insert << [ | |
t['id'], user['id'] | |
] | |
end | |
t["entities"]["urls"].each do |url| | |
urls_to_insert << [ | |
url["url"], url["expanded_url"], url["display_url"], url["expanded_url"].match(/^(.*:)\/\/([A-Za-z0-9\\-\\.]+)(:[0-9]+)?(.*)$/)[2] | |
] | |
tweets_urls_to_insert << [ | |
t['id'], url['url'] | |
] | |
end | |
end | |
# | |
# DB Insertion | |
# | |
# Import Tweets | |
keys = [:id, :retweeted, :favorite_count, :in_reply_to_user_id, :in_reply_to_status_id, :truncated, :retweet_count, :created_at, :favorited, :full_text, :lang] | |
DB[:tweets].import(keys, tweets_to_insert) | |
# Clean Users So No Duplicates | |
users_to_insert = users_to_insert.uniq | |
users_to_insert = users_to_insert.reject { |u| u[0] == '-1' } # Unmatched users get an id of -1 | |
keys = [:id, :name, :screen_name] | |
DB[:users].import(keys, users_to_insert.uniq) | |
# Import join for users to tweets | |
keys = [:tweet_id, :user_id] | |
DB[:tweets_users].import(keys, tweets_users_to_insert) | |
# Import urls | |
keys = [:url, :expanded_url, :display_url, :host] | |
DB[:urls].import(keys, urls_to_insert.uniq) | |
# Import join for urls to tweets | |
keys = [:tweet_id, :url_id] | |
DB[:tweets_urls].import(keys, tweets_urls_to_insert) | |
puts "Done!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment