Skip to content

Instantly share code, notes, and snippets.

@dbreunig
Last active November 12, 2022 21:04
Show Gist options
  • Save dbreunig/d4f94756c1c607e0fb3b8d53be04990c to your computer and use it in GitHub Desktop.
Save dbreunig/d4f94756c1c607e0fb3b8d53be04990c to your computer and use it in GitHub Desktop.
Convert your downloaded Twitter data to a sqlite database. Comments detail how to run it. #twitter
# tweetToSqlite.rb
# Drew Breunig, 2022
# Use however you want.
#
# To run this script, put the 'tweets.js' file in the same directory
# as this script. There's one non-standard gem requirement, `sequel`.
# Install it with `$ gem install sequel`. Then run `ruby tweetToSqlite.rb`.
#
# TODO
# - Extract and save hashtag entities
# - Extract and save symbol entities
# - Extract and save geolocation
# - Extract and save media associated with tweets
require 'sequel'
require 'time'
require 'json'
require 'csv'
#
# Database connection
#
DB = Sequel.sqlite('tweets.db')
#
# Database tables creation
#
DB.create_table?(:tweets) do
String :id, primary_key: true
TrueClass :retweeted, default: false
Integer :favorite_count, default: 0
String :in_reply_to_user_id
String :in_reply_to_status_id
TrueClass :truncated, default: false
Integer :retweet_count, default: 0
Time :created_at
TrueClass :favorited, default: false
String :full_text, text: true
String :lang
end
DB.create_table?(:users) do
String :id, primary_key: true
String :name
String :screen_name
end
DB.create_table?(:tweets_users) do
String :tweet_id
String :user_id
end
DB.create_table?(:urls) do
String :url, primary_key: true
String :expanded_url
String :display_url
String :host
end
DB.create_table?(:tweets_urls) do
String :tweet_id
String :url_id
end
#
# Helper Methods
#
# Checks if string is valid json, returns bool
def valid_json?(json)
JSON.parse(json)
true
rescue JSON::ParserError, TypeError => e
false
end
#
# JS Parsing
#
# Walk through the file tweets.js and parse them into ruby hashes.
# The tweets are archived in js, not json. Which causes problems with Ruby's parser.
# Thankfully the data itself is pretty predictable. We have an array of dicts, each
# representing a tweet. We look for start lines, which are: /$\s{3}{/. Everytime
# you see that, parse your `holder` and load it into the tweets.
tweets = []
holder = ""
File.readlines('./tweets.js').each do |line|
if line =~ /^\s{2}{/
# Store
holder = holder.strip.gsub(/,\z/, "")
if valid_json?(holder)
tweet = JSON.parse(holder)
tweets << tweet
end
holder = ""
# New Tweet
holder = line
else
holder = holder + line
end
end
#
# Preparation
#
# Arrays to load up for mass insertion
tweets_to_insert = []
users_to_insert = []
urls_to_insert = []
tweets_users_to_insert = []
tweets_urls_to_insert = []
# Enumerate each tweet and load the necessary data into the arrays
tweets.each do |t|
t = t["tweet"]
tweets_to_insert << [
t["id"], t["retweeted"], t["favorite_count"], t["in_reply_to_user_id"],
t["in_reply_to_status_id"], t["truncated"], t["retweet_count"],
Time.parse(t["created_at"]), t["favorited"], t["full_text"], t["lang"]
]
t["entities"]["user_mentions"].each do |user|
users_to_insert << [
user['id'], user['name'], user['screen_name']
]
tweets_users_to_insert << [
t['id'], user['id']
]
end
t["entities"]["urls"].each do |url|
urls_to_insert << [
url["url"], url["expanded_url"], url["display_url"], url["expanded_url"].match(/^(.*:)\/\/([A-Za-z0-9\\-\\.]+)(:[0-9]+)?(.*)$/)[2]
]
tweets_urls_to_insert << [
t['id'], url['url']
]
end
end
#
# DB Insertion
#
# Import Tweets
keys = [:id, :retweeted, :favorite_count, :in_reply_to_user_id, :in_reply_to_status_id, :truncated, :retweet_count, :created_at, :favorited, :full_text, :lang]
DB[:tweets].import(keys, tweets_to_insert)
# Clean Users So No Duplicates
users_to_insert = users_to_insert.uniq
users_to_insert = users_to_insert.reject { |u| u[0] == '-1' } # Unmatched users get an id of -1
keys = [:id, :name, :screen_name]
DB[:users].import(keys, users_to_insert.uniq)
# Import join for users to tweets
keys = [:tweet_id, :user_id]
DB[:tweets_users].import(keys, tweets_users_to_insert)
# Import urls
keys = [:url, :expanded_url, :display_url, :host]
DB[:urls].import(keys, urls_to_insert.uniq)
# Import join for urls to tweets
keys = [:tweet_id, :url_id]
DB[:tweets_urls].import(keys, tweets_urls_to_insert)
puts "Done!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment