Last active
October 25, 2023 21:13
-
-
Save pjaspers/e946e3dfd8915ed916c658d6c221800b to your computer and use it in GitHub Desktop.
If you have your twitter archive laying around, this will put it in a sqlite. Save it somewhere and run it with `ruby twitter-archive.rb` and it should just work (if you did a gem install sqlite3 first)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gem "sqlite3", "~> 1.6.7" | |
require "sqlite3" | |
require "json" | |
require "time" | |
archive_directory = "~/Documents/Archives/twitter-2023-07-01-826d9daac5cf75f140476a88ff1e97b0f4ba6b8bba8b47c62f8b8c1e65f7aee0" | |
output_db_path = "~/Sync/twitter.db" | |
begin | |
lines = File.readlines(File.expand_path(File.join(archive_directory, "data/tweets.js")), chomp: true); nil | |
rescue Errno::ENOENT | |
abort <<~STR | |
Nope, nope, nope | |
Couldn't find a data/tweets.js in | |
#{File.expand_path(archive_directory)}" | |
STR | |
end | |
lines[0] = lines[0].split(" = ").last # remove twitters weird window thing and make it json | |
parsed = JSON.parse(lines.join("\n")); nil | |
# json.first["tweet"].keys has the columns | |
#=> ["edit_info", "retweeted", "source", "entities", "display_text_range", "favorite_count", "in_reply_to_status_id_str", "id_str", "in_reply_to_user_id", "truncated", "retweet_count", "id", "in_reply_to_status_id", "created_at", "favorited", "full_text", "lang", "in_reply_to_screen_name", "in_reply_to_user_id_str"] | |
db = SQLite3::Database.new(File.expand_path(output_db_path)) | |
puts "Setting up DB at #{output_db_path}..." | |
db.execute_batch(<<~SQL) | |
CREATE TABLE if not exists tweets ( | |
id varchar(500) primary key, | |
in_reply_to_status_id varchar(100), | |
in_reply_to_user_id varchar(100), | |
truncated boolean, | |
retweet_count integer, | |
created_at datetime, | |
favorited boolean, | |
text text, | |
lang varchar(20), | |
in_reply_to_screen_name varchar(200), | |
source text | |
); | |
SQL | |
db.execute_batch(<<~SQL) | |
CREATE VIRTUAL TABLE if not exists "tweets_fts" USING FTS5 ( | |
id, | |
text, | |
created_at, | |
content="tweets" | |
); | |
SQL | |
insert = db.prepare(<<~SQL) | |
insert or ignore into tweets(id, in_reply_to_status_id, in_reply_to_user_id, truncated, retweet_count, created_at, favorited, text, lang, in_reply_to_screen_name, source) | |
values ( :id, :in_reply_to_status_id, :in_reply_to_user_id, :truncated, :retweet_count, :created_at, :favorited, :text, :lang, :in_reply_to_screen_name, :source) | |
SQL | |
puts "Moving JSON to sqlite with high tech computering" | |
db.transaction do | |
parsed.each do |tweet| | |
tweet = tweet["tweet"] | |
insert.execute( | |
id: tweet["id_str"], | |
in_reply_to_status_id: tweet["in_reply_to_status_id"], | |
in_reply_to_user_id: tweet["in_reply_to_user_id"], | |
truncated: tweet["truncated"] ? 1 : 0, | |
retweet_count: tweet["retweet_count"], | |
created_at: Time.parse(tweet["created_at"]).iso8601, | |
favorited: tweet["favorited"] ? 1 : 0, | |
text: tweet["full_text"], | |
lang: tweet["lang"], | |
in_reply_to_screen_name: tweet["in_reply_to_screen_name"], | |
source: tweet["source"] | |
) | |
end | |
end | |
insert = db.prepare(<<~SQL) | |
insert into tweets_fts(rowid, id, text, created_at) | |
select tweets.rowid, tweets.id, tweets.text, tweets.created_at from tweets; | |
SQL | |
insert.execute | |
puts "\n\nSome things to try\n\n" | |
[ | |
["# Tweets per year", %Q[select strftime("%Y", created_at), count(*) as count from tweets group by strftime("%Y", created_at) order by count desc]], | |
["Favorite reply to's", %Q[select in_reply_to_user_id, in_reply_to_screen_name, count(*) as cnt from tweets where in_reply_to_user_id <> "" group by in_reply_to_user_id order by cnt desc limit 10]], | |
["Most tweets in a single day", %Q[select count(*) as count, strftime("%Y%m%d", created_at) from tweets group by strftime("%j%Y", created_at) order by count desc limit 10]], | |
].each do |name, query| | |
puts name | |
puts | |
puts " sqlite3 #{output_db_path} '#{query}'" | |
puts | |
db.execute(query).each do |(thing, count)| | |
puts " #{thing} => #{count}" | |
end | |
end | |
puts <<~STR | |
Search for tweets with: | |
sqlite3 -column #{output_db_path} "select text, created_at, id from tweets where text like '%<term here>%'"; | |
Or full text search with: | |
sqlite3 -column #{output_db_path} "select text, created_at, id from tweets_fts where text match '<term here>'"; | |
STR |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment