Created
July 15, 2021 15:59
-
-
Save robmiller/0b523deac3bf83663074ff6fc6f4e3d1 to your computer and use it in GitHub Desktop.
Script for deleting tweets from a Twitter data export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
gem "twitter", "~> 7.0" | |
gem "http", "~> 4.4" | |
require "twitter" | |
require "http" | |
require "json" | |
require "yaml" | |
require "yaml/store" | |
require "pathname" | |
# things you must configure | |
TWITTER_USER = "" | |
MAX_AGE_IN_DAYS = 365 # anything older than this is deleted | |
# Download your Twitter archive and point this to the | |
# tweet.js file contained in the archive | |
DATA_FILE = "path/to/tweet.js" | |
# archive old tweets? | |
ARCHIVE = false | |
# directory to store archived tweets in | |
ARCHIVE_DIR = Pathname("path/to/old_tweets") | |
# get these from dev.twitter.com | |
CONSUMER_KEY = "" | |
CONSUMER_SECRET = "" | |
OAUTH_TOKEN = "" | |
OAUTH_TOKEN_SECRET = "" | |
# any special favorites? | |
IDS_TO_SAVE_FOREVER = [] | |
# any special keywords? | |
KEYWORDS_TO_SAVE_FOREVER = [] | |
# don't delete tweets with this many likes or more | |
LIKE_THRESHOLD = 5 | |
# don't delete tweets with this many RTs or more | |
RT_THRESHOLD = 5 | |
### you shouldn't have to change anything below this line ### | |
MAX_AGE_IN_SECONDS = MAX_AGE_IN_DAYS*24*60*60 | |
NOW_IN_SECONDS = Time.now | |
TWEETS_PER_REQUEST = 200 | |
### METHODS ### | |
# There's enough data in the archive for us to construct this data type, | |
# which quacks like a Twitter API-returned Tweet and allows us to decide | |
# whether or not to delete a Tweet without hitting the API. | |
OfflineTweet = Struct.new(:id, | |
:text, | |
:created_at, | |
:favorite_count, | |
:retweet_count) | |
$already_deleted = YAML::Store.new("deleted_tweets.yaml") | |
def delete_from_twitter(tweet, client) | |
client.destroy_status(tweet.id) | |
rescue Twitter::Error::NotFound => e | |
$stderr.puts "Tweet already deleted" | |
$already_deleted.transaction { $already_deleted[tweet.id] = 1 } | |
rescue StandardError => e | |
puts e.inspect | |
puts "Error deleting #{tweet.id}; exiting" | |
exit | |
else | |
puts "Deleted #{tweet.id}" | |
end | |
def archive_tweet(tweet) | |
text = tweet.text | |
urls = text | |
.scan(%r[https?://t.co/\w+]) | |
.map { |u| URI.parse(u) rescue nil } | |
.compact | |
.map { |url| | |
u = url | |
5.times do | |
response = HTTP.timeout(connect: 5, read: 10).get(u) | |
break if response.code == 200 || response["Location"].nil? | |
u = response["Location"] | |
rescue StandardError => e | |
$stderr.puts e.message | |
break | |
end | |
u | |
} | |
.compact | |
payload = YAML.dump({ "tweet" => text, "date" => tweet.created_at, "urls" => urls }) | |
path = ARCHIVE_DIR + "#{tweet.id}.txt" | |
File.open(path, "w") do |f| | |
f.write(payload) | |
end | |
end | |
def keyword_in(tweet) | |
KEYWORDS_TO_SAVE_FOREVER.any? do |keyword| | |
tweet.text.include?(keyword) | |
end | |
end | |
### WE BEGIN ### | |
client = Twitter::REST::Client.new do |config| | |
config.consumer_key = CONSUMER_KEY | |
config.consumer_secret = CONSUMER_SECRET | |
config.access_token = OAUTH_TOKEN | |
config.access_token_secret = OAUTH_TOKEN_SECRET | |
end | |
puts | |
puts "What's that sound...?" | |
puts | |
tweets = JSON.parse(File.read(DATA_FILE)) | |
.map { |t| t["tweet"] } | |
.map do |t| | |
OfflineTweet.new( | |
t["id_str"].to_i, | |
t["full_text"], | |
Time.parse(t["created_at"]), | |
t["favorite_count"].to_i, | |
t["retweet_count"].to_i, | |
) | |
end | |
puts | |
puts "Found #{tweets.length} tweets in archive data" | |
puts | |
tweets.each do |tweet| | |
next if $already_deleted.transaction { $already_deleted[tweet.id] } | |
if ARCHIVE | |
archive_tweet(tweet) | |
end | |
tweet_age = NOW_IN_SECONDS - tweet.created_at | |
tweet_age_in_days = (tweet_age/(24*60*60)).round | |
if (tweet_age < MAX_AGE_IN_SECONDS) then | |
puts "Ignored a tweet #{tweet_age_in_days} days old" | |
elsif IDS_TO_SAVE_FOREVER.include?(tweet.id) then | |
puts "Ignored a tweet that is to be saved forever" | |
elsif keyword_in(tweet) then | |
puts "Ignored a tweet with a favored keyword" | |
elsif tweet.favorite_count >= LIKE_THRESHOLD | |
puts "Ignored a tweet with >= #{LIKE_THRESHOLD} likes" | |
elsif tweet.retweet_count >= RT_THRESHOLD | |
puts "Ignored a tweet with >= #{RT_THRESHOLD} RTs" | |
else | |
puts "Deleting a tweet #{tweet_age_in_days} days old" | |
delete_from_twitter(tweet, client) | |
end | |
puts " #{tweet.text}" | |
puts "" | |
rescue Twitter::Error::TooManyRequests => e | |
$stderr.puts "Hit the rate limit; pausing for #{e.rate_limit.reset_in} seconds" | |
sleep e.rate_limit.reset_in | |
retry | |
rescue StandardError => e | |
puts e.inspect | |
exit | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment