Last active
December 2, 2022 18:19
-
-
Save gwire/0ea2b4424a2c7aa203c558b2b3d3cc26 to your computer and use it in GitHub Desktop.
Download missing mastodon avatars to the cache
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Remote mastodon accounts can be refreshed with the command | |
# tootctl accounts refresh --all | |
# https://docs.joinmastodon.org/admin/tootctl/#accounts-refresh | |
# however, tootctl was having issues, so I ended up writing a this | |
# helper script to scan a mastodon cache for missing avatar/header images | |
# prioritising recently active accounts, and download directly | |
# | |
# note: make sure $cache_path is correct! | |
# assumes a path like .../accounts/avatars/888/888/888/888/888/888/original/filename.jpeg | |
require 'pg' | |
require 'net/http' | |
require 'uri' | |
require 'fileutils' | |
$cache_path="/home/mastodon/live/public/system/cache" | |
$admin_url="https://example.social/admin/accounts/" | |
$check_size=false | |
conn = PG.connect(dbname: 'mastodon') | |
res = conn.exec(" | |
SELECT | |
id, | |
CONCAT(username,'@',domain) AS user, | |
CONCAT('/accounts/avatars/', | |
SUBSTRING(id::text, 1, 3), '/', | |
SUBSTRING(id::text, 4, 3), '/', | |
SUBSTRING(id::text, 7, 3), '/', | |
SUBSTRING(id::text, 10, 3),'/', | |
SUBSTRING(id::text, 13, 3),'/', | |
SUBSTRING(id::text, 16, 3),'/original/') AS avatar_file_path, | |
avatar_file_name, | |
avatar_file_size, | |
avatar_remote_url, | |
CONCAT('/accounts/headers/', | |
SUBSTRING(id::text, 1, 3), '/', | |
SUBSTRING(id::text, 4, 3), '/', | |
SUBSTRING(id::text, 7, 3), '/', | |
SUBSTRING(id::text, 10, 3),'/', | |
SUBSTRING(id::text, 13, 3),'/', | |
SUBSTRING(id::text, 16, 3),'/original/') AS header_file_path, | |
header_file_name, | |
header_file_size, | |
header_remote_url,updated_at | |
FROM public.accounts | |
ORDER BY updated_at DESC | |
") | |
#WHERE avatar_updated_at <'2022-11-29'::date | |
#AND updated_at > '2022-11-24'::date | |
#LIMIT 6000 | |
#ORDER BY RANDOM () | |
def download_file(type,user,id,url,path,filename,expected_size,attempt=0) | |
file_dest = path + filename | |
uri = URI(url) | |
resp = Net::HTTP.get_response(uri) | |
case resp.code | |
when "200" | |
if resp.body.length == expected_size | |
unless Dir.exist?(path) | |
FileUtils.mkdir_p path | |
end | |
File.open(file_dest, "wb") { |file| file.write(resp.body) } | |
unless File.exist?(file_dest) | |
STDERR.puts "___ error writing file?" + file_dest | |
end | |
else | |
### even if it's not the expected size, just write it anyway? | |
unless Dir.exist?(path) | |
FileUtils.mkdir_p path | |
end | |
File.open(file_dest, "wb") { |file| file.write(resp.body) } | |
unless File.exist?(file_dest) | |
STDERR.puts "___ error writing file?" + file_dest | |
end | |
end | |
when "301","302","307" | |
if attempt < 3 | |
download_file(type,user,id,resp.header['location'],path,filename,expected_size,attempt +1) | |
end | |
when "401","403","404","500","502","503","520","521","522" | |
## 404 usually means the user has changed their uploads | |
## 403 usually means the user has changed their uploads (and an object store like s3 is in use) | |
## we want to trigger updating the account profile information at this point | |
## but the easiest way for me is to output the account admin URL and manually click refresh | |
#STDERR.print "+" | |
STDOUT.puts $admin_url + id + "/#@"+ user + " refresh needed?" | |
STDOUT.puts " " + url + " " + resp.code | |
else | |
#STDERR.print "?" | |
STDERR.puts "___ problem downloading " + type + " for " + user + " code " + resp.code | |
end | |
rescue Net::OpenTimeout => e | |
## the most common cause for this appears to be servers that have an IPv6 address that doesn't respond | |
STDOUT.puts $admin_url + id + "/#@"+ user | |
STDOUT.puts " " + url + " open_timeout" | |
#STDERR.print "X" | |
#STDERR.puts "___ ERROR: timed out while trying to connect #{e}" | |
rescue Net::ReadTimeout => e | |
STDOUT.puts $admin_url + id + "/#@"+ user | |
STDOUT.puts " " + url + " read_timeout" | |
#STDERR.print "X" | |
#STDERR.puts "___ ERROR: timed outreading #{e}" | |
rescue OpenSSL::SSL::SSLError => e | |
STDOUT.puts $admin_url + id + "/#@"+ user | |
STDOUT.puts " " + url + " ssl_error" | |
#STDERR.print "X" | |
#STDERR.puts "___ ERROR: SSL Error #{e}" | |
rescue SocketError => e | |
## most common cause is domain is missing from DNS | |
STDOUT.puts $admin_url + id + "/#@"+ user | |
STDOUT.puts " " + url + " socket_error" | |
#STDERR.print "X" | |
#STDERR.puts "___ ERROR: Socket Error #{e}" | |
rescue Errno::ECONNREFUSED => e | |
STDOUT.puts $admin_url + id + "/#@"+ user | |
STDOUT.puts " " + url + " connection_refused" | |
#STDERR.print "X" | |
#STDERR.puts "___ ERROR: Errno::ECONNREFUSED #{e}" | |
rescue Errno::EHOSTUNREACH => e | |
STDOUT.puts $admin_url + id + "/#@"+ user | |
STDOUT.puts " " + url + " unreachable" | |
#STDERR.print "X" | |
#STDERR.puts "___ ERROR: Errno::EHOSTUNREACH #{e}" | |
end | |
res.each do |row| | |
if row['avatar_file_name'] | |
avatar_dir = $cache_path + row['avatar_file_path'] | |
#STDERR.print "." | |
unless File.exist?(avatar_dir + row['avatar_file_name']) | |
unless row['avatar_remote_url'].nil? || row['avatar_remote_url'].empty? | |
#STDERR.print "|" | |
download_file("avatar",row['user'],row['id'],row['avatar_remote_url'],avatar_dir,row['avatar_file_name'],row['avatar_file_size'].to_i) | |
end | |
end | |
if $check_size && File.exist?(avatar_dir + row['avatar_file_name']) | |
avatar_file_size = File.size(avatar_dir + row['avatar_file_name']) | |
unless avatar_file_size === row['avatar_file_size'].to_i | |
STDOUT.puts " " + avatar_dir + row['avatar_file_name'] + " size " + avatar_file_size.to_s + " is not db_size " + row['avatar_file_size']+ " " + row['user'] | |
end | |
end | |
end | |
if row['header_file_name'] | |
header_dir = $cache_path + row['header_file_path'] | |
#STDERR.print "," | |
unless File.exist?(header_dir + row['header_file_name']) | |
unless row['header_remote_url'].nil? || row['header_remote_url'].empty? | |
#STDERR.print "|" | |
download_file("header",row['user'],row['id'],row['header_remote_url'],header_dir,row['header_file_name'],row['header_file_size'].to_i) | |
end | |
end | |
if $check_size && File.exist?(header_dir + row['header_file_name']) | |
header_file_size = File.size(header_dir + row['header_file_name']) | |
unless header_file_size === row['header_file_size'].to_i | |
STDOUT.puts " " + header_dir + row['header_file_name'] + " size " + header_file_size.to_s + " is not db_size " + row['header_file_size'] + " " + row['user'] | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment