gwire · December 2, 2022 18:19
diff --git a/download_missing_avatars.rb b/download_missing_avatars.rb
 #!/usr/bin/env ruby

 # Remote mastodon accounts can be refreshed with the command
 #   tootctl accounts refresh --all
 # https://docs.joinmastodon.org/admin/tootctl/#accounts-refresh

 # however, tootctl was having issues, so I ended up writing a this
 # helper script to scan a mastodon cache for missing avatar/header images
 # prioritising recently active accounts, and download directly
 # 
 # note: make sure $cache_path is correct!
 #  assumes a path like .../accounts/avatars/888/888/888/888/888/888/original/filename.jpeg

 require 'pg'
 require 'net/http'
 require 'uri'
 require 'fileutils'

 $cache_path="/home/mastodon/live/public/system/cache"
 $admin_url="https://example.social/admin/accounts/"
 $check_size=false

 conn = PG.connect(dbname: 'mastodon')
 res  = conn.exec("
 SELECT
  id,
  CONCAT(username,'@',domain) AS user,
  CONCAT('/accounts/avatars/',
         SUBSTRING(id::text, 1, 3), '/',
         SUBSTRING(id::text, 4, 3), '/',
         SUBSTRING(id::text, 7, 3), '/',
         SUBSTRING(id::text, 10, 3),'/',
         SUBSTRING(id::text, 13, 3),'/',
         SUBSTRING(id::text, 16, 3),'/original/') AS avatar_file_path,
   avatar_file_name,
   avatar_file_size,
   avatar_remote_url,
   CONCAT('/accounts/headers/',
         SUBSTRING(id::text, 1, 3), '/',
         SUBSTRING(id::text, 4, 3), '/',
         SUBSTRING(id::text, 7, 3), '/',
         SUBSTRING(id::text, 10, 3),'/',
         SUBSTRING(id::text, 13, 3),'/',
         SUBSTRING(id::text, 16, 3),'/original/') AS header_file_path,
    header_file_name,
    header_file_size,
    header_remote_url,updated_at
 FROM public.accounts
 ORDER BY updated_at DESC
 ")

 #WHERE avatar_updated_at <'2022-11-29'::date
 #AND updated_at > '2022-11-24'::date
 #LIMIT 6000
 #ORDER BY RANDOM ()

 def download_file(type,user,id,url,path,filename,expected_size,attempt=0)
  file_dest = path + filename
  uri = URI(url)
  resp = Net::HTTP.get_response(uri)
  case resp.code
  when "200"
    if resp.body.length == expected_size
      unless Dir.exist?(path)
        FileUtils.mkdir_p path
      end
      File.open(file_dest, "wb") { |file| file.write(resp.body) }
      unless File.exist?(file_dest)
        STDERR.puts "___ error writing file?" + file_dest
      end
    else
      ### even if it's not the expected size, just write it anyway?
      unless Dir.exist?(path)
        FileUtils.mkdir_p path
      end
      File.open(file_dest, "wb") { |file| file.write(resp.body) }
      unless File.exist?(file_dest)
        STDERR.puts "___ error writing file?" + file_dest
      end
    end
  when "301","302","307"
    if attempt < 3
      download_file(type,user,id,resp.header['location'],path,filename,expected_size,attempt +1)
    end
  when "401","403","404","500","502","503","520","521","522"
    ## 404 usually means the user has changed their uploads
    ## 403 usually means the user has changed their uploads (and an object store like s3 is in use)
    ## we want to trigger updating the account profile information at this point
    ## but the easiest way for me is to output the account admin URL and manually click refresh
    #STDERR.print "+"
    STDOUT.puts $admin_url + id + "/#@"+ user + " refresh needed?"
    STDOUT.puts "  " + url + " " + resp.code
  else
    #STDERR.print "?"
    STDERR.puts "___ problem downloading " + type + " for " + user + " code " + resp.code
  end
 rescue Net::OpenTimeout => e
  ## the most common cause for this appears to be servers that have an IPv6 address that doesn't respond
  STDOUT.puts $admin_url + id + "/#@"+ user
  STDOUT.puts "  " + url + " open_timeout"
  #STDERR.print "X"
  #STDERR.puts "___ ERROR: timed out while trying to connect #{e}"  
 rescue Net::ReadTimeout => e
  STDOUT.puts $admin_url + id + "/#@"+ user
  STDOUT.puts "  " + url + " read_timeout"
  #STDERR.print "X"
  #STDERR.puts "___ ERROR: timed outreading #{e}"
 rescue OpenSSL::SSL::SSLError => e
  STDOUT.puts $admin_url + id + "/#@"+ user
  STDOUT.puts "  " + url + " ssl_error"
  #STDERR.print "X"
  #STDERR.puts "___ ERROR: SSL Error #{e}"
 rescue SocketError => e
  ## most common cause is domain is missing from DNS    
  STDOUT.puts $admin_url + id + "/#@"+ user
  STDOUT.puts "  " + url + " socket_error"
  #STDERR.print "X"
  #STDERR.puts "___ ERROR: Socket Error #{e}"
 rescue Errno::ECONNREFUSED => e
  STDOUT.puts $admin_url + id + "/#@"+ user 
  STDOUT.puts "  " + url + " connection_refused"
  #STDERR.print "X"
  #STDERR.puts "___ ERROR: Errno::ECONNREFUSED #{e}"
 rescue Errno::EHOSTUNREACH  => e
  STDOUT.puts $admin_url + id + "/#@"+ user
  STDOUT.puts "  " + url + " unreachable"
  #STDERR.print "X"
  #STDERR.puts "___ ERROR: Errno::EHOSTUNREACH #{e}"
 end

 res.each do |row|
  
  if row['avatar_file_name']
    avatar_dir = $cache_path + row['avatar_file_path']
    #STDERR.print "."
    unless File.exist?(avatar_dir + row['avatar_file_name'])
      unless row['avatar_remote_url'].nil? || row['avatar_remote_url'].empty?
        #STDERR.print "|"
        download_file("avatar",row['user'],row['id'],row['avatar_remote_url'],avatar_dir,row['avatar_file_name'],row['avatar_file_size'].to_i)
      end
    end
    if $check_size && File.exist?(avatar_dir + row['avatar_file_name'])
      avatar_file_size = File.size(avatar_dir + row['avatar_file_name'])
      unless avatar_file_size === row['avatar_file_size'].to_i
        STDOUT.puts "  " + avatar_dir + row['avatar_file_name'] + " size " + avatar_file_size.to_s + " is not db_size " + row['avatar_file_size']+ " " + row['user']
      end
    end
  end

  if row['header_file_name']
    header_dir = $cache_path + row['header_file_path']
    #STDERR.print ","
    unless File.exist?(header_dir + row['header_file_name'])
      unless row['header_remote_url'].nil? || row['header_remote_url'].empty?
        #STDERR.print "|"
        download_file("header",row['user'],row['id'],row['header_remote_url'],header_dir,row['header_file_name'],row['header_file_size'].to_i)
      end
    end
    if $check_size && File.exist?(header_dir + row['header_file_name'])
      header_file_size = File.size(header_dir + row['header_file_name'])
      unless header_file_size === row['header_file_size'].to_i
        STDOUT.puts "  " + header_dir + row['header_file_name'] + " size " + header_file_size.to_s + " is not db_size " + row['header_file_size'] + " " + row['user']
      end
    end
  end

 end
	#!/usr/bin/env ruby

	# Remote mastodon accounts can be refreshed with the command
	# tootctl accounts refresh --all
	# https://docs.joinmastodon.org/admin/tootctl/#accounts-refresh

	# however, tootctl was having issues, so I ended up writing a this
	# helper script to scan a mastodon cache for missing avatar/header images
	# prioritising recently active accounts, and download directly
	#
	# note: make sure $cache_path is correct!
	# assumes a path like .../accounts/avatars/888/888/888/888/888/888/original/filename.jpeg

	require 'pg'
	require 'net/http'
	require 'uri'
	require 'fileutils'

	$cache_path="/home/mastodon/live/public/system/cache"
	$admin_url="https://example.social/admin/accounts/"
	$check_size=false

	conn = PG.connect(dbname: 'mastodon')
	res = conn.exec("
	SELECT
	id,
	CONCAT(username,'@',domain) AS user,
	CONCAT('/accounts/avatars/',
	SUBSTRING(id::text, 1, 3), '/',
	SUBSTRING(id::text, 4, 3), '/',
	SUBSTRING(id::text, 7, 3), '/',
	SUBSTRING(id::text, 10, 3),'/',
	SUBSTRING(id::text, 13, 3),'/',
	SUBSTRING(id::text, 16, 3),'/original/') AS avatar_file_path,
	avatar_file_name,
	avatar_file_size,
	avatar_remote_url,
	CONCAT('/accounts/headers/',
	SUBSTRING(id::text, 1, 3), '/',
	SUBSTRING(id::text, 4, 3), '/',
	SUBSTRING(id::text, 7, 3), '/',
	SUBSTRING(id::text, 10, 3),'/',
	SUBSTRING(id::text, 13, 3),'/',
	SUBSTRING(id::text, 16, 3),'/original/') AS header_file_path,
	header_file_name,
	header_file_size,
	header_remote_url,updated_at
	FROM public.accounts
	ORDER BY updated_at DESC
	")

	#WHERE avatar_updated_at <'2022-11-29'::date
	#AND updated_at > '2022-11-24'::date
	#LIMIT 6000
	#ORDER BY RANDOM ()

	def download_file(type,user,id,url,path,filename,expected_size,attempt=0)
	file_dest = path + filename
	uri = URI(url)
	resp = Net::HTTP.get_response(uri)
	case resp.code
	when "200"
	if resp.body.length == expected_size
	unless Dir.exist?(path)
	FileUtils.mkdir_p path
	end
	File.open(file_dest, "wb") { \|file\| file.write(resp.body) }
	unless File.exist?(file_dest)
	STDERR.puts "___ error writing file?" + file_dest
	end
	else
	### even if it's not the expected size, just write it anyway?
	unless Dir.exist?(path)
	FileUtils.mkdir_p path
	end
	File.open(file_dest, "wb") { \|file\| file.write(resp.body) }
	unless File.exist?(file_dest)
	STDERR.puts "___ error writing file?" + file_dest
	end
	end
	when "301","302","307"
	if attempt < 3
	download_file(type,user,id,resp.header['location'],path,filename,expected_size,attempt +1)
	end
	when "401","403","404","500","502","503","520","521","522"
	## 404 usually means the user has changed their uploads
	## 403 usually means the user has changed their uploads (and an object store like s3 is in use)
	## we want to trigger updating the account profile information at this point
	## but the easiest way for me is to output the account admin URL and manually click refresh
	#STDERR.print "+"
	STDOUT.puts $admin_url + id + "/#@"+ user + " refresh needed?"
	STDOUT.puts " " + url + " " + resp.code
	else
	#STDERR.print "?"
	STDERR.puts "___ problem downloading " + type + " for " + user + " code " + resp.code
	end
	rescue Net::OpenTimeout => e
	## the most common cause for this appears to be servers that have an IPv6 address that doesn't respond
	STDOUT.puts $admin_url + id + "/#@"+ user
	STDOUT.puts " " + url + " open_timeout"
	#STDERR.print "X"
	#STDERR.puts "___ ERROR: timed out while trying to connect #{e}"
	rescue Net::ReadTimeout => e
	STDOUT.puts $admin_url + id + "/#@"+ user
	STDOUT.puts " " + url + " read_timeout"
	#STDERR.print "X"
	#STDERR.puts "___ ERROR: timed outreading #{e}"
	rescue OpenSSL::SSL::SSLError => e
	STDOUT.puts $admin_url + id + "/#@"+ user
	STDOUT.puts " " + url + " ssl_error"
	#STDERR.print "X"
	#STDERR.puts "___ ERROR: SSL Error #{e}"
	rescue SocketError => e
	## most common cause is domain is missing from DNS
	STDOUT.puts $admin_url + id + "/#@"+ user
	STDOUT.puts " " + url + " socket_error"
	#STDERR.print "X"
	#STDERR.puts "___ ERROR: Socket Error #{e}"
	rescue Errno::ECONNREFUSED => e
	STDOUT.puts $admin_url + id + "/#@"+ user
	STDOUT.puts " " + url + " connection_refused"
	#STDERR.print "X"
	#STDERR.puts "___ ERROR: Errno::ECONNREFUSED #{e}"
	rescue Errno::EHOSTUNREACH => e
	STDOUT.puts $admin_url + id + "/#@"+ user
	STDOUT.puts " " + url + " unreachable"
	#STDERR.print "X"
	#STDERR.puts "___ ERROR: Errno::EHOSTUNREACH #{e}"
	end

	res.each do \|row\|

	if row['avatar_file_name']
	avatar_dir = $cache_path + row['avatar_file_path']
	#STDERR.print "."
	unless File.exist?(avatar_dir + row['avatar_file_name'])
	unless row['avatar_remote_url'].nil? \|\| row['avatar_remote_url'].empty?
	#STDERR.print "\|"
	download_file("avatar",row['user'],row['id'],row['avatar_remote_url'],avatar_dir,row['avatar_file_name'],row['avatar_file_size'].to_i)
	end
	end
	if $check_size && File.exist?(avatar_dir + row['avatar_file_name'])
	avatar_file_size = File.size(avatar_dir + row['avatar_file_name'])
	unless avatar_file_size === row['avatar_file_size'].to_i
	STDOUT.puts " " + avatar_dir + row['avatar_file_name'] + " size " + avatar_file_size.to_s + " is not db_size " + row['avatar_file_size']+ " " + row['user']
	end
	end
	end

	if row['header_file_name']
	header_dir = $cache_path + row['header_file_path']
	#STDERR.print ","
	unless File.exist?(header_dir + row['header_file_name'])
	unless row['header_remote_url'].nil? \|\| row['header_remote_url'].empty?
	#STDERR.print "\|"
	download_file("header",row['user'],row['id'],row['header_remote_url'],header_dir,row['header_file_name'],row['header_file_size'].to_i)
	end
	end
	if $check_size && File.exist?(header_dir + row['header_file_name'])
	header_file_size = File.size(header_dir + row['header_file_name'])
	unless header_file_size === row['header_file_size'].to_i
	STDOUT.puts " " + header_dir + row['header_file_name'] + " size " + header_file_size.to_s + " is not db_size " + row['header_file_size'] + " " + row['user']
	end
	end
	end

	end