Last active
December 20, 2015 08:29
-
-
Save mildmojo/6100700 to your computer and use it in GitHub Desktop.
Script for pulling out all the Twitter haters blogged at http://gamerfury.tumblr.com (all sorts o' trigger warnings on that one).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# haters.rb | |
# | |
# Finds all the Twitter users posted to http://gamerfury.tumblr.com and prints | |
# them, newline-delimited, to STDOUT. Require this file in your own script to | |
# access ScumbagHaterFinder directly. | |
# | |
# Tested with Ruby 2.0.0. | |
# | |
# Example usage: | |
# $ ruby haters.rb --verbose | |
# hater1 https://twitter.com/hater1 | |
# ... | |
# | |
# Example mass-block ('gem install t' for the lovely 't' CLI Twitter client) | |
# $ ruby haters.rb | xargs t block | |
# | |
require 'open-uri' | |
require 'json' | |
require 'csv' | |
require 'optparse' | |
require 'rss' | |
require 'rexml/document' | |
# ScumbagHaterFinder parses post descriptions for haters. | |
# | |
# Examples: | |
# # Just get post descriptions, no parsing. | |
# descriptions = ScumbagHaterFinder.new('http://gamerfury.tumblr.com').get_all | |
# | |
# # Provide a parser that pulls out the first '@username' in each description. | |
# haters = ScumbagHaterFinder.new('http://gamerfury.tumblr.com').get_all { |desc| | |
# desc.scan(/@(\S+)/).first | |
# } | |
# | |
class ScumbagHaterFinder < Struct.new(:base_url) | |
# Parser should return a hater name string or nil. | |
def get_all &parser | |
page_num = 0 | |
haters = [] | |
# By default, return whole description | |
parser ||= lambda { |desc| | |
desc | |
} | |
catch :no_more_pages do | |
loop do | |
page_num += 1 | |
url = base_url + (page_num == 1 ? '/rss' : "/page/#{page_num}/rss") | |
STDERR.puts "Fetching page #{page_num}..." | |
page_haters = get_haters_from_rss(url, parser) | |
throw :no_more_pages if page_haters.empty? | |
haters.concat page_haters | |
end | |
end | |
haters | |
end | |
############################################################################## | |
private | |
############################################################################## | |
def get_haters_from_rss url, parser | |
rss = open(url) rescue StringIO.new | |
feed = RSS::Parser.parse(rss) | |
page_haters = feed.items.map { |item| | |
parser.call(item.description) | |
} | |
page_haters.compact.reject(&:empty?) | |
end | |
end | |
# ScumbagHaterPresenter formats hater arrays into string representations. | |
# | |
# Example: | |
# haters = ScumbagHaterFinder.new('http://gamerfury.tumblr.com').get_all | |
# presenter = ScumbagHaterPresenter.new(haters) | |
# | |
# presenter.as_text | |
# => "hater1\nhater2\n" | |
# | |
# presenter.as_text_verbose | |
# => "hater1 https://twitter.com/hater1\n" | |
# | |
# presenter.as_csv | |
# => "hater1,https://twitter.com/hater1\nhater2,https://twitter.com/hater2\n" | |
# | |
# presenter.as_json | |
# => "\{"haters\":[[\"hater1\",\"https://twitter.com/hater1\"]]}" | |
# | |
class ScumbagHaterPresenter < Struct.new(:haters) | |
def as_text | |
haters.join("\n") | |
end | |
def as_text_verbose | |
verbose_haters.map { |fields| | |
sprintf('%-25s %s', *fields) | |
}.join("\n") | |
end | |
def as_csv | |
CSV.generate do |csv| | |
verbose_haters.each do |hater| | |
csv << hater | |
end | |
end | |
end | |
def as_json | |
{ haters: verbose_haters }.to_json | |
end | |
############################################################################## | |
private | |
############################################################################## | |
def verbose_haters | |
haters.map { |hater| [hater, "https://twitter.com/#{hater}"] } | |
end | |
end | |
if __FILE__ == $0 | |
options = {format: :as_text} | |
# Blargh, command-line arg parsing... | |
ARGV.options do |opts| | |
opts.banner = "Usage: #{File.basename($PROGRAM_NAME)} [--text|--json|--csv] [--output <FILE>] [--verbose]" | |
opts.separator '' | |
opts.on( '-h', '--help', 'Show this help' ) { raise 'help' } | |
opts.on( '-v', '--verbose', 'Verbose output' ) { options[:verbose] = true } | |
opts.on( '-o', '--output <FILE>', 'Write output to a file' ) do |file| | |
options[:output] = file | |
end | |
opts.on( '-t', '--text', 'Print hater list as newline-delimited text' ) do | |
options[:format] = :as_text | |
end | |
opts.on( '-j', '--json', 'Print hater list as json' ) do | |
options[:format] = :as_json | |
end | |
opts.on( '-c', '--csv', 'Print hater list as CSV' ) do | |
options[:format] = :as_csv | |
end | |
begin | |
opts.parse! | |
if options[:format] == :as_text && options[:verbose] | |
options[:format] = :as_text_verbose | |
end | |
rescue | |
puts opts | |
exit | |
end | |
end | |
BASE_URL = 'http://gamerfury.tumblr.com' | |
hater_finder = ScumbagHaterFinder.new(BASE_URL) | |
hater_parser = ->(desc) { | |
# Quick & dirty XML sanitization. | |
desc = desc.to_s.sub('&', '&') | |
desc = desc.sub(/<[^>]*</, '<') | |
# GamerFury usernames are inside <em> tags in post descriptions. | |
# Wrap description in <root> tags so REXML doesn't throw a fit. | |
doc = REXML::Document.new("<root>#{desc}</root>") | |
ems = REXML::XPath.match(doc, '//em') | |
hater = ems.last | |
(hater && hater.text.to_s.sub('&', '&')) || nil | |
} | |
haters = hater_finder.get_all(&hater_parser) | |
output = ScumbagHaterPresenter.new(haters).send(options[:format]) | |
if options[:output] | |
File.open(options[:output], 'w') do |f| | |
f.write output + "\n" | |
end | |
else | |
puts output | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment