Last active
August 29, 2015 14:18
-
-
Save WardCunningham/9792cb44076a5ca60c29 to your computer and use it in GitHub Desktop.
Federation Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
logs | |
sites | |
.DS_Store | |
*.numbers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
require 'json' | |
root = 'h2.ward.asia.wiki.org' | |
@fed = {} | |
@nxt = [root] | |
def queue site | |
unless @fed[site] or @nxt.include? site | |
@nxt << site | |
puts "queueing #{site} of #{@nxt.length}" | |
end | |
end | |
def contexts site, slug, page | |
begin | |
page['story'].each do |item| | |
yield item['site'] if item['site'] | |
end | |
page['journal'].each do |action| | |
yield action['site'] if action['site'] | |
end | |
rescue | |
puts "context #{site}/#{slug} rescued" | |
end | |
end | |
def tally site, pages | |
pages.each do |slug, page| | |
marks = {} | |
contexts site, slug, page do |ref| | |
puts "\t#{site}: #{page['title']} => #{ref}" unless marks[ref] | |
marks[ref] = true | |
queue ref | |
end | |
end | |
end | |
def sitemap site | |
begin | |
res = Net::HTTP.get_response site, '/system/sitemap.json' | |
if res.code == '200' | |
@fed[site] = slugs = JSON.parse(res.body).map {|each| each['slug']} | |
pages = {} | |
bytes = 0 | |
slugs.each do |slug| | |
res = Net::HTTP.get_response site, "/#{slug}.json" | |
pages[slug] = JSON.parse(res.body) | |
# puts "fetch #{site} slug #{slug} got #{res.body.length} bytes" | |
bytes += res.body.length | |
sleep 0.2 | |
end | |
puts "fetch #{site} sitemap got #{bytes} bytes" | |
yield site, pages | |
else | |
puts "fetch #{site} sitemap got code #{res.code}" | |
end | |
rescue | |
puts "fetch #{site} sitemap rescued" | |
end | |
end | |
def fetch site | |
begin | |
res = Net::HTTP.get_response site, '/system/export.json' | |
if res.code == '200' | |
puts "fetch #{site} export got #{res.body.length} bytes" | |
pages = JSON.parse res.body | |
@fed[site] = pages.keys | |
yield site, pages | |
else | |
puts "fetch #{site} export got code #{res.code}" | |
# sitemap site | |
end | |
rescue | |
puts "fetch #{site} export rescued" | |
end | |
end | |
max = 999 | |
while @nxt.length > 0 and (max-=1) >= 0 | |
fetch @nxt.shift do |site, pages| | |
File.open("sites/#{site}","w") do |file| | |
file.write JSON.pretty_generate pages | |
tally site, pages | |
end | |
end | |
end | |
puts "done" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
require 'json' | |
root = 'h2.ward.asia.wiki.org' | |
@fed = {} | |
@nxt = [root] | |
def queue site | |
unless @fed[site] or @nxt.include? site | |
@nxt << site | |
puts "queueing #{site} of #{@nxt.length}" | |
end | |
end | |
def contexts site, slug, page | |
begin | |
page['story'].each do |item| | |
yield item['site'] if item['site'] | |
end | |
page['journal'].each do |action| | |
yield action['site'] if action['site'] | |
end | |
rescue | |
puts "context #{site}/#{slug} rescued" | |
end | |
end | |
def tally site, pages | |
pages.each do |slug, page| | |
marks = {} | |
contexts site, slug, page do |ref| | |
puts "\t#{site}: #{page['title']} => #{ref}" unless marks[ref] | |
marks[ref] = true | |
queue ref | |
end | |
end | |
end | |
def sitemap site | |
begin | |
res = Net::HTTP.get_response site, '/system/sitemap.json' | |
if res.code == '200' | |
@fed[site] = slugs = JSON.parse(res.body).map {|each| each['slug']} | |
pages = {} | |
bytes = 0 | |
slugs.each do |slug| | |
res = Net::HTTP.get_response site, "/#{slug}.json" | |
pages[slug] = JSON.parse(res.body) | |
# puts "fetch #{site} slug #{slug} got #{res.body.length} bytes" | |
bytes += res.body.length | |
sleep 0.2 | |
end | |
puts "fetch #{site} sitemap got #{bytes} bytes" | |
tally site, pages | |
else | |
puts "fetch #{site} sitemap got code #{res.code}" | |
end | |
rescue | |
puts "fetch #{site} sitemap rescued" | |
end | |
end | |
def fetch site | |
begin | |
res = Net::HTTP.get_response site, '/system/export.json' | |
if res.code == '200' | |
puts "fetch #{site} export got #{res.body.length} bytes" | |
pages = JSON.parse res.body | |
@fed[site] = pages.keys | |
tally site, pages | |
else | |
puts "fetch #{site} export got code #{res.code}" | |
sitemap site | |
end | |
rescue | |
puts "fetch #{site} export rescued" | |
end | |
end | |
max = 5 | |
while @nxt.length > 0 and (max-=1) >= 0 | |
fetch @nxt.shift | |
end | |
puts "done" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'json' | |
require 'csv' | |
require 'pp' | |
root = 'h2.ward.asia.wiki.org' | |
@fed = {} | |
@nxt = [root] | |
class Tally | |
def initialize | |
@rows = Hash.new {|h,k| h[k] = Hash.new(0)} | |
@columns = Hash.new(0) | |
end | |
def count row, column, n=1 | |
@rows[row][column] += n | |
@columns[column] += 1 | |
end | |
def save name | |
keys = @columns.keys.to_a | |
CSV.open "csv/#{name}.csv",'wb' do |file| | |
file << [name, keys].flatten | |
@rows.each do |row, hash| | |
file << [row, keys.collect {|key| hash[key]}].flatten | |
end | |
end | |
pp @rows | |
end | |
end | |
@sites = Tally.new | |
def queue site | |
unless @fed[site] or @nxt.include? site | |
@nxt << site | |
# puts "queueing #{site} of #{@nxt.length}" | |
end | |
end | |
def contexts site, slug, page | |
begin | |
page['story'].each do |item| | |
yield item['site'] if item['site'] | |
end | |
page['journal'].each do |action| | |
yield action['site'] if action['site'] | |
end | |
rescue | |
puts "context #{site}/#{slug} rescued" | |
end | |
end | |
def tally site, pages | |
cites = Hash.new(0) | |
pages.each do |slug, page| | |
@sites.count site, 'pages' | |
@sites.count site, 'actions',(page["journal"]||[]).length | |
@sites.count site, 'items',(page['story']||[]).length | |
marks = {} | |
contexts site, slug, page do |ref| | |
cites[ref] += 1 | |
# puts "\t#{site}: #{page['title']} => #{ref}" unless marks[ref] | |
marks[ref] = true | |
queue ref | |
end | |
marks.keys.each do |cite| | |
@sites.count cite, 'citations' if @fed[cite] | |
end | |
end | |
@sites.count site, 'neighbors', cites.keys.size | |
end | |
def fetch site | |
begin | |
file = "sites/#{site}" | |
if File.exist? file | |
body = File.read file | |
puts "fetch #{site} export got #{body.length} bytes" | |
pages = JSON.parse body | |
@fed[site] = pages.keys | |
tally site, pages | |
else | |
puts "fetch #{site} not in cache" | |
end | |
rescue | |
puts "fetch #{site} fetch rescued" | |
end | |
end | |
puts "start" | |
max = 999 | |
while @nxt.length > 0 and (max-=1) >= 0 | |
fetch @nxt.shift | |
end | |
# pp @fed | |
@sites.save 'sites' | |
puts "done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment