Skip to content

Instantly share code, notes, and snippets.

@kiyoto
Created November 7, 2014 07:11
Show Gist options
  • Save kiyoto/f9dd1d7709d6a92698be to your computer and use it in GitHub Desktop.
Save kiyoto/f9dd1d7709d6a92698be to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'open-uri'
require 'json'
BASE_URL = 'http://www.tomtunguz.com'
class URLFormatError < RuntimeError; end
def fetch_categories(base_url)
html = open(base_url).read
dom = Nokogiri::HTML(html)
dom.css('a').map {|a| a.attributes["href"].text }.select do |x| x.start_with?('/categories/') end
end
def fetch_article_urls_in_category(base_url, category)
html = open(base_url + category).read
dom = Nokogiri::HTML(html)
dom.css('ul.posts h2 a').map {|a| a.attributes["href"].text }
end
def fetch_article_detail(article_url)
raise URLFormatError if not /tomtunguz.com\//.match(article_url)
html = open(article_url).read
dom = Nokogiri::HTML(html)
csv_row = []
csv_row << dom.css('div.post p')[-1].text # date
csv_row << article_url
csv_row << dom.css('div.post p').length # num paragraphs
csv_row << dom.css('div.post img').length # num images
csv_row << dom.css('div.post table').length # num tables
csv_row << dom.css('div.post h2').length # num sections
csv_row << dom.css('div.post h1')[0].text # title
csv_row
end
puts [
'date', 'url', 'num_paragraph',
'num_images', 'num_tables', 'num_sections',
'title', 'category'
].join("\t")
fetch_categories(BASE_URL).each do |category|
fetch_article_urls_in_category(BASE_URL, category).each do |article_url|
begin
data = fetch_article_detail(article_url)
data << category
puts data.join("\t")
sleep 1
rescue URLFormatError
STDERR.puts "failed to parse #{article_url} in #{category}"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment