Created
April 24, 2012 02:58
-
-
Save danneu/2475824 to your computer and use it in GitHub Desktop.
Crawls http://isbullsh.it and stores each blog post title & tag into MongoDB. RE: http://news.ycombinator.com/item?id=3878605
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'anemone' | |
require 'mongo' | |
# Patterns | |
POST_WITHOUT_SLASH = %r[\d{4}\/\d{2}\/[^\/]+$] # http://isbullsh.it/2012/66/here-is-a-title (301 redirects to slash) | |
POST_WITH_SLASH = %r[\d{4}\/\d{2}\/[\w-]+\/$] # http://isbullsh.it/2012/66/here-is-a-title/ | |
ANY_POST = Regexp.union POST_WITHOUT_SLASH, POST_WITH_SLASH | |
ANY_PAGE = %r[page\/\d+] # http://isbullsh.it/page/4 | |
ANY_PATTERN = Regexp.union ANY_PAGE, ANY_POST | |
# MongoDB | |
db = Mongo::Connection.new.db("scraped") | |
posts_collection = db["posts"] | |
Anemone.crawl("http://isbullsh.it") do |anemone| | |
# only crawl links that are pages or blog posts | |
anemone.focus_crawl do |page| | |
page.links.keep_if { |link| link.to_s.match(ANY_PATTERN) } | |
end | |
# only scrape blog post pages (urls that don't end in slash are just 301 redirects) | |
anemone.on_pages_like(POST_WITH_SLASH) do |page| | |
title = page.doc.at_xpath("//div[@role='main']/header/h1").text rescue nil | |
tag = page.doc.at_xpath("//header/div[@class='post-data']/p/a").text rescue nil | |
if title and tag | |
post = {title: title, tag: tag} | |
puts "Inserting #{post.inspect}" | |
posts_collection.insert post | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment