Created
February 4, 2010 05:22
-
-
Save johnholdun/294344 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tl;dr | |
# grabs some tumblr posts from google and returns their word lengths! | |
# for nostrich! | |
require 'rubygems' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'json' | |
lengths = [] | |
# where are our results? | |
search_path = 'http://www.google.com/search?hl=en&safe=off&tbo=1&tbs=rltm:1&q=%22tl;dr%22+site:tumblr.com+-tagged&aq=f&aqi=&oq=' | |
# how do we find our result links? | |
selector = '.l' | |
# fetch those post urls | |
if (doc = (Nokogiri::HTML(open(search_path)) rescue nil)) | |
urls = doc.css(selector).map do |link| | |
link[:href] | |
end | |
end | |
# what might our body field be called? | |
possible_fields = %w[regular-body photo-caption] | |
urls.each do |url| | |
# just catch errors all at once like a lazy person | |
begin | |
base_path = url.split('/post').first | |
post_id = url.match(/\/(\d+)\/?/).to_a.last | |
api_path = "#{base_path}/api/read/json?id=#{post_id}" | |
# if there's an error, just move on to the next url | |
raw_json = open(api_path).read | |
# tumblr's json output is not actually well-formed json -_- | |
raw_json.gsub! /(^[^{]+|;$)/, '' | |
post_data = JSON.parse raw_json | |
post = post_data['posts'][0] | |
post_body = '' | |
possible_fields.each do |field| | |
if !post[field].nil? | |
post_body = post[field] | |
break | |
end | |
end | |
# strip HTML | |
post_body.gsub! /<[^>]+>/, '' | |
# squash adjacent whitespace characters down | |
post_body.gsub! /\s+/, ' ' | |
# get that fuckin' length | |
lengths << { :url => url, :length => post_body.split(' ').size, :date => post['date-gmt'] } | |
rescue: next | |
end | |
end | |
File.open('tldr.yml', 'w'){ |f| f.write lengths.to_yaml } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment