Created
June 22, 2012 20:16
-
-
Save harrisj/2974940 to your computer and use it in GitHub Desktop.
Basic code for the nytimes_ebooks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require 'rubygems' | |
require 'rest-client' | |
require 'nokogiri' | |
class ArticleFetcher | |
def self.fetch_text(url) | |
case url | |
when /video\.nytimes\.com/ | |
'' | |
when /\.blogs\.nytimes\.com/, /dealbook\.nytimes\.com/ | |
fetch_blog_text(url) | |
when /\.nytimes\.com/ | |
fetch_article_text(url) | |
end | |
end | |
def self.fetch_blog_text(url) | |
response = RestClient.get(url) | |
response_html = response.to_s | |
parsed_html = Nokogiri::HTML(response_html) | |
parsed_html.css("div.entry-content").inner_text | |
end | |
def self.fetch_article_text(url) | |
full_url = if url =~ /\?/ | |
url + "&pagewanted=all" | |
else | |
url + "?pagewanted=all" | |
end | |
response = RestClient.get(full_url) | |
response_html = response.to_s | |
parsed_html = Nokogiri::HTML(response_html) | |
parsed_html.css("div.articleBody").inner_text | |
end | |
def self.extract_quotes(text) | |
out = text.scan(/["“]([^"”]*)[”"]/m).join("\n") | |
out.gsub(/,\n/m, ".\n") | |
end | |
def self.fetch_quotes(url) | |
extract_quotes(fetch_text(url)) | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$:.unshift(File.dirname(__FILE__)) | |
require 'rubygems' | |
require 'article_fetcher' | |
require 'markov' | |
require 'open-uri' | |
require 'simple-rss' | |
require 'colorize' | |
require 'twitter_db' | |
RSS_FEED_URL = 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml' | |
rss = SimpleRSS.parse open(RSS_FEED_URL) | |
Tweet.connect_to_db | |
debugging = ENV['DEBUG'] | |
rss.items.each do |item| | |
next if Tweet.in_db?(item.guid) && !debugging | |
text = ArticleFetcher.fetch_text(item.guid) | |
next if text.empty? | |
quotes = ArticleFetcher.extract_quotes(text) | |
if quotes.length < 400 | |
puts "Not using quotes for this one: #{item.guid}" | |
quotes = text | |
end | |
quotes.gsub!(/(([A-Z]\.)+)/) {|w| w.gsub('.', '')} | |
quotes.gsub!(/(Dr|Mr|Mrs|Gov|Amb|Hon|Ave)\./, '\1') | |
markov = MarkovChainer.new(1) | |
markov.add_text(quotes) | |
retries = 5 | |
while retries > 0 | |
retries -= 1 | |
body = markov.generate_sentence | |
case body | |
when /[A-Z][A-Z]+/, /^.+\b[A-Z][a-z]+/, /(Dr|Mr|Mrs|Gov|Rep)\s/ | |
puts "Retrying since this has a name in it: #{body}" | |
else | |
break | |
end | |
end | |
if body.length > 100 | |
body = body[0,100] | |
body.gsub!(/\s+\S+$/, '') | |
end | |
body.gsub!(/\.$/, '') | |
Tweet.queue(item.title, body, item.guid) | |
end | |
Tweet.post_pending |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'sqlite3' | |
require 'bitly' | |
gem 'activesupport', '~> 2.3.11' | |
gem 'activerecord', '~> 2.3.11' | |
require 'active_support' | |
require 'active_record' | |
gem 'twitter', '~> 2.4.0' | |
require 'twitter' | |
DB_PATH = File.join(File.dirname(__FILE__), "tweets.db") | |
Bitly.use_api_version_3 | |
Twitter.configure do |config| | |
config.consumer_key = 'REDACTED' | |
config.consumer_secret = 'REDACTED' | |
config.oauth_token = 'REDACTED' | |
config.oauth_token_secret = 'REDACTED' | |
end | |
class CreateTwitterDb < ActiveRecord::Migration | |
def self.up | |
create_table :tweets do |t| | |
t.string :nyt_title | |
t.string :body | |
t.string :expanded_link | |
t.boolean :posted, :default => false, :null => false | |
t.boolean :expired, :default => false, :null => false | |
t.datetime :created_at | |
t.datetime :posted_at | |
end | |
add_index :tweets, :posted | |
add_index :tweets, :expanded_link | |
create_table :tweet_metadata do |t| | |
t.datetime :next_post_at | |
end | |
end | |
end | |
class TweetMetadata < ActiveRecord::Base | |
set_table_name 'tweet_metadata' | |
def self.can_post_again? | |
r = first | |
r.nil? || r.next_post_at < Time.now | |
end | |
def self.tweet_posted | |
next_time = Time.now + 5.minutes + rand(10.minutes) | |
r = first | |
if r.nil? | |
create :next_post_at => next_time | |
else | |
r.update_attribute(:next_post_at, next_time) | |
end | |
end | |
end | |
class Tweet < ActiveRecord::Base | |
BITLY_KEY = 'REDACTED' | |
named_scope :pending, :conditions => {:posted => false, :expired => false} | |
def self.connect_to_db | |
if !File.exists?(DB_PATH) | |
should_create = true | |
end | |
ActiveRecord::Base.establish_connection({ | |
:adapter => 'sqlite3', | |
:database => DB_PATH | |
}) | |
if should_create | |
create_db | |
end | |
end | |
def self.shorten_link(link) | |
if @bitly.nil? | |
@bitly = Bitly.new('nytimesebooks', BITLY_KEY) | |
end | |
u = @bitly.shorten(link, :history => 1) | |
u.short_url | |
end | |
def self.in_db?(link) | |
exists?(:expanded_link => link) | |
end | |
def self.queue(title, text, link) | |
if ENV['DEBUG'] | |
puts title | |
puts link | |
puts text.colorize(:red) | |
return | |
end | |
return if exists?(:expanded_link => link) | |
begin | |
short_link = shorten_link(link) | |
body = "#{text.gsub(/\s+$/, '')} #{short_link}" | |
create :nyt_title => title, :body => body, :expanded_link => link | |
puts title | |
puts link | |
puts body.colorize(:red) | |
# rescue => ex | |
# puts "ERROR #{ex.message} for #{text} #{link}" | |
end | |
end | |
def mark_posted!(add_timeout=true) | |
update_attribute(:posted, true) | |
if add_timeout | |
TweetMetadata.tweet_posted | |
end | |
end | |
def self.post_pending | |
t = pending.first | |
if !t.nil? && !TweetMetadata.can_post_again? | |
puts "Can't post yet" | |
end | |
unless t.nil? || !TweetMetadata.can_post_again? | |
begin | |
Twitter.update(t.body) | |
t.mark_posted! | |
rescue => ex | |
if ex.message =~ /Status is a duplicate/ | |
t.mark_posted!(false) | |
else | |
raise ex | |
end | |
end | |
end | |
end | |
private | |
def self.create_db | |
CreateTwitterDb.up | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
My hackish code for nytimes_ebooks (minus the Markov class I'm using). Feel free to use anywhere. MIT license I guess.