Skip to content

Instantly share code, notes, and snippets.

@harrisj
Created June 22, 2012 20:16
Show Gist options
  • Save harrisj/2974940 to your computer and use it in GitHub Desktop.
Save harrisj/2974940 to your computer and use it in GitHub Desktop.
Basic code for the nytimes_ebooks
# encoding: UTF-8
require 'rubygems'
require 'rest-client'
require 'nokogiri'
class ArticleFetcher
def self.fetch_text(url)
case url
when /video\.nytimes\.com/
''
when /\.blogs\.nytimes\.com/, /dealbook\.nytimes\.com/
fetch_blog_text(url)
when /\.nytimes\.com/
fetch_article_text(url)
end
end
def self.fetch_blog_text(url)
response = RestClient.get(url)
response_html = response.to_s
parsed_html = Nokogiri::HTML(response_html)
parsed_html.css("div.entry-content").inner_text
end
def self.fetch_article_text(url)
full_url = if url =~ /\?/
url + "&pagewanted=all"
else
url + "?pagewanted=all"
end
response = RestClient.get(full_url)
response_html = response.to_s
parsed_html = Nokogiri::HTML(response_html)
parsed_html.css("div.articleBody").inner_text
end
def self.extract_quotes(text)
out = text.scan(/["“]([^"”]*)[”"]/m).join("\n")
out.gsub(/,\n/m, ".\n")
end
def self.fetch_quotes(url)
extract_quotes(fetch_text(url))
end
end
$:.unshift(File.dirname(__FILE__))
require 'rubygems'
require 'article_fetcher'
require 'markov'
require 'open-uri'
require 'simple-rss'
require 'colorize'
require 'twitter_db'
RSS_FEED_URL = 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'
rss = SimpleRSS.parse open(RSS_FEED_URL)
Tweet.connect_to_db
debugging = ENV['DEBUG']
rss.items.each do |item|
next if Tweet.in_db?(item.guid) && !debugging
text = ArticleFetcher.fetch_text(item.guid)
next if text.empty?
quotes = ArticleFetcher.extract_quotes(text)
if quotes.length < 400
puts "Not using quotes for this one: #{item.guid}"
quotes = text
end
quotes.gsub!(/(([A-Z]\.)+)/) {|w| w.gsub('.', '')}
quotes.gsub!(/(Dr|Mr|Mrs|Gov|Amb|Hon|Ave)\./, '\1')
markov = MarkovChainer.new(1)
markov.add_text(quotes)
retries = 5
while retries > 0
retries -= 1
body = markov.generate_sentence
case body
when /[A-Z][A-Z]+/, /^.+\b[A-Z][a-z]+/, /(Dr|Mr|Mrs|Gov|Rep)\s/
puts "Retrying since this has a name in it: #{body}"
else
break
end
end
if body.length > 100
body = body[0,100]
body.gsub!(/\s+\S+$/, '')
end
body.gsub!(/\.$/, '')
Tweet.queue(item.title, body, item.guid)
end
Tweet.post_pending
require 'rubygems'
require 'sqlite3'
require 'bitly'
gem 'activesupport', '~> 2.3.11'
gem 'activerecord', '~> 2.3.11'
require 'active_support'
require 'active_record'
gem 'twitter', '~> 2.4.0'
require 'twitter'
DB_PATH = File.join(File.dirname(__FILE__), "tweets.db")
Bitly.use_api_version_3
Twitter.configure do |config|
config.consumer_key = 'REDACTED'
config.consumer_secret = 'REDACTED'
config.oauth_token = 'REDACTED'
config.oauth_token_secret = 'REDACTED'
end
class CreateTwitterDb < ActiveRecord::Migration
def self.up
create_table :tweets do |t|
t.string :nyt_title
t.string :body
t.string :expanded_link
t.boolean :posted, :default => false, :null => false
t.boolean :expired, :default => false, :null => false
t.datetime :created_at
t.datetime :posted_at
end
add_index :tweets, :posted
add_index :tweets, :expanded_link
create_table :tweet_metadata do |t|
t.datetime :next_post_at
end
end
end
class TweetMetadata < ActiveRecord::Base
set_table_name 'tweet_metadata'
def self.can_post_again?
r = first
r.nil? || r.next_post_at < Time.now
end
def self.tweet_posted
next_time = Time.now + 5.minutes + rand(10.minutes)
r = first
if r.nil?
create :next_post_at => next_time
else
r.update_attribute(:next_post_at, next_time)
end
end
end
class Tweet < ActiveRecord::Base
BITLY_KEY = 'REDACTED'
named_scope :pending, :conditions => {:posted => false, :expired => false}
def self.connect_to_db
if !File.exists?(DB_PATH)
should_create = true
end
ActiveRecord::Base.establish_connection({
:adapter => 'sqlite3',
:database => DB_PATH
})
if should_create
create_db
end
end
def self.shorten_link(link)
if @bitly.nil?
@bitly = Bitly.new('nytimesebooks', BITLY_KEY)
end
u = @bitly.shorten(link, :history => 1)
u.short_url
end
def self.in_db?(link)
exists?(:expanded_link => link)
end
def self.queue(title, text, link)
if ENV['DEBUG']
puts title
puts link
puts text.colorize(:red)
return
end
return if exists?(:expanded_link => link)
begin
short_link = shorten_link(link)
body = "#{text.gsub(/\s+$/, '')} #{short_link}"
create :nyt_title => title, :body => body, :expanded_link => link
puts title
puts link
puts body.colorize(:red)
# rescue => ex
# puts "ERROR #{ex.message} for #{text} #{link}"
end
end
def mark_posted!(add_timeout=true)
update_attribute(:posted, true)
if add_timeout
TweetMetadata.tweet_posted
end
end
def self.post_pending
t = pending.first
if !t.nil? && !TweetMetadata.can_post_again?
puts "Can't post yet"
end
unless t.nil? || !TweetMetadata.can_post_again?
begin
Twitter.update(t.body)
t.mark_posted!
rescue => ex
if ex.message =~ /Status is a duplicate/
t.mark_posted!(false)
else
raise ex
end
end
end
end
private
def self.create_db
CreateTwitterDb.up
end
end
@harrisj
Copy link
Author

harrisj commented Jun 22, 2012

My hackish code for nytimes_ebooks (minus the Markov class I'm using). Feel free to use anywhere. MIT license I guess.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment