Created
June 22, 2009 19:50
-
-
Save toto/134154 to your computer and use it in GitHub Desktop.
Produces Fulltext Feeds for Heise Online
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/opt/local/bin/ruby | |
require 'rubygems' | |
require 'scrubyt' | |
require 'open-uri' | |
require 'builder' | |
LIMIT = 5 | |
res = [] | |
data = Scrubyt::Extractor.define do | |
fetch 'http://www.heise.de/newsticker/classic/', :user_agent => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7; de-de) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Safari/530.17' | |
link_title "//h3/a", :write_text => true do | |
link_url | |
end | |
end | |
data.to_hash.each_with_index do |item,index| | |
break if index >= LIMIT | |
content = Scrubyt::Extractor.define do | |
next if item[:link_url] =~ /\Ahttp\:/ | |
fetch 'http://heise.de' + item[:link_url], :user_agent => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7; de-de) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Safari/530.17' | |
item '//div[@id="mitte_news"]' do | |
item_text '//div[@class="meldung_wrapper"]' | |
created_at '//p[@class="news_datum"]' | |
end | |
end | |
item_text = content.to_hash.first | |
next unless item_text | |
date = item_text[:created_at].split(/[\.: ]/).collect{|i| i.to_i} | |
date = Time.utc(date[2].to_i, date[1].to_i, date[0].to_i, date[-2].to_i, date[-1].to_i) | |
res << {:title => item[:link_title], | |
:link => 'http://heise.de' + item[:link_url], | |
:content => item_text[:item_text].gsub(/ +/,' ').gsub('Anzeige','').gsub(/\n\n+/,"\n"), | |
:created_at => date} | |
end | |
builder = Builder::XmlMarkup.new(:target => STDOUT) | |
builder.feed do |feed| | |
feed.title("Heise Online Newsticker") | |
feed.updated(res.first[:created_at].strftime("%Y-%m-%dT%H:%M:%SZ")) | |
for item in res | |
feed.entry do |entry| | |
entry.title(item[:title]) | |
entry.link(:href => item[:link]) | |
entry.id item[:link] | |
entry.content(item[:content], :type => 'text') | |
entry.updated(item[:created_at].strftime("%Y-%m-%dT%H:%M:%SZ")) | |
entry.author do |author| | |
author.name('Heise Online') | |
end | |
end | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment