Skip to content

Instantly share code, notes, and snippets.

@metade
Created August 12, 2008 10:02
Show Gist options
  • Save metade/5025 to your computer and use it in GitHub Desktop.
Save metade/5025 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'open-uri'
require 'hpricot'
require 'addressable/uri'
require 'feed-normalizer'
require 'rdelicious'
class DeliciousMusicNews
def initialize(username, password)
@links_to_gids = links_to_gids
@delicious = Rdelicious.new(username, password)
end
def process_feed(feed_url)
results = {}
feed = FeedNormalizer::FeedNormalizer.parse open(feed_url)
feed.entries.each do |entry|
external_links_from_news_story(entry.url).each do |uri|
gids = gids_for_uri(uri)
gids.each do |gid|
results[gid] ||= []
results[gid] << entry
end
end
end
results.keys.each do |gid|
entries = results[gid]
entries.each do |entry|
@delicious.add(entry.url, entry.title, entry.description, "musicbrainz/artist/#{gid}") unless @delicious.url_exists?(entry.url)
end
end
end
private
def gids_for_uri(uri)
# search for gids with both a trailing / and without
uri_string = uri.to_s.gsub(%r[/$], '')
uri_strings = [ uri_string, "#{uri_string}/" ]
uri_strings.map { |u| @links_to_gids[u] }.flatten.compact.uniq
end
def external_links_from_news_story(url)
doc = Hpricot(open(url))
links = []
doc.search('//a').each do |link|
uri = Addressable::URI.parse(link.attributes['href'])
next if (uri.nil? or uri.relative?)
next if (uri.to_s=~%r[http://(news.*?\.|www\.)?bbc.co.uk] and (uri.to_s=~%r[http://(www\.?)bbc.co.uk/music/artist/\w+])!=0)
next if (uri.to_s=~%r[http://(del\.icio\.us)])
links << uri
end
links
end
def links_to_gids
links_to_gids = {}
File.read('artists_urls.txt').each_line do |line|
gid, url = line.chomp.split("\t")
links_to_gids[url] ||= []
links_to_gids[url] << gid
end
links_to_gids
end
end
dmn = DeliciousMusicNews.new('bbcmusicnews', '********')
[
'http://newsrss.bbc.co.uk/rss/newsbeat/newsbeat/rss.xml',
'http://newsrss.bbc.co.uk/rss/newsbeat/music/rss.xml',
'http://newsrss.bbc.co.uk/rss/newsbeat/entertainment/rss.xml',
'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/entertainment/rss.xml'
].each { |feed| dmn.process_feed(feed) }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment