Skip to content

Instantly share code, notes, and snippets.

@serhei
Created June 28, 2009 00:11
Show Gist options
  • Save serhei/137161 to your computer and use it in GitHub Desktop.
Save serhei/137161 to your computer and use it in GitHub Desktop.
# One-off script I wrote to slurp sample sentences off http://smart.fm
require 'rubygems'
require 'open-uri'
require 'hpricot'
def cleanup_text(str)
# str = str.gsub /<\/?b>/, "" # Human readable
str = str.gsub /\s+/, " "
return str.strip
end
# Currently equipped for creating a tab-separated Anki file
def save_sentence(savefile, original, explanation)
# savefile.puts original + "\n\n" + explanation + "\n\n" # Human readable
original = original.gsub /\n/, "<br>"
explanation = explanation.gsub /\n/, "<br>"
savefile.puts original + "\t" + explanation + "\n"
end
def download_one_page(source_url, savefile, page_num)
doc = Hpricot(open(source_url + "?page=" + page_num.to_s))
(doc/"li.sentence_package").each do |sentence|
original = cleanup_text( (sentence/"p.text > a").first.inner_html )
original = original.gsub /\s/, ""
transliteration = cleanup_text( (sentence/"p.transliteration").inner_html )
translation = cleanup_text( (sentence/"p.translation > a").inner_html )
save_sentence( savefile, original, transliteration + "\n" + translation )
end
end
def download_sentences(list_num, list_title)
source_url = "http://smart.fm/lists/" + list_num.to_s + "/sentences"
puts "Saving " + source_url + " to " + list_title + ".txt"
savefile = open(list_title + ".txt", "w")
doc = Hpricot(open(source_url))
num_pages = (doc/"div.pagination > a:nth-last-of-type(1)").inner_html.to_i
for i in 1..num_pages do
download_one_page(source_url, savefile, i)
end
savefile.close
end
def download_series(series_num)
doc = Hpricot(open("http://smart.fm/series/" + series_num.to_s))
(doc/"div.list-det > h4 > a").each do |element|
/http:\/\/smart.fm\/lists\/(\d+)-(.*)/ =~ element.attributes['href']
download_sentences( $1, $2 )
end
end
download_series(3318)
download_series(3321)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment