Created
June 28, 2009 00:11
-
-
Save serhei/137161 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# One-off script I wrote to slurp sample sentences off http://smart.fm | |
require 'rubygems' | |
require 'open-uri' | |
require 'hpricot' | |
def cleanup_text(str) | |
# str = str.gsub /<\/?b>/, "" # Human readable | |
str = str.gsub /\s+/, " " | |
return str.strip | |
end | |
# Currently equipped for creating a tab-separated Anki file | |
def save_sentence(savefile, original, explanation) | |
# savefile.puts original + "\n\n" + explanation + "\n\n" # Human readable | |
original = original.gsub /\n/, "<br>" | |
explanation = explanation.gsub /\n/, "<br>" | |
savefile.puts original + "\t" + explanation + "\n" | |
end | |
def download_one_page(source_url, savefile, page_num) | |
doc = Hpricot(open(source_url + "?page=" + page_num.to_s)) | |
(doc/"li.sentence_package").each do |sentence| | |
original = cleanup_text( (sentence/"p.text > a").first.inner_html ) | |
original = original.gsub /\s/, "" | |
transliteration = cleanup_text( (sentence/"p.transliteration").inner_html ) | |
translation = cleanup_text( (sentence/"p.translation > a").inner_html ) | |
save_sentence( savefile, original, transliteration + "\n" + translation ) | |
end | |
end | |
def download_sentences(list_num, list_title) | |
source_url = "http://smart.fm/lists/" + list_num.to_s + "/sentences" | |
puts "Saving " + source_url + " to " + list_title + ".txt" | |
savefile = open(list_title + ".txt", "w") | |
doc = Hpricot(open(source_url)) | |
num_pages = (doc/"div.pagination > a:nth-last-of-type(1)").inner_html.to_i | |
for i in 1..num_pages do | |
download_one_page(source_url, savefile, i) | |
end | |
savefile.close | |
end | |
def download_series(series_num) | |
doc = Hpricot(open("http://smart.fm/series/" + series_num.to_s)) | |
(doc/"div.list-det > h4 > a").each do |element| | |
/http:\/\/smart.fm\/lists\/(\d+)-(.*)/ =~ element.attributes['href'] | |
download_sentences( $1, $2 ) | |
end | |
end | |
download_series(3318) | |
download_series(3321) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment