Created
October 17, 2010 10:01
-
-
Save m040601/630706 to your computer and use it in GitHub Desktop.
description of this gist - one or more file containing scraps (anki_lookup_sentences_wget_sed_jedict_server etc...)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| # ./lookup-sentences 1 < ./deck-exported.txt > ./deck-toimport.txt | |
| # $Revision: 1.3 $ $Date: 2010/02/23 02:36:09 $ | |
| while read line | |
| do | |
| sentence=`echo "$line" | awk -F'\t' '{print $'"$1"'}' | sed 's/<[^<]*>//g'` | |
| echo -ne "$line\t" | |
| wget -O - "http://www.csse.monash.edu.au/~jwb/cgi-bin/wwwjdic.cgi?9MIH$sentence" | \ | |
| grep '<li> ' | \ | |
| sed 's|.*<li> ||' | \ | |
| sed 's|<\/li>$|<br>|' | \ | |
| tr -d '\n' | \ | |
| tr '\t' ' ' | \ | |
| recode EUCJP..u8 | |
| echo | |
| done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # found this on some guys gist | |
| # One-off script I wrote to slurp sample sentences off http://smart.fm | |
| require 'rubygems' | |
| require 'open-uri' | |
| require 'hpricot' | |
| def cleanup_text(str) | |
| # str = str.gsub /<\/?b>/, "" # Human readable | |
| str = str.gsub /\s+/, " " | |
| return str.strip | |
| end | |
| # Currently equipped for creating a tab-separated Anki file | |
| def save_sentence(savefile, original, explanation) | |
| # savefile.puts original + "\n\n" + explanation + "\n\n" # Human readable | |
| original = original.gsub /\n/, "<br>" | |
| explanation = explanation.gsub /\n/, "<br>" | |
| savefile.puts original + "\t" + explanation + "\n" | |
| end | |
| def download_one_page(source_url, savefile, page_num) | |
| doc = Hpricot(open(source_url + "?page=" + page_num.to_s)) | |
| (doc/"li.sentence_package").each do |sentence| | |
| original = cleanup_text( (sentence/"p.text > a").first.inner_html ) | |
| original = original.gsub /\s/, "" | |
| transliteration = cleanup_text( (sentence/"p.transliteration").inner_html ) | |
| translation = cleanup_text( (sentence/"p.translation > a").inner_html ) | |
| save_sentence( savefile, original, transliteration + "\n" + translation ) | |
| end | |
| end | |
| def download_sentences(list_num, list_title) | |
| source_url = "http://smart.fm/lists/" + list_num.to_s + "/sentences" | |
| puts "Saving " + source_url + " to " + list_title + ".txt" | |
| savefile = open(list_title + ".txt", "w") | |
| doc = Hpricot(open(source_url)) | |
| num_pages = (doc/"div.pagination > a:nth-last-of-type(1)").inner_html.to_i | |
| for i in 1..num_pages do | |
| download_one_page(source_url, savefile, i) | |
| end | |
| savefile.close | |
| end | |
| def download_series(series_num) | |
| doc = Hpricot(open("http://smart.fm/series/" + series_num.to_s)) | |
| (doc/"div.list-det > h4 > a").each do |element| | |
| /http:\/\/smart.fm\/lists\/(\d+)-(.*)/ =~ element.attributes['href'] | |
| download_sentences( $1, $2 ) | |
| end | |
| end | |
| download_series(3318) | |
| download_series(3321) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment