m040601 · October 17, 2010 10:01
diff --git a/anki_lookup_sentences_wget_sed_jedict_server.sh b/anki_lookup_sentences_wget_sed_jedict_server.sh
 #!/bin/sh

 # ./lookup-sentences 1 < ./deck-exported.txt > ./deck-toimport.txt
 # $Revision: 1.3 $ $Date: 2010/02/23 02:36:09 $

 while read line
 do
  sentence=`echo "$line" | awk -F'\t' '{print $'"$1"'}' | sed 's/<[^<]*>//g'`
  echo -ne "$line\t"
  wget -O - "http://www.csse.monash.edu.au/~jwb/cgi-bin/wwwjdic.cgi?9MIH$sentence" | \
    grep '<li> ' | \
    sed 's|.*<li> ||' | \
    sed 's|<\/li>$|<br>|' | \
    tr -d '\n' | \
    tr '\t' ' ' | \
    recode EUCJP..u8
  echo
 done
diff --git a/slurp_sentences_smart.fm.rb b/slurp_sentences_smart.fm.rb
 # found this on some guys gist
 # One-off script I wrote to slurp sample sentences off http://smart.fm

 require 'rubygems'
 require 'open-uri'
 require 'hpricot'

 def cleanup_text(str)
  # str = str.gsub /<\/?b>/, "" # Human readable
  str = str.gsub /\s+/, " "
  return str.strip
 end

 # Currently equipped for creating a tab-separated Anki file
 def save_sentence(savefile, original, explanation)
  # savefile.puts original + "\n\n" + explanation + "\n\n" # Human readable
  original = original.gsub /\n/, "<br>"
  explanation = explanation.gsub /\n/, "<br>"
  savefile.puts original + "\t" + explanation + "\n"
 end

 def download_one_page(source_url, savefile, page_num)
  doc = Hpricot(open(source_url + "?page=" + page_num.to_s))
  (doc/"li.sentence_package").each do |sentence|
    original = cleanup_text( (sentence/"p.text > a").first.inner_html )
    original = original.gsub /\s/, ""
    transliteration = cleanup_text( (sentence/"p.transliteration").inner_html )
    translation = cleanup_text( (sentence/"p.translation > a").inner_html )
    save_sentence( savefile, original, transliteration + "\n" + translation )
  end
 end

 def download_sentences(list_num, list_title)
  source_url = "http://smart.fm/lists/" + list_num.to_s + "/sentences"
  puts "Saving " + source_url + " to " + list_title + ".txt"
  savefile = open(list_title + ".txt", "w")
  doc = Hpricot(open(source_url))
  num_pages = (doc/"div.pagination > a:nth-last-of-type(1)").inner_html.to_i
  for i in 1..num_pages do
    download_one_page(source_url, savefile, i)
  end
  savefile.close
 end

 def download_series(series_num)
  doc = Hpricot(open("http://smart.fm/series/" + series_num.to_s))
  (doc/"div.list-det > h4 > a").each do |element|
    /http:\/\/smart.fm\/lists\/(\d+)-(.*)/ =~ element.attributes['href']
    download_sentences( $1, $2 )
  end
 end

 download_series(3318)
 download_series(3321)
	#!/bin/sh

	# ./lookup-sentences 1 < ./deck-exported.txt > ./deck-toimport.txt
	# $Revision: 1.3 $ $Date: 2010/02/23 02:36:09 $

	while read line
	do
	sentence=`echo "$line" \| awk -F'\t' '{print $'"$1"'}' \| sed 's/<[^<]*>//g'`
	echo -ne "$line\t"
	wget -O - "http://www.csse.monash.edu.au/~jwb/cgi-bin/wwwjdic.cgi?9MIH$sentence" \| \
	grep '<li> ' \| \
	sed 's\|.*<li> \|\|' \| \
	sed 's\|<\/li>$\|<br>\|' \| \
	tr -d '\n' \| \
	tr '\t' ' ' \| \
	recode EUCJP..u8
	echo
	done
	# found this on some guys gist
	# One-off script I wrote to slurp sample sentences off http://smart.fm

	require 'rubygems'
	require 'open-uri'
	require 'hpricot'

	def cleanup_text(str)
	# str = str.gsub /<\/?b>/, "" # Human readable
	str = str.gsub /\s+/, " "
	return str.strip
	end

	# Currently equipped for creating a tab-separated Anki file
	def save_sentence(savefile, original, explanation)
	# savefile.puts original + "\n\n" + explanation + "\n\n" # Human readable
	original = original.gsub /\n/, "<br>"
	explanation = explanation.gsub /\n/, "<br>"
	savefile.puts original + "\t" + explanation + "\n"
	end

	def download_one_page(source_url, savefile, page_num)
	doc = Hpricot(open(source_url + "?page=" + page_num.to_s))
	(doc/"li.sentence_package").each do \|sentence\|
	original = cleanup_text( (sentence/"p.text > a").first.inner_html )
	original = original.gsub /\s/, ""
	transliteration = cleanup_text( (sentence/"p.transliteration").inner_html )
	translation = cleanup_text( (sentence/"p.translation > a").inner_html )
	save_sentence( savefile, original, transliteration + "\n" + translation )
	end
	end

	def download_sentences(list_num, list_title)
	source_url = "http://smart.fm/lists/" + list_num.to_s + "/sentences"
	puts "Saving " + source_url + " to " + list_title + ".txt"
	savefile = open(list_title + ".txt", "w")
	doc = Hpricot(open(source_url))
	num_pages = (doc/"div.pagination > a:nth-last-of-type(1)").inner_html.to_i
	for i in 1..num_pages do
	download_one_page(source_url, savefile, i)
	end
	savefile.close
	end

	def download_series(series_num)
	doc = Hpricot(open("http://smart.fm/series/" + series_num.to_s))
	(doc/"div.list-det > h4 > a").each do \|element\|
	/http:\/\/smart.fm\/lists\/(\d+)-(.*)/ =~ element.attributes['href']
	download_sentences( $1, $2 )
	end
	end

	download_series(3318)
	download_series(3321)