-
-
Save shouya/4571764 to your computer and use it in GitHub Desktop.
# | |
# This script is a convenient tool that takes colordict history from stdin | |
# and filters all starred vocabularies, queries the dict for meaning, then outputs | |
# a file for anki to import. | |
# | |
# Requires: | |
# * sdcv (stardict commandline version) | |
# | |
# | |
# This script is licensed by WTFPL. | |
# Copyright (c) Shou Ya, 2013 | |
# | |
require 'ap' | |
STAR_PREFIX = "\xC2\xA1\xC3\x9A ".force_encoding('utf-8') | |
# deprecated | |
E2CDICT_NAME = "\xE6\x87\xB6\xE8\x9F\xB2\xE7\xB0" \ | |
"\xA1\xE6\x98\x8E\xE8\x8B\xB1\xE6" \ | |
"\xBC\xA2\xE8\xA9\x9E\xE5\x85\xB8".force_encoding('utf-8') | |
C2EDICT_NAME = 'MDBG Chinese-English' \ | |
' dictionary (www.mdbg.net)' | |
.force_encoding('utf-8') | |
DICT_DIR = "#{ENV['HOME']}/backup/usefuldict/" | |
PREVIOUS_OUT_PREFIX = 'prev_' | |
def get_prev | |
Dir.glob(PREVIOUS_OUT_PREFIX + '*') | |
.sort {|a,b| File.ctime(b) <=> File.ctime(a) } | |
.map { |x| File.read(x).each_line | |
.map(&:chomp) | |
.to_a | |
.reject {|x| x =~ /^\#/ or x =~ /^\s*$/} } | |
.inject([], &:concat) | |
end | |
def read_words | |
word_list = [] | |
$stdin.read.each_line do |l| | |
l.chomp! | |
next unless l.start_with? STAR_PREFIX | |
l[0, STAR_PREFIX.length] = '' | |
next unless l.ascii_only? | |
word_list << l.strip | |
end | |
word_list | |
end | |
# The following function is from: | |
# http://svn.ruby-lang.org/repos/ruby/trunk/lib/shellwords.rb | |
# GPLv2 Licensed | |
def shellescape(str) | |
# An empty argument will be skipped, so return empty quotes. | |
return "''" if str.empty? | |
str = str.dup | |
# Process as a single byte sequence because not all shell | |
# implementations are multibyte aware. | |
str.gsub!(/([^A-Za-z0-9_\-.,:\/@\n])/, "\\\\\\1") | |
# A LF cannot be escaped with a backslash because a backslash + LF | |
# combo is regarded as line continuation and simply ignored. | |
str.gsub!(/\n/, "'\n'") | |
return str | |
end | |
def query_word(word, dict) | |
puts word | |
IO.popen(['sdcv', '--non-interactive', | |
'--utf8-input', '--utf8-output', | |
'--data-dir', DICT_DIR, | |
'--use-dict', dict, | |
shellescape(word)]) do |io| | |
=begin | |
result = io.read | |
result.tap{ |x| x.sub!(/.*?^\s*$\n/m, '')}.chomp! | |
result.gsub!("\n", '</div><div>') | |
result.sub!('</div>', '') | |
result.reverse!.tap {|x| x.sub!('<div>'.reverse, '')}.reverse! | |
result.gsub!("\t", '') | |
result | |
=end | |
io.read.each_line.drop(4)[0..-2].map(&:chomp).join("<br/>").gsub("\t", ' ') | |
end | |
end | |
def produce_dict(word_list, trans_list, output_filename) | |
File.open(output_filename, 'w') do |f| | |
word_list.zip(trans_list).each do |(w,t)| | |
f.puts(w + "\t" + t) | |
end | |
end | |
end | |
def determine_dict(word) | |
if word.ascii_only? | |
E2CDICT_NAME | |
else | |
C2EDICT_NAME | |
end | |
end | |
def main | |
output_filename = ARGV[0] || "out.txt" | |
prev_word_list = get_prev | |
new_word_list = read_words | |
word_list = new_word_list - prev_word_list | |
File.open(output_filename, 'w') do |f| | |
word_count = word_list.length | |
current = 1 | |
word_list.each do |word| | |
puts "producing #{word} ... (%d/%d)" % [current, word_count] | |
f.puts `ruby ./check_dict.rb "#{word}"`.chomp | |
current += 1 | |
end | |
end | |
# trans_list = word_list.dup.map {|x| query_word(x, determine_dict(x)) } | |
# produce_dict(word_list, trans_list, output_filename) | |
backup_filename = PREVIOUS_OUT_PREFIX + | |
Time.now.to_i.to_s + '-' + output_filename | |
File.write(backup_filename, | |
(new_word_list).map {|x| x + "\n"}.sort.join) | |
end | |
if $0 == __FILE__ | |
main | |
end | |
# | |
require 'open-uri' | |
require 'nokogiri' | |
require 'ap' | |
DICTIONARY_URL = 'http://dictionary.reference.com/browse/%s' | |
word = ARGV[0] | |
html = Nokogiri::HTML(open(DICTIONARY_URL % word.strip)) | |
exp = html.css('.results_content').first | |
header = exp.css('.header').first | |
body = exp.css('.body').first | |
header.css('.pronset span[audio]').remove | |
header.css('.pronset .ipapr').remove | |
header.css('.pronset .questionmark').remove | |
header.css('.pronset .pron_toggle').remove | |
header.traverse do |ele| | |
ele.remove_attribute('onmouseover') | |
ele.remove_attribute('onmouseout') | |
ele.remove_attribute('onclick') | |
end | |
body.css('#rltqns').remove | |
body.css('#topfcrds').remove | |
body.css('#fcrds').remove | |
body.css('.fwcrp').remove | |
body.css('.tail').remove | |
body.traverse do |ele| | |
ele.remove_attribute('onmouseover') | |
ele.remove_attribute('onmousedown') | |
ele.remove_attribute('onmouseout') | |
ele.remove_attribute('onclick') | |
end | |
print word.gsub("\n", '').gsub("\t", ' ') | |
print "\t" | |
print header.to_xml.gsub("\n", '').gsub("\t", ' ') | |
print "\t" | |
puts body.to_xml.gsub("\n", '').gsub("\t", ' ') |
.header { | |
text-align: center; | |
} | |
.card { | |
font-family: "verdana"; | |
font-size: small; | |
} | |
.lunatext, .lunatext *, .lunatext a { | |
font-family: Verdana, Arial, Helvetica, sans-serif; | |
font-size: 13px; | |
color: #333333; | |
line-height: 1.25em; | |
margin: 0; | |
padding: 0; | |
} | |
.lunatext a { | |
text-decoration:underline; | |
} | |
.lunatext a:hover { | |
color:#0050BB; | |
} | |
.lunatext a:visited { | |
color:#663399; | |
} | |
.lunatext p { | |
margin:0 0 1em; | |
} | |
.lunatext ol { | |
margin-left:2em; | |
} | |
.lunatext ol ol { | |
margin-bottom:0; | |
} | |
.lunatext ul, .lunatext ol { | |
font-size:0.925em; | |
margin-bottom:1.5em; | |
padding-left:0.75em; | |
} | |
.lunatext ul li { | |
background:transparent url(http://cache.lexico.com/g/d/bullet_gray.gif) no-repeat scroll 0 0.5em; | |
list-style-image:none; | |
list-style-position:outside; | |
list-style-type:none; | |
margin-bottom:0.33em; | |
padding-left:8px; | |
} | |
.lunatext ul ul, .lunatext ul ol, .lunatext ol ul, .lunatext ol ol { | |
font-size:1em; | |
} | |
.lunatext blockquote { | |
background:#EFEFEF none repeat scroll 0 50%; | |
border:1px solid #DCDCDC; | |
margin:1em 0; | |
padding:1em; | |
text-align:left; | |
} | |
/* END OF NONE NEED STUFF FOR LUNA */ | |
/* luna HTML project styles */ | |
div.luna-Ent, div.luna-Pronkey { | |
background-color: #FFF; | |
color: #333333; | |
display: block; | |
font-family: Verdana, Arial, Helvetica, sans-serif; | |
font-size: 1em; | |
padding-bottom:5px; | |
} | |
div.dndata { | |
padding-left: 37px; | |
} | |
.tail, .body { | |
margin-bottom: 0em; | |
margin-left: 0em; | |
margin-right: 0; | |
margin-top: 0em; | |
color: #333333; | |
} | |
.tail { | |
padding-top: 20px; | |
} | |
h2.me { | |
font-family: "Arial Unicode MS", Arial, Helvetica, sans-serif; | |
font-size: 18px; | |
display: inline; | |
color: #000000; | |
} | |
.luna-Img { | |
vertical-align: text-top; | |
} | |
.luna-Ent a:link, .luna-Pronkey a:link, .luna-Ent a:visited, .luna-Pronkey a:visited, .x { | |
color: #333333; | |
text-decoration: underline; | |
} | |
.luna-Ent a:hover, .luna-Pronkey a:hover { | |
color: #0055BB; | |
text-decoration: underline; | |
} | |
.pg { | |
font-weight: bold; | |
color: #333333; | |
font-family: Verdana, Arial, Helvetica, sans-serif; | |
display: inline; | |
font-style: italic; | |
font-size: 13px; | |
padding-right: 3px; | |
} | |
.rom-inline { | |
color: #333333; | |
display: inline; | |
font-style: normal; | |
font-weight: normal; | |
} | |
.labset { | |
color: #333333; | |
display: inline; | |
font-style: normal; | |
} | |
.secondary-bf { | |
font-size: 13px; | |
color: #333333; | |
display: inline; | |
font-weight: bold; | |
} | |
.body .rom-inline { | |
display: inline; | |
font-style: normal; | |
font-weight: normal; | |
} | |
.body .pg { | |
color: #333333; | |
font-style: italic; | |
} | |
.body .sectionlabel { | |
color: #333333; | |
font-style: italic; | |
display: block; | |
padding-top: 15px; | |
} | |
.body .secondary-bf { | |
color: #333333 | |
} | |
table.luna-Ent { | |
font-size: 13px; | |
background-color: #FFF; | |
color: #333333; | |
display: block; | |
padding-bottom: 0; | |
width: 100%; | |
} | |
table.luna-Ent td { | |
text-align: left; | |
vertical-align: top; | |
} | |
table.luna-Ent td.dn { | |
width: 1.5em; | |
} | |
table.luna-Ent td span.labset { | |
color: #333333 | |
} | |
td span.secondary-bf { | |
color: #333333 | |
} | |
.show_ipapr .pron { | |
font-size: 1.1em; | |
color: #333333; | |
font-family: "Arial Unicode MS", "Lucida Grande", Verdana, Arial, Helvetica, sans-serif; | |
display: inline; | |
} | |
.show_rhpr .pron { | |
font-size: 13px; | |
color: #333333; | |
font-family: Verdana, Arial, Helvetica, sans-serif; | |
display: inline; | |
} | |
.show_spellpr .pron { | |
font-size: 13px; | |
color: #333333; | |
font-family: Verdana, Arial, Helvetica, sans-serif; | |
display: inline; | |
} | |
.pronlink { | |
font-size: 13px; | |
color: #333333; | |
cursor: pointer; | |
text-decoration: underline; | |
} | |
.pronset { | |
color: #333333; | |
} | |
.pronset iframe{ | |
position : relative; | |
top: 4px; | |
left: 5px; | |
} | |
.pronset div{ | |
padding-bottom: 4px; | |
} | |
.me { | |
display: inline; | |
font-weight: bold; | |
} | |
.homno { | |
font-family: Verdana, Arial, Helvetica, sans-serif; | |
font-size:10px; | |
display: inline; | |
color: #333333; | |
vertical-align: top; | |
} | |
.dnindex { | |
font-weight: bold; | |
color: #7B7B7B; | |
display:block; | |
float:left; | |
width:28px; | |
} | |
.luna-Nested { | |
position:relative; | |
} | |
.luna-Nested .dndata { | |
padding-left: 30px; | |
} | |
.luna-Nested span { | |
_left:-28px; | |
} | |
.dn { | |
color: #333333; | |
display: inline; | |
} | |
.ital-inline { | |
display: inline; | |
font-style: italic; | |
font-family: Georgia, Verdana, Arial, Helvetica, sans-serif; /*georgia for example sent. */ | |
} | |
.sc { | |
display: inline; | |
font-variant: small-caps; | |
} | |
.tail .rom-inline { | |
color: #333333; | |
display: inline; | |
font-style: normal; | |
font-weight: normal; | |
} | |
.prondelim { | |
color: #333333; | |
font-family: Verdana, Arial, Helvetica, sans-serif; | |
} | |
.sectionLabel { | |
font-weight: bold; | |
color: #333333; | |
font-family: Verdana, Arial, Helvetica, sans-serif; | |
display: inline; | |
font-style: italic; | |
} | |
.shape { | |
display: inline; | |
/* font-family: Helvetica, Arial, sans-serif;*/ | |
} | |
sub { | |
position: relative; | |
} | |
table.luna-Pronkey { | |
display: block; | |
width: 100%; | |
} | |
table.luna-Pronkey td { | |
white-space: nowrap; | |
} | |
table.luna-Pronkey td.pr { | |
width: 3.5em; | |
} | |
caption.luna-Pronkey { | |
display: inline; | |
font-weight: bold; | |
text-align: left; | |
} | |
div.luna-Pronkey .info { | |
display: block; | |
font-size: .9em; | |
} | |
.indefinitionword { | |
font-size: 13px; | |
color: #000000; | |
display: inline; | |
font-weight: bold; | |
} | |
span.x sup { | |
font-size: .75em; | |
} | |
.ety .ital-inline { | |
font-family: "Arial Unicode MS", "Lucida Grande", Verdana, Arial, Helvetica, sans-serif; | |
} | |
hr.ety { | |
display: none; | |
} | |
.boldface { | |
font-weight: 700; | |
} | |
.lightface { | |
font-weight: 100; | |
} |
@Luxor This version could currently work without sdcv support :)
Firstly you have to export the query history in colordict to a file, in anyway you like. The way I use is to share via email to my mailbox, and then copy them to a file on my pc.
You put the two ruby scripts in a same directory. And then:
$ cat /tmp/colordict-export-history.txt | ruby ./anki_helper.rb
It will produced an out.txt
and a prev_xxxx.txt
. You can leave the latter one aside since when the next time you import, it will automatically neglect the imported items specified in that file. And now you can take the out.txt
for anki to import it. It is separated in three fields, each represent the original word, the front(including syllables and pronunciation) and the back respectively with tab character. In addition you might need a new note type for this kind of flashcards, since it needs extra stylesheets, which I've been posted as 'note_style.css'.
Have fun :)
Hello,
Can you provide example of usage.
As far as I know colordict stores history in sqllit db and works on android
Thanks