Skip to content

Instantly share code, notes, and snippets.

@bchase
Created August 3, 2018 09:01
Show Gist options
  • Save bchase/f9738b9b28a198bfd16ce0309b050466 to your computer and use it in GitHub Desktop.
Save bchase/f9738b9b28a198bfd16ce0309b050466 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'pathname'
require 'uri'
module HTML
def table(txt)
node('table', txt)
end
def tr(txt)
node('tr', txt)
end
def td(txt)
node('td', txt)
end
def ul(txt)
node('ul', txt)
end
def li(txt)
node('li', txt)
end
def em(txt)
node('em', txt)
end
def a(txt, attrs)
node('a', txt, attrs)
end
def nbsp
" "
end
def br
"<br/>"
end
def node(tag, txt, attrs={})
attrs = attrs.empty? ? "" : " #{attrs.map {|k,v| %[#{k}="#{v}"]}.join(' ')}"
"<#{tag}#{attrs}>#{txt}</#{tag}>"
end
end
AEDICT_KEYWORDS = %w[gikun go ik jouyou kan kanyou kun name oik ok on rad tou uK ateji iK ik io oK gA gai ichi news nf spec abbr aphorism arch chn col company derog eK fam fem given hon hum id joc m-sl male obs obsc on-mim organization person place poet pol product proverb quote rare sens sl station surname uk unclass vulg work X yoji adj-f adj-i adj-ix adj-kari adj-ku adj-na adj-nari adj-no adj-pn adj-shiku adj-t adv adv-to aux aux-adj aux-v conj cop-da ctr exp int n n-adv n-pr n-pref n-suf n-t num pn pref prt suf unc v-unspec v1 v1-s v2a-s v2b-k v2b-s v2d-k v2d-s v2g-k v2g-s v2h-k v2h-s v2k-k v2k-s v2m-k v2m-s v2n-s v2r-k v2r-s v2s-s v2t-k v2t-s v2w-s v2y-k v2y-s v2z-s v4b v4g v4h v4k v4m v4n v4r v4s v4t v5aru v5b v5g v5k v5k-s v5m v5n v5r v5r-i v5s v5t v5u v5u-s v5uru vi vk vn vr vs vs-c vs-i vs-s vt vz anat archit astron baseb biol bot Buddh bus chem comp econ engr finc food geol geom law ling MA mahj math med mil music physics Shinto shogi sports sumo zool hob ksb ktb kyb kyu nab osb rkb std thb tsb tsug eng afr ain alg amh ara arn bnt bre bul bur chi chn cze dan dut epo est fil fin fre geo ger glg grc gre haw heb hin hun ice ind ita khm kor kur lat mal mao may mnc mol mon nor per pol por rum rus san scr slo slv som spa swa swe tah tam tha tib tur urd vie yid equ expl fig lit jmdict jmnedict test A D R ant cf ex kvar pref see syn uses]
class AedictNotepad
def self.to_anki_tsv(filepath)
Pathname.new(filepath).read
.sub(/^\[(\w|\s)+\]\n?$/,'')
.split(/\n/)
.reject(&:empty?)
.map {|line| AnkiCard.new(entry: Entry.parse(line)).tab_separated}
.join("\n")
end
class Entry
%i[kanji kana common defns].each do |sym| attr_reader sym end
def initialize(kanji:, kana:, common:, defns:)
@kanji = kanji
@kana = kana
@common = common
@defns = defns
end
def self.parse(str)
line_re = %r{^(.+?)\s*\[(.+)\]:\s*(.+?)([(]P[)])?$}
multi_gloss_initial_re = %r{([(][^/]+?[)] )?[(]\d+[)]}
m = str.match line_re
raw_defns = m[3].gsub(multi_gloss_initial_re, "\n\\0").sub(/^\n/,'').split("\n").map {|g| g.chomp '/'}
defns =
if raw_defns.length == 1
[ Definition.parse(Definition::SINGLE_RE, raw_defns.first, num: 1) ]
else
raw_defns.map {|d| Definition.parse Definition::MULTI_RE, d}
end
new \
kanji: m[1] == "null" ? nil : m[1],
kana: m[2],
common: !!m[4],
defns: defns
end
class Definition
%i[num glosses pos tags].each do |sym| attr_reader sym end
def initialize(num:, glosses:, pos:, tags:)
@num = num.nil? ? 1 : num.to_i
@glosses = glosses
@pos = pos
@tags = tags
end
SINGLE_RE = %r{([(](?<pos>[^)]+?)[)] )?(?<rest>.+)}
MULTI_RE = %r{([(](?<pos>[^)]+?)[)] )?[(](?<num>\d+)[)] (?<rest>.+)}
def self.parse(regex, str, num: nil)
m1 = str.match regex
m2 = m1[:rest].match %r{(?<parens>([(](#{AEDICT_KEYWORDS.join('|')})[)] )+)?(?<glosses>.+)}
tags = m2[:parens]&.scan(%r{[(]([^)]+)[)]}).flatten.map{|t| t.split(',')}.flatten if m2 && m2[:parens]
new \
num: num || m1[:num],
pos: m1[:pos]&.split(','),
tags: tags,
glosses: m2[:glosses].split('/')
end
end
end
end
class AnkiCard
include HTML
attr_reader :entry
def initialize(entry:)
@entry = entry
end
def tab_separated
"#{front}\t#{back}"
end
def front
entry.kanji || entry.kana
end
def back
readings_and_common = [
entry.kanji,
"[#{entry.kana}]",
(entry.common ? em("(P)") : nil),
].reject(&:nil?).join(nbsp)
lines = [
readings_and_common,
defns_table,
dict_links,
].reject(&:nil?).join(br*2)
end
def defns_table
table entry.defns.map {|defn|
pos = defn.pos.nil? ? "" : "(#{defn.pos.join(',')})"
tags = defn.tags.nil? ? "" : "[#{defn.tags.join(',')}]"
glosses = ul(defn.glosses.map{|g| li g}.join(''))
tr [
td(defn.num),
td(pos),
td(tags),
td(glosses),
].join('')
}.join('')
end
def dict_links
txt = entry.kanji || entry.kana
[
a("Jisho", href: jisho(txt)),
a("goo", href: goo(txt)),
a("Weblio JJ", href: weblio_jj(txt)),
a("Weblio JE", href: weblio_je(txt)),
].join(nbsp)
end
def weblio_je(txt)
"https://ejje.weblio.jp/content/#{URI.encode(txt)}"
end
def weblio_jj(txt)
"https://www.weblio.jp/content/#{URI.encode(txt)}"
end
def goo(txt)
"https://dictionary.goo.ne.jp/srch/jn/#{URI.encode(txt)}/m0u/"
end
def jisho(txt)
"https://jisho.org/search/#{URI.encode(txt)}"
end
end
input = Pathname.new(ARGV[0])
output = Pathname.new("#{input.sub_ext ''}-#{Time.now.to_i}.tsv")
output.write AedictNotepad.to_anki_tsv(input)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment