Created
August 3, 2018 09:01
-
-
Save bchase/f9738b9b28a198bfd16ce0309b050466 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'pathname' | |
require 'uri' | |
module HTML | |
def table(txt) | |
node('table', txt) | |
end | |
def tr(txt) | |
node('tr', txt) | |
end | |
def td(txt) | |
node('td', txt) | |
end | |
def ul(txt) | |
node('ul', txt) | |
end | |
def li(txt) | |
node('li', txt) | |
end | |
def em(txt) | |
node('em', txt) | |
end | |
def a(txt, attrs) | |
node('a', txt, attrs) | |
end | |
def nbsp | |
" " | |
end | |
def br | |
"<br/>" | |
end | |
def node(tag, txt, attrs={}) | |
attrs = attrs.empty? ? "" : " #{attrs.map {|k,v| %[#{k}="#{v}"]}.join(' ')}" | |
"<#{tag}#{attrs}>#{txt}</#{tag}>" | |
end | |
end | |
AEDICT_KEYWORDS = %w[gikun go ik jouyou kan kanyou kun name oik ok on rad tou uK ateji iK ik io oK gA gai ichi news nf spec abbr aphorism arch chn col company derog eK fam fem given hon hum id joc m-sl male obs obsc on-mim organization person place poet pol product proverb quote rare sens sl station surname uk unclass vulg work X yoji adj-f adj-i adj-ix adj-kari adj-ku adj-na adj-nari adj-no adj-pn adj-shiku adj-t adv adv-to aux aux-adj aux-v conj cop-da ctr exp int n n-adv n-pr n-pref n-suf n-t num pn pref prt suf unc v-unspec v1 v1-s v2a-s v2b-k v2b-s v2d-k v2d-s v2g-k v2g-s v2h-k v2h-s v2k-k v2k-s v2m-k v2m-s v2n-s v2r-k v2r-s v2s-s v2t-k v2t-s v2w-s v2y-k v2y-s v2z-s v4b v4g v4h v4k v4m v4n v4r v4s v4t v5aru v5b v5g v5k v5k-s v5m v5n v5r v5r-i v5s v5t v5u v5u-s v5uru vi vk vn vr vs vs-c vs-i vs-s vt vz anat archit astron baseb biol bot Buddh bus chem comp econ engr finc food geol geom law ling MA mahj math med mil music physics Shinto shogi sports sumo zool hob ksb ktb kyb kyu nab osb rkb std thb tsb tsug eng afr ain alg amh ara arn bnt bre bul bur chi chn cze dan dut epo est fil fin fre geo ger glg grc gre haw heb hin hun ice ind ita khm kor kur lat mal mao may mnc mol mon nor per pol por rum rus san scr slo slv som spa swa swe tah tam tha tib tur urd vie yid equ expl fig lit jmdict jmnedict test A D R ant cf ex kvar pref see syn uses] | |
class AedictNotepad | |
def self.to_anki_tsv(filepath) | |
Pathname.new(filepath).read | |
.sub(/^\[(\w|\s)+\]\n?$/,'') | |
.split(/\n/) | |
.reject(&:empty?) | |
.map {|line| AnkiCard.new(entry: Entry.parse(line)).tab_separated} | |
.join("\n") | |
end | |
class Entry | |
%i[kanji kana common defns].each do |sym| attr_reader sym end | |
def initialize(kanji:, kana:, common:, defns:) | |
@kanji = kanji | |
@kana = kana | |
@common = common | |
@defns = defns | |
end | |
def self.parse(str) | |
line_re = %r{^(.+?)\s*\[(.+)\]:\s*(.+?)([(]P[)])?$} | |
multi_gloss_initial_re = %r{([(][^/]+?[)] )?[(]\d+[)]} | |
m = str.match line_re | |
raw_defns = m[3].gsub(multi_gloss_initial_re, "\n\\0").sub(/^\n/,'').split("\n").map {|g| g.chomp '/'} | |
defns = | |
if raw_defns.length == 1 | |
[ Definition.parse(Definition::SINGLE_RE, raw_defns.first, num: 1) ] | |
else | |
raw_defns.map {|d| Definition.parse Definition::MULTI_RE, d} | |
end | |
new \ | |
kanji: m[1] == "null" ? nil : m[1], | |
kana: m[2], | |
common: !!m[4], | |
defns: defns | |
end | |
class Definition | |
%i[num glosses pos tags].each do |sym| attr_reader sym end | |
def initialize(num:, glosses:, pos:, tags:) | |
@num = num.nil? ? 1 : num.to_i | |
@glosses = glosses | |
@pos = pos | |
@tags = tags | |
end | |
SINGLE_RE = %r{([(](?<pos>[^)]+?)[)] )?(?<rest>.+)} | |
MULTI_RE = %r{([(](?<pos>[^)]+?)[)] )?[(](?<num>\d+)[)] (?<rest>.+)} | |
def self.parse(regex, str, num: nil) | |
m1 = str.match regex | |
m2 = m1[:rest].match %r{(?<parens>([(](#{AEDICT_KEYWORDS.join('|')})[)] )+)?(?<glosses>.+)} | |
tags = m2[:parens]&.scan(%r{[(]([^)]+)[)]}).flatten.map{|t| t.split(',')}.flatten if m2 && m2[:parens] | |
new \ | |
num: num || m1[:num], | |
pos: m1[:pos]&.split(','), | |
tags: tags, | |
glosses: m2[:glosses].split('/') | |
end | |
end | |
end | |
end | |
class AnkiCard | |
include HTML | |
attr_reader :entry | |
def initialize(entry:) | |
@entry = entry | |
end | |
def tab_separated | |
"#{front}\t#{back}" | |
end | |
def front | |
entry.kanji || entry.kana | |
end | |
def back | |
readings_and_common = [ | |
entry.kanji, | |
"[#{entry.kana}]", | |
(entry.common ? em("(P)") : nil), | |
].reject(&:nil?).join(nbsp) | |
lines = [ | |
readings_and_common, | |
defns_table, | |
dict_links, | |
].reject(&:nil?).join(br*2) | |
end | |
def defns_table | |
table entry.defns.map {|defn| | |
pos = defn.pos.nil? ? "" : "(#{defn.pos.join(',')})" | |
tags = defn.tags.nil? ? "" : "[#{defn.tags.join(',')}]" | |
glosses = ul(defn.glosses.map{|g| li g}.join('')) | |
tr [ | |
td(defn.num), | |
td(pos), | |
td(tags), | |
td(glosses), | |
].join('') | |
}.join('') | |
end | |
def dict_links | |
txt = entry.kanji || entry.kana | |
[ | |
a("Jisho", href: jisho(txt)), | |
a("goo", href: goo(txt)), | |
a("Weblio JJ", href: weblio_jj(txt)), | |
a("Weblio JE", href: weblio_je(txt)), | |
].join(nbsp) | |
end | |
def weblio_je(txt) | |
"https://ejje.weblio.jp/content/#{URI.encode(txt)}" | |
end | |
def weblio_jj(txt) | |
"https://www.weblio.jp/content/#{URI.encode(txt)}" | |
end | |
def goo(txt) | |
"https://dictionary.goo.ne.jp/srch/jn/#{URI.encode(txt)}/m0u/" | |
end | |
def jisho(txt) | |
"https://jisho.org/search/#{URI.encode(txt)}" | |
end | |
end | |
input = Pathname.new(ARGV[0]) | |
output = Pathname.new("#{input.sub_ext ''}-#{Time.now.to_i}.tsv") | |
output.write AedictNotepad.to_anki_tsv(input) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment