Ishotihadus · December 7, 2018 13:37
diff --git a/nicoime.rb b/nicoime.rb
 =begin
 ニコニコ大百科と pixiv 百科事典の and をとった感じの IME 辞書を作るスクリプト
 およそ https://www.ncaq.net/2017/03/10/ に近い挙動をする。
 読みはニコニコ大百科からとる。

 ニコニコとピクシブの両方に単語登録されているものだけ選ぶ。
 次のものは取り除いてある。
 ・ブラックリストに入っているもの
 ・カッコを含むのに読みに「かっこ」が含まれないもの
 ・読みが 1 文字以下のもの

 表記ゆれのリダイレクトは入力時に便利な場合があるので残してある。
 実行時間は 3 分ほど。
 =end

 require 'cgi'
 require 'net/http'
 require 'nokogiri'
 require 'open-uri'
 require 'parallel'
 require 'retryable'

 def get_document(url)
    Retryable.retryable(sleep: 1, tries: 15) do
        charset = nil
        html = open(url) do |f|
            charset = f.charset
            f.read
        end
        Nokogiri::HTML.parse(html, nil, charset)
    end
 end

 def whitelist(title, ruby)
    blacklist = [/のサムネ画像集\z/, /用語集\z/, /の歴史\z/, /の一覧\d*\z/, /\A\d+月\d+日\z/]
    return false if (title.index('(') || title.index('（')) && !ruby.index('かっこ')
    return false if ruby.size < 2
    return false if blacklist.any?{|e| title =~ e}
    true
 end

 pixiv_sitemaps = get_document('https://dic.pixiv.net/sitemap.xml').xpath('//sitemap/loc').map(&:text).select{|e| e.start_with?('https://dic.pixiv.net/sitemap/part/')}
 pixiv_list = Parallel.map(pixiv_sitemaps, progress: 'Downloading pixiv word list') do |url|
    get_document(url).xpath('//url/loc').map{|e| CGI.unescape(e.text.sub('https://dic.pixiv.net/a/', '')).unicode_normalize(:nfc).tr('_', '')}
 end.flatten(1).sort

 nico_indexlist = get_document('https://dic.nicovideo.jp/m/a/a/').xpath('//a[starts-with(@href, "/m/yp/a/")]').map do |e|
    url = e['href'].sub('/1-', '')
    count = e.text.tr('()', '').to_i
    ((count + 49) / 50).times.map{|i| "https://dic.nicovideo.jp#{url}/#{(i * 50 + 1)}-"}
 end.flatten(1)
 word_list = Parallel.map(nico_indexlist, progress: 'Downloading niconico word list') do |url|
    get_document(url).xpath('//li/a[starts-with(@href,"/a/")]/..').map do |e|
        title = e.xpath('a').text.unicode_normalize(:nfc).tr("\r\n\t", '').tr('　', ' ').gsub(/ +/, ' ')
        ruby = e.children.last.text.match(/\((.*?)\)/)[1].unicode_normalize(:nfc).tr(" 　\r\n\t", '').tr('ァ-ン','ぁ-ん')
        # is_redirect = e.children.last.text.strip.end_with?('(リダイレクト)')
        { title: title, ruby: ruby }
    end
 end.flatten(1)

 File.open('nicoime.txt', 'w') do |file|
    word_list.each do |e|
        pixiv_title = e[:title].tr(' ', '')
        file.puts("#{e[:ruby]}\t#{e[:title]}\t固有名詞") if whitelist(e[:title], e[:ruby]) && pixiv_list.bsearch{|t| pixiv_title <=> t}
    end
 end
	=begin
	ニコニコ大百科と pixiv 百科事典の and をとった感じの IME 辞書を作るスクリプト
	およそ https://www.ncaq.net/2017/03/10/ に近い挙動をする。
	読みはニコニコ大百科からとる。

	ニコニコとピクシブの両方に単語登録されているものだけ選ぶ。
	次のものは取り除いてある。
	・ブラックリストに入っているもの
	・カッコを含むのに読みに「かっこ」が含まれないもの
	・読みが 1 文字以下のもの

	表記ゆれのリダイレクトは入力時に便利な場合があるので残してある。
	実行時間は 3 分ほど。
	=end

	require 'cgi'
	require 'net/http'
	require 'nokogiri'
	require 'open-uri'
	require 'parallel'
	require 'retryable'

	def get_document(url)
	Retryable.retryable(sleep: 1, tries: 15) do
	charset = nil
	html = open(url) do \|f\|
	charset = f.charset
	f.read
	end
	Nokogiri::HTML.parse(html, nil, charset)
	end
	end

	def whitelist(title, ruby)
	blacklist = [/のサムネ画像集\z/, /用語集\z/, /の歴史\z/, /の一覧\d*\z/, /\A\d+月\d+日\z/]
	return false if (title.index('(') \|\| title.index('（')) && !ruby.index('かっこ')
	return false if ruby.size < 2
	return false if blacklist.any?{\|e\| title =~ e}
	true
	end

	pixiv_sitemaps = get_document('https://dic.pixiv.net/sitemap.xml').xpath('//sitemap/loc').map(&:text).select{\|e\| e.start_with?('https://dic.pixiv.net/sitemap/part/')}
	pixiv_list = Parallel.map(pixiv_sitemaps, progress: 'Downloading pixiv word list') do \|url\|
	get_document(url).xpath('//url/loc').map{\|e\| CGI.unescape(e.text.sub('https://dic.pixiv.net/a/', '')).unicode_normalize(:nfc).tr('_', '')}
	end.flatten(1).sort

	nico_indexlist = get_document('https://dic.nicovideo.jp/m/a/a/').xpath('//a[starts-with(@href, "/m/yp/a/")]').map do \|e\|
	url = e['href'].sub('/1-', '')
	count = e.text.tr('()', '').to_i
	((count + 49) / 50).times.map{\|i\| "https://dic.nicovideo.jp#{url}/#{(i * 50 + 1)}-"}
	end.flatten(1)
	word_list = Parallel.map(nico_indexlist, progress: 'Downloading niconico word list') do \|url\|
	get_document(url).xpath('//li/a[starts-with(@href,"/a/")]/..').map do \|e\|
	title = e.xpath('a').text.unicode_normalize(:nfc).tr("\r\n\t", '').tr('　', ' ').gsub(/ +/, ' ')
	ruby = e.children.last.text.match(/\((.*?)\)/)[1].unicode_normalize(:nfc).tr(" 　\r\n\t", '').tr('ァ-ン','ぁ-ん')
	# is_redirect = e.children.last.text.strip.end_with?('(リダイレクト)')
	{ title: title, ruby: ruby }
	end
	end.flatten(1)

	File.open('nicoime.txt', 'w') do \|file\|
	word_list.each do \|e\|
	pixiv_title = e[:title].tr(' ', '')
	file.puts("#{e[:ruby]}\t#{e[:title]}\t固有名詞") if whitelist(e[:title], e[:ruby]) && pixiv_list.bsearch{\|t\| pixiv_title <=> t}
	end
	end