Created
February 11, 2011 04:41
-
-
Save chsh/821931 to your computer and use it in GitHub Desktop.
Build lang selection master data using Google language_tools page. :-D
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
require 'open-uri' | |
require 'yaml' | |
require 'rubygems' | |
require 'nokogiri' | |
class LangsBuilder | |
def dump_langs(writer_or_file) | |
if writer_or_file.is_a? String | |
File.open(writer_or_file, 'w') do |f| | |
dump_langs_to_writer f | |
end | |
elsif writer_or_file.is_a? IO | |
dump_langs_to_writer writer_or_file | |
else raise "parameter must be File or IO. But '#{writer_or_file.class}'" | |
end | |
end | |
private | |
def dump_langs_to_writer(writer) | |
lang_map = build_lang_map | |
YAML.dump(lang_map, writer) | |
end | |
def build_lang_map | |
doc = doc_by_lang 'en' | |
langs = pickup_langs doc | |
lang_map = {} | |
langs.each do |lang| | |
lang_map[lang] = translation_map(lang) | |
end | |
lang_map | |
end | |
def pickup_langs(doc) | |
langs = doc.xpath('//a').map { |a| $1 if a['href'] =~ /^http:\/\/www\.google\.com\/intl\/([^\/]+)\/?$/ }.compact | |
langs.map { |lang| lang unless lang =~ /^xx-/ }.compact | |
end | |
def translation_map(lang_for) | |
doc = doc_by_lang lang_for | |
as = doc.xpath('//a').map { |a| a if (a['href'] =~ /^http:\/\/www\.google\.com\/intl\/[^\/]+\/?$/ && a['href'] !~ /\/xx-.+\/?$/) }.compact | |
code_trans_array = as.map { |a| [lang_from_href(a['href']), normalize_text(a.text)] } | |
Hash[*code_trans_array.flatten] | |
end | |
private | |
def lang_from_href(href) | |
$1 if href =~ /^http:\/\/www\.google\.com\/intl\/([^\/]+)\/?$/ | |
end | |
def normalize_text(text) | |
text.gsub(/(/, '(').gsub(/)/, ')') | |
end | |
def doc_by_lang(lang) | |
@doc_by_lang ||= {} | |
@doc_by_lang[lang] ||= Nokogiri::HTML(open("http://www.google.com/language_tools?hl=#{lang}").read) | |
end | |
end | |
LangsBuilder.new.dump_langs(ARGV[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment