Skip to content

Instantly share code, notes, and snippets.

@chsh
Created February 11, 2011 04:41
Show Gist options
  • Save chsh/821931 to your computer and use it in GitHub Desktop.
Save chsh/821931 to your computer and use it in GitHub Desktop.
Build lang selection master data using Google language_tools page. :-D
# encoding: utf-8
require 'open-uri'
require 'yaml'
require 'rubygems'
require 'nokogiri'
class LangsBuilder
def dump_langs(writer_or_file)
if writer_or_file.is_a? String
File.open(writer_or_file, 'w') do |f|
dump_langs_to_writer f
end
elsif writer_or_file.is_a? IO
dump_langs_to_writer writer_or_file
else raise "parameter must be File or IO. But '#{writer_or_file.class}'"
end
end
private
def dump_langs_to_writer(writer)
lang_map = build_lang_map
YAML.dump(lang_map, writer)
end
def build_lang_map
doc = doc_by_lang 'en'
langs = pickup_langs doc
lang_map = {}
langs.each do |lang|
lang_map[lang] = translation_map(lang)
end
lang_map
end
def pickup_langs(doc)
langs = doc.xpath('//a').map { |a| $1 if a['href'] =~ /^http:\/\/www\.google\.com\/intl\/([^\/]+)\/?$/ }.compact
langs.map { |lang| lang unless lang =~ /^xx-/ }.compact
end
def translation_map(lang_for)
doc = doc_by_lang lang_for
as = doc.xpath('//a').map { |a| a if (a['href'] =~ /^http:\/\/www\.google\.com\/intl\/[^\/]+\/?$/ && a['href'] !~ /\/xx-.+\/?$/) }.compact
code_trans_array = as.map { |a| [lang_from_href(a['href']), normalize_text(a.text)] }
Hash[*code_trans_array.flatten]
end
private
def lang_from_href(href)
$1 if href =~ /^http:\/\/www\.google\.com\/intl\/([^\/]+)\/?$/
end
def normalize_text(text)
text.gsub(/(/, '(').gsub(/)/, ')')
end
def doc_by_lang(lang)
@doc_by_lang ||= {}
@doc_by_lang[lang] ||= Nokogiri::HTML(open("http://www.google.com/language_tools?hl=#{lang}").read)
end
end
LangsBuilder.new.dump_langs(ARGV[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment