Last active
April 20, 2017 04:59
-
-
Save shouya/0fb1bb1869e34b9d0a5c63729fefa931 to your computer and use it in GitHub Desktop.
DOM 全文翻譯試驗
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
require 'nokogiri' | |
require 'rest-client' | |
require 'json' | |
require 'securerandom' | |
require 'byebug' | |
PRESERVING_TAGS = ['code', 'pre'] | |
SOGOU_TRANSLATE_PRIVATE_API = 'http://XXXXXX' | |
def uniq_id(node) | |
node.css_path | |
end | |
def shallow_text?(node) | |
return false if node.text? | |
return false unless node.children.any?(&:text?) | |
return false unless node.children | |
.reject(&:text?) | |
.map(&:children) | |
.flatten | |
.all?(&:text?) | |
true | |
end | |
def transform_shallow_text(node) | |
pieces = {} | |
text = '' | |
node.children.to_a.each do |x| | |
if x.text? | |
text += x.text | |
else | |
key = SecureRandom.hex | |
pieces[key] = { | |
node: x, | |
text: x.text, | |
} | |
pieces[key][:preserve] = true if PRESERVING_TAGS.include?(x.name) | |
text += " #{key} " | |
end | |
end | |
[pieces, text] | |
end | |
def traverse(node, &block) | |
if shallow_text?(node) | |
pieces, txt = transform_shallow_text(node) | |
yield :shallow, [pieces, txt, node] | |
return | |
elsif node.text? && node.text !~ /^\s+$/ | |
yield :text, [node.text, node] | |
return | |
end | |
node.children.each do |node_| | |
next if PRESERVING_TAGS.include?(node_.name) | |
traverse(node_, &block) | |
end | |
end | |
def extract_translation_fragments(dom) | |
frags = [] | |
slot_frags = [] | |
piece_frags = [] | |
traverse(dom) do |type, arg| | |
case type | |
when :shallow | |
pieces, txt, node = *arg | |
slot_frags << { | |
uniq_id: uniq_id(node), | |
text: txt | |
} | |
pieces.each do |k, v| | |
piece_frags << { | |
uniq_id: uniq_id(v[:node]), | |
text: v[:text], | |
key: k, | |
node: v[:node].clone, | |
preserve: v[:preserve] | |
} | |
end | |
when :text | |
txt, node = *arg | |
frags << { | |
uniq_id: uniq_id(node), | |
text: txt | |
} | |
end | |
end | |
[frags, slot_frags, piece_frags] | |
end | |
def translate(frags, with: :sogou) | |
send("#{with}_translate", frags) | |
end | |
# NOT Working Yet | |
def google_translate(text) | |
resp = | |
RestClient.get('https://translation.googleapis.com/language/translate/v2/', | |
params: { | |
key: 'AIzaSyA8qN5akKl2OlFq8q_Wn1KQfwuVGGcoPz0', | |
q: text, | |
source: 'en', | |
target: 'zh', | |
model: 'nmt' | |
}, | |
origin: 'https://cloud.google.com') | |
JSON.parse(resp)['data']['translations'].first['translatedText'] | |
end | |
def sogou_translate(frags_) | |
process_frags = lambda do |frags| | |
trans_frag = frags.reject { |x| x[:preserve] } | |
.map { |x| { text: x[:text], sendback: x[:uniq_id] } } | |
req = { | |
uuid: SecureRandom.uuid, | |
from_lang: 'en', | |
to_lang: 'zh-CHS', | |
trans_frag: trans_frag, | |
sendback: '1' | |
} | |
res = RestClient.post(SOGOU_TRANSLATE_PRIVATE_API, | |
req.to_json, | |
content_type: :json, | |
accept: :json) | |
result = JSON.parse(res.to_s)['trans_result'] | |
.map { |x| [x['sendback'], x['trans_text']] } | |
.to_h | |
frags.map do |x| | |
{ **x, trans: result[x[:uniq_id]] || x[:text] } | |
end | |
end | |
frags_.each_slice(100).flat_map(&process_frags) | |
end | |
def dummy_translate(frags) | |
frags.map { |x| { **x, trans: x[:text] } } | |
end | |
def apply_translation(all_frags, dom) | |
frags, slot_frags, piece_frags = *all_frags | |
frags.each do |frag| | |
dom.css(frag[:uniq_id]).first.content = frag[:trans] | |
end | |
piece_frags.each do |frag| | |
frag[:node].content = frag[:trans] | |
end | |
slot_frags.each do |frag| | |
frag[:inner] = frag[:trans].clone | |
piece_frags.each do |piece| | |
frag[:inner].sub!( | |
piece[:key], | |
piece[:node].to_html | |
) | |
end | |
dom.css(frag[:uniq_id]).first.inner_html = frag[:inner] | |
end | |
end | |
indom = Nokogiri::HTML(File.read(ARGV[0])) | |
outdom = indom.clone | |
all_frags = extract_translation_fragments(indom) | |
all_frags.map! { |frags| translate(frags, with: :sogou) } | |
apply_translation(all_frags, outdom) | |
File.write("out.html", outdom.to_s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Steps to play with the script:
SOGOU_TRANSLATE_PRIVATE_API
variable to the corresponding api endpoint.gem install nokogiri rest-client byebug
to install required gems.html
fileruby translate.rb <YOUR_HTML_FILE>
out.html
file for result