Skip to content

Instantly share code, notes, and snippets.

@shouya
Last active April 20, 2017 04:59
Show Gist options
  • Save shouya/0fb1bb1869e34b9d0a5c63729fefa931 to your computer and use it in GitHub Desktop.
Save shouya/0fb1bb1869e34b9d0a5c63729fefa931 to your computer and use it in GitHub Desktop.
DOM 全文翻譯試驗
# frozen_string_literal: true
require 'nokogiri'
require 'rest-client'
require 'json'
require 'securerandom'
require 'byebug'
PRESERVING_TAGS = ['code', 'pre']
SOGOU_TRANSLATE_PRIVATE_API = 'http://XXXXXX'
def uniq_id(node)
node.css_path
end
def shallow_text?(node)
return false if node.text?
return false unless node.children.any?(&:text?)
return false unless node.children
.reject(&:text?)
.map(&:children)
.flatten
.all?(&:text?)
true
end
def transform_shallow_text(node)
pieces = {}
text = ''
node.children.to_a.each do |x|
if x.text?
text += x.text
else
key = SecureRandom.hex
pieces[key] = {
node: x,
text: x.text,
}
pieces[key][:preserve] = true if PRESERVING_TAGS.include?(x.name)
text += " #{key} "
end
end
[pieces, text]
end
def traverse(node, &block)
if shallow_text?(node)
pieces, txt = transform_shallow_text(node)
yield :shallow, [pieces, txt, node]
return
elsif node.text? && node.text !~ /^\s+$/
yield :text, [node.text, node]
return
end
node.children.each do |node_|
next if PRESERVING_TAGS.include?(node_.name)
traverse(node_, &block)
end
end
def extract_translation_fragments(dom)
frags = []
slot_frags = []
piece_frags = []
traverse(dom) do |type, arg|
case type
when :shallow
pieces, txt, node = *arg
slot_frags << {
uniq_id: uniq_id(node),
text: txt
}
pieces.each do |k, v|
piece_frags << {
uniq_id: uniq_id(v[:node]),
text: v[:text],
key: k,
node: v[:node].clone,
preserve: v[:preserve]
}
end
when :text
txt, node = *arg
frags << {
uniq_id: uniq_id(node),
text: txt
}
end
end
[frags, slot_frags, piece_frags]
end
def translate(frags, with: :sogou)
send("#{with}_translate", frags)
end
# NOT Working Yet
def google_translate(text)
resp =
RestClient.get('https://translation.googleapis.com/language/translate/v2/',
params: {
key: 'AIzaSyA8qN5akKl2OlFq8q_Wn1KQfwuVGGcoPz0',
q: text,
source: 'en',
target: 'zh',
model: 'nmt'
},
origin: 'https://cloud.google.com')
JSON.parse(resp)['data']['translations'].first['translatedText']
end
def sogou_translate(frags_)
process_frags = lambda do |frags|
trans_frag = frags.reject { |x| x[:preserve] }
.map { |x| { text: x[:text], sendback: x[:uniq_id] } }
req = {
uuid: SecureRandom.uuid,
from_lang: 'en',
to_lang: 'zh-CHS',
trans_frag: trans_frag,
sendback: '1'
}
res = RestClient.post(SOGOU_TRANSLATE_PRIVATE_API,
req.to_json,
content_type: :json,
accept: :json)
result = JSON.parse(res.to_s)['trans_result']
.map { |x| [x['sendback'], x['trans_text']] }
.to_h
frags.map do |x|
{ **x, trans: result[x[:uniq_id]] || x[:text] }
end
end
frags_.each_slice(100).flat_map(&process_frags)
end
def dummy_translate(frags)
frags.map { |x| { **x, trans: x[:text] } }
end
def apply_translation(all_frags, dom)
frags, slot_frags, piece_frags = *all_frags
frags.each do |frag|
dom.css(frag[:uniq_id]).first.content = frag[:trans]
end
piece_frags.each do |frag|
frag[:node].content = frag[:trans]
end
slot_frags.each do |frag|
frag[:inner] = frag[:trans].clone
piece_frags.each do |piece|
frag[:inner].sub!(
piece[:key],
piece[:node].to_html
)
end
dom.css(frag[:uniq_id]).first.inner_html = frag[:inner]
end
end
indom = Nokogiri::HTML(File.read(ARGV[0]))
outdom = indom.clone
all_frags = extract_translation_fragments(indom)
all_frags.map! { |frags| translate(frags, with: :sogou) }
apply_translation(all_frags, outdom)
File.write("out.html", outdom.to_s)
@shouya
Copy link
Author

shouya commented Apr 20, 2017

Steps to play with the script:

  1. Modify SOGOU_TRANSLATE_PRIVATE_API variable to the corresponding api endpoint.
  2. Run gem install nokogiri rest-client byebug to install required gems
  3. Save the webpage you want to translate
  4. Put this script in the same directory with your .html file
  5. Run ruby translate.rb <YOUR_HTML_FILE>
  6. Check the generated out.html file for result

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment