Created
June 5, 2018 15:32
-
-
Save nihal111/ac02babe3769bff8cdba537fa0af62c8 to your computer and use it in GitHub Desktop.
Translates a bulky file with pre and post processing. Takes care of rate limit of Google Translate API.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from googletrans import Translator | |
import time | |
BATCH_SIZE = 300 | |
WAIT = 105 | |
INPUT_FILE = 'strings.xml' | |
PRE_PROCESSED_FILE = 'pre_strings.xml' | |
OUTPUT_FILE = 'en_strings.xml' | |
batches = [] | |
translator = Translator() | |
''' | |
Create output file | |
''' | |
open(OUTPUT_FILE, 'a').close() | |
''' | |
Empty output file | |
''' | |
open(OUTPUT_FILE, 'w').close() | |
''' | |
Pre processing | |
''' | |
with open(INPUT_FILE, 'r') as file: | |
text = file.read() | |
text = text.replace("<strong>", "").replace("</strong>", "") | |
text = text.replace("<b>", "").replace("</b>", "") | |
with open(PRE_PROCESSED_FILE, 'w') as file: | |
file.write(text) | |
print("Pre processing done!") | |
''' | |
Read all lines from the file to be translated | |
''' | |
with open(PRE_PROCESSED_FILE, 'r') as file: | |
text = file.readlines() | |
''' | |
Create batches to handle API limit | |
RATE LIMIT: 1,000,000 characters per 100 sec | |
https://cloud.google.com/translate/quotas | |
''' | |
batches = [text[i:i + BATCH_SIZE] for i in xrange(0, len(text), BATCH_SIZE)] | |
''' | |
Translate each batch | |
''' | |
batch_ctr = 0 | |
for batch in batches: | |
strings_to_translate = [] | |
translated_strings = [] | |
en_text = [] | |
batch_ctr += 1 | |
for line in batch: | |
if "<string name=" in line and "</string>" in line: | |
curr_line = line[line.index(">")+1:] | |
curr_line = curr_line[:curr_line.index("</string>")] | |
strings_to_translate.append(curr_line) | |
translations = translator.translate(strings_to_translate, dest='en') | |
for translation in translations: | |
translated_strings.append(translation.text) | |
counter = 0 | |
for line in batch: | |
en_line = line | |
if "<string name=" in line and "</string>" in line: | |
left = line[:line.index(">") + 1] | |
right = line[line.index("</string>"):] | |
if isinstance(translated_strings[counter], unicode): | |
translated_strings[counter] = translated_strings[counter].encode('utf-8') | |
en_line = left + translated_strings[counter] + right | |
counter += 1 | |
en_text.append(en_line) | |
print("\nBATCH {} DONE".format(batch_ctr)) | |
with open(OUTPUT_FILE, 'a') as file: | |
for i in range(len(en_text)): | |
if isinstance(en_text[i], unicode): | |
en_text[i] = en_text[i].encode('utf-8') | |
try: | |
file.write(en_text[i]) | |
except UnicodeDecodeError as e: | |
print(type(en_text[i])) | |
print(("Exception: " + en_text[i] + " "), e.args) | |
print("Waiting for {} secs".format(WAIT)) | |
time.sleep(WAIT) | |
print("Wait complete") | |
''' | |
Post processing | |
''' | |
with open(OUTPUT_FILE, 'r') as file: | |
en_text = file.read() | |
en_text = en_text.replace('\\\'', '\'') | |
en_text = en_text.replace('\'', '\\\'') | |
en_text = en_text.replace('\\ n', '\\n') | |
en_text = en_text.replace(' \\n', '\\n') | |
en_text = en_text.replace('\\n ', '\\n') | |
en_text = en_text.replace(' \\n ', '\\n') | |
en_text = en_text.replace('\\ N', '\\n') | |
en_text = en_text.replace(' \\N', '\\n') | |
en_text = en_text.replace('\\N ', '\\n') | |
en_text = en_text.replace(' \\N ', '\\n') | |
en_text = en_text.replace('%% s', '%s') | |
en_text = en_text.replace(' / ', '/') | |
en_text = en_text.replace(' \\ ', '\\') | |
en_text = en_text.replace('& lt; ', '<') | |
en_text = en_text.replace('& lt;', '<') | |
en_text = en_text.replace(' & gt;', '>') | |
en_text = en_text.replace('& gt;', '>') | |
en_text = en_text.replace(' % ', '%') | |
en_text = en_text.replace('% ', '%') | |
en_text = en_text.replace(' %', '%') | |
en_text = en_text.replace(' $ ', '$') | |
en_text = en_text.replace('$ ', '$') | |
en_text = en_text.replace(' $', '$') | |
en_text = en_text.replace('&', '&') | |
en_text = en_text.replace(' < ', '<') | |
en_text = en_text.replace(' <', '<') | |
en_text = en_text.replace('< ', '<') | |
en_text = en_text.replace(' </ ', '</') | |
en_text = en_text.replace(' </', '</') | |
en_text = en_text.replace('</ ', '</') | |
en_text = en_text.replace(' > ', '>') | |
en_text = en_text.replace('> ', '>') | |
en_text = en_text.replace(' >', '>') | |
with open(OUTPUT_FILE, 'w') as file: | |
file.write(en_text) | |
print("Post processing done!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment