Skip to content

Instantly share code, notes, and snippets.

@nihal111
Created June 5, 2018 15:32
Show Gist options
  • Save nihal111/ac02babe3769bff8cdba537fa0af62c8 to your computer and use it in GitHub Desktop.
Save nihal111/ac02babe3769bff8cdba537fa0af62c8 to your computer and use it in GitHub Desktop.
Translates a bulky file with pre and post processing. Takes care of rate limit of Google Translate API.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from googletrans import Translator
import time
BATCH_SIZE = 300
WAIT = 105
INPUT_FILE = 'strings.xml'
PRE_PROCESSED_FILE = 'pre_strings.xml'
OUTPUT_FILE = 'en_strings.xml'
batches = []
translator = Translator()
'''
Create output file
'''
open(OUTPUT_FILE, 'a').close()
'''
Empty output file
'''
open(OUTPUT_FILE, 'w').close()
'''
Pre processing
'''
with open(INPUT_FILE, 'r') as file:
text = file.read()
text = text.replace("<strong>", "").replace("</strong>", "")
text = text.replace("<b>", "").replace("</b>", "")
with open(PRE_PROCESSED_FILE, 'w') as file:
file.write(text)
print("Pre processing done!")
'''
Read all lines from the file to be translated
'''
with open(PRE_PROCESSED_FILE, 'r') as file:
text = file.readlines()
'''
Create batches to handle API limit
RATE LIMIT: 1,000,000 characters per 100 sec
https://cloud.google.com/translate/quotas
'''
batches = [text[i:i + BATCH_SIZE] for i in xrange(0, len(text), BATCH_SIZE)]
'''
Translate each batch
'''
batch_ctr = 0
for batch in batches:
strings_to_translate = []
translated_strings = []
en_text = []
batch_ctr += 1
for line in batch:
if "<string name=" in line and "</string>" in line:
curr_line = line[line.index(">")+1:]
curr_line = curr_line[:curr_line.index("</string>")]
strings_to_translate.append(curr_line)
translations = translator.translate(strings_to_translate, dest='en')
for translation in translations:
translated_strings.append(translation.text)
counter = 0
for line in batch:
en_line = line
if "<string name=" in line and "</string>" in line:
left = line[:line.index(">") + 1]
right = line[line.index("</string>"):]
if isinstance(translated_strings[counter], unicode):
translated_strings[counter] = translated_strings[counter].encode('utf-8')
en_line = left + translated_strings[counter] + right
counter += 1
en_text.append(en_line)
print("\nBATCH {} DONE".format(batch_ctr))
with open(OUTPUT_FILE, 'a') as file:
for i in range(len(en_text)):
if isinstance(en_text[i], unicode):
en_text[i] = en_text[i].encode('utf-8')
try:
file.write(en_text[i])
except UnicodeDecodeError as e:
print(type(en_text[i]))
print(("Exception: " + en_text[i] + " "), e.args)
print("Waiting for {} secs".format(WAIT))
time.sleep(WAIT)
print("Wait complete")
'''
Post processing
'''
with open(OUTPUT_FILE, 'r') as file:
en_text = file.read()
en_text = en_text.replace('\\\'', '\'')
en_text = en_text.replace('\'', '\\\'')
en_text = en_text.replace('\\ n', '\\n')
en_text = en_text.replace(' \\n', '\\n')
en_text = en_text.replace('\\n ', '\\n')
en_text = en_text.replace(' \\n ', '\\n')
en_text = en_text.replace('\\ N', '\\n')
en_text = en_text.replace(' \\N', '\\n')
en_text = en_text.replace('\\N ', '\\n')
en_text = en_text.replace(' \\N ', '\\n')
en_text = en_text.replace('%% s', '%s')
en_text = en_text.replace(' / ', '/')
en_text = en_text.replace(' \\ ', '\\')
en_text = en_text.replace('& lt; ', '&lt;')
en_text = en_text.replace('& lt;', '&lt;')
en_text = en_text.replace(' & gt;', '&gt;')
en_text = en_text.replace('& gt;', '&gt;')
en_text = en_text.replace(' % ', '%')
en_text = en_text.replace('% ', '%')
en_text = en_text.replace(' %', '%')
en_text = en_text.replace(' $ ', '$')
en_text = en_text.replace('$ ', '$')
en_text = en_text.replace(' $', '$')
en_text = en_text.replace('&', '&amp;')
en_text = en_text.replace(' < ', '<')
en_text = en_text.replace(' <', '<')
en_text = en_text.replace('< ', '<')
en_text = en_text.replace(' </ ', '</')
en_text = en_text.replace(' </', '</')
en_text = en_text.replace('</ ', '</')
en_text = en_text.replace(' > ', '>')
en_text = en_text.replace('> ', '>')
en_text = en_text.replace(' >', '>')
with open(OUTPUT_FILE, 'w') as file:
file.write(en_text)
print("Post processing done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment