Created
April 24, 2019 10:29
-
-
Save powersee/416fe645b61fe86acca8d90a7b51f2c5 to your computer and use it in GitHub Desktop.
批量将文本翻译为中文,代码来源:https://blog.csdn.net/Fly_TheWind/article/details/84011981
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#本脚本在 Python3 下运行正常,先安装模块 googletrans 和 tqdm ,然后设置好文本目录和生成文件的放置目录即可 | |
#作用:能把文件夹里面的文本批量翻译成中文。 | |
#输入正确的文本编码很重要,我一开始用 utf-8 一直出错,折腾一番后,猜想难道是文本编码不对?于是在 vim 里面输入 :set fileencoding 知道了我要翻译的文本是 utf-16 …… | |
from googletrans import Translator | |
from tqdm import tqdm | |
import os | |
import random | |
import time | |
import re | |
#声明源文件目录 和 生成文件的放置目录 | |
home = os.environ['HOME'] | |
path ="/Users/ver/Desktop/wait" | |
dest ="/Users/ver/Desktop/done" | |
files = os.listdir(path) | |
s = [] | |
# 把长文本切分成短文本,当时google担心会检查文本长度,所以随机了长度,应该没这么严格,想写成固定的也可以 | |
def getText(string): | |
list = [] | |
randline = random.random() * 500 | |
while len(string) > 1500: | |
index = string.find("\n",int(randline)+1000) | |
if index is not None: | |
list.append(string[0:index]) | |
string = string[index:] | |
list.append(string) | |
return list | |
#保存翻译完后的文件 | |
def save2file(title,result): | |
with open(dest+"/"+title,'w') as d: | |
for en in result: | |
d.write(en) | |
d.close | |
# 在文本中无法识别的表情包会使得翻译产生错误,对文本过滤 | |
emoji_pattern = re.compile( | |
u"(\ud83d[\ude00-\ude4f])|" # emoticons | |
u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2) | |
u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2) | |
u"(\ud83d[\ude80-\udeff])|" # transport & map symbols | |
u"(\ud83c[\udde0-\uddff])|" # flags (iOS) | |
u"((-{0,1}[{}]-{0,1}))|" # 我的项目逻辑需要,可删除 | |
u"([R efn]|)" # 我的项目逻辑需要,可删除 | |
"+", flags=re.UNICODE) | |
def remove_emoji(text): | |
return emoji_pattern.sub(r'', text) | |
# 打印单个文本分段后的翻译进度 | |
def printProcess(cnt,txt_len,tatal_size,error): | |
content = "file completed "+str(cnt)+"/"+str(txt_len) | |
print(content,end="\r") | |
# 在短文本翻译出错后,用二分法找到错误地方,并舍去无法翻译的句子 | |
def binarySearch(text): | |
mid = (int) (len(text) *1.0/2) | |
result = [] | |
splitIndex = text.find("。",mid) | |
if splitIndex == -1 or splitIndex == 0: | |
return result | |
pre = text[0:splitIndex] | |
after = text[splitIndex+1:] | |
try: | |
result = result+append(pre) | |
except: | |
result = result+binarySearch(pre) | |
try: | |
result = result+append(after) | |
except: | |
result = result+binarySearch(after) | |
return result | |
# 翻译文本 | |
def getTranslateTextList(txt): | |
result = [] | |
time.sleep(1) | |
cnt = 0 | |
txtsize = 0 | |
for text in txt: | |
try: | |
cnt += 1 | |
text = remove_emoji(text) | |
txtsize += len(text) | |
translate = Translator() | |
en = translate.translate(text=text, dest='zh-CN').text | |
#原作者这里是翻译成英文的,但是我是用来把日文翻译成中文,所以就把 en 改为 zh-CN 了。 | |
result.append(en) | |
printProcess(cnt,len(txt),txtsize,error) | |
slptimes = random.random() #我可能想太多,怕固定的sleep还是会被google检查出来,所以随机了一个时间 | |
time.sleep(1.2 + slptimes) | |
except Exception as e: | |
result = result + binarySearch(text) | |
return result | |
## 正式开始 mian() | |
for file in tqdm(files): | |
if not os.path.isdir(file): | |
title = Translator().translate(text=file, dest='en').text | |
try: | |
with open(path+"/"+file,'r',encoding='utf-16',errors='ignore') as f: | |
string = f.read() | |
# string = string.rstrip("\n").decode("utf8") | |
# string = string.split("\r\n") | |
f.close() | |
txt = getText(string) | |
print("analysis:"+title) | |
result = getTranslateTextList(txt) | |
save2file(title,result) | |
except Exception as e: | |
print(str(e)) | |
continue | |
time.sleep(10) #为了保证不被google屏蔽IP,不得已设置了一个超长时间的sleep,可以按情况改小。我试了下 3,结果没多久就被封 IP 了,改为 10 就没什么问题了。 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment