Created
August 4, 2017 05:38
-
-
Save lttzzlll/89aa8f6ea05ef3907f4ee477a086f3f5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Python Training | |
''' | |
import argparse | |
import codecs | |
import xml.etree.ElementTree as ET | |
import pandas as pd | |
INPUT_XML = 'CortanaLiveData_ja-JP_2015_Audio_test_Result.xml' | |
INPUT_HYP = 'task3_output.hyp' | |
OUTPUT_HYP = 'task4_output.hyp' | |
def run(input_xml, input_hyp, output_hyp): | |
''' | |
start program | |
''' | |
xmlp = ET.XMLParser(encoding='utf-16') | |
f = ET.parse(input_xml, parser=xmlp) | |
transDict = {} | |
for elem in f.iter('SpeechIn'): | |
wave_file_path = elem.get('WaveFilePath').split('.')[0] | |
trans = elem.getchildren()[0].getchildren()[0].getchildren()[0].get('Text') | |
transDict[wave_file_path] = trans | |
data = pd.read_csv(input_hyp, sep='\t', header=None, encoding='utf-16') | |
for index, item in data.iterrows(): | |
key = '\\'.join((item[1], (item[0]))) | |
if key in transDict: | |
item[8] = transDict[key] | |
# print(key) | |
with codecs.open(output_hyp, 'w', encoding='utf-16') as f: | |
for item in data.values.tolist(): | |
f.write('\t'.join(item) + '\n') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='prodcess file') | |
parser.add_argument('--input_xml', type=str, default=INPUT_XML, help='input xml file') | |
parser.add_argument('--input_hyp', type=str, default=INPUT_HYP, help='input hyp file') | |
parser.add_argument('--output_hyp', type=str, default=OUTPUT_HYP, help='output hyp file') | |
args = parser.parse_args() | |
run(args.input_xml, args.input_hyp, args.output_hyp) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment