Skip to content

Instantly share code, notes, and snippets.

@lttzzlll
Created August 4, 2017 05:38
Show Gist options
  • Save lttzzlll/89aa8f6ea05ef3907f4ee477a086f3f5 to your computer and use it in GitHub Desktop.
Save lttzzlll/89aa8f6ea05ef3907f4ee477a086f3f5 to your computer and use it in GitHub Desktop.
'''
Python Training
'''
import argparse
import codecs
import xml.etree.ElementTree as ET
import pandas as pd
INPUT_XML = 'CortanaLiveData_ja-JP_2015_Audio_test_Result.xml'
INPUT_HYP = 'task3_output.hyp'
OUTPUT_HYP = 'task4_output.hyp'
def run(input_xml, input_hyp, output_hyp):
'''
start program
'''
xmlp = ET.XMLParser(encoding='utf-16')
f = ET.parse(input_xml, parser=xmlp)
transDict = {}
for elem in f.iter('SpeechIn'):
wave_file_path = elem.get('WaveFilePath').split('.')[0]
trans = elem.getchildren()[0].getchildren()[0].getchildren()[0].get('Text')
transDict[wave_file_path] = trans
data = pd.read_csv(input_hyp, sep='\t', header=None, encoding='utf-16')
for index, item in data.iterrows():
key = '\\'.join((item[1], (item[0])))
if key in transDict:
item[8] = transDict[key]
# print(key)
with codecs.open(output_hyp, 'w', encoding='utf-16') as f:
for item in data.values.tolist():
f.write('\t'.join(item) + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='prodcess file')
parser.add_argument('--input_xml', type=str, default=INPUT_XML, help='input xml file')
parser.add_argument('--input_hyp', type=str, default=INPUT_HYP, help='input hyp file')
parser.add_argument('--output_hyp', type=str, default=OUTPUT_HYP, help='output hyp file')
args = parser.parse_args()
run(args.input_xml, args.input_hyp, args.output_hyp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment