Skip to content

Instantly share code, notes, and snippets.

@lttzzlll
Created August 4, 2017 05:40
Show Gist options
  • Save lttzzlll/685967cc898c18cc129517cfc1bc1860 to your computer and use it in GitHub Desktop.
Save lttzzlll/685967cc898c18cc129517cfc1bc1860 to your computer and use it in GitHub Desktop.
'''
Python Training
'''
import os
import codecs
import argparse
# import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
INPUT_PATH = 'All_xml'
OUTPUT_PATH = 'task5_output_pcent.txt'
def run(input_path, out_path):
'''
start program
'''
utteranceDict = {}
trans_utter = {}
for filename in os.listdir(input_path):
# if filename.endswith('.xml') and ('MPS' in filename or 'XboxOneServer' in filename):
if filename.endswith('.xml') and 'MPS' in filename:
print(filename)
with codecs.open(os.path.join(input_path, filename), 'r', encoding='utf-16') as f:
lines = f.readlines()
for line in lines:
# if 'Recognition Text' in line:
if 'Word Text' in line \
and 'Pron' in line \
and 'Confidence' in line \
and 'BS' in line \
and 'DR' in line \
and 'Correct' in line:
temp = line.split('=')
trans = temp[1].split('\"')[1]
utter = temp[2].split('\"')[1]
# utter = line.split('\"')[1]
if utter not in utteranceDict:
utteranceDict[utter] = 1
else:
utteranceDict[utter] += 1
if 'bing' in trans or 'xbox' in trans:
trans_utter[trans] = utter
# if 'MPS' in filename:
# pass
# soup = BeautifulSoup(codecs.open(os.path.join(input_path, filename), 'r', encoding='utf-16'), "lxml")
# print(soup.tag)
# xmlp = ET.XMLParser(encoding='utf-16')
# try:
# f = ET.parse(os.path.join(input_path, filename), parser=xmlp)
# for elem in f.iter('Recognition'):
# utter = elem.get("Text")
# if utter is not None and len(utter) > 0 and utter not in utteranceDict:
# utteranceDict[utter] = 0
# else:
# utteranceDict[utter] += 1
# except (RuntimeError):
# print(filename)
count = 0
for utter in utteranceDict:
count += utteranceDict[utter]
utter_pcent = {}
for trans in trans_utter:
utter_pcent[trans_utter[trans]] = float(utteranceDict[trans_utter[trans]]) / count
print(len(utteranceDict))
with codecs.open(out_path, 'w', encoding='utf-16') as f:
for utter in utter_pcent:
# if utteranceDict[utter] >= 50:
f.write('\t'.join((utter, "{:10.10f}".format(utter_pcent[utter]))) + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='prodcess file')
parser.add_argument('--input_path', type=str, default=INPUT_PATH, help='input xml file path')
# parser.add_argument('--input_hyp', type=str, default=INPUT_HYP, help='input hyp file')
parser.add_argument('--output_path', type=str, default=OUTPUT_PATH, help='output file path')
args = parser.parse_args()
run(args.input_path, args.output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment