Created
August 4, 2017 05:40
-
-
Save lttzzlll/685967cc898c18cc129517cfc1bc1860 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Python Training | |
''' | |
import os | |
import codecs | |
import argparse | |
# import xml.etree.ElementTree as ET | |
from bs4 import BeautifulSoup | |
INPUT_PATH = 'All_xml' | |
OUTPUT_PATH = 'task5_output_pcent.txt' | |
def run(input_path, out_path): | |
''' | |
start program | |
''' | |
utteranceDict = {} | |
trans_utter = {} | |
for filename in os.listdir(input_path): | |
# if filename.endswith('.xml') and ('MPS' in filename or 'XboxOneServer' in filename): | |
if filename.endswith('.xml') and 'MPS' in filename: | |
print(filename) | |
with codecs.open(os.path.join(input_path, filename), 'r', encoding='utf-16') as f: | |
lines = f.readlines() | |
for line in lines: | |
# if 'Recognition Text' in line: | |
if 'Word Text' in line \ | |
and 'Pron' in line \ | |
and 'Confidence' in line \ | |
and 'BS' in line \ | |
and 'DR' in line \ | |
and 'Correct' in line: | |
temp = line.split('=') | |
trans = temp[1].split('\"')[1] | |
utter = temp[2].split('\"')[1] | |
# utter = line.split('\"')[1] | |
if utter not in utteranceDict: | |
utteranceDict[utter] = 1 | |
else: | |
utteranceDict[utter] += 1 | |
if 'bing' in trans or 'xbox' in trans: | |
trans_utter[trans] = utter | |
# if 'MPS' in filename: | |
# pass | |
# soup = BeautifulSoup(codecs.open(os.path.join(input_path, filename), 'r', encoding='utf-16'), "lxml") | |
# print(soup.tag) | |
# xmlp = ET.XMLParser(encoding='utf-16') | |
# try: | |
# f = ET.parse(os.path.join(input_path, filename), parser=xmlp) | |
# for elem in f.iter('Recognition'): | |
# utter = elem.get("Text") | |
# if utter is not None and len(utter) > 0 and utter not in utteranceDict: | |
# utteranceDict[utter] = 0 | |
# else: | |
# utteranceDict[utter] += 1 | |
# except (RuntimeError): | |
# print(filename) | |
count = 0 | |
for utter in utteranceDict: | |
count += utteranceDict[utter] | |
utter_pcent = {} | |
for trans in trans_utter: | |
utter_pcent[trans_utter[trans]] = float(utteranceDict[trans_utter[trans]]) / count | |
print(len(utteranceDict)) | |
with codecs.open(out_path, 'w', encoding='utf-16') as f: | |
for utter in utter_pcent: | |
# if utteranceDict[utter] >= 50: | |
f.write('\t'.join((utter, "{:10.10f}".format(utter_pcent[utter]))) + '\n') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='prodcess file') | |
parser.add_argument('--input_path', type=str, default=INPUT_PATH, help='input xml file path') | |
# parser.add_argument('--input_hyp', type=str, default=INPUT_HYP, help='input hyp file') | |
parser.add_argument('--output_path', type=str, default=OUTPUT_PATH, help='output file path') | |
args = parser.parse_args() | |
run(args.input_path, args.output_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment