Skip to content

Instantly share code, notes, and snippets.

@lttzzlll
Created August 1, 2017 12:12
Show Gist options
  • Save lttzzlll/ff7ae65382954f7a291a2cee600ac423 to your computer and use it in GitHub Desktop.
Save lttzzlll/ff7ae65382954f7a291a2cee600ac423 to your computer and use it in GitHub Desktop.
task 2
'''
Python Training
https://microsoft.sharepoint.com/teams/stca/ipe/sr/_layouts/15/WopiFrame.aspx?sourcedoc={57bcddd2-341f-4151-84f7-f332fca4d07a}&action=edit&wd=target%28LearningCorner%2Eone%7CDF7F96A9-2186-462F-A0EC-8772881176AA%2FPerl%20and%20C%23%20Training%7C1C6AD0B9-765A-4DD5-A3C7-0822927FC9D1%2F%29
'''
import pandas as pd
from operator import itemgetter
import argparse
SRC_FILE_NAME = 'Desktop_Merino_ThresholdCortana_Train_de-de_Li_1.hyp'
DES_FILE_NAME = 'task2_output.txt'
def run(src, des):
'''
start program
'''
data = pd.read_csv(src, sep='\t', encoding='gb18030')
wordCount = 0
wordDict = {}
for index, item in data.iterrows():
# print(item[8])
wordList = str(item[8]).split(' ')
wordlist = [word.encode('utf-8') for word in wordList]
for word in wordList:
if word not in wordDict:
wordDict[word] = 1
else:
wordDict[word] += 1
wordCount += 1
# wordProb = {}
# for word in wordDict:
# wordProb[word] = float(wordDict[word]) / float(wordCount)
wordProb = wordDict.items()
wordProb = sorted(wordProb, key=itemgetter(1), reverse=True)
with open(des, 'w') as f:
for word, prob in wordProb:
f.write(str.format('%s %d\n' % (word, prob)))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='process file')
parser.add_argument('--src', type=str, default=SRC_FILE_NAME, help='source file path')
parser.add_argument('--des', type=str, default=DES_FILE_NAME, help='destination file path')
args = parser.parse_args()
run(src=args.src, des=args.des)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment