Created
August 1, 2017 12:12
-
-
Save lttzzlll/ff7ae65382954f7a291a2cee600ac423 to your computer and use it in GitHub Desktop.
task 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Python Training | |
https://microsoft.sharepoint.com/teams/stca/ipe/sr/_layouts/15/WopiFrame.aspx?sourcedoc={57bcddd2-341f-4151-84f7-f332fca4d07a}&action=edit&wd=target%28LearningCorner%2Eone%7CDF7F96A9-2186-462F-A0EC-8772881176AA%2FPerl%20and%20C%23%20Training%7C1C6AD0B9-765A-4DD5-A3C7-0822927FC9D1%2F%29 | |
''' | |
import pandas as pd | |
from operator import itemgetter | |
import argparse | |
SRC_FILE_NAME = 'Desktop_Merino_ThresholdCortana_Train_de-de_Li_1.hyp' | |
DES_FILE_NAME = 'task2_output.txt' | |
def run(src, des): | |
''' | |
start program | |
''' | |
data = pd.read_csv(src, sep='\t', encoding='gb18030') | |
wordCount = 0 | |
wordDict = {} | |
for index, item in data.iterrows(): | |
# print(item[8]) | |
wordList = str(item[8]).split(' ') | |
wordlist = [word.encode('utf-8') for word in wordList] | |
for word in wordList: | |
if word not in wordDict: | |
wordDict[word] = 1 | |
else: | |
wordDict[word] += 1 | |
wordCount += 1 | |
# wordProb = {} | |
# for word in wordDict: | |
# wordProb[word] = float(wordDict[word]) / float(wordCount) | |
wordProb = wordDict.items() | |
wordProb = sorted(wordProb, key=itemgetter(1), reverse=True) | |
with open(des, 'w') as f: | |
for word, prob in wordProb: | |
f.write(str.format('%s %d\n' % (word, prob))) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='process file') | |
parser.add_argument('--src', type=str, default=SRC_FILE_NAME, help='source file path') | |
parser.add_argument('--des', type=str, default=DES_FILE_NAME, help='destination file path') | |
args = parser.parse_args() | |
run(src=args.src, des=args.des) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment