hankcs · November 23, 2017 03:26
diff --git a/OOV.py b/OOV.py
 # -*- coding:utf-8 -*-
 # Filename: OOV.py
 # Author：hankcs
 # Date: 2017-11-21 17:51

 def load_words(path, dict):
    with open(path) as src:
        for line in src:
            dict.update(line.split())


 gold_file = 'data/datasets/sighan2005-pku/test.txt'
 train_file = 'data/datasets/sighan2005-pku/train.txt'
 dev_file = 'data/datasets/sighan2005-pku/dev.txt'
 embeddings_file = 'data/embeddings/news_tensite.pku.words.w2v50'
 gold_dict = set()
 train_dict = set()
 embeddings_dict = set()
 load_words(gold_file, gold_dict)
 load_words(train_file, train_dict)
 load_words(dev_file, train_dict)
 with open(embeddings_file) as embeddings_file:
    try:
        for line in embeddings_file:
            embeddings_dict.add(line.split()[0])
    except UnicodeDecodeError:
        pass

 print('Test Word OOV Type : %d' % len(gold_dict - train_dict))
 print('Test Word OOV Type (+WE) : %d' % len(gold_dict - (train_dict | embeddings_dict)))
	# -- coding:utf-8 --
	# Filename: OOV.py
	# Author：hankcs
	# Date: 2017-11-21 17:51

	def load_words(path, dict):
	with open(path) as src:
	for line in src:
	dict.update(line.split())


	gold_file = 'data/datasets/sighan2005-pku/test.txt'
	train_file = 'data/datasets/sighan2005-pku/train.txt'
	dev_file = 'data/datasets/sighan2005-pku/dev.txt'
	embeddings_file = 'data/embeddings/news_tensite.pku.words.w2v50'
	gold_dict = set()
	train_dict = set()
	embeddings_dict = set()
	load_words(gold_file, gold_dict)
	load_words(train_file, train_dict)
	load_words(dev_file, train_dict)
	with open(embeddings_file) as embeddings_file:
	try:
	for line in embeddings_file:
	embeddings_dict.add(line.split()[0])
	except UnicodeDecodeError:
	pass

	print('Test Word OOV Type : %d' % len(gold_dict - train_dict))
	print('Test Word OOV Type (+WE) : %d' % len(gold_dict - (train_dict \| embeddings_dict)))