Skip to content

Instantly share code, notes, and snippets.

@hankcs
Created November 23, 2017 03:26
Show Gist options
  • Save hankcs/317d7426afbef98bdae45a78d9ed483b to your computer and use it in GitHub Desktop.
Save hankcs/317d7426afbef98bdae45a78d9ed483b to your computer and use it in GitHub Desktop.
OOV recognition trick in convseg
# -*- coding:utf-8 -*-
# Filename: OOV.py
# Author:hankcs
# Date: 2017-11-21 17:51
def load_words(path, dict):
with open(path) as src:
for line in src:
dict.update(line.split())
gold_file = 'data/datasets/sighan2005-pku/test.txt'
train_file = 'data/datasets/sighan2005-pku/train.txt'
dev_file = 'data/datasets/sighan2005-pku/dev.txt'
embeddings_file = 'data/embeddings/news_tensite.pku.words.w2v50'
gold_dict = set()
train_dict = set()
embeddings_dict = set()
load_words(gold_file, gold_dict)
load_words(train_file, train_dict)
load_words(dev_file, train_dict)
with open(embeddings_file) as embeddings_file:
try:
for line in embeddings_file:
embeddings_dict.add(line.split()[0])
except UnicodeDecodeError:
pass
print('Test Word OOV Type : %d' % len(gold_dict - train_dict))
print('Test Word OOV Type (+WE) : %d' % len(gold_dict - (train_dict | embeddings_dict)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment