Last active
January 13, 2021 03:20
-
-
Save howard-haowen/f5f2aa4dc5f8250b534e4500c9739c6d to your computer and use it in GitHub Desktop.
Tokenize Chinese with CkipTagger
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Reference source of the codes: https://github.com/ckiplab/ckiptagger. | |
# Intall dependencies | |
## !pip install -U ckiptagger[gdown] | |
## !pip install -U ckiptagger[tfgpu] # for Tensorflow GPU | |
from ckiptagger import data_utils, WS, POS, NER | |
# Download model files to ./data.zip (2GB) and extract to ./data/ | |
## data_utils.download_data_url("./") # iis-ckip (Option 1) | |
data_utils.download_data_gdown("./") # gdrive-ckip (Option 2) | |
# Use GPU: | |
## 1. Install tensorflow-gpu (see Installation) | |
## 2. Set CUDA_VISIBLE_DEVICES environment variable, e.g. | |
### os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
## 3. Set disable_cuda=False, e.g. | |
### ws = WS("./data", disable_cuda=False) | |
# To use CPU: | |
ws = WS("./data") | |
pos = POS("./data") | |
ner = NER("./data") | |
# Define a basic tokenizer | |
def tokenize(myStr): | |
res = ws([myStr]) # The argument has to be a list. | |
return res[0] # The index 0 is necessary because the input is a list with a sinle element. | |
# Define the main function | |
def ckip(doc_list): | |
word_sentence_list = ws( | |
doc_list, | |
# sentence_segmentation = True, # To consider delimiters | |
# segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters | |
# recommend_dictionary = dictionary1, # words in this dictionary are encouraged | |
# coerce_dictionary = dictionary2, # words in this dictionary are forced | |
) | |
pos_sentence_list = pos(word_sentence_list) | |
ner_sentence_list = ner(word_sentence_list, pos_sentence_list) | |
return word_sentence_list, pos_sentence_list, ner_sentence_list | |
# word_sentence_list returns a list of lists, with each containing a list of tokenized words, like | |
## [['中央', '流行', '疫情', '指揮', '中心', '今', '(21)', '日', '表示', ',', '我國', '近日', '接獲', '日本', '官方']] | |
# pos_sentence_list returns a list of lists, with each containing a list of POS tags, like | |
## [['Nc', 'VH', 'Na', 'VC', 'Nc', 'Nd', 'Neu', 'Nd', 'VE', 'COMMACATEGORY', 'Nc', 'Nd', 'VC', 'Nc', 'Na']] | |
# ner_sentence_list returns a list of sets, with each containing tuples of four elements (starting index, ending index, NER label, NER term), like | |
## [{(0, 10, 'ORG', '中央流行疫情指揮中心'),(21, 23, 'DATE', '近日'),(25, 27, 'NORP', '日本')}] | |
def print_ner(single_doc): | |
print(f'{single_doc}\n') | |
ner_sentence_list = ckip(single_doc)[2] | |
for entity in sorted(ner_sentence_list[0]): #Without sorting, entities won't be printed in the order they occur in the doc. | |
print(entity) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment