howard-haowen · January 13, 2021 03:20
diff --git a/CkipTagger_CPU.py b/CkipTagger_CPU.py
 # Reference source of the codes: https://github.com/ckiplab/ckiptagger. 

 # Intall dependencies
 ## !pip install -U ckiptagger[gdown]
 ## !pip install -U ckiptagger[tfgpu] # for Tensorflow GPU

 from ckiptagger import data_utils, WS, POS, NER

 # Download model files to ./data.zip (2GB) and extract to ./data/
 ## data_utils.download_data_url("./") # iis-ckip (Option 1)
 data_utils.download_data_gdown("./") # gdrive-ckip (Option 2)

 # Use GPU:
 ##    1. Install tensorflow-gpu (see Installation)
 ##    2. Set CUDA_VISIBLE_DEVICES environment variable, e.g. 
 ### os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 ##    3. Set disable_cuda=False, e.g. 
 ### ws = WS("./data", disable_cuda=False)

 # To use CPU:
 ws = WS("./data")
 pos = POS("./data")
 ner = NER("./data")

 # Define a basic tokenizer
 def tokenize(myStr):
  res = ws([myStr]) # The argument has to be a list.
  return res[0] # The index 0 is necessary because the input is a list with a sinle element. 

 # Define the main function 
 def ckip(doc_list):
    word_sentence_list = ws(
        doc_list,
        # sentence_segmentation = True, # To consider delimiters
        # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters
        # recommend_dictionary = dictionary1, # words in this dictionary are encouraged
        # coerce_dictionary = dictionary2, # words in this dictionary are forced
    )

    pos_sentence_list = pos(word_sentence_list)
    ner_sentence_list = ner(word_sentence_list, pos_sentence_list)
    return word_sentence_list, pos_sentence_list, ner_sentence_list

 # word_sentence_list returns a list of lists, with each containing a list of tokenized words, like
 ## [['中央', '流行', '疫情', '指揮', '中心', '今', '(21)', '日', '表示', '，', '我國', '近日', '接獲', '日本', '官方']]
 # pos_sentence_list returns a list of lists, with each containing a list of POS tags, like
 ## [['Nc', 'VH', 'Na', 'VC', 'Nc', 'Nd', 'Neu', 'Nd', 'VE', 'COMMACATEGORY', 'Nc', 'Nd', 'VC', 'Nc', 'Na']]
 # ner_sentence_list returns a list of sets, with each containing tuples of four elements (starting index, ending index, NER label, NER term), like
 ## [{(0, 10, 'ORG', '中央流行疫情指揮中心'),(21, 23, 'DATE', '近日'),(25, 27, 'NORP', '日本')}]

 def print_ner(single_doc):
    print(f'{single_doc}\n')
    ner_sentence_list = ckip(single_doc)[2] 
    for entity in sorted(ner_sentence_list[0]): #Without sorting, entities won't be printed in the order they occur in the doc.
        print(entity)
	# Reference source of the codes: https://github.com/ckiplab/ckiptagger.

	# Intall dependencies
	## !pip install -U ckiptagger[gdown]
	## !pip install -U ckiptagger[tfgpu] # for Tensorflow GPU

	from ckiptagger import data_utils, WS, POS, NER

	# Download model files to ./data.zip (2GB) and extract to ./data/
	## data_utils.download_data_url("./") # iis-ckip (Option 1)
	data_utils.download_data_gdown("./") # gdrive-ckip (Option 2)

	# Use GPU:
	## 1. Install tensorflow-gpu (see Installation)
	## 2. Set CUDA_VISIBLE_DEVICES environment variable, e.g.
	### os.environ["CUDA_VISIBLE_DEVICES"] = "0"
	## 3. Set disable_cuda=False, e.g.
	### ws = WS("./data", disable_cuda=False)

	# To use CPU:
	ws = WS("./data")
	pos = POS("./data")
	ner = NER("./data")

	# Define a basic tokenizer
	def tokenize(myStr):
	res = ws([myStr]) # The argument has to be a list.
	return res[0] # The index 0 is necessary because the input is a list with a sinle element.

	# Define the main function
	def ckip(doc_list):
	word_sentence_list = ws(
	doc_list,
	# sentence_segmentation = True, # To consider delimiters
	# segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters
	# recommend_dictionary = dictionary1, # words in this dictionary are encouraged
	# coerce_dictionary = dictionary2, # words in this dictionary are forced
	)

	pos_sentence_list = pos(word_sentence_list)
	ner_sentence_list = ner(word_sentence_list, pos_sentence_list)
	return word_sentence_list, pos_sentence_list, ner_sentence_list

	# word_sentence_list returns a list of lists, with each containing a list of tokenized words, like
	## [['中央', '流行', '疫情', '指揮', '中心', '今', '(21)', '日', '表示', '，', '我國', '近日', '接獲', '日本', '官方']]
	# pos_sentence_list returns a list of lists, with each containing a list of POS tags, like
	## [['Nc', 'VH', 'Na', 'VC', 'Nc', 'Nd', 'Neu', 'Nd', 'VE', 'COMMACATEGORY', 'Nc', 'Nd', 'VC', 'Nc', 'Na']]
	# ner_sentence_list returns a list of sets, with each containing tuples of four elements (starting index, ending index, NER label, NER term), like
	## [{(0, 10, 'ORG', '中央流行疫情指揮中心'),(21, 23, 'DATE', '近日'),(25, 27, 'NORP', '日本')}]

	def print_ner(single_doc):
	print(f'{single_doc}\n')
	ner_sentence_list = ckip(single_doc)[2]
	for entity in sorted(ner_sentence_list[0]): #Without sorting, entities won't be printed in the order they occur in the doc.
	print(entity)