Last active
April 13, 2018 06:27
-
-
Save hailiang-wang/dcf0647bad498bfad783b4b54d4e1d31 to your computer and use it in GitHub Desktop.
HIT LTP Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
#=============================================================================== | |
# | |
# Copyright (c) 2017 <> All Rights Reserved | |
# | |
# | |
# File: /Users/hain/tmp/ltp_parser.py | |
# Author: Hai Liang Wang | |
# Date: 2018-04-12:18:49:38 | |
# | |
#=============================================================================== | |
""" | |
""" | |
from __future__ import print_function | |
from __future__ import division | |
__copyright__ = "Copyright (c) 2017 . All Rights Reserved" | |
__author__ = "Hai Liang Wang" | |
__date__ = "2018-04-12:18:49:38" | |
import os | |
import sys | |
curdir = os.path.dirname(os.path.abspath(__file__)) | |
sys.path.append(curdir) | |
if sys.version_info[0] < 3: | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
# raise "Must be using Python 3" | |
else: | |
unicode = str | |
from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller, VectorOfString | |
import jieba | |
import jieba.posseg as tokenizer | |
jieba.load_userdict("/Users/hain/tmp/jianxin.vocab.utf8") | |
# Get ENV | |
ENVIRON = os.environ.copy() | |
def any2utf8(text, errors='strict', encoding='utf8'): | |
"""Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" | |
if isinstance(text, unicode): | |
return text.encode('utf8') | |
# do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 | |
return unicode(text, encoding, errors=errors).encode('utf8') | |
to_utf8 = any2utf8 | |
def any2unicode(text, encoding='utf8', errors='strict'): | |
"""Convert a string (bytestring in `encoding` or unicode), to unicode.""" | |
if isinstance(text, unicode): | |
return text | |
return unicode(text, encoding, errors=errors) | |
to_unicode = any2unicode | |
# paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!' | |
paragraph = '在购买建信安心保本二号混合时为何老是提示补充个人资料?' | |
sentence = SentenceSplitter.split(paragraph)[0] | |
from absl import flags #absl-py | |
from absl import logging #absl-py | |
FLAGS = flags.FLAGS | |
flags.DEFINE_string('echo', None, 'Text to echo.') | |
import unittest | |
# run testcase: python /Users/hain/tmp/ltp_parser.py Test.testExample | |
class Test(unittest.TestCase): | |
''' | |
''' | |
def setUp(self): | |
self.segmentor = Segmentor() | |
# 词表中每行是一个词,以\t分割,第一列是词,后面都被忽略 | |
self.segmentor.load_with_lexicon("/Users/hain/tmp/ltp_data_v3.4.0/cws.model", "/Users/hain/ai/pyltp/jianxin.vocab.utf8") | |
# self.segmentor.load("/Users/hain/tmp/ltp_data_v3.4.0/cws.model") | |
self.postagger = Postagger() | |
self.postagger.load("/Users/hain/tmp/ltp_data_v3.4.0/pos.model") | |
self.recognizer = NamedEntityRecognizer() | |
self.recognizer.load("/Users/hain/tmp/ltp_data_v3.4.0/ner.model") | |
self.parser = Parser() | |
self.parser.load("/Users/hain/tmp/ltp_data_v3.4.0/parser.model") | |
self.labeller = SementicRoleLabeller() | |
self.labeller.load("/Users/hain/tmp/ltp_data_v3.4.0/pisrl.model") | |
def tearDown(self): | |
self.segmentor.release() | |
self.postagger.release() | |
self.parser.release() | |
self.recognizer.release() | |
self.labeller.release() | |
def test_parser(self): | |
logging.info("test_parser") | |
# words = self.segmentor.segment(sentence) | |
# postags = self.postagger.postag(words) | |
x = tokenizer.cut(sentence) | |
words, postags = VectorOfString(), VectorOfString() | |
for y in x: | |
words.append(any2utf8(y.word)) | |
postags.append(any2utf8(y.flag)) | |
print(" ".join(words)) | |
arcs = self.parser.parse(words, postags) | |
netags = self.recognizer.recognize(words, postags) | |
roles = self.labeller.label(words, postags, arcs) | |
print("\t".join(words)) | |
print("\t".join(postags)) | |
# print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) | |
print("Parser, CoNLL-U Format\n\n# text = %s " % sentence) | |
for x,y in enumerate(arcs): | |
# print(x + 1, "word: %s, tag: %s, head: %s, label: %s" % (words[x], postags[x], y.head, y.relation)) | |
print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( | |
x + 1, | |
words[x], | |
words[x], | |
postags[x], | |
"_", | |
"_", | |
y.head, | |
y.relation, | |
"_", | |
"_")) | |
print("\n\n") | |
print("\t".join(netags)) | |
for role in roles: | |
print(role.index, "".join( | |
["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) | |
''' | |
[dynet] random seed: 155442246 | |
[dynet] allocating memory: 2000MB | |
[dynet] memory allocation done. | |
I0412 19:35:32.704823 140735696544640 ltp_parser.py:82] test_parser | |
中国 进出口 银行 与 中国银行 加强 合作 。 | |
ns v n p ni v v wp | |
3:ATT 3:ATT 6:SBV 6:ADV 4:POB 0:HED 6:VOB 6:WP | |
B-Ni I-Ni E-Ni O S-Ni O O O | |
5 A0:(0,2)A1:(6,6) | |
. | |
---------------------------------------------------------------------- | |
Ran 1 test in 4.143s | |
OK | |
依存关系的 label 解释 | |
SBV 主谓关系 subject-verb 我送她一束花 (我 <-- 送) | |
VOB 动宾关系 直接宾语,verb-object 我送她一束花 (送 --> 花) | |
IOB 间宾关系 间接宾语,indirect-object 我送她一束花 (送 --> 她) | |
FOB 前置宾语 前置宾语,fronting-object 他什么书都读 (书 <-- 读) | |
DBL 兼语 double 他请我吃饭 (请 --> 我) | |
ATT 定中关系 attribute 红苹果 (红 <-- 苹果) | |
ADV 状中结构 adverbial 非常美丽 (非常 <-- 美丽) | |
CMP 动补结构 complement 做完了作业 (做 --> 完) | |
COO 并列关系 coordinate 大山和大海 (大山 --> 大海) | |
POB 介宾关系 preposition-object 在贸易区内 (在 --> 内) | |
LAD 左附加关系 left adjunct 大山和大海 (和 <-- 大海) | |
RAD 右附加关系 right adjunct 孩子们 (孩子 --> 们) | |
IS 独立结构 independent structure 两个单句在结构上彼此独立 | |
WP 标点符号 punctuation 标点符号 | |
HED 核心关系 head 指整个句子的核心 | |
展示Tree:http://blog.chatbot.io/conllu.js/ | |
''' | |
def test(): | |
unittest.main() | |
if __name__ == '__main__': | |
FLAGS([__file__, '--verbosity', '1']) # DEBUG 1; INFO 0; WARNING -1 | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment