Skip to content

Instantly share code, notes, and snippets.

@hailiang-wang
Last active April 13, 2018 06:27
Show Gist options
  • Save hailiang-wang/dcf0647bad498bfad783b4b54d4e1d31 to your computer and use it in GitHub Desktop.
Save hailiang-wang/dcf0647bad498bfad783b4b54d4e1d31 to your computer and use it in GitHub Desktop.
HIT LTP Parser
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#===============================================================================
#
# Copyright (c) 2017 <> All Rights Reserved
#
#
# File: /Users/hain/tmp/ltp_parser.py
# Author: Hai Liang Wang
# Date: 2018-04-12:18:49:38
#
#===============================================================================
"""
"""
from __future__ import print_function
from __future__ import division
__copyright__ = "Copyright (c) 2017 . All Rights Reserved"
__author__ = "Hai Liang Wang"
__date__ = "2018-04-12:18:49:38"
import os
import sys
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)
if sys.version_info[0] < 3:
reload(sys)
sys.setdefaultencoding("utf-8")
# raise "Must be using Python 3"
else:
unicode = str
from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller, VectorOfString
import jieba
import jieba.posseg as tokenizer
jieba.load_userdict("/Users/hain/tmp/jianxin.vocab.utf8")
# Get ENV
ENVIRON = os.environ.copy()
def any2utf8(text, errors='strict', encoding='utf8'):
"""Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8."""
if isinstance(text, unicode):
return text.encode('utf8')
# do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
return unicode(text, encoding, errors=errors).encode('utf8')
to_utf8 = any2utf8
def any2unicode(text, encoding='utf8', errors='strict'):
"""Convert a string (bytestring in `encoding` or unicode), to unicode."""
if isinstance(text, unicode):
return text
return unicode(text, encoding, errors=errors)
to_unicode = any2unicode
# paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!'
paragraph = '在购买建信安心保本二号混合时为何老是提示补充个人资料?'
sentence = SentenceSplitter.split(paragraph)[0]
from absl import flags #absl-py
from absl import logging #absl-py
FLAGS = flags.FLAGS
flags.DEFINE_string('echo', None, 'Text to echo.')
import unittest
# run testcase: python /Users/hain/tmp/ltp_parser.py Test.testExample
class Test(unittest.TestCase):
'''
'''
def setUp(self):
self.segmentor = Segmentor()
# 词表中每行是一个词,以\t分割,第一列是词,后面都被忽略
self.segmentor.load_with_lexicon("/Users/hain/tmp/ltp_data_v3.4.0/cws.model", "/Users/hain/ai/pyltp/jianxin.vocab.utf8")
# self.segmentor.load("/Users/hain/tmp/ltp_data_v3.4.0/cws.model")
self.postagger = Postagger()
self.postagger.load("/Users/hain/tmp/ltp_data_v3.4.0/pos.model")
self.recognizer = NamedEntityRecognizer()
self.recognizer.load("/Users/hain/tmp/ltp_data_v3.4.0/ner.model")
self.parser = Parser()
self.parser.load("/Users/hain/tmp/ltp_data_v3.4.0/parser.model")
self.labeller = SementicRoleLabeller()
self.labeller.load("/Users/hain/tmp/ltp_data_v3.4.0/pisrl.model")
def tearDown(self):
self.segmentor.release()
self.postagger.release()
self.parser.release()
self.recognizer.release()
self.labeller.release()
def test_parser(self):
logging.info("test_parser")
# words = self.segmentor.segment(sentence)
# postags = self.postagger.postag(words)
x = tokenizer.cut(sentence)
words, postags = VectorOfString(), VectorOfString()
for y in x:
words.append(any2utf8(y.word))
postags.append(any2utf8(y.flag))
print(" ".join(words))
arcs = self.parser.parse(words, postags)
netags = self.recognizer.recognize(words, postags)
roles = self.labeller.label(words, postags, arcs)
print("\t".join(words))
print("\t".join(postags))
# print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
print("Parser, CoNLL-U Format\n\n# text = %s " % sentence)
for x,y in enumerate(arcs):
# print(x + 1, "word: %s, tag: %s, head: %s, label: %s" % (words[x], postags[x], y.head, y.relation))
print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
x + 1,
words[x],
words[x],
postags[x],
"_",
"_",
y.head,
y.relation,
"_",
"_"))
print("\n\n")
print("\t".join(netags))
for role in roles:
print(role.index, "".join(
["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
'''
[dynet] random seed: 155442246
[dynet] allocating memory: 2000MB
[dynet] memory allocation done.
I0412 19:35:32.704823 140735696544640 ltp_parser.py:82] test_parser
中国 进出口 银行 与 中国银行 加强 合作 。
ns v n p ni v v wp
3:ATT 3:ATT 6:SBV 6:ADV 4:POB 0:HED 6:VOB 6:WP
B-Ni I-Ni E-Ni O S-Ni O O O
5 A0:(0,2)A1:(6,6)
.
----------------------------------------------------------------------
Ran 1 test in 4.143s
OK
依存关系的 label 解释
SBV 主谓关系 subject-verb 我送她一束花 (我 <-- 送)
VOB 动宾关系 直接宾语,verb-object 我送她一束花 (送 --> 花)
IOB 间宾关系 间接宾语,indirect-object 我送她一束花 (送 --> 她)
FOB 前置宾语 前置宾语,fronting-object 他什么书都读 (书 <-- 读)
DBL 兼语 double 他请我吃饭 (请 --> 我)
ATT 定中关系 attribute 红苹果 (红 <-- 苹果)
ADV 状中结构 adverbial 非常美丽 (非常 <-- 美丽)
CMP 动补结构 complement 做完了作业 (做 --> 完)
COO 并列关系 coordinate 大山和大海 (大山 --> 大海)
POB 介宾关系 preposition-object 在贸易区内 (在 --> 内)
LAD 左附加关系 left adjunct 大山和大海 (和 <-- 大海)
RAD 右附加关系 right adjunct 孩子们 (孩子 --> 们)
IS 独立结构 independent structure 两个单句在结构上彼此独立
WP 标点符号 punctuation 标点符号
HED 核心关系 head 指整个句子的核心
展示Tree:http://blog.chatbot.io/conllu.js/
'''
def test():
unittest.main()
if __name__ == '__main__':
FLAGS([__file__, '--verbosity', '1']) # DEBUG 1; INFO 0; WARNING -1
test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment