Created
February 24, 2021 15:10
-
-
Save cgpeter96/22f615505f6733cf4f187b3c37d8c88e to your computer and use it in GitHub Desktop.
街道抽取脚本(简易版)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
@desc: | |
en:the simple code of road name extration | |
cn:简单街道抽取脚本 | |
@author:peter | |
@mail:[email protected] | |
@date:2021/2/24 | |
@note: | |
该脚本可能存在问题,但由于目前数据就这么多所以就先这样吧,仅供参考。 | |
""" | |
import pkuseg | |
import jieba | |
from jieba import posseg | |
def tokenize_pku(word): | |
return tokenizer.cut(word) | |
def tokenize_jieba(word): | |
ret = posseg.cut(word) | |
return [(word.word,word.flag) for word in ret] | |
USE_PKU_TOKENIZER=True | |
tokenize = None | |
if USE_PKU_TOKENIZER: | |
tokenizer = pkuseg.pkuseg(postag=True) | |
tokenize = tokenize_pku | |
else: | |
#jieba准确率有限。 | |
tokenize = tokenize_jieba | |
data="""向塘北大道西50米 | |
天龙路与龙华路交叉口北50米 | |
观澜大道490号附近 | |
成都市锦江区海椒市街13号附7号 | |
玉兰西路 | |
团结北路23号 | |
湖塘镇火炬北路12号 | |
昆明市晋宁区庄跷西路28 | |
金水路合作路28-1号 | |
长公大道浙江显家门业阆中总代理旁 | |
安阳街道岭下东路4号楼 | |
万顷沙珠江街珠江东路169号 | |
中央大街万达广场a座一层a17 | |
梅亭路18号民生银行旁 | |
北京市四川西路""".split("\n") | |
data=[line.strip()for line in data] | |
#数据 | |
pos_cands = [tokenize(line) for line in data] | |
road_keywords = ["街","大道","路",] | |
def check(word): | |
#检测是否为街道路 | |
for w in road_keywords: | |
if w in word: | |
return True | |
return False | |
def check_city(word): | |
#检测是否为城市 | |
keywords=["省","市","区","街道","县","村","镇"] | |
for key in keywords: | |
if word.endswith(key): | |
return True | |
return False | |
def find_road(pos_cands,verbose=False): | |
""" | |
道路组合形式:n+n | |
v+ns | |
ns+n | |
ns+ns | |
ns | |
n | |
j+n | |
n+n | |
n与ns需要包含关键词: | |
Args: | |
pos_cands:list,e.g. [("北京","ns")] | |
""" | |
res = [] | |
pre_idx = -1 | |
pre_pos = "" | |
text = "" | |
if verbose: | |
print(pos_cands) | |
for idx,(word,pos) in enumerate(pos_cands): | |
#过滤地区词 | |
if pos=="ns" and check_city(word): | |
continue | |
#总结规律,写规则 | |
if pre_pos in ["v","j","n","a","ns"]and pos in ["ns","n"]: | |
if check(word): | |
text+=word | |
res.append(text) | |
text = "" | |
pre_pos="" | |
else: | |
text=word | |
pre_pos=pos | |
pre_idx=idx | |
elif check(word) and pos in ["ns","n"]: | |
res.append(word) | |
pre_idx=idx | |
elif pos in ["v","j","n","a"]: | |
# print(word) | |
text+=word | |
pre_idx=idx | |
pre_pos=pos | |
elif pos in ["ns","n"]: | |
# print(word) | |
text+=word | |
pre_idx=idx | |
pre_pos=pos | |
else: | |
pre_idx= idx | |
pre_pos="" | |
if text: | |
res.append(text) | |
real_res = [] | |
for word in res: | |
for key in road_keywords: | |
if key in word: | |
real_res.append(word) | |
return real_res | |
for cand in pos_cands: | |
print(find_road(cand)) | |
# print(find_road(pos_cands[-4],True)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment