Skip to content

Instantly share code, notes, and snippets.

@cgpeter96
Created February 24, 2021 15:10
Show Gist options
  • Save cgpeter96/22f615505f6733cf4f187b3c37d8c88e to your computer and use it in GitHub Desktop.
Save cgpeter96/22f615505f6733cf4f187b3c37d8c88e to your computer and use it in GitHub Desktop.
街道抽取脚本(简易版)
"""
@desc:
en:the simple code of road name extration
cn:简单街道抽取脚本
@author:peter
@mail:[email protected]
@date:2021/2/24
@note:
该脚本可能存在问题,但由于目前数据就这么多所以就先这样吧,仅供参考。
"""
import pkuseg
import jieba
from jieba import posseg
def tokenize_pku(word):
return tokenizer.cut(word)
def tokenize_jieba(word):
ret = posseg.cut(word)
return [(word.word,word.flag) for word in ret]
USE_PKU_TOKENIZER=True
tokenize = None
if USE_PKU_TOKENIZER:
tokenizer = pkuseg.pkuseg(postag=True)
tokenize = tokenize_pku
else:
#jieba准确率有限。
tokenize = tokenize_jieba
data="""向塘北大道西50米
天龙路与龙华路交叉口北50米
观澜大道490号附近
成都市锦江区海椒市街13号附7号
玉兰西路
团结北路23号
湖塘镇火炬北路12号
昆明市晋宁区庄跷西路28
金水路合作路28-1号
长公大道浙江显家门业阆中总代理旁
安阳街道岭下东路4号楼
万顷沙珠江街珠江东路169号
中央大街万达广场a座一层a17
梅亭路18号民生银行旁
北京市四川西路""".split("\n")
data=[line.strip()for line in data]
#数据
pos_cands = [tokenize(line) for line in data]
road_keywords = ["街","大道","路",]
def check(word):
#检测是否为街道路
for w in road_keywords:
if w in word:
return True
return False
def check_city(word):
#检测是否为城市
keywords=["省","市","区","街道","县","村","镇"]
for key in keywords:
if word.endswith(key):
return True
return False
def find_road(pos_cands,verbose=False):
"""
道路组合形式:n+n
v+ns
ns+n
ns+ns
ns
n
j+n
n+n
n与ns需要包含关键词:
Args:
pos_cands:list,e.g. [("北京","ns")]
"""
res = []
pre_idx = -1
pre_pos = ""
text = ""
if verbose:
print(pos_cands)
for idx,(word,pos) in enumerate(pos_cands):
#过滤地区词
if pos=="ns" and check_city(word):
continue
#总结规律,写规则
if pre_pos in ["v","j","n","a","ns"]and pos in ["ns","n"]:
if check(word):
text+=word
res.append(text)
text = ""
pre_pos=""
else:
text=word
pre_pos=pos
pre_idx=idx
elif check(word) and pos in ["ns","n"]:
res.append(word)
pre_idx=idx
elif pos in ["v","j","n","a"]:
# print(word)
text+=word
pre_idx=idx
pre_pos=pos
elif pos in ["ns","n"]:
# print(word)
text+=word
pre_idx=idx
pre_pos=pos
else:
pre_idx= idx
pre_pos=""
if text:
res.append(text)
real_res = []
for word in res:
for key in road_keywords:
if key in word:
real_res.append(word)
return real_res
for cand in pos_cands:
print(find_road(cand))
# print(find_road(pos_cands[-4],True))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment