Skip to content

Instantly share code, notes, and snippets.

@theeluwin
Last active June 14, 2016 03:00
Show Gist options
  • Select an option

  • Save theeluwin/651f0368f86cde0bdfb22635a05db72e to your computer and use it in GitHub Desktop.

Select an option

Save theeluwin/651f0368f86cde0bdfb22635a05db72e to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from __future__ import division, print_function, unicode_literals
import re
from collections import Counter
from konlpy.tag import Mecab
tagger = Mecab()
STOPWORDS = ("이", "그", "저")
def xplit(delimiters):
return lambda value: re.split('|'.join([re.escape(delimiter) for delimiter in delimiters]), value)
def bagify(text, tokenizer=xplit(['.', ' '])):
return Counter([token for token in tokenizer(text) if len(token) > 0 and token not in STOPWORDS])
def jaccard(text1, text2):
tokenizer = tagger.nouns
bag1 = bagify(text1, tokenizer)
bag2 = bagify(text2, tokenizer)
return sum((bag1 & bag2).values()) / sum((bag1 | bag2).values())
if __name__ == '__main__':
# text1 = "나라 말씀이 중국말과는 좀 다른것 같은데 말이지 아무래도 영 뜻 전하기가 조금 어려워버리는거 인정하는 각?"
# text2 = "나라 말이 중국과는 좀 다른데 아무래도 영 뜻 전하기가 꽤나 어려워 버리는거 ㅇㅈ?"
text1 = "맛있는 물을 마시는 연필"
text2 = "맛없는 물을 마시는 연필"
print(jaccard(text1, text2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment