Last active
June 14, 2016 03:00
-
-
Save theeluwin/651f0368f86cde0bdfb22635a05db72e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| from __future__ import division, print_function, unicode_literals | |
| import re | |
| from collections import Counter | |
| from konlpy.tag import Mecab | |
| tagger = Mecab() | |
| STOPWORDS = ("이", "그", "저") | |
| def xplit(delimiters): | |
| return lambda value: re.split('|'.join([re.escape(delimiter) for delimiter in delimiters]), value) | |
| def bagify(text, tokenizer=xplit(['.', ' '])): | |
| return Counter([token for token in tokenizer(text) if len(token) > 0 and token not in STOPWORDS]) | |
| def jaccard(text1, text2): | |
| tokenizer = tagger.nouns | |
| bag1 = bagify(text1, tokenizer) | |
| bag2 = bagify(text2, tokenizer) | |
| return sum((bag1 & bag2).values()) / sum((bag1 | bag2).values()) | |
| if __name__ == '__main__': | |
| # text1 = "나라 말씀이 중국말과는 좀 다른것 같은데 말이지 아무래도 영 뜻 전하기가 조금 어려워버리는거 인정하는 각?" | |
| # text2 = "나라 말이 중국과는 좀 다른데 아무래도 영 뜻 전하기가 꽤나 어려워 버리는거 ㅇㅈ?" | |
| text1 = "맛있는 물을 마시는 연필" | |
| text2 = "맛없는 물을 마시는 연필" | |
| print(jaccard(text1, text2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment