Last active
January 5, 2018 17:20
-
-
Save shiumachi/71885ced3937bba1c27730c374216604 to your computer and use it in GitHub Desktop.
tfidfとbigramによるコサイン類似度
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import MeCab | |
from collections import Counter | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
m = MeCab.Tagger("-Ochasen") | |
m2 = MeCab.Tagger("-Owakati") | |
df = pd.read_csv("messages.csv") | |
df = df.dropna() | |
df = df.reset_index() | |
texts = df["text"] | |
def gen_bag_of_words(node): | |
c = Counter() | |
while node: | |
if len(node.surface) > 0: | |
c[node.surface] += 1 | |
node = node.next | |
return c | |
def gen_bag_of_bigrams(node): | |
c = Counter() | |
prev = None | |
while node: | |
if node.surface == 0: | |
continue | |
if prev is not None: | |
c[(prev, node.surface)] += 1 | |
prev = node.surface | |
node = node.next | |
return c | |
word_vectors = list() | |
for text in texts: | |
try: | |
node = m.parseToNode(text) | |
except TypeError as te: | |
print("{}, text = {}".format(te, text)) | |
word_vectors.append(gen_bag_of_words(node)) | |
texts_words_list = [' '.join(list(x.keys())) for x in word_vectors] | |
tfidf_matrix = tfidf_vectorizer.fit_transform(texts_words_list) | |
texts_wakati = [m2.parse(x.strip()) for x in texts] | |
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b', min_df=1) | |
bigram_matrix = bigram_vectorizer.fit_transform(texts_wakati) | |
def pickup_similar_texts(scores, texts_words_list, min_score=0.5, max_score=1.0, min_length=20): | |
similar_texts = set() | |
for i, score in enumerate(scores[0]): | |
if score < max_score and score > min_score and len(texts_words_list[i]) >= min_length: | |
similar_texts.add((texts_words_list[i], score, i)) | |
return similar_texts | |
def listup_similarities(start_idx=0, end_idx=100, min_length=50, min_score=0.5, max_score=0.8, matrix=bigram_matrix, texts=texts_wakati_with_original_form): | |
for i in range(start_idx, end_idx): | |
base_text = texts[i] | |
base_word_vector = base_text.split(' ') | |
if len(base_text) < min_length or 'https' in base_word_vector or 'http' in base_word_vector: | |
continue | |
base_vector = matrix[i] | |
scores = cosine_similarity(base_vector, matrix) | |
similar_texts = pickup_similar_texts(scores, texts, min_score=min_score, max_score=max_score, min_length=min_length/2) | |
if len(similar_texts) > 0: | |
print("### base text = {}, idx = {} ###".format(base_text, i)) | |
print(df[i:i+1]) | |
for (text, score, idx) in similar_texts: | |
print("{}, idx = {}, score = {}".format(text, idx, score)) | |
print(df[idx:idx+1]) | |
def wakati_with_original_form(text): | |
node = m.parseToNode(text) | |
original_forms_list = list() | |
while node: | |
a = node.feature.split(',') | |
if a[6] == '*': | |
original_forms_list.append(node.surface) | |
else: | |
original_forms_list.append(a[6]) | |
node = node.next | |
if len(arr) <= 0: | |
return '' | |
text_wakati = ' '.join(original_forms_list[1:-1]) | |
return text_wakati | |
texts_wakati_with_original_form = [wakati_with_original_form(x) for x in texts] | |
bigram_matrix = bigram_vectorizer.fit_transform(texts_wakati_with_original_form) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment