Skip to content

Instantly share code, notes, and snippets.

@shiumachi
Last active January 5, 2018 17:20
Show Gist options
  • Save shiumachi/71885ced3937bba1c27730c374216604 to your computer and use it in GitHub Desktop.
Save shiumachi/71885ced3937bba1c27730c374216604 to your computer and use it in GitHub Desktop.
tfidfとbigramによるコサイン類似度
import pandas as pd
import numpy as np
import MeCab
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
m = MeCab.Tagger("-Ochasen")
m2 = MeCab.Tagger("-Owakati")
df = pd.read_csv("messages.csv")
df = df.dropna()
df = df.reset_index()
texts = df["text"]
def gen_bag_of_words(node):
c = Counter()
while node:
if len(node.surface) > 0:
c[node.surface] += 1
node = node.next
return c
def gen_bag_of_bigrams(node):
c = Counter()
prev = None
while node:
if node.surface == 0:
continue
if prev is not None:
c[(prev, node.surface)] += 1
prev = node.surface
node = node.next
return c
word_vectors = list()
for text in texts:
try:
node = m.parseToNode(text)
except TypeError as te:
print("{}, text = {}".format(te, text))
word_vectors.append(gen_bag_of_words(node))
texts_words_list = [' '.join(list(x.keys())) for x in word_vectors]
tfidf_matrix = tfidf_vectorizer.fit_transform(texts_words_list)
texts_wakati = [m2.parse(x.strip()) for x in texts]
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b', min_df=1)
bigram_matrix = bigram_vectorizer.fit_transform(texts_wakati)
def pickup_similar_texts(scores, texts_words_list, min_score=0.5, max_score=1.0, min_length=20):
similar_texts = set()
for i, score in enumerate(scores[0]):
if score < max_score and score > min_score and len(texts_words_list[i]) >= min_length:
similar_texts.add((texts_words_list[i], score, i))
return similar_texts
def listup_similarities(start_idx=0, end_idx=100, min_length=50, min_score=0.5, max_score=0.8, matrix=bigram_matrix, texts=texts_wakati_with_original_form):
for i in range(start_idx, end_idx):
base_text = texts[i]
base_word_vector = base_text.split(' ')
if len(base_text) < min_length or 'https' in base_word_vector or 'http' in base_word_vector:
continue
base_vector = matrix[i]
scores = cosine_similarity(base_vector, matrix)
similar_texts = pickup_similar_texts(scores, texts, min_score=min_score, max_score=max_score, min_length=min_length/2)
if len(similar_texts) > 0:
print("### base text = {}, idx = {} ###".format(base_text, i))
print(df[i:i+1])
for (text, score, idx) in similar_texts:
print("{}, idx = {}, score = {}".format(text, idx, score))
print(df[idx:idx+1])
def wakati_with_original_form(text):
node = m.parseToNode(text)
original_forms_list = list()
while node:
a = node.feature.split(',')
if a[6] == '*':
original_forms_list.append(node.surface)
else:
original_forms_list.append(a[6])
node = node.next
if len(arr) <= 0:
return ''
text_wakati = ' '.join(original_forms_list[1:-1])
return text_wakati
texts_wakati_with_original_form = [wakati_with_original_form(x) for x in texts]
bigram_matrix = bigram_vectorizer.fit_transform(texts_wakati_with_original_form)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment