Created
October 23, 2017 03:43
-
-
Save ratsgo/7726e8f9ccbd6c722f50a698f8771c94 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import frontmatter | |
import glob | |
import yaml | |
import string | |
from konlpy.tag import Komoran | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def get_posts(folder='C:/Users/ratsgo/GoogleDrive/내폴더/git/blog/_posts'): | |
result = {} | |
for filepath in glob.glob(folder + "/*"): | |
filename = filepath.split('\\')[-1] | |
slug = filename[11:-3] | |
post = frontmatter.load(filepath) | |
if "slug" in post.keys(): | |
slug = post["slug"] | |
result[slug] = post.content | |
return result | |
def write_result_to_file(related, file='C:/Users/ratsgo/GoogleDrive/내폴더/git/blog/_data/related.yml'): | |
data = [] | |
for r in related: | |
r = { | |
'post': r, | |
'related': related[r] | |
} | |
data.append(r) | |
with open(file, 'w') as f: | |
yaml.dump(data, f, default_flow_style=False) | |
stemmer = Komoran() | |
def tokenize(text): | |
#stems = stemmer.pos(text) | |
stems = stemmer.nouns(text) | |
return [stem[0] for stem in stems] | |
def cosine_sim(text1, text2, vectorizer): | |
tfidf = vectorizer.fit_transform([text1, text2]) | |
return ((tfidf * tfidf.T).A)[0, 1] | |
def get_similarity(num_best=5): | |
vectorizer = TfidfVectorizer(tokenizer=tokenize) | |
posts = get_posts() | |
cleaned_posts = {slug: post.lower().translate(str.maketrans('', '', string.punctuation)) for slug, post in posts.items()} | |
slugs = list(cleaned_posts.keys()) | |
tfidf = vectorizer.fit_transform(list(cleaned_posts.values())) | |
matrix = (tfidf * tfidf.T).A | |
result = {} | |
for i, row in enumerate(matrix): | |
indices = row.argsort()[-num_best-1:-1][::-1] | |
current_slug = slugs[i] | |
result[current_slug] = [slugs[index] for index in indices] | |
write_result_to_file(result) | |
get_similarity() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment