Last active
April 16, 2024 19:23
-
-
Save jrc03c/aff133ee43f4991dc863a1871b998d05 to your computer and use it in GitHub Desktop.
A quick-and-dirty tf-idf implementation in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pyds: https://github.com/jrc03c/pyds | |
from numpy import log | |
from pyds import sort | |
alpha = "abcdefghijklmnopqrstuvwxyz " | |
quotes = "'\"‘’“”" | |
def clean(x): | |
for char in x: | |
if char in quotes: | |
while char in x: | |
x = x.replace(char, "") | |
out = "" | |
for char in x.lower(): | |
if char in alpha: | |
out += char | |
else: | |
out += " " | |
while " " in out: | |
out = out.replace(" ", " ") | |
return out | |
def tfidf(docs): | |
cache = { | |
"all_words": [], | |
"word_counts": {}, | |
} | |
for doc in docs: | |
if doc not in cache["word_counts"]: | |
cache["word_counts"][doc] = {"total": 0, "words": {}} | |
cleaned = clean(doc) | |
words = cleaned.split(" ") | |
for word in words: | |
if word not in cache["word_counts"][doc]["words"]: | |
cache["word_counts"][doc]["words"][word] = 0 | |
cache["word_counts"][doc]["words"][word] += 1 | |
cache["word_counts"][doc]["total"] += 1 | |
cache["all_words"].append(word) | |
cache["all_words"] = list(sorted(set(cache["all_words"]))) | |
def tf(word, doc): | |
return ( | |
cache["word_counts"][doc]["words"][word] | |
/ cache["word_counts"][doc]["total"] | |
) | |
def idf(word): | |
N = len(cache["word_counts"].keys()) | |
n = 0 | |
for doc in cache["word_counts"]: | |
if word in cache["word_counts"][doc]["words"]: | |
n += 1 | |
return log(N / n) | |
out = [] | |
for doc in docs: | |
row = [] | |
for word in cache["word_counts"][doc]["words"].keys(): | |
score = tf(word, doc) * idf(word) | |
row.append({"word": word, "score": score}) | |
out.append(sort(lambda a, b: b["score"] - a["score"], row)) | |
return out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment