Skip to content

Instantly share code, notes, and snippets.

@jrc03c
Last active April 16, 2024 19:23
Show Gist options
  • Save jrc03c/aff133ee43f4991dc863a1871b998d05 to your computer and use it in GitHub Desktop.
Save jrc03c/aff133ee43f4991dc863a1871b998d05 to your computer and use it in GitHub Desktop.
A quick-and-dirty tf-idf implementation in Python
# pyds: https://github.com/jrc03c/pyds
from numpy import log
from pyds import sort
alpha = "abcdefghijklmnopqrstuvwxyz "
quotes = "'\"‘’“”"
def clean(x):
for char in x:
if char in quotes:
while char in x:
x = x.replace(char, "")
out = ""
for char in x.lower():
if char in alpha:
out += char
else:
out += " "
while " " in out:
out = out.replace(" ", " ")
return out
def tfidf(docs):
cache = {
"all_words": [],
"word_counts": {},
}
for doc in docs:
if doc not in cache["word_counts"]:
cache["word_counts"][doc] = {"total": 0, "words": {}}
cleaned = clean(doc)
words = cleaned.split(" ")
for word in words:
if word not in cache["word_counts"][doc]["words"]:
cache["word_counts"][doc]["words"][word] = 0
cache["word_counts"][doc]["words"][word] += 1
cache["word_counts"][doc]["total"] += 1
cache["all_words"].append(word)
cache["all_words"] = list(sorted(set(cache["all_words"])))
def tf(word, doc):
return (
cache["word_counts"][doc]["words"][word]
/ cache["word_counts"][doc]["total"]
)
def idf(word):
N = len(cache["word_counts"].keys())
n = 0
for doc in cache["word_counts"]:
if word in cache["word_counts"][doc]["words"]:
n += 1
return log(N / n)
out = []
for doc in docs:
row = []
for word in cache["word_counts"][doc]["words"].keys():
score = tf(word, doc) * idf(word)
row.append({"word": word, "score": score})
out.append(sort(lambda a, b: b["score"] - a["score"], row))
return out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment