Created
November 28, 2022 04:13
-
-
Save dennisseah/ed7627863b6560630f65b74e41248462 to your computer and use it in GitHub Desktop.
TF-IDF with PyPDF2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# nltk==3.7 | |
# pandas==1.5.2 | |
# PyPDF2==2.11.2 | |
# scikit-learn==1.1.3 | |
from PyPDF2 import PdfReader | |
import io | |
import pandas as pd | |
import requests | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from nltk import word_tokenize | |
from nltk.stem.porter import PorterStemmer | |
def tokenizeText(text: str): | |
tokens = word_tokenize(text.lower()) | |
return " ".join([PorterStemmer().stem(item) for item in tokens]) | |
def main(): | |
response = requests.get("https://www.databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf") | |
reader = PdfReader(io.BytesIO(response.content)) | |
text = " ".join([tokenizeText(p.extract_text().replace("\n", " ")) for p in reader.pages]) | |
vectorizer = TfidfVectorizer() | |
matrix = vectorizer.fit_transform([text]).todense() | |
df_matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names_out()) | |
words = df_matrix.sum(axis=0).sort_values(ascending=True) | |
df_words = words.to_frame(name="count") | |
df_words.reset_index(inplace=True) | |
df_words = df_words.rename(columns = {"index": "word"}) | |
print(df_words.squeeze().to_json(orient="records")) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment