Last active
June 21, 2023 12:43
-
-
Save snakeye/6c7f436670bc968a3fc929a4c91aba5d to your computer and use it in GitHub Desktop.
Automatic Relevant Post Suggestions for Static Blogs (Eleventy, Jekyll, etc.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import logging | |
import os | |
import string | |
from typing import List | |
import frontmatter | |
import nltk | |
import numpy as np | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize import word_tokenize | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
logger = logging.getLogger(__name__) | |
nltk.download("stopwords", quiet=True) | |
nltk.download("wordnet", quiet=True) | |
def collect_md_files(directory: str) -> List[str]: | |
md_files = [] | |
for root, dirs, files in os.walk(directory): | |
for file in files: | |
if file.endswith(".md"): | |
md_files.append(os.path.join(root, file)) | |
return md_files | |
def extract_tokens( | |
text: str, | |
stop_words: set, | |
lemmatizer: WordNetLemmatizer, | |
punctuation: str, | |
) -> List[str]: | |
text = text.lower().translate(str.maketrans("", "", punctuation)) | |
tokens = word_tokenize(text) | |
tokens = [token for token in tokens if len(token) >= 3 and token not in stop_words] | |
return [lemmatizer.lemmatize(token) for token in tokens] | |
def find_similar_posts( | |
posts: List[frontmatter.Post], similarity_matrix: np.ndarray, limit: int = 3 | |
) -> dict: | |
similar_posts = {} | |
for i, post in enumerate(posts): | |
similarities = similarity_matrix[i] | |
sorted_indices = np.argsort(similarities)[::-1] | |
top_indices = sorted_indices[: limit + 1] | |
top_posts = [posts[idx] for idx in top_indices if idx != i] | |
similar_posts[post] = top_posts | |
return similar_posts | |
def main(args): | |
md_files = collect_md_files(args.dir) | |
# load posts | |
posts = [] | |
for file in md_files: | |
with open(file) as ifile: | |
post = frontmatter.load(ifile) | |
if "permalink" not in post: | |
logger.error(f"Post {file} does not have permalink!") | |
post["file"] = file | |
posts.append(post) | |
# tokenise post contents | |
stop_words = set(stopwords.words("english")) | |
punctuation = string.punctuation + "’–‘“”()" | |
lemmatizer = WordNetLemmatizer() | |
tokenized_sources = [ | |
" ".join(extract_tokens(post.content, stop_words, lemmatizer, punctuation)) | |
for post in posts | |
] | |
# build similarity matrix | |
tfidf_vectorizer = TfidfVectorizer() | |
tfidf_matrix = tfidf_vectorizer.fit_transform(tokenized_sources) | |
similarity_matrix = cosine_similarity(tfidf_matrix) | |
# process similar posts | |
similar_posts = find_similar_posts(posts, similarity_matrix) | |
for post, similar in similar_posts.items(): | |
update = [] | |
for i, sim_post in enumerate(similar): | |
url = sim_post.get("permalink") | |
update.append( | |
{ | |
"url": url, | |
"title": sim_post.get("title"), | |
"description": sim_post.get("description"), | |
"image": sim_post.get("image"), | |
'date': sim_post.get('date'), | |
}, | |
) | |
post["similar"] = update | |
# save posts | |
for post in posts: | |
file_path = post["file"] | |
del post["file"] | |
with open(file_path, "wb") as ofile: | |
frontmatter.dump(post, ofile) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("dir", type=str) | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This GitHub Gist provides a Python script,
relevance.py
, that leverages Natural Language Processing (NLP) and machine learning techniques to generate relevant post suggestions for static blogs. By utilizing NLTK (Natural Language Toolkit), scikit-learn (Sklearn), and thefrontmatter
package, this script enhances the reader experience by automatically updating each blog post with the details of the three most relevant posts.Features:
YAML Metadata Parsing: The script extracts relevant information from blog posts stored in YAML format with metadata specified in frontmatter. It retrieves key details such as the post title, content, and permalink.
Language Support: The script is designed for static blogs written in English. It incorporates language processing techniques optimized for English text analysis, including tokenization, stemming, and stop-word removal.
Text Preprocessing: Before generating relevant post suggestions, the script performs essential preprocessing steps to clean and normalize the text. This involves removing HTML tags, punctuation, and other noise from the post content.
TF-IDF Vectorization: The script employs the Term Frequency-Inverse Document Frequency (TF-IDF) technique to transform the preprocessed text into numerical vectors. TF-IDF captures the importance of each term in a blog post relative to the entire corpus, enabling accurate analysis of text similarity.
Cosine Similarity Calculation: Using the TF-IDF vectors, the script computes the cosine similarity between each blog post. Cosine similarity measures the angle between two vectors, providing a reliable metric to assess their similarity.
Relevant Post Suggestions: Based on the cosine similarity scores, the script identifies the three most similar blog posts to the current one. It updates the frontmatter of each blog post, appending the details of the relevant posts.
Flexible Target Directory: The script accepts the target directory as an argument, allowing customization for different static blog setups. This flexibility ensures seamless integration with your existing file structure.
Integration with CI: The script is suitable for integration with Continuous Integration (CI) pipelines. By including it as part of your CI workflow, you can automate the generation of relevant post suggestions prior to each blog deployment, ensuring an up-to-date and engaging reader experience.
Dependencies:
pip install nltk
pip install scikit-learn
pip install python-frontmatter
Usage:
python relevance.py /path/to/blog/posts
.By utilizing this script, you can significantly enhance your static blog by providing readers with personalized and relevant post recommendations. The automated process saves valuable time and effort, leading to increased reader engagement and an enriched blogging experience.