Last active
November 12, 2023 17:14
-
-
Save lordjabez/e12e3d7d01879833246ae7c6253177a1 to your computer and use it in GitHub Desktop.
Create theme reports from a Wordpress blog export
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import contextlib | |
import os | |
import sys | |
import cachier | |
import feedparser | |
import keyring | |
import openai | |
import numpy as np | |
import pandas as pd | |
import matplotlib.colors as mcolors | |
import matplotlib.pyplot as plt | |
from sklearn.cluster import KMeans | |
from sklearn.manifold import TSNE | |
try: | |
blog_domain = sys.argv[1] | |
num_themes = [int(t) for t in sys.argv[2].split(',')] | |
except Exception: | |
print('USAGE: generate-theme-reports.py BLOG_EXPORT_FILE NUM_THEMES[,NUM_THEMES...]') | |
sys.exit(-1) | |
samples_per_cluster = 10 | |
random_state = 42 | |
summarization_prompt = 'Summarize the theme of these posts in a single sentence. Do not include topic examples.' | |
openai_api_key = keyring.get_password('system', 'openai') | |
openai_client = openai.Client(api_key=openai_api_key) | |
@cachier.cachier() | |
def create_embedding(text): | |
print(text) | |
response = openai_client.embeddings.create(model='text-embedding-ada-002', input=text) | |
return response.data[0].embedding | |
@cachier.cachier() | |
def create_completion(prompt): | |
messages = [{'role': 'user', 'content': prompt}] | |
response = openai_client.chat.completions.create( | |
model='gpt-4-1106-preview', | |
messages=messages, | |
temperature=0, | |
max_tokens=100, | |
) | |
return response.choices[0].message.content.replace('\n', ' ') | |
def create_post(entry): | |
post_id = entry['id'] | |
link = entry['link'] | |
title = entry['title'] | |
content = entry['content'][0]['value'] | |
return {'id': post_id, 'link': link, 'title': title, 'content': content} | |
@cachier.cachier() | |
def download_posts(blog_domain): | |
print(f'Downloading posts from {blog_domain}') | |
posts = [] | |
page = 1 | |
while True: | |
feed_url = f'https://{blog_domain}?feed=rss2&paged={page}' | |
entries = feedparser.parse(feed_url)['entries'] | |
if not entries: | |
break | |
posts.extend(create_post(e) for e in entries) | |
page += 1 | |
return pd.DataFrame(posts) | |
def compute_embeddings(posts): | |
print('Computing embeddings for posts') | |
posts['embedding'] = posts.apply(lambda r: create_embedding(r.content), axis=1) | |
def generate_clusters(posts, num_clusters): | |
print(f'Computing {num_clusters} clusters from post embeddings') | |
post_vectors = np.vstack(posts.embedding.values) | |
kmeans = KMeans(n_init='auto', n_clusters=num_clusters, init='k-means++', random_state=random_state) | |
kmeans.fit(post_vectors) | |
posts['cluster_index'] = kmeans.labels_ | |
tsne = TSNE(n_components=2, perplexity=15, learning_rate=200, init='random', random_state=random_state) | |
return tsne.fit_transform(post_vectors) | |
def get_cluster_theme(posts, cluster_index): | |
print(f'Determining the theme of cluster {cluster_index}') | |
cluster = posts[posts.cluster_index == cluster_index] | |
num_samples = min(cluster.shape[0], samples_per_cluster) | |
cluster_samples = cluster.sample(num_samples, random_state=random_state) | |
posts = '\n'.join(cluster_samples.content.values) | |
prompt = '\n\n'.join((summarization_prompt, 'Posts:', '###', posts, '###', 'Theme:')) | |
return cluster_samples, create_completion(prompt) | |
def get_cluster_themes(posts): | |
num_themes = posts.cluster_index.max() + 1 | |
return [(i, *get_cluster_theme(posts, i)) for i in range(num_themes)] | |
def create_theme_report(cluster_themes, report_filename): | |
print(f'Writing theme report to {report_filename}') | |
color_names = [c.replace('tab:', '').capitalize() for c in mcolors.TABLEAU_COLORS] | |
with open(report_filename, 'w') as report_file, contextlib.redirect_stdout(report_file): | |
print('<h1>Topic Distribution</h1>') | |
print('<img src="clustering.png"/>') | |
for cluster_index, cluster_samples, theme in cluster_themes: | |
print(f'<h1>{color_names[cluster_index]} Posts</h1>') | |
print(f'<p>{theme}</p>') | |
print('<ul>') | |
for s in range(len(cluster_samples)): | |
link = cluster_samples.link.values[s] | |
title = cluster_samples.title.values[s] | |
print(f'<li><a href="{link}">{title}</a></li>') | |
print('</ul>') | |
def create_theme_graph(posts, post_vectors, graph_filename): | |
print(f'Writing theme graph to {graph_filename}') | |
num_themes = posts.cluster_index.max() + 1 | |
x_values, y_values = post_vectors.T | |
plt.figure() | |
for cluster_index, color in zip(range(num_themes), mcolors.TABLEAU_COLORS): | |
x_points = x_values[posts.cluster_index == cluster_index] | |
y_points = y_values[posts.cluster_index == cluster_index] | |
x_mean = x_points.mean() | |
y_mean = y_points.mean() | |
plt.scatter(x_points, y_points, color=color, alpha=0.3) | |
plt.scatter(x_mean, y_mean, marker='x', color=color, s=100) | |
plt.savefig(graph_filename) | |
def create_report(posts, post_vectors, cluster_themes): | |
num_themes = posts.cluster_index.max() + 1 | |
report_folder = os.path.join('reports', f'{num_themes}-themes') | |
print(f'Creating report at {report_folder}') | |
os.makedirs(report_folder, exist_ok=True) | |
report_filename = os.path.join(report_folder, 'index.html') | |
graph_filename = os.path.join(report_folder, 'clustering.png') | |
create_theme_report(cluster_themes, report_filename) | |
create_theme_graph(posts, post_vectors, graph_filename) | |
posts = download_posts(blog_domain) | |
compute_embeddings(posts) | |
for num_themes in num_themes: | |
post_vectors = generate_clusters(posts, num_themes) | |
cluster_themes = get_cluster_themes(posts) | |
create_report(posts, post_vectors, cluster_themes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment