Created
February 19, 2025 04:11
-
-
Save SamSaffron/b7745cb9d3a30d5dc52d2e9a6500b64a to your computer and use it in GitHub Desktop.
classify_meta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require "fileutils" | |
require "open-uri" | |
TOPICS_DIR = File.expand_path("./topics", __dir__) | |
LLM_MODEL = "Gemini Flash 2.0" | |
Dir.chdir("/home/sam/Source/discourse") | |
require File.expand_path( | |
"/home/sam/Source/discourse/config/environment", | |
__FILE__ | |
) | |
def ask_llm(system_message, user_message) | |
llm_model = LlmModel.find_by(display_name: LLM_MODEL) | |
llm = llm_model.to_llm | |
messages = [{ type: :user, content: user_message }] | |
prompt = | |
DiscourseAi::Completions::Prompt.new(system_message, messages: messages) | |
llm.generate(prompt, user: Discourse.system_user) | |
end | |
def download_topic(site, topic_id) | |
# Construct the URL for the topic JSON | |
url = "https://#{site}/t/#{topic_id}.json" | |
# Create TOPICS_DIR if it doesn't exist | |
FileUtils.mkdir_p(TOPICS_DIR) | |
# Define the output file path | |
output_file = File.join(TOPICS_DIR, "#{topic_id}.json") | |
# Download and save the topic JSON | |
begin | |
response = URI.open(url).read | |
File.write(output_file, response) | |
rescue OpenURI::HTTPError => e | |
puts "Error downloading topic #{topic_id}: #{e.message}" | |
rescue StandardError => e | |
puts "Unexpected error: #{e.message}" | |
end | |
end | |
def list_topic_ids(site, count) | |
all_ids = [] | |
page = 0 | |
while all_ids.size < count | |
url = "https://#{site}/latest.json?no_definitions=true&page=#{page}" | |
begin | |
response = URI.open(url).read | |
data = JSON.parse(response) | |
topics = data["topic_list"]["topics"] | |
break if topics.empty? | |
all_ids.concat(topics.map { |t| t["id"] }) | |
page += 1 | |
rescue OpenURI::HTTPError => e | |
puts "Error fetching topic list: #{e.message}" | |
break | |
rescue StandardError => e | |
puts "Unexpected error: #{e.message}" | |
break | |
end | |
end | |
all_ids.uniq.first(count) | |
end | |
def download_topics(site, count, only_new: true) | |
topic_ids = list_topic_ids(site, count) | |
topic_ids.each do |topic_id| | |
output_file = File.join(TOPICS_DIR, "#{topic_id}.json") | |
next if File.exist?(output_file) && only_new | |
download_topic(site, topic_id) | |
end | |
end | |
def extract_concepts(topic_id, existing_concepts = nil) | |
topic_file = File.join(TOPICS_DIR, "#{topic_id}.json") | |
return [] unless File.exist?(topic_file) | |
topic_data = JSON.parse(File.read(topic_file)) | |
posts = topic_data["post_stream"]["posts"] | |
system_message = <<~PROMPT | |
You are a concept extraction assistant. Extract key concepts from the given text. | |
- Concepts should be 1-3 words | |
- Return 3-5 key concepts that best represent the discussion | |
- Format output as JSON array of [concept, [post_numbers]] | |
Example: | |
[ | |
["Discourse update", [1,2]], | |
["Docker update", [3,4]] | |
] | |
IMPORTANT: Only ever reply with valid JSON, do not return any other text | |
IMPORTANT: Do not wrap the result with ```json or any other formatting | |
PROMPT | |
context = | |
if existing_concepts | |
"Existing concepts: #{existing_concepts.join(", ")}. Please prefer these concepts when applicable.\n\n" | |
else | |
"" | |
end | |
# Add metadata section | |
metadata = [] | |
metadata << "Title: #{topic_data["title"]}" if topic_data["title"] | |
if topic_data["category_name"] | |
metadata << "Category: #{topic_data["category_name"]}" | |
end | |
if topic_data["tags"]&.any? | |
metadata << "Tags: #{topic_data["tags"].join(", ")}" | |
end | |
content = [ | |
metadata.join("\n"), | |
posts | |
.map { |post| "Post ##{post["post_number"]}: #{post["cooked"]}" } | |
.join("\n\n") | |
].join("\n\n") | |
user_message = context + content | |
retries = 0 | |
max_retries = 5 | |
begin | |
result = ask_llm(system_message, user_message) | |
rescue => e | |
if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries | |
retries += 1 | |
puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})" | |
sleep 20 | |
retry | |
else | |
puts "Error after #{retries} retries: #{e.message}" | |
return [] | |
end | |
end | |
# gemini likes to do this | |
result = result.gsub(/^```.*/, "").strip | |
begin | |
JSON.parse(result) | |
rescue JSON::ParserError | |
puts "Error parsing LLM response for topic #{topic_id}" | |
puts result | |
[] | |
end | |
end | |
def extract_all_concepts | |
first_pass_dir = File.expand_path("./first_pass", __dir__) | |
FileUtils.mkdir_p(first_pass_dir) | |
Dir | |
.glob(File.join(TOPICS_DIR, "*.json")) | |
.each do |topic_file| | |
topic_id = File.basename(topic_file, ".json") | |
output_file = File.join(first_pass_dir, "#{topic_id}.json") | |
# Skip if already processed | |
next if File.exist?(output_file) | |
puts "Processing topic #{topic_id}..." | |
concepts = extract_concepts(topic_id) | |
p concepts | |
# Save concepts to file | |
File.write(output_file, JSON.pretty_generate(concepts)) | |
end | |
end | |
def list_all_concepts | |
first_pass_dir = File.expand_path("./first_pass", __dir__) | |
concept_counts = Hash.new(0) | |
Dir | |
.glob(File.join(first_pass_dir, "*.json")) | |
.each do |file| | |
begin | |
concepts = JSON.parse(File.read(file)) | |
concepts.each { |concept, _posts| concept_counts[concept] += 1 } | |
rescue JSON::ParserError => e | |
puts "Error parsing #{file}: #{e.message}" | |
end | |
end | |
concepts = +"" | |
# Sort by count in descending order and convert to array of [concept, count] | |
concept_counts | |
.sort_by { |_, count| -count } | |
.each { |concept, count| concepts << "#{concept}: #{count}\n" } | |
end | |
def normalize_concepts(max_concepts = 100) | |
normalized_file = File.expand_path("./normalized_concepts.json", __dir__) | |
# Return cached results if they exist | |
return JSON.parse(File.read(normalized_file)) if File.exist?(normalized_file) | |
concepts = list_all_concepts | |
system_message = <<~PROMPT | |
You are a concept normalization assistant. Given a list of concepts and their frequencies, | |
normalize them according to these rules: | |
1. Use consistent title case for all concepts | |
2. Merge similar or duplicate concepts (e.g. "docker update" and "updating docker") | |
3. Generalize overly specific concepts while maintaining meaning | |
4. Return only the top #{max_concepts} most relevant concepts | |
Format output as JSON array of [normalized_concept, count]: | |
[ | |
["Docker Updates", 45], | |
["Performance Optimization", 32] | |
] | |
IMPORTANT: Only reply with valid JSON, no other text | |
IMPORTANT: Maintain relative usage counts when merging concepts | |
PROMPT | |
user_message = "Here are the concepts to normalize:\n#{concepts}" | |
retries = 0 | |
max_retries = 5 | |
begin | |
result = ask_llm(system_message, user_message) | |
result = result.gsub(/^```.*/, "").strip | |
normalized = JSON.parse(result) | |
# Cache the results | |
File.write( | |
File.expand_path("./normalized_concepts.json", __dir__), | |
JSON.pretty_generate(normalized) | |
) | |
normalized | |
rescue => e | |
if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries | |
retries += 1 | |
puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})" | |
sleep 20 | |
retry | |
else | |
puts "Error after #{retries} retries: #{e.message}" | |
[] | |
end | |
end | |
end | |
def reclassify_topics | |
normalized_concepts = normalize_concepts(100) | |
concept_names = normalized_concepts.map { |concept, _| concept } | |
final_dir = File.expand_path("./final_classifications", __dir__) | |
FileUtils.mkdir_p(final_dir) | |
Dir | |
.glob(File.join(TOPICS_DIR, "*.json")) | |
.each do |topic_file| | |
topic_id = File.basename(topic_file, ".json") | |
output_file = File.join(final_dir, "#{topic_id}.json") | |
# Skip if already processed | |
next if File.exist?(output_file) | |
puts "Reclassifying topic #{topic_id}..." | |
concepts = extract_concepts(topic_id, concept_names) | |
# Save final classification | |
File.write(output_file, JSON.pretty_generate(concepts)) | |
end | |
end | |
def summarize_classifications | |
final_dir = File.expand_path("./final_classifications", __dir__) | |
concept_counts = Hash.new(0) | |
topic_concepts = Hash.new { |h, k| h[k] = [] } | |
Dir | |
.glob(File.join(final_dir, "*.json")) | |
.each do |file| | |
topic_id = File.basename(file, ".json") | |
begin | |
concepts = JSON.parse(File.read(file)) | |
concepts.each do |concept, posts| | |
concept_counts[concept] += 1 | |
topic_concepts[concept] << topic_id | |
end | |
rescue JSON::ParserError => e | |
puts "Error parsing #{file}: #{e.message}" | |
end | |
end | |
summary = { | |
concept_counts: concept_counts.sort_by { |_, count| -count }.to_h, | |
topic_concepts: topic_concepts | |
} | |
File.write( | |
File.expand_path("./classification_summary.json", __dir__), | |
JSON.pretty_generate(summary) | |
) | |
summary | |
end | |
def generate_graph_data | |
final_dir = File.expand_path("./final_classifications", __dir__) | |
topics_data = [] | |
Dir | |
.glob(File.join(TOPICS_DIR, "*.json")) | |
.each do |topic_file| | |
topic_id = File.basename(topic_file, ".json") | |
# Read topic data | |
topic_json = JSON.parse(File.read(topic_file)) | |
# Read classification data | |
classification_file = File.join(final_dir, "#{topic_id}.json") | |
next unless File.exist?(classification_file) | |
concepts = JSON.parse(File.read(classification_file)) | |
# Create topic entry | |
topic_entry = { | |
id: topic_id, | |
slug: topic_json["slug"], | |
title: topic_json["title"], | |
concepts: concepts.to_h # Convert the array of [concept, posts] to a hash | |
} | |
topics_data << topic_entry | |
end | |
graph_data = { topics: topics_data } | |
# Save to file | |
output_file = File.expand_path("./graph_data.json", __dir__) | |
File.write(output_file, JSON.pretty_generate(graph_data)) | |
graph_data | |
end | |
# Example usage: | |
#download_topics("meta.discourse.org", 500) | |
#extract_all_concepts | |
#list_all_concepts | |
# normalize_concepts(100) | |
# reclassify_topics | |
generate_graph_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment