Skip to content

Instantly share code, notes, and snippets.

@SamSaffron
Created February 19, 2025 04:11
Show Gist options
  • Save SamSaffron/b7745cb9d3a30d5dc52d2e9a6500b64a to your computer and use it in GitHub Desktop.
Save SamSaffron/b7745cb9d3a30d5dc52d2e9a6500b64a to your computer and use it in GitHub Desktop.
classify_meta
#!/usr/bin/env ruby
require "fileutils"
require "open-uri"
TOPICS_DIR = File.expand_path("./topics", __dir__)
LLM_MODEL = "Gemini Flash 2.0"
Dir.chdir("/home/sam/Source/discourse")
require File.expand_path(
"/home/sam/Source/discourse/config/environment",
__FILE__
)
def ask_llm(system_message, user_message)
llm_model = LlmModel.find_by(display_name: LLM_MODEL)
llm = llm_model.to_llm
messages = [{ type: :user, content: user_message }]
prompt =
DiscourseAi::Completions::Prompt.new(system_message, messages: messages)
llm.generate(prompt, user: Discourse.system_user)
end
def download_topic(site, topic_id)
# Construct the URL for the topic JSON
url = "https://#{site}/t/#{topic_id}.json"
# Create TOPICS_DIR if it doesn't exist
FileUtils.mkdir_p(TOPICS_DIR)
# Define the output file path
output_file = File.join(TOPICS_DIR, "#{topic_id}.json")
# Download and save the topic JSON
begin
response = URI.open(url).read
File.write(output_file, response)
rescue OpenURI::HTTPError => e
puts "Error downloading topic #{topic_id}: #{e.message}"
rescue StandardError => e
puts "Unexpected error: #{e.message}"
end
end
def list_topic_ids(site, count)
all_ids = []
page = 0
while all_ids.size < count
url = "https://#{site}/latest.json?no_definitions=true&page=#{page}"
begin
response = URI.open(url).read
data = JSON.parse(response)
topics = data["topic_list"]["topics"]
break if topics.empty?
all_ids.concat(topics.map { |t| t["id"] })
page += 1
rescue OpenURI::HTTPError => e
puts "Error fetching topic list: #{e.message}"
break
rescue StandardError => e
puts "Unexpected error: #{e.message}"
break
end
end
all_ids.uniq.first(count)
end
def download_topics(site, count, only_new: true)
topic_ids = list_topic_ids(site, count)
topic_ids.each do |topic_id|
output_file = File.join(TOPICS_DIR, "#{topic_id}.json")
next if File.exist?(output_file) && only_new
download_topic(site, topic_id)
end
end
def extract_concepts(topic_id, existing_concepts = nil)
topic_file = File.join(TOPICS_DIR, "#{topic_id}.json")
return [] unless File.exist?(topic_file)
topic_data = JSON.parse(File.read(topic_file))
posts = topic_data["post_stream"]["posts"]
system_message = <<~PROMPT
You are a concept extraction assistant. Extract key concepts from the given text.
- Concepts should be 1-3 words
- Return 3-5 key concepts that best represent the discussion
- Format output as JSON array of [concept, [post_numbers]]
Example:
[
["Discourse update", [1,2]],
["Docker update", [3,4]]
]
IMPORTANT: Only ever reply with valid JSON, do not return any other text
IMPORTANT: Do not wrap the result with ```json or any other formatting
PROMPT
context =
if existing_concepts
"Existing concepts: #{existing_concepts.join(", ")}. Please prefer these concepts when applicable.\n\n"
else
""
end
# Add metadata section
metadata = []
metadata << "Title: #{topic_data["title"]}" if topic_data["title"]
if topic_data["category_name"]
metadata << "Category: #{topic_data["category_name"]}"
end
if topic_data["tags"]&.any?
metadata << "Tags: #{topic_data["tags"].join(", ")}"
end
content = [
metadata.join("\n"),
posts
.map { |post| "Post ##{post["post_number"]}: #{post["cooked"]}" }
.join("\n\n")
].join("\n\n")
user_message = context + content
retries = 0
max_retries = 5
begin
result = ask_llm(system_message, user_message)
rescue => e
if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
retries += 1
puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
sleep 20
retry
else
puts "Error after #{retries} retries: #{e.message}"
return []
end
end
# gemini likes to do this
result = result.gsub(/^```.*/, "").strip
begin
JSON.parse(result)
rescue JSON::ParserError
puts "Error parsing LLM response for topic #{topic_id}"
puts result
[]
end
end
def extract_all_concepts
first_pass_dir = File.expand_path("./first_pass", __dir__)
FileUtils.mkdir_p(first_pass_dir)
Dir
.glob(File.join(TOPICS_DIR, "*.json"))
.each do |topic_file|
topic_id = File.basename(topic_file, ".json")
output_file = File.join(first_pass_dir, "#{topic_id}.json")
# Skip if already processed
next if File.exist?(output_file)
puts "Processing topic #{topic_id}..."
concepts = extract_concepts(topic_id)
p concepts
# Save concepts to file
File.write(output_file, JSON.pretty_generate(concepts))
end
end
def list_all_concepts
first_pass_dir = File.expand_path("./first_pass", __dir__)
concept_counts = Hash.new(0)
Dir
.glob(File.join(first_pass_dir, "*.json"))
.each do |file|
begin
concepts = JSON.parse(File.read(file))
concepts.each { |concept, _posts| concept_counts[concept] += 1 }
rescue JSON::ParserError => e
puts "Error parsing #{file}: #{e.message}"
end
end
concepts = +""
# Sort by count in descending order and convert to array of [concept, count]
concept_counts
.sort_by { |_, count| -count }
.each { |concept, count| concepts << "#{concept}: #{count}\n" }
end
def normalize_concepts(max_concepts = 100)
normalized_file = File.expand_path("./normalized_concepts.json", __dir__)
# Return cached results if they exist
return JSON.parse(File.read(normalized_file)) if File.exist?(normalized_file)
concepts = list_all_concepts
system_message = <<~PROMPT
You are a concept normalization assistant. Given a list of concepts and their frequencies,
normalize them according to these rules:
1. Use consistent title case for all concepts
2. Merge similar or duplicate concepts (e.g. "docker update" and "updating docker")
3. Generalize overly specific concepts while maintaining meaning
4. Return only the top #{max_concepts} most relevant concepts
Format output as JSON array of [normalized_concept, count]:
[
["Docker Updates", 45],
["Performance Optimization", 32]
]
IMPORTANT: Only reply with valid JSON, no other text
IMPORTANT: Maintain relative usage counts when merging concepts
PROMPT
user_message = "Here are the concepts to normalize:\n#{concepts}"
retries = 0
max_retries = 5
begin
result = ask_llm(system_message, user_message)
result = result.gsub(/^```.*/, "").strip
normalized = JSON.parse(result)
# Cache the results
File.write(
File.expand_path("./normalized_concepts.json", __dir__),
JSON.pretty_generate(normalized)
)
normalized
rescue => e
if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
retries += 1
puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
sleep 20
retry
else
puts "Error after #{retries} retries: #{e.message}"
[]
end
end
end
def reclassify_topics
normalized_concepts = normalize_concepts(100)
concept_names = normalized_concepts.map { |concept, _| concept }
final_dir = File.expand_path("./final_classifications", __dir__)
FileUtils.mkdir_p(final_dir)
Dir
.glob(File.join(TOPICS_DIR, "*.json"))
.each do |topic_file|
topic_id = File.basename(topic_file, ".json")
output_file = File.join(final_dir, "#{topic_id}.json")
# Skip if already processed
next if File.exist?(output_file)
puts "Reclassifying topic #{topic_id}..."
concepts = extract_concepts(topic_id, concept_names)
# Save final classification
File.write(output_file, JSON.pretty_generate(concepts))
end
end
def summarize_classifications
final_dir = File.expand_path("./final_classifications", __dir__)
concept_counts = Hash.new(0)
topic_concepts = Hash.new { |h, k| h[k] = [] }
Dir
.glob(File.join(final_dir, "*.json"))
.each do |file|
topic_id = File.basename(file, ".json")
begin
concepts = JSON.parse(File.read(file))
concepts.each do |concept, posts|
concept_counts[concept] += 1
topic_concepts[concept] << topic_id
end
rescue JSON::ParserError => e
puts "Error parsing #{file}: #{e.message}"
end
end
summary = {
concept_counts: concept_counts.sort_by { |_, count| -count }.to_h,
topic_concepts: topic_concepts
}
File.write(
File.expand_path("./classification_summary.json", __dir__),
JSON.pretty_generate(summary)
)
summary
end
def generate_graph_data
final_dir = File.expand_path("./final_classifications", __dir__)
topics_data = []
Dir
.glob(File.join(TOPICS_DIR, "*.json"))
.each do |topic_file|
topic_id = File.basename(topic_file, ".json")
# Read topic data
topic_json = JSON.parse(File.read(topic_file))
# Read classification data
classification_file = File.join(final_dir, "#{topic_id}.json")
next unless File.exist?(classification_file)
concepts = JSON.parse(File.read(classification_file))
# Create topic entry
topic_entry = {
id: topic_id,
slug: topic_json["slug"],
title: topic_json["title"],
concepts: concepts.to_h # Convert the array of [concept, posts] to a hash
}
topics_data << topic_entry
end
graph_data = { topics: topics_data }
# Save to file
output_file = File.expand_path("./graph_data.json", __dir__)
File.write(output_file, JSON.pretty_generate(graph_data))
graph_data
end
# Example usage:
#download_topics("meta.discourse.org", 500)
#extract_all_concepts
#list_all_concepts
# normalize_concepts(100)
# reclassify_topics
generate_graph_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment