SamSaffron · February 19, 2025 04:11
diff --git a/classify b/classify
 #!/usr/bin/env ruby
 require "fileutils"
 require "open-uri"

 TOPICS_DIR = File.expand_path("./topics", __dir__)
 LLM_MODEL = "Gemini Flash 2.0"

 Dir.chdir("/home/sam/Source/discourse")

 require File.expand_path(
          "/home/sam/Source/discourse/config/environment",
          __FILE__
        )

 def ask_llm(system_message, user_message)
  llm_model = LlmModel.find_by(display_name: LLM_MODEL)
  llm = llm_model.to_llm

  messages = [{ type: :user, content: user_message }]

  prompt =
    DiscourseAi::Completions::Prompt.new(system_message, messages: messages)

  llm.generate(prompt, user: Discourse.system_user)
 end

 def download_topic(site, topic_id)
  # Construct the URL for the topic JSON
  url = "https://#{site}/t/#{topic_id}.json"

  # Create TOPICS_DIR if it doesn't exist
  FileUtils.mkdir_p(TOPICS_DIR)

  # Define the output file path
  output_file = File.join(TOPICS_DIR, "#{topic_id}.json")

  # Download and save the topic JSON
  begin
    response = URI.open(url).read
    File.write(output_file, response)
  rescue OpenURI::HTTPError => e
    puts "Error downloading topic #{topic_id}: #{e.message}"
  rescue StandardError => e
    puts "Unexpected error: #{e.message}"
  end
 end

 def list_topic_ids(site, count)
  all_ids = []
  page = 0

  while all_ids.size < count
    url = "https://#{site}/latest.json?no_definitions=true&page=#{page}"
    begin
      response = URI.open(url).read
      data = JSON.parse(response)
      topics = data["topic_list"]["topics"]
      break if topics.empty?

      all_ids.concat(topics.map { |t| t["id"] })
      page += 1
    rescue OpenURI::HTTPError => e
      puts "Error fetching topic list: #{e.message}"
      break
    rescue StandardError => e
      puts "Unexpected error: #{e.message}"
      break
    end
  end

  all_ids.uniq.first(count)
 end

 def download_topics(site, count, only_new: true)
  topic_ids = list_topic_ids(site, count)
  topic_ids.each do |topic_id|
    output_file = File.join(TOPICS_DIR, "#{topic_id}.json")
    next if File.exist?(output_file) && only_new

    download_topic(site, topic_id)
  end
 end

 def extract_concepts(topic_id, existing_concepts = nil)
  topic_file = File.join(TOPICS_DIR, "#{topic_id}.json")
  return [] unless File.exist?(topic_file)

  topic_data = JSON.parse(File.read(topic_file))
  posts = topic_data["post_stream"]["posts"]

  system_message = <<~PROMPT
    You are a concept extraction assistant. Extract key concepts from the given text.
    - Concepts should be 1-3 words
    - Return 3-5 key concepts that best represent the discussion
    - Format output as JSON array of [concept, [post_numbers]]

    Example:
    [
      ["Discourse update", [1,2]],
      ["Docker update", [3,4]]
    ]

    IMPORTANT: Only ever reply with valid JSON, do not return any other text
    IMPORTANT: Do not wrap the result with ```json or any other formatting
  PROMPT

  context =
    if existing_concepts
      "Existing concepts: #{existing_concepts.join(", ")}. Please prefer these concepts when applicable.\n\n"
    else
      ""
    end

  # Add metadata section
  metadata = []
  metadata << "Title: #{topic_data["title"]}" if topic_data["title"]
  if topic_data["category_name"]
    metadata << "Category: #{topic_data["category_name"]}"
  end
  if topic_data["tags"]&.any?
    metadata << "Tags: #{topic_data["tags"].join(", ")}"
  end

  content = [
    metadata.join("\n"),
    posts
      .map { |post| "Post ##{post["post_number"]}: #{post["cooked"]}" }
      .join("\n\n")
  ].join("\n\n")

  user_message = context + content

  retries = 0
  max_retries = 5

  begin
    result = ask_llm(system_message, user_message)
  rescue => e
    if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
      retries += 1
      puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
      sleep 20
      retry
    else
      puts "Error after #{retries} retries: #{e.message}"
      return []
    end
  end

  # gemini likes to do this
  result = result.gsub(/^```.*/, "").strip

  begin
    JSON.parse(result)
  rescue JSON::ParserError
    puts "Error parsing LLM response for topic #{topic_id}"
    puts result
    []
  end
 end

 def extract_all_concepts
  first_pass_dir = File.expand_path("./first_pass", __dir__)
  FileUtils.mkdir_p(first_pass_dir)

  Dir
    .glob(File.join(TOPICS_DIR, "*.json"))
    .each do |topic_file|
      topic_id = File.basename(topic_file, ".json")
      output_file = File.join(first_pass_dir, "#{topic_id}.json")

      # Skip if already processed
      next if File.exist?(output_file)

      puts "Processing topic #{topic_id}..."
      concepts = extract_concepts(topic_id)
      p concepts

      # Save concepts to file
      File.write(output_file, JSON.pretty_generate(concepts))
    end
 end

 def list_all_concepts
  first_pass_dir = File.expand_path("./first_pass", __dir__)
  concept_counts = Hash.new(0)

  Dir
    .glob(File.join(first_pass_dir, "*.json"))
    .each do |file|
      begin
        concepts = JSON.parse(File.read(file))
        concepts.each { |concept, _posts| concept_counts[concept] += 1 }
      rescue JSON::ParserError => e
        puts "Error parsing #{file}: #{e.message}"
      end
    end

  concepts = +""
  # Sort by count in descending order and convert to array of [concept, count]
  concept_counts
    .sort_by { |_, count| -count }
    .each { |concept, count| concepts << "#{concept}: #{count}\n" }
 end

 def normalize_concepts(max_concepts = 100)
  normalized_file = File.expand_path("./normalized_concepts.json", __dir__)

  # Return cached results if they exist
  return JSON.parse(File.read(normalized_file)) if File.exist?(normalized_file)
  concepts = list_all_concepts

  system_message = <<~PROMPT
    You are a concept normalization assistant. Given a list of concepts and their frequencies, 
    normalize them according to these rules:
    1. Use consistent title case for all concepts
    2. Merge similar or duplicate concepts (e.g. "docker update" and "updating docker")
    3. Generalize overly specific concepts while maintaining meaning
    4. Return only the top #{max_concepts} most relevant concepts
    
    Format output as JSON array of [normalized_concept, count]:
    [
      ["Docker Updates", 45],
      ["Performance Optimization", 32]
    ]

    IMPORTANT: Only reply with valid JSON, no other text
    IMPORTANT: Maintain relative usage counts when merging concepts
  PROMPT

  user_message = "Here are the concepts to normalize:\n#{concepts}"

  retries = 0
  max_retries = 5

  begin
    result = ask_llm(system_message, user_message)
    result = result.gsub(/^```.*/, "").strip

    normalized = JSON.parse(result)
    # Cache the results
    File.write(
      File.expand_path("./normalized_concepts.json", __dir__),
      JSON.pretty_generate(normalized)
    )
    normalized
  rescue => e
    if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
      retries += 1
      puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
      sleep 20
      retry
    else
      puts "Error after #{retries} retries: #{e.message}"
      []
    end
  end
 end

 def reclassify_topics
  normalized_concepts = normalize_concepts(100)
  concept_names = normalized_concepts.map { |concept, _| concept }
  final_dir = File.expand_path("./final_classifications", __dir__)
  FileUtils.mkdir_p(final_dir)

  Dir
    .glob(File.join(TOPICS_DIR, "*.json"))
    .each do |topic_file|
      topic_id = File.basename(topic_file, ".json")
      output_file = File.join(final_dir, "#{topic_id}.json")

      # Skip if already processed
      next if File.exist?(output_file)

      puts "Reclassifying topic #{topic_id}..."
      concepts = extract_concepts(topic_id, concept_names)

      # Save final classification
      File.write(output_file, JSON.pretty_generate(concepts))
    end
 end

 def summarize_classifications
  final_dir = File.expand_path("./final_classifications", __dir__)
  concept_counts = Hash.new(0)
  topic_concepts = Hash.new { |h, k| h[k] = [] }

  Dir
    .glob(File.join(final_dir, "*.json"))
    .each do |file|
      topic_id = File.basename(file, ".json")
      begin
        concepts = JSON.parse(File.read(file))
        concepts.each do |concept, posts|
          concept_counts[concept] += 1
          topic_concepts[concept] << topic_id
        end
      rescue JSON::ParserError => e
        puts "Error parsing #{file}: #{e.message}"
      end
    end

  summary = {
    concept_counts: concept_counts.sort_by { |_, count| -count }.to_h,
    topic_concepts: topic_concepts
  }

  File.write(
    File.expand_path("./classification_summary.json", __dir__),
    JSON.pretty_generate(summary)
  )

  summary
 end

 def generate_graph_data
  final_dir = File.expand_path("./final_classifications", __dir__)
  topics_data = []

  Dir
    .glob(File.join(TOPICS_DIR, "*.json"))
    .each do |topic_file|
      topic_id = File.basename(topic_file, ".json")

      # Read topic data
      topic_json = JSON.parse(File.read(topic_file))

      # Read classification data
      classification_file = File.join(final_dir, "#{topic_id}.json")
      next unless File.exist?(classification_file)

      concepts = JSON.parse(File.read(classification_file))

      # Create topic entry
      topic_entry = {
        id: topic_id,
        slug: topic_json["slug"],
        title: topic_json["title"],
        concepts: concepts.to_h # Convert the array of [concept, posts] to a hash
      }

      topics_data << topic_entry
    end

  graph_data = { topics: topics_data }

  # Save to file
  output_file = File.expand_path("./graph_data.json", __dir__)
  File.write(output_file, JSON.pretty_generate(graph_data))

  graph_data
 end

 # Example usage:
 #download_topics("meta.discourse.org", 500)
 #extract_all_concepts
 #list_all_concepts
 # normalize_concepts(100)
 # reclassify_topics
 generate_graph_data
	#!/usr/bin/env ruby
	require "fileutils"
	require "open-uri"

	TOPICS_DIR = File.expand_path("./topics", __dir__)
	LLM_MODEL = "Gemini Flash 2.0"

	Dir.chdir("/home/sam/Source/discourse")

	require File.expand_path(
	"/home/sam/Source/discourse/config/environment",
	__FILE__
	)

	def ask_llm(system_message, user_message)
	llm_model = LlmModel.find_by(display_name: LLM_MODEL)
	llm = llm_model.to_llm

	messages = [{ type: :user, content: user_message }]

	prompt =
	DiscourseAi::Completions::Prompt.new(system_message, messages: messages)

	llm.generate(prompt, user: Discourse.system_user)
	end

	def download_topic(site, topic_id)
	# Construct the URL for the topic JSON
	url = "https://#{site}/t/#{topic_id}.json"

	# Create TOPICS_DIR if it doesn't exist
	FileUtils.mkdir_p(TOPICS_DIR)

	# Define the output file path
	output_file = File.join(TOPICS_DIR, "#{topic_id}.json")

	# Download and save the topic JSON
	begin
	response = URI.open(url).read
	File.write(output_file, response)
	rescue OpenURI::HTTPError => e
	puts "Error downloading topic #{topic_id}: #{e.message}"
	rescue StandardError => e
	puts "Unexpected error: #{e.message}"
	end
	end

	def list_topic_ids(site, count)
	all_ids = []
	page = 0

	while all_ids.size < count
	url = "https://#{site}/latest.json?no_definitions=true&page=#{page}"
	begin
	response = URI.open(url).read
	data = JSON.parse(response)
	topics = data["topic_list"]["topics"]
	break if topics.empty?

	all_ids.concat(topics.map { \|t\| t["id"] })
	page += 1
	rescue OpenURI::HTTPError => e
	puts "Error fetching topic list: #{e.message}"
	break
	rescue StandardError => e
	puts "Unexpected error: #{e.message}"
	break
	end
	end

	all_ids.uniq.first(count)
	end

	def download_topics(site, count, only_new: true)
	topic_ids = list_topic_ids(site, count)
	topic_ids.each do \|topic_id\|
	output_file = File.join(TOPICS_DIR, "#{topic_id}.json")
	next if File.exist?(output_file) && only_new

	download_topic(site, topic_id)
	end
	end

	def extract_concepts(topic_id, existing_concepts = nil)
	topic_file = File.join(TOPICS_DIR, "#{topic_id}.json")
	return [] unless File.exist?(topic_file)

	topic_data = JSON.parse(File.read(topic_file))
	posts = topic_data["post_stream"]["posts"]

	system_message = <<~PROMPT
	You are a concept extraction assistant. Extract key concepts from the given text.
	- Concepts should be 1-3 words
	- Return 3-5 key concepts that best represent the discussion
	- Format output as JSON array of [concept, [post_numbers]]

	Example:
	[
	["Discourse update", [1,2]],
	["Docker update", [3,4]]
	]

	IMPORTANT: Only ever reply with valid JSON, do not return any other text
	IMPORTANT: Do not wrap the result with ```json or any other formatting
	PROMPT

	context =
	if existing_concepts
	"Existing concepts: #{existing_concepts.join(", ")}. Please prefer these concepts when applicable.\n\n"
	else
	""
	end

	# Add metadata section
	metadata = []
	metadata << "Title: #{topic_data["title"]}" if topic_data["title"]
	if topic_data["category_name"]
	metadata << "Category: #{topic_data["category_name"]}"
	end
	if topic_data["tags"]&.any?
	metadata << "Tags: #{topic_data["tags"].join(", ")}"
	end

	content = [
	metadata.join("\n"),
	posts
	.map { \|post\| "Post ##{post["post_number"]}: #{post["cooked"]}" }
	.join("\n\n")
	].join("\n\n")

	user_message = context + content

	retries = 0
	max_retries = 5

	begin
	result = ask_llm(system_message, user_message)
	rescue => e
	if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
	retries += 1
	puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
	sleep 20
	retry
	else
	puts "Error after #{retries} retries: #{e.message}"
	return []
	end
	end

	# gemini likes to do this
	result = result.gsub(/^```.*/, "").strip

	begin
	JSON.parse(result)
	rescue JSON::ParserError
	puts "Error parsing LLM response for topic #{topic_id}"
	puts result
	[]
	end
	end

	def extract_all_concepts
	first_pass_dir = File.expand_path("./first_pass", __dir__)
	FileUtils.mkdir_p(first_pass_dir)

	Dir
	.glob(File.join(TOPICS_DIR, "*.json"))
	.each do \|topic_file\|
	topic_id = File.basename(topic_file, ".json")
	output_file = File.join(first_pass_dir, "#{topic_id}.json")

	# Skip if already processed
	next if File.exist?(output_file)

	puts "Processing topic #{topic_id}..."
	concepts = extract_concepts(topic_id)
	p concepts

	# Save concepts to file
	File.write(output_file, JSON.pretty_generate(concepts))
	end
	end

	def list_all_concepts
	first_pass_dir = File.expand_path("./first_pass", __dir__)
	concept_counts = Hash.new(0)

	Dir
	.glob(File.join(first_pass_dir, "*.json"))
	.each do \|file\|
	begin
	concepts = JSON.parse(File.read(file))
	concepts.each { \|concept, _posts\| concept_counts[concept] += 1 }
	rescue JSON::ParserError => e
	puts "Error parsing #{file}: #{e.message}"
	end
	end

	concepts = +""
	# Sort by count in descending order and convert to array of [concept, count]
	concept_counts
	.sort_by { \|_, count\| -count }
	.each { \|concept, count\| concepts << "#{concept}: #{count}\n" }
	end

	def normalize_concepts(max_concepts = 100)
	normalized_file = File.expand_path("./normalized_concepts.json", __dir__)

	# Return cached results if they exist
	return JSON.parse(File.read(normalized_file)) if File.exist?(normalized_file)
	concepts = list_all_concepts

	system_message = <<~PROMPT
	You are a concept normalization assistant. Given a list of concepts and their frequencies,
	normalize them according to these rules:
	1. Use consistent title case for all concepts
	2. Merge similar or duplicate concepts (e.g. "docker update" and "updating docker")
	3. Generalize overly specific concepts while maintaining meaning
	4. Return only the top #{max_concepts} most relevant concepts

	Format output as JSON array of [normalized_concept, count]:
	[
	["Docker Updates", 45],
	["Performance Optimization", 32]
	]

	IMPORTANT: Only reply with valid JSON, no other text
	IMPORTANT: Maintain relative usage counts when merging concepts
	PROMPT

	user_message = "Here are the concepts to normalize:\n#{concepts}"

	retries = 0
	max_retries = 5

	begin
	result = ask_llm(system_message, user_message)
	result = result.gsub(/^```.*/, "").strip

	normalized = JSON.parse(result)
	# Cache the results
	File.write(
	File.expand_path("./normalized_concepts.json", __dir__),
	JSON.pretty_generate(normalized)
	)
	normalized
	rescue => e
	if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
	retries += 1
	puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
	sleep 20
	retry
	else
	puts "Error after #{retries} retries: #{e.message}"
	[]
	end
	end
	end

	def reclassify_topics
	normalized_concepts = normalize_concepts(100)
	concept_names = normalized_concepts.map { \|concept, _\| concept }
	final_dir = File.expand_path("./final_classifications", __dir__)
	FileUtils.mkdir_p(final_dir)

	Dir
	.glob(File.join(TOPICS_DIR, "*.json"))
	.each do \|topic_file\|
	topic_id = File.basename(topic_file, ".json")
	output_file = File.join(final_dir, "#{topic_id}.json")

	# Skip if already processed
	next if File.exist?(output_file)

	puts "Reclassifying topic #{topic_id}..."
	concepts = extract_concepts(topic_id, concept_names)

	# Save final classification
	File.write(output_file, JSON.pretty_generate(concepts))
	end
	end

	def summarize_classifications
	final_dir = File.expand_path("./final_classifications", __dir__)
	concept_counts = Hash.new(0)
	topic_concepts = Hash.new { \|h, k\| h[k] = [] }

	Dir
	.glob(File.join(final_dir, "*.json"))
	.each do \|file\|
	topic_id = File.basename(file, ".json")
	begin
	concepts = JSON.parse(File.read(file))
	concepts.each do \|concept, posts\|
	concept_counts[concept] += 1
	topic_concepts[concept] << topic_id
	end
	rescue JSON::ParserError => e
	puts "Error parsing #{file}: #{e.message}"
	end
	end

	summary = {
	concept_counts: concept_counts.sort_by { \|_, count\| -count }.to_h,
	topic_concepts: topic_concepts
	}

	File.write(
	File.expand_path("./classification_summary.json", __dir__),
	JSON.pretty_generate(summary)
	)

	summary
	end

	def generate_graph_data
	final_dir = File.expand_path("./final_classifications", __dir__)
	topics_data = []

	Dir
	.glob(File.join(TOPICS_DIR, "*.json"))
	.each do \|topic_file\|
	topic_id = File.basename(topic_file, ".json")

	# Read topic data
	topic_json = JSON.parse(File.read(topic_file))

	# Read classification data
	classification_file = File.join(final_dir, "#{topic_id}.json")
	next unless File.exist?(classification_file)

	concepts = JSON.parse(File.read(classification_file))

	# Create topic entry
	topic_entry = {
	id: topic_id,
	slug: topic_json["slug"],
	title: topic_json["title"],
	concepts: concepts.to_h # Convert the array of [concept, posts] to a hash
	}

	topics_data << topic_entry
	end

	graph_data = { topics: topics_data }

	# Save to file
	output_file = File.expand_path("./graph_data.json", __dir__)
	File.write(output_file, JSON.pretty_generate(graph_data))

	graph_data
	end

	# Example usage:
	#download_topics("meta.discourse.org", 500)
	#extract_all_concepts
	#list_all_concepts
	# normalize_concepts(100)
	# reclassify_topics
	generate_graph_data