Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save zk-1/eed6de21935d2bd04455b87516198c3d to your computer and use it in GitHub Desktop.
Save zk-1/eed6de21935d2bd04455b87516198c3d to your computer and use it in GitHub Desktop.
Ruby script to export Slack conversations to a genericized markdown format suitable for ingestion by AI LLMs. Supports CSV, JSONL and JSON.
# typed: false
# frozen_string_literal: true
# usage: ruby ./slack_conversations_export_genericized.rb
require "slack-ruby-client"
require "nokogiri"
require "yaml"
require "json"
require "csv"
CONFIG_TEMPLATE = <<~HEREDOC
slack_bot_token:
slack_channel_id:
since_timestamp:
until_timestamp:
skip_bot_threads: true
skip_attachment_threads: true
skip_empty_threads: true
include_bot_replies: false
genericize_tagged_users: true
niceify_channel_and_team_tags: true
export_format: csv
agent_slack_ids:
- SLACK_USER_ID
HEREDOC
SLACKBOT_NAME = "SLACKBOT"
REQUESTER_NAME = "REQUESTER"
SUPPORT_AGENT_NAME = "SUPPORT_AGENT"
ANOTHER_USER_NAME = "ANOTHER_USER"
PRIVATE_SLACK_CHANNEL_NAME = "PRIVATE_SLACK_CHANNEL"
CONFIG_FILE = "config.yaml"
# Create the config file if it doesn't exist
unless File.file?(CONFIG_FILE)
File.write(CONFIG_FILE, CONFIG_TEMPLATE)
puts "Config template written to #{CONFIG_FILE}, please add configs and re-run script"
puts "The Slack bot should be a member of the channel, and have the channels:history"
puts "scope for public channels, and groups:history for private channels"
puts "Supported export formats are csv, json and jsonl"
exit
end
# Load the config file
config = YAML.load(File.read(CONFIG_FILE))
channel = config["slack_channel_id"]
# Instantiate a Slack web API client
slack_bot_client = Slack::Web::Client.new(token: config["slack_bot_token"])
# Get list of timestamps of all top-level messages ('threads') in the channel
puts "Enumerating threads..."
thread_timestamps = []
slack_bot_client.conversations_history(
channel:,
oldest: config["since_timestamp"],
newest: config["until_timestamp"],
) do |res|
thread_timestamps.concat(res.messages.map { |m| m["ts"] })
end
prepared_conversations = []
puts "#{thread_timestamps.count} threads found. Proceeding to transcribe threads..."
# Iterate through all the threads
thread_timestamps.each_with_index do |ts, i|
# Progress indicator
print "\n#{i + 1}/#{thread_timestamps.count}"
# API call to retrieve the thread's messages
thread_messages = []
slack_bot_client.conversations_replies(channel:, ts:) do |res|
thread_messages.concat(res.messages)
end
# Skip bot threads if so configured
next if config["skip_bot_threads"] && thread_messages.first.key?("bot_id")
# Skip threads that have attachments if so configured
next if config["skip_attachment_threads"] && thread_messages.first.key?("files")
thread_author = thread_messages.first["user"]
# Transcribe the thread replies as an array of strings
prepared_messages = []
thread_messages.each do |message|
next if message.key?("bot_id") && !config["include_bot_replies"]
next if message["text"].empty?
# Determine the message attribution (author)
message_attribution = if message.key?("bot_id")
SLACKBOT_NAME
elsif message["user"] == thread_author
REQUESTER_NAME
elsif config["agent_slack_ids"]&.include?(message["user"])
SUPPORT_AGENT_NAME
else
ANOTHER_USER_NAME
end
message_text = message["text"].dup
# Genericize tagged users if so configured. This only for human users, not bots
if config["genericize_tagged_users"]
message_text.gsub!(/<@[A-Z0-9]+>/) do |match|
if match[2..-2] == thread_author
REQUESTER_NAME
elsif config["agent_slack_ids"]&.include?(match[2..-2])
SUPPORT_AGENT_NAME
else
ANOTHER_USER_NAME
end
end
end
# Niceify channel and team names if so configured
if config["niceify_channel_and_team_tags"]
# Public channels
message_text.gsub!(/<#C[A-Z0-9]+\|([a-z0-9-]+)>/, "#\\1")
# Private channels
message_text.gsub!(/<#C[A-Z0-9]+\|>/, PRIVATE_SLACK_CHANNEL_NAME)
# Teams
message_text.gsub!(/<!subteam\^[A-Z0-9]+\|(@[a-zA-Z0-9\-_]+)>/, "\\1 team")
end
# Convert Slack links to Markdown links
message_text.gsub!(%r{<(https?:\/\/[^|]+)\|([^>]+)>}, "[\\2](\\1)")
message_text.gsub!(%r{<(https?:\/\/[^|]+)>}, "\\1")
# Parse emails & phone numbers
message_text.gsub!(/<mailto:[^|]+\|([^>]+)>/, "\\1")
message_text.gsub!(/<tel:[^|]+\|([^>]+)>/, "\\1")
# Parse HTML entities such as &amp;, &gt;, etc.
doc = Nokogiri::HTML.parse(message_text)
message_text = doc.text
prepared_messages << "#{message_attribution}: #{message_text}"
end
next if prepared_messages.count == 0
next if config["skip_empty_threads"] && prepared_messages.count == 1
# Arrange the requests and replies into an array of hashes
prepared_conversations << {
request: prepared_messages.first.gsub(/^#{REQUESTER_NAME}: /, ""),
replies: prepared_messages.drop(1).join("\n"),
}
print " ✔"
end
# Write the prepared data to a local file
export_file_name = "exported_conversations" \
"_from_#{channel}" \
"_at_#{Time.now.to_i}" \
"_since_#{config["since_timestamp"]}" \
"_until_#{config["until_timestamp"]}"
case config["export_format"]
when "csv"
export_file = "#{export_file_name}.csv"
CSV.open(export_file, "wb") do |csv|
csv << prepared_conversations.first.keys
prepared_conversations.each do |hash|
csv << hash.values
end
end
when "jsonl"
export_file = "#{export_file_name}.jsonl"
File.write(export_file, prepared_conversations.map { |r| JSON.generate(r) }.join("\n"))
when "json"
export_file = "#{export_file_name}.json"
File.write(export_file, prepared_conversations.to_json)
end
puts "\n✔ Exported #{prepared_conversations.count} threads to #{export_file}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment