Last active
June 8, 2023 18:29
-
-
Save zk-1/eed6de21935d2bd04455b87516198c3d to your computer and use it in GitHub Desktop.
Ruby script to export Slack conversations to a genericized markdown format suitable for ingestion by AI LLMs. Supports CSV, JSONL and JSON.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# typed: false | |
# frozen_string_literal: true | |
# usage: ruby ./slack_conversations_export_genericized.rb | |
require "slack-ruby-client" | |
require "nokogiri" | |
require "yaml" | |
require "json" | |
require "csv" | |
CONFIG_TEMPLATE = <<~HEREDOC | |
slack_bot_token: | |
slack_channel_id: | |
since_timestamp: | |
until_timestamp: | |
skip_bot_threads: true | |
skip_attachment_threads: true | |
skip_empty_threads: true | |
include_bot_replies: false | |
genericize_tagged_users: true | |
niceify_channel_and_team_tags: true | |
export_format: csv | |
agent_slack_ids: | |
- SLACK_USER_ID | |
HEREDOC | |
SLACKBOT_NAME = "SLACKBOT" | |
REQUESTER_NAME = "REQUESTER" | |
SUPPORT_AGENT_NAME = "SUPPORT_AGENT" | |
ANOTHER_USER_NAME = "ANOTHER_USER" | |
PRIVATE_SLACK_CHANNEL_NAME = "PRIVATE_SLACK_CHANNEL" | |
CONFIG_FILE = "config.yaml" | |
# Create the config file if it doesn't exist | |
unless File.file?(CONFIG_FILE) | |
File.write(CONFIG_FILE, CONFIG_TEMPLATE) | |
puts "Config template written to #{CONFIG_FILE}, please add configs and re-run script" | |
puts "The Slack bot should be a member of the channel, and have the channels:history" | |
puts "scope for public channels, and groups:history for private channels" | |
puts "Supported export formats are csv, json and jsonl" | |
exit | |
end | |
# Load the config file | |
config = YAML.load(File.read(CONFIG_FILE)) | |
channel = config["slack_channel_id"] | |
# Instantiate a Slack web API client | |
slack_bot_client = Slack::Web::Client.new(token: config["slack_bot_token"]) | |
# Get list of timestamps of all top-level messages ('threads') in the channel | |
puts "Enumerating threads..." | |
thread_timestamps = [] | |
slack_bot_client.conversations_history( | |
channel:, | |
oldest: config["since_timestamp"], | |
newest: config["until_timestamp"], | |
) do |res| | |
thread_timestamps.concat(res.messages.map { |m| m["ts"] }) | |
end | |
prepared_conversations = [] | |
puts "#{thread_timestamps.count} threads found. Proceeding to transcribe threads..." | |
# Iterate through all the threads | |
thread_timestamps.each_with_index do |ts, i| | |
# Progress indicator | |
print "\n#{i + 1}/#{thread_timestamps.count}" | |
# API call to retrieve the thread's messages | |
thread_messages = [] | |
slack_bot_client.conversations_replies(channel:, ts:) do |res| | |
thread_messages.concat(res.messages) | |
end | |
# Skip bot threads if so configured | |
next if config["skip_bot_threads"] && thread_messages.first.key?("bot_id") | |
# Skip threads that have attachments if so configured | |
next if config["skip_attachment_threads"] && thread_messages.first.key?("files") | |
thread_author = thread_messages.first["user"] | |
# Transcribe the thread replies as an array of strings | |
prepared_messages = [] | |
thread_messages.each do |message| | |
next if message.key?("bot_id") && !config["include_bot_replies"] | |
next if message["text"].empty? | |
# Determine the message attribution (author) | |
message_attribution = if message.key?("bot_id") | |
SLACKBOT_NAME | |
elsif message["user"] == thread_author | |
REQUESTER_NAME | |
elsif config["agent_slack_ids"]&.include?(message["user"]) | |
SUPPORT_AGENT_NAME | |
else | |
ANOTHER_USER_NAME | |
end | |
message_text = message["text"].dup | |
# Genericize tagged users if so configured. This only for human users, not bots | |
if config["genericize_tagged_users"] | |
message_text.gsub!(/<@[A-Z0-9]+>/) do |match| | |
if match[2..-2] == thread_author | |
REQUESTER_NAME | |
elsif config["agent_slack_ids"]&.include?(match[2..-2]) | |
SUPPORT_AGENT_NAME | |
else | |
ANOTHER_USER_NAME | |
end | |
end | |
end | |
# Niceify channel and team names if so configured | |
if config["niceify_channel_and_team_tags"] | |
# Public channels | |
message_text.gsub!(/<#C[A-Z0-9]+\|([a-z0-9-]+)>/, "#\\1") | |
# Private channels | |
message_text.gsub!(/<#C[A-Z0-9]+\|>/, PRIVATE_SLACK_CHANNEL_NAME) | |
# Teams | |
message_text.gsub!(/<!subteam\^[A-Z0-9]+\|(@[a-zA-Z0-9\-_]+)>/, "\\1 team") | |
end | |
# Convert Slack links to Markdown links | |
message_text.gsub!(%r{<(https?:\/\/[^|]+)\|([^>]+)>}, "[\\2](\\1)") | |
message_text.gsub!(%r{<(https?:\/\/[^|]+)>}, "\\1") | |
# Parse emails & phone numbers | |
message_text.gsub!(/<mailto:[^|]+\|([^>]+)>/, "\\1") | |
message_text.gsub!(/<tel:[^|]+\|([^>]+)>/, "\\1") | |
# Parse HTML entities such as &, >, etc. | |
doc = Nokogiri::HTML.parse(message_text) | |
message_text = doc.text | |
prepared_messages << "#{message_attribution}: #{message_text}" | |
end | |
next if prepared_messages.count == 0 | |
next if config["skip_empty_threads"] && prepared_messages.count == 1 | |
# Arrange the requests and replies into an array of hashes | |
prepared_conversations << { | |
request: prepared_messages.first.gsub(/^#{REQUESTER_NAME}: /, ""), | |
replies: prepared_messages.drop(1).join("\n"), | |
} | |
print " ✔" | |
end | |
# Write the prepared data to a local file | |
export_file_name = "exported_conversations" \ | |
"_from_#{channel}" \ | |
"_at_#{Time.now.to_i}" \ | |
"_since_#{config["since_timestamp"]}" \ | |
"_until_#{config["until_timestamp"]}" | |
case config["export_format"] | |
when "csv" | |
export_file = "#{export_file_name}.csv" | |
CSV.open(export_file, "wb") do |csv| | |
csv << prepared_conversations.first.keys | |
prepared_conversations.each do |hash| | |
csv << hash.values | |
end | |
end | |
when "jsonl" | |
export_file = "#{export_file_name}.jsonl" | |
File.write(export_file, prepared_conversations.map { |r| JSON.generate(r) }.join("\n")) | |
when "json" | |
export_file = "#{export_file_name}.json" | |
File.write(export_file, prepared_conversations.to_json) | |
end | |
puts "\n✔ Exported #{prepared_conversations.count} threads to #{export_file}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment