Skip to content

Instantly share code, notes, and snippets.

@krsyoung
Last active December 3, 2021 17:50
Show Gist options
  • Save krsyoung/ee171d283628f7d8689ba2e7eb5064b1 to your computer and use it in GitHub Desktop.
Save krsyoung/ee171d283628f7d8689ba2e7eb5064b1 to your computer and use it in GitHub Desktop.
Helper tasks for migrating Ruby on Rails audited gem from YAML to JSONB
require 'benchmark'
# This file includes several tasks to help with the management of the audited gem. The two main tasks are:
#
# yaml_to_jsonb - this will migrate yaml data stored in a column named audited_changes_yaml to jsonb data stored
# in a column named audited_changes.
#
# scrub - this will look at all of the columns that have been excluded from audited classes and ensure that
# they are all removed from the jsonb data.
#
# Here is a sample of the migration needed:
# # 20211126022727_change_audited_changes_from_yaml_to_json.rb
# # The general approach is:
# # 1. Quick migration, change the schema
# # 2. Slow task, migrate the old YAML column to the new JSONB column
# # 3. Quick migration, drop the old YAML column once the data migration is trusted
# #
# class ChangeAuditedChangesFromYamlToJson < ActiveRecord::Migration[6.1]
# def up
# # rename the current column
# rename_column :audits, :audited_changes, :audited_changes_yaml
# # add a new column to catch everything going forward, this will make it look like all
# # audits are empty ... so best to run the data migration soon after
# add_column :audits, :audited_changes, :jsonb, default: {}
# end
# def down
# remove_column :audits, :audited_changes
# rename_column :audits, :audited_changes_yaml, :audited_changes
# end
# end
# prepare a string for a SQL statement (double up single quotes)
def escape_sql(string)
return string.gsub("'","''")
end
# converts a yaml column (audited_changes_yaml) to jsonb (audited_changes)
def convert_yaml_to_json(r)
# parse the yaml text into an object
parsed_yaml = YAML.safe_load(r['audited_changes_yaml'], permitted_classes: [Date, Time, ActiveSupport::HashWithIndifferentAccess, ActiveSupport::TimeZone, ActiveSupport::TimeWithZone, BigDecimal], aliases: true)
# dump the object to json using Oj (faster!?)
json_string = Oj.dump(parsed_yaml, mode: :compat)
# avoid any conflicts with sql (must escape single quotes)
escaped_json = escape_sql(json_string)
# Run a straight SQL update to avoid instantiating the model
ActiveRecord::Base.connection.execute(
"""
UPDATE audits
SET audited_changes = '#{escaped_json}'::jsonb
WHERE id = #{r['id']}
"""
)
end
namespace :audited do
desc "Transform Audited YAML to JSONB"
task yaml_to_jsonb: :environment do
Rails.logger = Logger.new(STDOUT)
# number of records to execute in a batch, locally 100,000 takes about 7 minutes
BATCH_SIZE = 100000
Rails.logger.info "About to process #{BATCH_SIZE} records"
# track the conversion and insertion time of all records
time = Benchmark.realtime {
ActiveRecord::Base.connection.execute('SELECT id, audited_changes_yaml FROM audits WHERE audited_changes = \'{}\' LIMIT ' + BATCH_SIZE.to_s).map do |r|
convert_yaml_to_json(r)
end
}
# track how long the overall batch took to complete
Rails.logger.info "Completed in: #{time/60}m"
# count the number of records remaining to process
count = Audited::Audit.where(audited_changes: {}).count
Rails.logger.info "Records remaining: #{count}"
# calculate an estimate of the remaining time based on the current processing time
remaining_time = (time / BATCH_SIZE * count) / 60
Rails.logger.info "Approx remaining time: #{remaining_time}"
end
desc "Remove sensitive data from JSONB audited fields based on non-audited fields defined"
task scrub: :environment do
Rails.logger = Logger.new(STDOUT)
# max number of records to process
BATCH_SIZE = 1000
Rails.logger.info "About to process #{BATCH_SIZE} records"
# get a list of audit records we're looking at based on all existing audited records
auditable_types = Audited::Audit.group(:auditable_type).count
# track the overall runtime
time = Benchmark.realtime {
# loop over the different audited classes
auditable_types.each do |key, value|
Rails.logger.info " - processing #{key}"
# if a class no longer exists, it either means we renamed it or it was removed.
begin
non_audited_columns = key.constantize.non_audited_columns
rescue => e
# could possibly just delete all of the records here
next
end
non_audited_columns.each do |column|
Rails.logger.info " - hunting for #{column}"
# confirmation_token
ActiveRecord::Base.connection.execute('SELECT id, audited_changes FROM audits WHERE auditable_type = \'' + key + '\' AND audited_changes -> \'' + column + '\' is not null LIMIT ' + BATCH_SIZE.to_s).map do |r|
object = Oj.load(r['audited_changes'])
# again try Oj to speed things up as much as possible
json_string = Oj.dump(object.except(*non_audited_columns), mode: :compat)
# escape for sql
escaped_json = escape_sql(json_string)
# update
ActiveRecord::Base.connection.execute(
"""
UPDATE audits
SET audited_changes = '#{escaped_json}'::jsonb
WHERE id = #{r['id']}
"""
)
end
end
end
}
Rails.logger.info "Completed in: #{time/60}m"
count = Audited::Audit.where("audited_changes -> 'encrypted_password' is not null").count
Rails.logger.info "Records remaining: #{count}"
remaining_time = (time / BATCH_SIZE * count) / 60
Rails.logger.info "Approx remaining time: #{remaining_time}"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment