Skip to content

Instantly share code, notes, and snippets.

@mmasashi
Created April 28, 2015 23:45
Show Gist options
  • Save mmasashi/26a5463cda88b23a3593 to your computer and use it in GitHub Desktop.
Save mmasashi/26a5463cda88b23a3593 to your computer and use it in GitHub Desktop.
Encode string for redshift
class RedshiftString
# Redshift supports UTF-8 but it enforces stricter rule than other
# implementations such as MySQL or Ruby. This method returns a
# Redshift-safe string from the given string.
def self.encode(string, options = {})
result = string.encoding == Encoding::UTF_8 ? string.encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) : string.encode(Encoding::UTF_8, options)
result.each_char.collect{|c|
# Per Redshift document
# http://docs.aws.amazon.com/redshift/latest/dg/multi-byte-character-load-errors.html
if c >= "\uFDD0" && c <= "\uFDEF" || c == "\uFFFE" || c == "\uFFFF"
options[:replace] || "\uFFFD"
else
c
end
}.join
end
def self.encode1(string, options = {})
result = string.encoding == Encoding::UTF_8 ? string.encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) : string.encode(Encoding::UTF_8, options)
result.codepoints.each_with_index do |cp, i|
if cp >= 0xFDD0 && cp <= 0xFDEF || cp == 0xFFFE || cp == 0xFFFF
result[i] = options[:replace] || "\uFFFD"
end
end
result
end
def self.encode2(string, options = {})
result = string.encoding == Encoding::UTF_8 ? string.encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) : string.encode(Encoding::UTF_8, options)
rep_str = options[:replace] || "\uFFFD"
result.codepoints.each_with_index do |cp, i|
if cp >= 0xFDD0 && cp <= 0xFDEF || cp == 0xFFFE || cp == 0xFFFF
result[i] = rep_str
end
end
result
end
def self.encode3(string, options = {})
result = string.encoding == Encoding::UTF_8 ? string.encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) : string.encode(Encoding::UTF_8, options)
rep_str = options[:replace] || "\uFFFD"
result.each_codepoint.with_index(0) do |cp, i|
if cp >= 0xFDD0 && cp <= 0xFDEF || cp == 0xFFFE || cp == 0xFFFF
result[i] = rep_str
end
end
result
end
end
@option = { invalid: :replace, undef: :replace }
## test
@test_data = [
{string: "Straße", result: "Straße".encode(Encoding::UTF_8) },
{string: "\xeb\x13\x00", result: "\uFFFD\u0013\u0000".encode(Encoding::UTF_8) },
{string: "\xef\xb7\x91", result: "\uFFFD".encode(Encoding::UTF_8) },
{string: "\xef\xbf\xbf", result: "\uFFFD".encode(Encoding::UTF_8) },
]
def assert_encode(method, input, expected_output, encoding = Encoding::UTF_8)
unless RedshiftString.send(method, input, @option) == expected_output
raise "Invalid encode: #{method} input:#{input} expected:#{expected_output}"
end
print '.'
end
def run_test_cases(method)
print "testing '#{method}'."
@test_data.each do |h|
assert_encode(method, h[:string], h[:result])
end
puts " OK"
end
run_test_cases(:encode)
run_test_cases(:encode1)
run_test_cases(:encode2)
run_test_cases(:encode3)
## benchmark
require 'benchmark'
@num_trial = 10000
@benchmark_data = [
"The Benchmark module provides methods to measure and report the time used to execute Ruby code.",
"ネパール地震の緊急支援募金にご協力ください",
"世界平和構築へ 日米同盟強化",
"ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ",
"Τη γλώσσα μου έδωσαν ελληνική",
"\xeb\x13\x00".encode(Encoding::UTF_8),
"\xef\xb7\x91".encode(Encoding::UTF_8),
"\xef\xbf\xbf".encode(Encoding::UTF_8),
]
def run_benchmark(method)
@num_trial.times do
@benchmark_data.each do |string|
RedshiftString.send(method, string, @option)
end
end
end
puts
puts "Benchmarking...."
Benchmark.bm do |x|
x.report { run_benchmark(:encode) }
x.report { run_benchmark(:encode1) }
x.report { run_benchmark(:encode2) }
x.report { run_benchmark(:encode3) }
end
testing 'encode'..... OK
testing 'encode1'..... OK
testing 'encode2'..... OK
testing 'encode3'..... OK
Benchmarking....
user system total real
1.620000 0.010000 1.630000 ( 1.674482)
0.810000 0.010000 0.820000 ( 0.825965)
0.820000 0.020000 0.840000 ( 0.848739)
0.770000 0.000000 0.770000 ( 0.783537)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment