Created
April 28, 2015 23:45
-
-
Save mmasashi/26a5463cda88b23a3593 to your computer and use it in GitHub Desktop.
Encode string for redshift
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class RedshiftString | |
# Redshift supports UTF-8 but it enforces stricter rule than other | |
# implementations such as MySQL or Ruby. This method returns a | |
# Redshift-safe string from the given string. | |
def self.encode(string, options = {}) | |
result = string.encoding == Encoding::UTF_8 ? string.encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) : string.encode(Encoding::UTF_8, options) | |
result.each_char.collect{|c| | |
# Per Redshift document | |
# http://docs.aws.amazon.com/redshift/latest/dg/multi-byte-character-load-errors.html | |
if c >= "\uFDD0" && c <= "\uFDEF" || c == "\uFFFE" || c == "\uFFFF" | |
options[:replace] || "\uFFFD" | |
else | |
c | |
end | |
}.join | |
end | |
def self.encode1(string, options = {}) | |
result = string.encoding == Encoding::UTF_8 ? string.encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) : string.encode(Encoding::UTF_8, options) | |
result.codepoints.each_with_index do |cp, i| | |
if cp >= 0xFDD0 && cp <= 0xFDEF || cp == 0xFFFE || cp == 0xFFFF | |
result[i] = options[:replace] || "\uFFFD" | |
end | |
end | |
result | |
end | |
def self.encode2(string, options = {}) | |
result = string.encoding == Encoding::UTF_8 ? string.encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) : string.encode(Encoding::UTF_8, options) | |
rep_str = options[:replace] || "\uFFFD" | |
result.codepoints.each_with_index do |cp, i| | |
if cp >= 0xFDD0 && cp <= 0xFDEF || cp == 0xFFFE || cp == 0xFFFF | |
result[i] = rep_str | |
end | |
end | |
result | |
end | |
def self.encode3(string, options = {}) | |
result = string.encoding == Encoding::UTF_8 ? string.encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) : string.encode(Encoding::UTF_8, options) | |
rep_str = options[:replace] || "\uFFFD" | |
result.each_codepoint.with_index(0) do |cp, i| | |
if cp >= 0xFDD0 && cp <= 0xFDEF || cp == 0xFFFE || cp == 0xFFFF | |
result[i] = rep_str | |
end | |
end | |
result | |
end | |
end | |
@option = { invalid: :replace, undef: :replace } | |
## test | |
@test_data = [ | |
{string: "Straße", result: "Straße".encode(Encoding::UTF_8) }, | |
{string: "\xeb\x13\x00", result: "\uFFFD\u0013\u0000".encode(Encoding::UTF_8) }, | |
{string: "\xef\xb7\x91", result: "\uFFFD".encode(Encoding::UTF_8) }, | |
{string: "\xef\xbf\xbf", result: "\uFFFD".encode(Encoding::UTF_8) }, | |
] | |
def assert_encode(method, input, expected_output, encoding = Encoding::UTF_8) | |
unless RedshiftString.send(method, input, @option) == expected_output | |
raise "Invalid encode: #{method} input:#{input} expected:#{expected_output}" | |
end | |
print '.' | |
end | |
def run_test_cases(method) | |
print "testing '#{method}'." | |
@test_data.each do |h| | |
assert_encode(method, h[:string], h[:result]) | |
end | |
puts " OK" | |
end | |
run_test_cases(:encode) | |
run_test_cases(:encode1) | |
run_test_cases(:encode2) | |
run_test_cases(:encode3) | |
## benchmark | |
require 'benchmark' | |
@num_trial = 10000 | |
@benchmark_data = [ | |
"The Benchmark module provides methods to measure and report the time used to execute Ruby code.", | |
"ネパール地震の緊急支援募金にご協力ください", | |
"世界平和構築へ 日米同盟強化", | |
"ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ", | |
"Τη γλώσσα μου έδωσαν ελληνική", | |
"\xeb\x13\x00".encode(Encoding::UTF_8), | |
"\xef\xb7\x91".encode(Encoding::UTF_8), | |
"\xef\xbf\xbf".encode(Encoding::UTF_8), | |
] | |
def run_benchmark(method) | |
@num_trial.times do | |
@benchmark_data.each do |string| | |
RedshiftString.send(method, string, @option) | |
end | |
end | |
end | |
puts | |
puts "Benchmarking...." | |
Benchmark.bm do |x| | |
x.report { run_benchmark(:encode) } | |
x.report { run_benchmark(:encode1) } | |
x.report { run_benchmark(:encode2) } | |
x.report { run_benchmark(:encode3) } | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
testing 'encode'..... OK | |
testing 'encode1'..... OK | |
testing 'encode2'..... OK | |
testing 'encode3'..... OK | |
Benchmarking.... | |
user system total real | |
1.620000 0.010000 1.630000 ( 1.674482) | |
0.810000 0.010000 0.820000 ( 0.825965) | |
0.820000 0.020000 0.840000 ( 0.848739) | |
0.770000 0.000000 0.770000 ( 0.783537) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment